veryscrape.scrapers package

Submodules

veryscrape.scrapers.google module

class veryscrape.scrapers.google.ArticleGen(q, topic='', source='')[source]

Bases: veryscrape.items.ItemGenerator

process_text(text)[source]
process_time(text)[source]
class veryscrape.scrapers.google.Google(*args, proxy_pool=None, **kwargs)[source]

Bases: veryscrape.scrape.SearchEngineScraper

extract_urls(text)[source]
item_gen

alias of ArticleGen

query_string(query)[source]
session_class

alias of GoogleSession

source = 'article'
class veryscrape.scrapers.google.GoogleSession(*args, proxy_pool=None, **kwargs)[source]

Bases: veryscrape.session.Session

error_on_failure = False
retries_to_error = 2

veryscrape.scrapers.reddit module

class veryscrape.scrapers.reddit.CommentGen(q, topic='', source='')[source]

Bases: veryscrape.items.ItemGenerator

process_text(text)[source]
process_time(text)[source]
removed_comments = {'[removed]', '[deleted]'}
class veryscrape.scrapers.reddit.Reddit(key, secret, *, proxy_pool=None)[source]

Bases: veryscrape.scrape.Scraper

get_comments(query, link)[source]
item_gen

alias of CommentGen

scrape(query, topic='', **kwargs)[source]
scrape_every = 600
session_class

alias of RedditSession

source = 'reddit'
class veryscrape.scrapers.reddit.RedditSession(*args, **kwargs)[source]

Bases: veryscrape.session.OAuth2Session

base_url = 'https://oauth.reddit.com/r/'
persist_user_agent = True
user_agent = 'python:veryscrape:v0.1.0 (by /u/jayjay)'

veryscrape.scrapers.twingly module

class veryscrape.scrapers.twingly.BlogGen(q, topic='', source='')[source]

Bases: veryscrape.items.ItemGenerator

process_text(text)[source]
process_time(text)[source]
class veryscrape.scrapers.twingly.Twingly(api_key, *, proxy_pool=None)[source]

Bases: veryscrape.scrape.SearchEngineScraper

extract_urls(text)[source]
item_gen

alias of BlogGen

query_string(query)[source]
source = 'blog'

veryscrape.scrapers.twitter module

class veryscrape.scrapers.twitter.TweetGen(q, topic='', source='')[source]

Bases: veryscrape.items.ItemGenerator

last_item = None
process_text(text)[source]
process_time(text)[source]
class veryscrape.scrapers.twitter.Twitter(key, secret, token, token_secret, *, proxy_pool=None)[source]

Bases: veryscrape.scrape.Scraper

item_gen

alias of TweetGen

scrape(query, topic='', **kwargs)[source]
session_class

alias of TwitterSession

source = 'twitter'
class veryscrape.scrapers.twitter.TwitterSession(*args, **kwargs)[source]

Bases: veryscrape.session.OAuth1Session

base_url = 'https://stream.twitter.com/1.1/'

Module contents

class veryscrape.scrapers.Twitter(key, secret, token, token_secret, *, proxy_pool=None)[source]

Bases: veryscrape.scrape.Scraper

item_gen

alias of TweetGen

scrape(query, topic='', **kwargs)[source]
session_class

alias of TwitterSession

source = 'twitter'
class veryscrape.scrapers.Reddit(key, secret, *, proxy_pool=None)[source]

Bases: veryscrape.scrape.Scraper

get_comments(query, link)[source]
item_gen

alias of CommentGen

scrape(query, topic='', **kwargs)[source]
scrape_every = 600
session_class

alias of RedditSession

source = 'reddit'
class veryscrape.scrapers.Google(*args, proxy_pool=None, **kwargs)[source]

Bases: veryscrape.scrape.SearchEngineScraper

extract_urls(text)[source]
item_gen

alias of ArticleGen

query_string(query)[source]
session_class

alias of GoogleSession

source = 'article'
class veryscrape.scrapers.Twingly(api_key, *, proxy_pool=None)[source]

Bases: veryscrape.scrape.SearchEngineScraper

extract_urls(text)[source]
item_gen

alias of BlogGen

query_string(query)[source]
source = 'blog'
class veryscrape.scrapers.Spider(*args, source_urls=(), proxy_pool=None, **kwargs)[source]

Bases: veryscrape.scrape.Scraper

close()[source]
concurrent_requests = 200
item_gen

alias of SpiderItemGen

scrape(query, topic='', **kwargs)[source]
scrape_every = 0
source = 'spider'