Source code for veryscrape.scrapers.spider
import asyncio
from .google import extract_urls
from ..items import ItemGenerator
from ..scrape import Scraper
class SpiderItemGen(ItemGenerator):
def __init__(self, *args, **kwargs):
super(SpiderItemGen, self).__init__(*args, **kwargs)
self.topic = '__classify__'
[docs]class Spider(Scraper):
source = 'spider'
scrape_every = 0
item_gen = SpiderItemGen
concurrent_requests = 200
def __init__(self, *args, source_urls=(), proxy_pool=None, **kwargs):
super(Spider, self).__init__(*args, proxy_pool=proxy_pool, **kwargs)
self.loop = asyncio.get_event_loop()
self.source_urls = source_urls
self.seen_urls = set()
self.urls = set()
self._futures = set()
self._max_seen_urls = 1e7
self._scrape_future = None
[docs] async def close(self):
if self._scrape_future is not None:
self._scrape_future.cancel()
await super(Spider, self).close()
[docs] def scrape(self, query, topic='', **kwargs):
if self._scrape_future is None:
self._scrape_future = asyncio.ensure_future(self._scrape())
return self._scrape_future
async def _create_scrape_future(self, url):
while len(self._futures) > self.concurrent_requests:
await asyncio.sleep(1e-3)
future = asyncio.ensure_future(self.client.fetch('GET', url))
future.add_done_callback(self._fetch_callback)
self._futures.add(future)
def _fetch_callback(self, future):
self._futures.remove(future)
if not future.cancelled() and not future.exception():
html = future.result()
if html is not None:
# Topic of data gathered by spider is classified later
self.queues['__classify__'].put_nowait(html)
for url in extract_urls(html):
if url not in self.seen_urls:
self.urls.add(url)
self.seen_urls.add(url)
async def _scrape(self):
self.urls.update(set(self.source_urls))
while self._futures or self.urls:
await asyncio.sleep(1e-2)
for url in self.urls.copy():
await self._create_scrape_future(url)
self.urls.remove(url)