Source code for veryscrape.process

from collections import defaultdict, Counter
from newspaper import fulltext
from xml.sax.saxutils import unescape
import lxml.html
import re
import threading

from .items import Item

_clean_functions = defaultdict(list)
_mutex = threading.Lock()


[docs]def register(name, *funcs): """ Register cleaning function so it is run automatically on items with source 'name' :param name: name of data source (e.g. 'twitter') :param funcs: cleaning functions to apply """ with _mutex: _clean_functions[name].extend(funcs)
[docs]def unregister(name, *funcs): """ Unregister a function registered with 'veryscrape.process.register' :param name: name of data source (e.g. 'twitter') :param funcs: cleaning functions to remove """ with _mutex: if name == '*': _clean_functions.clear() elif not funcs: del _clean_functions[name] else: for func in funcs: _clean_functions[name].remove(func)
[docs]def clean_article(content): """Converts html text into article text""" result = '' try: result = fulltext(content) finally: # Catch-all to ensure all broken html is discarded return result
[docs]def clean_tweet(content): """ Unescapes and replaces mentions and hashtags with static tokens (@ - MENTION, # - HASHTAG) """ user_string = r'[A-Za-z0-9_\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff]' content = unescape(content) content = re.sub(r'([#|\uff03])(%s+)' % user_string, lambda m: ' %s ' % m.group(2), content) content = re.sub(r'@%s{2,}' % user_string, ' MENTION ', content) content = re.sub(r'(RT(\x20)?(:)?)?', '', content) return content
[docs]def clean_reddit_comment(content): """ Replace subreddit paths and user paths with static tokens (/r/... - SUBREDDIT) """ content = unescape(content) content = re.sub( r'(\[deleted\])|(\[removed\])|(\[not found\])', '', content) content = re.sub(r'/?r/[0-9a-zA-Z_]{3,}', '', content) return content
[docs]def clean_general(content): """ Remove any urls, non-ascii text and redundant spaces, normalize swearwords """ # Urls content = re.sub( r'(http|https):/?/?[\w_-]*(?:\.[\w_-]*)?[\d\w.,@?^=%&:/~+#-]*', '', content) # Ascii content = re.sub(r'([^\x20-\x7f]*)*([\t\n\r]*)*', '', content) # Swearwords content = re.sub( r'[.,@?^=*%$\'";{}[\]<>|\\!&:/~+#-]{4,}', ' fucking ', content) # Spaces content = re.sub(r'\x20{2,}', ' ', content) return content
[docs]def clean_item(item): """ Clean an item of undesirable data :param item: item to clean with all functions registered to item.source :return: cleaned item """ content = item.content for func in _clean_functions[item.source]: content = func(content) return Item(content, topic=item.topic, source=item.source, created_at=item.created_at)
[docs]def classify_text(text, topic_query_dict): """ Attempts to classify a text based on query strings organized by topic (Note, this is meant to be very fast - for a web spider) :param text: text to classify :param topic_query_dict: dict of topics and queries: e.g. {'t1': ['q1', 'q2'], 't2': ['q3'], ... :return: which topic does the text belong to """ count = Counter(re.split(r'[^\w]', text.lower())) topic = '' max_count = 0 for t, queries in topic_query_dict.items(): c = sum(count[q] for q in queries) if c > max_count: max_count = c topic = t return topic
[docs]def extract_urls(text): """ Extract urls in a given text and return the urls :param text: text to extract urls from :return: set of urls """ urls = set() try: result = lxml.html.fromstring(text) for e in result.xpath('//*[@href]'): if e.get('href') is not None: urls.add(e.get('href')) finally: return urls
[docs]def remove_urls(text, remove=set(' )({}[];:')): """ Removes (without returning) all urls present in a text :param text: text to clean urls from :param remove: break characters for url :return: text clean of urls """ ind = text.find('http') while ind > -1: length = len(text) for i in range(ind + 7, length): if text[i] in remove: break else: i = length - 1 text = text[:ind] + text[i:] ind = text.find('http') return text
register('twitter', clean_tweet, clean_general) register('reddit', clean_reddit_comment, clean_general) register('article', clean_article, clean_general) register('blog', clean_article, clean_general) register('spider', clean_article, clean_general) __all__ = [ 'clean_article', 'clean_tweet', 'clean_reddit_comment', 'clean_general', 'clean_item', 'register', 'unregister', 'classify_text', 'extract_urls', 'remove_urls' ]