from collections import defaultdict, Counter
from newspaper import fulltext
from xml.sax.saxutils import unescape
import lxml.html
import re
import threading
from .items import Item
_clean_functions = defaultdict(list)
_mutex = threading.Lock()
[docs]def register(name, *funcs):
"""
Register cleaning function so it is run automatically
on items with source 'name'
:param name: name of data source (e.g. 'twitter')
:param funcs: cleaning functions to apply
"""
with _mutex:
_clean_functions[name].extend(funcs)
[docs]def unregister(name, *funcs):
"""
Unregister a function registered with 'veryscrape.process.register'
:param name: name of data source (e.g. 'twitter')
:param funcs: cleaning functions to remove
"""
with _mutex:
if name == '*':
_clean_functions.clear()
elif not funcs:
del _clean_functions[name]
else:
for func in funcs:
_clean_functions[name].remove(func)
[docs]def clean_article(content):
"""Converts html text into article text"""
result = ''
try:
result = fulltext(content)
finally: # Catch-all to ensure all broken html is discarded
return result
[docs]def clean_general(content):
"""
Remove any urls, non-ascii text and redundant spaces, normalize swearwords
"""
# Urls
content = re.sub(
r'(http|https):/?/?[\w_-]*(?:\.[\w_-]*)?[\d\w.,@?^=%&:/~+#-]*',
'', content)
# Ascii
content = re.sub(r'([^\x20-\x7f]*)*([\t\n\r]*)*', '', content)
# Swearwords
content = re.sub(
r'[.,@?^=*%$\'";{}[\]<>|\\!&:/~+#-]{4,}',
' fucking ', content)
# Spaces
content = re.sub(r'\x20{2,}', ' ', content)
return content
[docs]def clean_item(item):
"""
Clean an item of undesirable data
:param item: item to clean with all functions registered to item.source
:return: cleaned item
"""
content = item.content
for func in _clean_functions[item.source]:
content = func(content)
return Item(content, topic=item.topic,
source=item.source, created_at=item.created_at)
[docs]def classify_text(text, topic_query_dict):
"""
Attempts to classify a text based on query strings organized by topic
(Note, this is meant to be very fast - for a web spider)
:param text: text to classify
:param topic_query_dict: dict of topics and queries:
e.g. {'t1': ['q1', 'q2'], 't2': ['q3'], ...
:return: which topic does the text belong to
"""
count = Counter(re.split(r'[^\w]', text.lower()))
topic = ''
max_count = 0
for t, queries in topic_query_dict.items():
c = sum(count[q] for q in queries)
if c > max_count:
max_count = c
topic = t
return topic
[docs]def remove_urls(text, remove=set(' )({}[];:')):
"""
Removes (without returning) all urls present in a text
:param text: text to clean urls from
:param remove: break characters for url
:return: text clean of urls
"""
ind = text.find('http')
while ind > -1:
length = len(text)
for i in range(ind + 7, length):
if text[i] in remove:
break
else:
i = length - 1
text = text[:ind] + text[i:]
ind = text.find('http')
return text
register('twitter', clean_tweet, clean_general)
register('reddit', clean_reddit_comment, clean_general)
register('article', clean_article, clean_general)
register('blog', clean_article, clean_general)
register('spider', clean_article, clean_general)
__all__ = [
'clean_article', 'clean_tweet', 'clean_reddit_comment', 'clean_general',
'clean_item', 'register', 'unregister',
'classify_text', 'extract_urls', 'remove_urls'
]