| |
|
|
|
|
| import glob |
| import re |
| import time |
| import tldextract |
| import sys |
|
|
|
|
| |
| domain_blacklist = set([ |
| '500px', |
| 'aapks', |
| 'akamaihd', |
| 'amazon', |
| 'apple', |
| 'artifactfire', |
| 'artstation', |
| 'awwni', |
| 'bandcamp', |
| 'battleforthenet', |
| 'coinscalendar', |
| 'dailymotion', |
| 'deviantart', |
| 'discord', |
| 'discordapp', |
| 'dlapkandroid', |
| 'dropbox', |
| 'e621', |
| 'ebay', |
| 'edealinfo', |
| 'erome', |
| 'eroshare', |
| 'explosm', |
| 'facebook', |
| 'fbcdn', |
| 'flickr', |
| 'furaffinity', |
| 'futhead', |
| 'gatopardo', |
| 'gfycat', |
| 'gifsound', |
| 'gifsoup', |
| 'giphy', |
| 'github', |
| 'google', |
| 'gunprime', |
| 'gyazo', |
| 'hotdealstar', |
| 'imagefap', |
| 'imageshack', |
| 'imgflip', |
| 'imgur', |
| 'instagram', |
| 'karmadecay', |
| 'kryptocal', |
| 'kym-cdn', |
| 'liveleak', |
| 'livememe', |
| 'lmgtfy', |
| 'magaimg', |
| 'memegenerator', |
| 'minorplanetcenter', |
| 'minus', |
| 'mobafire', |
| 'morejpeg', |
| 'nocookie', |
| 'pcpartpicker', |
| 'photobucket', |
| 'pinimg', |
| 'pinterest', |
| 'pixiv', |
| 'pornhub', |
| 'prntscr', |
| 'puu', |
| 'qkme', |
| 'quickmeme', |
| 'radd', |
| 'redd', |
| 'reddit', |
| 'reddit-stream', |
| 'redditlog', |
| 'redditmedia', |
| 'reddituploads', |
| 'redtube', |
| 'reupp', |
| 'reverb', |
| 'roanoke', |
| 'rollingstone', |
| 'sli', |
| 'soundcloud', |
| 'soundgasm', |
| 'spankbang', |
| 'spotify', |
| 'strawpoll', |
| 'streamable', |
| 'timeanddate', |
| 'tinypic', |
| 'touhouradio', |
| 'tumblr', |
| 'twimg', |
| 'twitch', |
| 'twitter', |
| 'vid', |
| 'vimeo', |
| 'vine', |
| 'vkaao', |
| 'vocaroo', |
| 'voyagefusion', |
| 'walmart', |
| 'wciu', |
| 'wikimedia', |
| 'wikipedia', |
| 'xhamster', |
| 'xkcd', |
| 'xvideos', |
| 'youtu', |
| 'youtube', |
| 'youtubedoubler', |
| 'ytimg', |
| 'zillexplorer', |
| ]) |
|
|
| def domain_is_in_blacklist(url): |
| domain = tldextract.extract(url).domain |
| return domain in domain_blacklist |
|
|
|
|
| |
| extentions_blacklist = ( |
| '.3gp', |
| '.7z' |
| '.ai', |
| '.aif', |
| '.apk', |
| '.app', |
| '.avi', |
| '.bin', |
| '.bmp', |
| '.bz2', |
| '.css', |
| '.csv', |
| '.dat', |
| '.deb', |
| '.dmg', |
| '.doc', |
| '.docx', |
| '.exe', |
| '.gif', |
| '.gifv', |
| '.gz', |
| '.iso', |
| '.jar', |
| '.jpeg', |
| '.jpg', |
| '.js', |
| '.log', |
| '.mid', |
| '.midi', |
| '.mkv', |
| '.mov', |
| '.mp3', |
| '.mp4', |
| '.mpeg', |
| '.mpg', |
| '.ogg', |
| '.ogv', |
| '.otf', |
| '.pdf', |
| '.pkg', |
| '.png', |
| '.pps', |
| '.ppt', |
| '.pptx', |
| '.psd', |
| '.py', |
| '.qt', |
| '.ram', |
| '.rar', |
| '.sql', |
| '.svg', |
| '.swf', |
| '.tar.gz', |
| '.tar', |
| '.tgz', |
| '.tiff', |
| '.ttf', |
| '.txt', |
| '.wav', |
| '.webm', |
| '.wma', |
| '.wmv', |
| '.xls', |
| '.xlsx', |
| '.xml', |
| '.xz', |
| '.zip', |
| ) |
|
|
| def extention_is_in_blacklist(url): |
| if url.split('?')[0].lower().endswith(extentions_blacklist): |
| return True |
| return False |
|
|
|
|
| |
| |
| |
| url_regex = re.compile( |
| r'^(?:http)s?://' |
| r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' |
| r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' |
| r'(?::\d+)?' |
| r'(?:/?|[/?]\S+)$', re.IGNORECASE) |
| def url_is_malformed(url): |
| return re.match(url_regex, url) is None |
|
|
|
|
| def print_progress(prefix, start_time, urls_counter, |
| domain_blacklist_counter, |
| extention_blacklist_counter, |
| short_url_counter, malformed_url_counter, |
| duplicate_url_counter): |
| string = prefix + ' | ' |
| string += 'time elapsed (s): {:.2f} | '.format(time.time() - start_time) |
| string += 'number of urls: {} | '.format(urls_counter) |
| string += 'domain blacklisted: {} | '.format(domain_blacklist_counter) |
| string += 'extention blacklisted: {} | '.format(extention_blacklist_counter) |
| string += 'short urls (<=8): {} | '.format(short_url_counter) |
| string += 'malformed urls: {} | '.format(malformed_url_counter) |
| string += 'duplicate urls: {}'.format(duplicate_url_counter) |
| print(string, flush=True) |
|
|
|
|
| if __name__ == '__main__': |
|
|
|
|
| print('remove blacklisted urls ..') |
|
|
| |
| path = sys.argv[1] |
| |
| output = sys.argv[2] |
|
|
| |
| files = glob.glob(path + '/*.txt') |
| print('> found {} files'.format(len(files))) |
|
|
| urls = set() |
| urls_counter = 0 |
| domain_blacklist_counter = 0 |
| extention_blacklist_counter = 0 |
| short_url_counter = 0 |
| malformed_url_counter = 0 |
| duplicate_url_counter = 0 |
| start_time = time.time() |
| for filename in files: |
| with open(filename, 'r') as f: |
| for line in f: |
| url = line.strip() |
| urls_counter += 1 |
| if domain_is_in_blacklist(url): |
| print('[DOMAIN BLACKLIST]: {}'.format(url), flush=True) |
| domain_blacklist_counter += 1 |
| elif extention_is_in_blacklist(url): |
| print('[EXTENTION BLACKLIST]: {}'.format(url), flush=True) |
| extention_blacklist_counter += 1 |
| elif len(url) <= 8: |
| print('[SHORT URL]: {}'.format(url), flush=True) |
| short_url_counter += 1 |
| elif url_is_malformed(url): |
| print('[MALFORMED URL]: {}'.format(url), flush=True) |
| malformed_url_counter += 1 |
| elif url in urls: |
| print('[DUPLICATE URL]: {}'.format(url), flush=True) |
| duplicate_url_counter += 1 |
| else: |
| urls.add(url) |
| if urls_counter % 100000 == 0: |
| print_progress('PROGRESS', start_time, urls_counter, |
| domain_blacklist_counter, |
| extention_blacklist_counter, |
| short_url_counter, malformed_url_counter, |
| duplicate_url_counter) |
|
|
| print_progress('FINAL', start_time, urls_counter, |
| domain_blacklist_counter, |
| extention_blacklist_counter, |
| short_url_counter, malformed_url_counter, |
| duplicate_url_counter) |
|
|
| |
| print('> writing cleaned up url list to {}'.format(output)) |
| with open(output, 'w') as f: |
| for url in urls: |
| f.write(url + '\n') |
|
|
| print('done :-)') |
|
|