| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import json |
| | import time |
| | import sys |
| |
|
| |
|
| | if __name__ == '__main__': |
| |
|
| |
|
| | print('grouping duplicate urls ...') |
| |
|
| | input = sys.argv[1] |
| | output = sys.argv[2] |
| | if len(sys.argv) > 3: |
| | jaccard_similarity_threshold = float(sys.argv[3]) |
| | else: |
| | jaccard_similarity_threshold = 0.7 |
| |
|
| | url_to_index = {} |
| | index_to_urls = [] |
| | counter = 0 |
| | start_time = time.time() |
| | with open(input, 'r') as f: |
| | for line in f: |
| | counter += 1 |
| | myjson = json.loads(line) |
| | urls = [] |
| | for main_url in myjson.keys(): |
| | urls.append(main_url) |
| | for value in myjson[main_url]: |
| | for other_url, js in value.items(): |
| | if js >= jaccard_similarity_threshold: |
| | urls.append(other_url) |
| | current_index = -1 |
| | other_indices = set() |
| | for url in urls: |
| | if url in url_to_index: |
| | if current_index == -1: |
| | current_index = url_to_index[url] |
| | elif current_index != url_to_index[url]: |
| | other_indices.add(url_to_index[url]) |
| | if current_index == -1: |
| | current_index = len(index_to_urls) |
| | index_to_urls.append(set()) |
| | for url in urls: |
| | url_to_index[url] = current_index |
| | index_to_urls[current_index].add(url) |
| | for index in other_indices: |
| | for url in index_to_urls[index]: |
| | index_to_urls[current_index].add(url) |
| | url_to_index[url] = current_index |
| | index_to_urls[index] = None |
| |
|
| | if counter % 100000 == 0: |
| | print(' > processed {} lines in {} seconds ...'.format( |
| | counter, time.time() - start_time)) |
| |
|
| |
|
| | total_remove = 0 |
| | total_remain = 0 |
| | for urls in index_to_urls: |
| | if urls is not None: |
| | if len(urls) > 1: |
| | total_remove += (len(urls) - 1) |
| | total_remain += 1 |
| | print('out of {} urls, only {} are unique and {} should be removed'.format( |
| | total_remove+total_remain, total_remain, total_remove)) |
| |
|
| | with open(output, 'wb') as f: |
| | for i, urls in enumerate(index_to_urls): |
| | if urls is not None: |
| | if len(urls) > 1: |
| | myjson = json.dumps({str(i): list(urls)}, |
| | ensure_ascii=False) |
| | f.write(myjson.encode('utf-8')) |
| | f.write('\n'.encode('utf-8')) |
| |
|