| import json | |
| from tqdm import tqdm | |
| import datasets | |
| from datasketch import MinHashLSH, MinHash | |
| idx = 0 | |
| lsh = MinHashLSH(threshold=0.7, num_perm=128) | |
| f = open("/home/aiscuser/fhw/data/qwq_python_filtered.json",'r+') | |
| fw = open("/home/aiscuser/fhw/data/qwq_python_deduplicated.json",'w+') | |
| lines = f.readlines() | |
| for line in tqdm(lines): | |
| d = json.loads(line) | |
| minhash = MinHash(num_perm=128) | |
| for word in d['instruction'].replace('.','').replace('\n',' ').split(): | |
| minhash.update(word.encode('utf-8')) | |
| lsh.insert(str(idx), minhash) | |
| idx = idx + 1 | |
| idx = 0 | |
| t = 0 | |
| for line in tqdm(lines): | |
| d = json.loads(line) | |
| minhash = MinHash(num_perm=128) | |
| for word in d['instruction'].replace('.','').replace('\n',' ').split(): | |
| minhash.update(word.encode('utf-8')) | |
| result = lsh.query(minhash) | |
| if len(result) == 1 or all(int(sim_idx)<= idx for sim_idx in result): | |
| t = t + 1 | |
| fw.write(line) | |
| idx = idx + 1 | |
| print(t) | |
| print(idx) | |