File size: 1,015 Bytes
fdf190d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import json
from tqdm import tqdm
import datasets
from datasketch import MinHashLSH, MinHash
idx = 0
lsh = MinHashLSH(threshold=0.7, num_perm=128)
f = open("/home/aiscuser/fhw/data/qwq_python_filtered.json",'r+')
fw = open("/home/aiscuser/fhw/data/qwq_python_deduplicated.json",'w+')
lines = f.readlines()
for line in tqdm(lines):
d = json.loads(line)
minhash = MinHash(num_perm=128)
for word in d['instruction'].replace('.','').replace('\n',' ').split():
minhash.update(word.encode('utf-8'))
lsh.insert(str(idx), minhash)
idx = idx + 1
idx = 0
t = 0
for line in tqdm(lines):
d = json.loads(line)
minhash = MinHash(num_perm=128)
for word in d['instruction'].replace('.','').replace('\n',' ').split():
minhash.update(word.encode('utf-8'))
result = lsh.query(minhash)
if len(result) == 1 or all(int(sim_idx)<= idx for sim_idx in result):
t = t + 1
fw.write(line)
idx = idx + 1
print(t)
print(idx)
|