codescripts / fhw_deduplication.py
f541119578's picture
Upload folder using huggingface_hub
fdf190d verified
import json
from tqdm import tqdm
import datasets
from datasketch import MinHashLSH, MinHash
idx = 0
lsh = MinHashLSH(threshold=0.7, num_perm=128)
f = open("/home/aiscuser/fhw/data/qwq_python_filtered.json",'r+')
fw = open("/home/aiscuser/fhw/data/qwq_python_deduplicated.json",'w+')
lines = f.readlines()
for line in tqdm(lines):
d = json.loads(line)
minhash = MinHash(num_perm=128)
for word in d['instruction'].replace('.','').replace('\n',' ').split():
minhash.update(word.encode('utf-8'))
lsh.insert(str(idx), minhash)
idx = idx + 1
idx = 0
t = 0
for line in tqdm(lines):
d = json.loads(line)
minhash = MinHash(num_perm=128)
for word in d['instruction'].replace('.','').replace('\n',' ').split():
minhash.update(word.encode('utf-8'))
result = lsh.query(minhash)
if len(result) == 1 or all(int(sim_idx)<= idx for sim_idx in result):
t = t + 1
fw.write(line)
idx = idx + 1
print(t)
print(idx)