File size: 1,015 Bytes
fdf190d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import json
from tqdm import tqdm
import datasets
from datasketch import MinHashLSH, MinHash

idx = 0
lsh = MinHashLSH(threshold=0.7, num_perm=128)
f = open("/home/aiscuser/fhw/data/qwq_python_filtered.json",'r+')
fw = open("/home/aiscuser/fhw/data/qwq_python_deduplicated.json",'w+')
lines = f.readlines()
for line in tqdm(lines):
    d = json.loads(line)
    minhash = MinHash(num_perm=128)
    for word in d['instruction'].replace('.','').replace('\n',' ').split():
        minhash.update(word.encode('utf-8'))
    lsh.insert(str(idx), minhash)
    idx = idx + 1


idx = 0
t = 0
for line in tqdm(lines):
    d = json.loads(line)
    minhash = MinHash(num_perm=128)
    for word in d['instruction'].replace('.','').replace('\n',' ').split():
        minhash.update(word.encode('utf-8'))
    result = lsh.query(minhash)
    if len(result) == 1 or all(int(sim_idx)<= idx for sim_idx in result):
        t = t + 1
        fw.write(line)
    idx = idx + 1

print(t)
print(idx)