Asilarknes commited on
Commit
4a4c19d
·
verified ·
1 Parent(s): ba520ba

Upload kb_reindex.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. kb_reindex.py +17 -0
kb_reindex.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Re-index existing passages with title boosting (title repeated to raise its
2
+ BM25 weight) so the canonical article ranks higher for entity questions."""
3
+ import json, time
4
+ import bm25s, Stemmer
5
+
6
+ KB = "/root/kb"
7
+ stemmer = Stemmer.Stemmer("russian")
8
+ P = [json.loads(l) for l in open(f"{KB}/passages.jsonl")]
9
+ # repeat title 3x to boost title-term frequency
10
+ corpus = [f"{p['title']}. {p['title']}. {p['title']}. {p['text']}" for p in P]
11
+ t0 = time.time()
12
+ ctok = bm25s.tokenize(corpus, stopwords=None, stemmer=stemmer, show_progress=False)
13
+ retr = bm25s.BM25(k1=1.5, b=0.75)
14
+ retr.index(ctok, show_progress=False)
15
+ retr.save(f"{KB}/bm25_index")
16
+ print(f"reindexed {len(P)} docs in {time.time()-t0:.0f}s", flush=True)
17
+ print("REIDX_DONE", flush=True)