Upload kb_reindex.py with huggingface_hub
Browse files- kb_reindex.py +17 -0
kb_reindex.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Re-index existing passages with title boosting (title repeated to raise its
|
| 2 |
+
BM25 weight) so the canonical article ranks higher for entity questions."""
|
| 3 |
+
import json, time
|
| 4 |
+
import bm25s, Stemmer
|
| 5 |
+
|
| 6 |
+
KB = "/root/kb"
|
| 7 |
+
stemmer = Stemmer.Stemmer("russian")
|
| 8 |
+
P = [json.loads(l) for l in open(f"{KB}/passages.jsonl")]
|
| 9 |
+
# repeat title 3x to boost title-term frequency
|
| 10 |
+
corpus = [f"{p['title']}. {p['title']}. {p['title']}. {p['text']}" for p in P]
|
| 11 |
+
t0 = time.time()
|
| 12 |
+
ctok = bm25s.tokenize(corpus, stopwords=None, stemmer=stemmer, show_progress=False)
|
| 13 |
+
retr = bm25s.BM25(k1=1.5, b=0.75)
|
| 14 |
+
retr.index(ctok, show_progress=False)
|
| 15 |
+
retr.save(f"{KB}/bm25_index")
|
| 16 |
+
print(f"reindexed {len(P)} docs in {time.time()-t0:.0f}s", flush=True)
|
| 17 |
+
print("REIDX_DONE", flush=True)
|