JackSparrow89 commited on
Commit
bb04c5f
·
verified ·
1 Parent(s): 15c80f2

Upload 65 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +15 -0
  2. .gitignore +34 -0
  3. Dockerfile +20 -0
  4. __pycache__/check_nfcorpus.cpython-313.pyc.2070577919488 +0 -0
  5. __pycache__/main.cpython-313.pyc.2070578258992 +0 -0
  6. check_nfcorpus.py +29 -0
  7. config.yaml +42 -0
  8. data/nfcorpus/corpus.jsonl +0 -0
  9. data/nfcorpus/qrels/dev.tsv +0 -0
  10. data/nfcorpus/qrels/test.tsv +0 -0
  11. data/nfcorpus/qrels/train.tsv +0 -0
  12. data/nfcorpus/queries.jsonl +0 -0
  13. data/scifact/corpus.jsonl +0 -0
  14. data/scifact/qrels/test.tsv +340 -0
  15. data/scifact/qrels/train.tsv +920 -0
  16. data/scifact/queries.jsonl +0 -0
  17. docker-compose.yml +16 -0
  18. evaluation/__pycache__/dataset_loader.cpython-313.pyc.2070577919488 +0 -0
  19. evaluation/__pycache__/evaluator.cpython-313.pyc.2070577919488 +0 -0
  20. evaluation/__pycache__/indexer_bridge.cpython-313.pyc.2070577919488 +0 -0
  21. evaluation/__pycache__/query_runner.cpython-313.pyc.2070577919488 +0 -0
  22. evaluation/__pycache__/run_eval.cpython-313.pyc.2070577919488 +0 -0
  23. evaluation/dataset_loader.py +132 -0
  24. evaluation/evaluator.py +197 -0
  25. evaluation/indexer_bridge.py +94 -0
  26. evaluation/query_runner.py +128 -0
  27. evaluation/run_eval.py +170 -0
  28. indexer/__pycache__/chunker.cpython-313.pyc.2070577919488 +0 -0
  29. indexer/__pycache__/crawler.cpython-313.pyc.2070577919488 +0 -0
  30. indexer/__pycache__/embedder.cpython-313.pyc.2070577919488 +0 -0
  31. indexer/__pycache__/extractor.cpython-313.pyc.2070577919488 +0 -0
  32. indexer/__pycache__/pipeline.cpython-313.pyc.2070577919488 +0 -0
  33. indexer/__pycache__/store.cpython-313.pyc.2070577919488 +0 -0
  34. indexer/__pycache__/watcher.cpython-313.pyc.2070577919488 +0 -0
  35. indexer/chunker.py +135 -0
  36. indexer/crawler.py +102 -0
  37. indexer/embedder.py +111 -0
  38. indexer/extractor.py +115 -0
  39. indexer/pipeline.py +125 -0
  40. indexer/store.py +238 -0
  41. indexer/watcher.py +187 -0
  42. main.py +298 -0
  43. requirements.txt +21 -0
  44. searcher/__init__.py +0 -0
  45. searcher/__pycache__/__init__.cpython-313.pyc.2070577919488 +0 -0
  46. searcher/__pycache__/dense_retriever.cpython-313.pyc.2070577919488 +0 -0
  47. searcher/__pycache__/facet_filter.cpython-313.pyc.2070577919488 +0 -0
  48. searcher/__pycache__/fusion_ranker.cpython-313.pyc.2070577919488 +0 -0
  49. searcher/__pycache__/highlighter.cpython-313.pyc.2070577919488 +0 -0
  50. searcher/__pycache__/query_understanding.cpython-313.pyc.2070578319792 +0 -0
.dockerignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ .venv
4
+ __pycache__/
5
+ *.pyc
6
+ *.pyo
7
+ *.pyd
8
+ *.log
9
+ .pytest_cache/
10
+ .mypy_cache/
11
+ .ruff_cache/
12
+ .idea/
13
+ .vscode/
14
+ data/
15
+ results/
.gitignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Virtual environment
2
+ .venv/
3
+
4
+ # Vector index and database (large binary files)
5
+ data/
6
+ results/
7
+
8
+ # Logs
9
+ *.log
10
+
11
+ # Downloaded ML models (auto-downloaded at runtime)
12
+ models/
13
+ .cache/
14
+ sentence_transformers/
15
+
16
+ # Python cache
17
+ ___pycache__/
18
+ *.pyc
19
+ *.pyo
20
+ *.pyd
21
+ .env
22
+ venv/
23
+ env/
24
+ .venv/
25
+ # Model cache
26
+ .cache/
27
+
28
+ # OS files
29
+ .DS_Store
30
+ Thumbs.db
31
+
32
+ # IDE
33
+ .vscode/
34
+ .idea/
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1 \
6
+ NLTK_DATA=/usr/local/share/nltk_data
7
+
8
+ WORKDIR /app
9
+
10
+ COPY requirements.txt .
11
+
12
+ RUN pip install --upgrade pip && \
13
+ pip install -r requirements.txt && \
14
+ python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data'); nltk.download('omw-1.4', download_dir='/usr/local/share/nltk_data')"
15
+
16
+ COPY . .
17
+
18
+ EXPOSE 7860
19
+
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
__pycache__/check_nfcorpus.cpython-313.pyc.2070577919488 ADDED
Binary file (1.46 kB). View file
 
__pycache__/main.cpython-313.pyc.2070578258992 ADDED
Binary file (11.7 kB). View file
 
check_nfcorpus.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+ import os
4
+
5
+ sys.path.append(os.path.abspath("."))
6
+ # Load results
7
+ with open('results/eval_nfcorpus.json') as f:
8
+ data = json.load(f)
9
+
10
+ # Load qrels
11
+ from evaluation.dataset_loader import DatasetLoader
12
+
13
+ loader = DatasetLoader('data/nfcorpus')
14
+ qrels = loader.load_qrels()
15
+
16
+ # 🔍 Debug prints
17
+ print("Sample RESULT query_id:", list(data.keys())[0])
18
+
19
+ first_qid = list(qrels.keys())[0]
20
+ print("Sample QREL query_id:", first_qid)
21
+
22
+ print("Sample QREL doc_id:", list(qrels[first_qid].keys())[0])
23
+
24
+ print("Total QREL queries:", len(qrels))
25
+ print("Total RESULT queries:", len(data))
26
+
27
+ # 🔥 Check overlap
28
+ common = set(data.keys()) & set(qrels.keys())
29
+ print("Common query IDs:", len(common))
config.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Directories to index
2
+ watch_paths:
3
+ - ./data/scifact
4
+ - ./data/nfcorpus #modify this
5
+
6
+ # File extensions to include
7
+ include_extensions:
8
+ - ".pdf"
9
+ - ".docx"
10
+ - ".txt"
11
+ - ".md"
12
+ - ".pptx"
13
+ - ".xlsx"
14
+ - ".py"
15
+ - ".js"
16
+ - ".ipynb"
17
+
18
+
19
+ # --- Add these ---
20
+ top_k: 5 # final results returned to user
21
+ candidate_k: 20 # candidates fetched before reranking
22
+
23
+ query_expansion: true # WordNet synonym expansion
24
+ max_synonyms: 5 # max synonyms to append
25
+
26
+ reranking_enabled: true # cross-encoder reranking
27
+ reranker_model: "cross-encoder/ms-marco-MiniLM-L-6-v2"
28
+
29
+ # Directories to skip
30
+ skip_directories:
31
+ - ".git"
32
+ - "node_modules"
33
+ - "__pycache__"
34
+ - ".venv"
35
+
36
+ # Where to store index data
37
+ data_dir: "./data"
38
+
39
+ embedding_model: "all-MiniLM-L6-v2"
40
+ # embedding_model: BAAI/bge-small-en-v1.5
41
+
42
+ debounce_seconds: 5
data/nfcorpus/corpus.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/nfcorpus/qrels/dev.tsv ADDED
The diff for this file is too large to render. See raw diff
 
data/nfcorpus/qrels/test.tsv ADDED
The diff for this file is too large to render. See raw diff
 
data/nfcorpus/qrels/train.tsv ADDED
The diff for this file is too large to render. See raw diff
 
data/nfcorpus/queries.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/scifact/corpus.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/scifact/qrels/test.tsv ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ query-id corpus-id score
2
+ 1 31715818 1
3
+ 3 14717500 1
4
+ 5 13734012 1
5
+ 13 1606628 1
6
+ 36 5152028 1
7
+ 36 11705328 1
8
+ 42 18174210 1
9
+ 48 13734012 1
10
+ 49 5953485 1
11
+ 50 12580014 1
12
+ 51 45638119 1
13
+ 53 45638119 1
14
+ 54 49556906 1
15
+ 56 4709641 1
16
+ 57 4709641 1
17
+ 70 5956380 1
18
+ 70 4414547 1
19
+ 72 6076903 1
20
+ 75 4387784 1
21
+ 94 1215116 1
22
+ 99 18810195 1
23
+ 100 4381486 1
24
+ 113 6157837 1
25
+ 115 33872649 1
26
+ 118 6372244 1
27
+ 124 4883040 1
28
+ 127 21598000 1
29
+ 128 8290953 1
30
+ 129 27768226 1
31
+ 130 27768226 1
32
+ 132 7975937 1
33
+ 133 38485364 1
34
+ 133 6969753 1
35
+ 133 17934082 1
36
+ 133 16280642 1
37
+ 133 12640810 1
38
+ 137 26016929 1
39
+ 141 6955746 1
40
+ 141 14437255 1
41
+ 142 10582939 1
42
+ 143 10582939 1
43
+ 146 10582939 1
44
+ 148 1084345 1
45
+ 163 18872233 1
46
+ 171 12670680 1
47
+ 179 16322674 1
48
+ 179 27123743 1
49
+ 179 23557241 1
50
+ 179 17450673 1
51
+ 180 16966326 1
52
+ 183 12827098 1
53
+ 185 18340282 1
54
+ 198 2177022 1
55
+ 208 13519661 1
56
+ 212 22038539 1
57
+ 213 13625993 1
58
+ 216 21366394 1
59
+ 217 21366394 1
60
+ 218 21366394 1
61
+ 219 21366394 1
62
+ 230 3067015 1
63
+ 232 10536636 1
64
+ 233 4388470 1
65
+ 236 4388470 1
66
+ 237 4942718 1
67
+ 238 2251426 1
68
+ 239 14079881 1
69
+ 248 1568684 1
70
+ 249 1568684 1
71
+ 261 1122279 1
72
+ 261 10697096 1
73
+ 268 970012 1
74
+ 269 970012 1
75
+ 274 11614737 1
76
+ 275 4961038 1
77
+ 275 14241418 1
78
+ 275 14819804 1
79
+ 279 14376683 1
80
+ 294 10874408 1
81
+ 295 20310709 1
82
+ 298 39381118 1
83
+ 300 3553087 1
84
+ 303 4388470 1
85
+ 312 6173523 1
86
+ 314 4347374 1
87
+ 324 2014909 1
88
+ 327 17997584 1
89
+ 338 23349986 1
90
+ 343 7873737 1
91
+ 343 5884524 1
92
+ 350 16927286 1
93
+ 354 8774475 1
94
+ 362 38587347 1
95
+ 380 19005293 1
96
+ 384 13770184 1
97
+ 385 9955779 1
98
+ 385 9767444 1
99
+ 386 16495649 1
100
+ 388 1148122 1
101
+ 399 791050 1
102
+ 410 14924526 1
103
+ 411 14924526 1
104
+ 415 6309659 1
105
+ 421 11172205 1
106
+ 431 28937856 1
107
+ 436 14637235 1
108
+ 437 18399038 1
109
+ 439 4423559 1
110
+ 440 4423559 1
111
+ 443 10165258 1
112
+ 452 12804937 1
113
+ 452 464511 1
114
+ 475 18678095 1
115
+ 478 14767844 1
116
+ 491 56893404 1
117
+ 501 17930286 1
118
+ 502 13071728 1
119
+ 507 30774694 1
120
+ 508 13980338 1
121
+ 513 13230773 1
122
+ 514 16256507 1
123
+ 516 29564505 1
124
+ 517 15663829 1
125
+ 521 34873974 1
126
+ 525 13639330 1
127
+ 527 3863543 1
128
+ 528 5476778 1
129
+ 532 12991445 1
130
+ 533 12991445 1
131
+ 535 39368721 1
132
+ 536 16056514 1
133
+ 539 13282296 1
134
+ 540 11886686 1
135
+ 540 25007443 1
136
+ 544 24221369 1
137
+ 549 9433958 1
138
+ 551 33499189 1
139
+ 552 1471041 1
140
+ 554 1049501 1
141
+ 560 40096222 1
142
+ 569 23460562 1
143
+ 575 10300888 1
144
+ 577 5289038 1
145
+ 578 8764879 1
146
+ 587 16999023 1
147
+ 589 10984005 1
148
+ 593 19675911 1
149
+ 597 12779444 1
150
+ 597 36355784 1
151
+ 597 25742130 1
152
+ 598 25742130 1
153
+ 613 9638032 1
154
+ 619 20888849 1
155
+ 619 2565138 1
156
+ 623 17000834 1
157
+ 628 24512064 1
158
+ 636 24294572 1
159
+ 637 25649714 1
160
+ 641 5912283 1
161
+ 641 31554917 1
162
+ 644 13619127 1
163
+ 649 12789595 1
164
+ 659 1215116 1
165
+ 660 1215116 1
166
+ 674 2095573 1
167
+ 684 4942718 1
168
+ 690 18750453 1
169
+ 691 10991183 1
170
+ 692 24088502 1
171
+ 693 24088502 1
172
+ 700 4350400 1
173
+ 702 4350400 1
174
+ 715 18421962 1
175
+ 716 18421962 1
176
+ 718 17587795 1
177
+ 721 1834762 1
178
+ 723 5531479 1
179
+ 727 7521113 1
180
+ 728 7521113 1
181
+ 728 36444198 1
182
+ 729 26851674 1
183
+ 742 32159283 1
184
+ 743 32159283 1
185
+ 744 8460275 1
186
+ 756 2831620 1
187
+ 759 1805641 1
188
+ 768 6421792 1
189
+ 770 15476777 1
190
+ 775 32275758 1
191
+ 781 24338780 1
192
+ 783 40632104 1
193
+ 784 2356950 1
194
+ 785 12471115 1
195
+ 793 8551160 1
196
+ 800 22543403 1
197
+ 805 22180793 1
198
+ 808 36606083 1
199
+ 811 19799455 1
200
+ 814 33387953 1
201
+ 820 8646760 1
202
+ 821 8646760 1
203
+ 823 15319019 1
204
+ 830 1897324 1
205
+ 831 1897324 1
206
+ 832 30303335 1
207
+ 834 5483793 1
208
+ 837 15928989 1
209
+ 839 1469751 1
210
+ 845 17741440 1
211
+ 847 16787954 1
212
+ 852 13843341 1
213
+ 859 1982286 1
214
+ 870 195689316 1
215
+ 873 1180972 1
216
+ 873 19307912 1
217
+ 873 27393799 1
218
+ 873 29025270 1
219
+ 873 3315558 1
220
+ 879 8426046 1
221
+ 880 8426046 1
222
+ 882 14803797 1
223
+ 887 18855191 1
224
+ 903 10648422 1
225
+ 904 7370282 1
226
+ 907 6923961 1
227
+ 911 11254556 1
228
+ 913 3203590 1
229
+ 914 3203590 1
230
+ 921 1642727 1
231
+ 922 17077004 1
232
+ 936 5483793 1
233
+ 956 12956194 1
234
+ 957 123859 1
235
+ 960 8780599 1
236
+ 967 2119889 1
237
+ 967 8997410 1
238
+ 971 46695481 1
239
+ 971 27873158 1
240
+ 971 28617573 1
241
+ 971 9764256 1
242
+ 975 5304891 1
243
+ 982 2988714 1
244
+ 985 6828370 1
245
+ 993 16472469 1
246
+ 1012 9745001 1
247
+ 1014 6277638 1
248
+ 1019 11603066 1
249
+ 1020 9433958 1
250
+ 1021 9433958 1
251
+ 1024 5373138 1
252
+ 1029 13923140 1
253
+ 1029 13940200 1
254
+ 1029 11899391 1
255
+ 1041 25254425 1
256
+ 1041 16626264 1
257
+ 1049 12486491 1
258
+ 1062 20381484 1
259
+ 1086 39281140 1
260
+ 1088 37549932 1
261
+ 1089 17628888 1
262
+ 1099 7662206 1
263
+ 1100 7662206 1
264
+ 1104 3898784 1
265
+ 1107 20532591 1
266
+ 1110 13770184 1
267
+ 1121 4456756 1
268
+ 1130 17997584 1
269
+ 1132 33499189 1
270
+ 1132 9283422 1
271
+ 1137 33370 1
272
+ 1140 12009265 1
273
+ 1144 10071552 1
274
+ 1146 13906581 1
275
+ 1150 11369420 1
276
+ 1163 15305881 1
277
+ 1175 31272411 1
278
+ 1179 31272411 1
279
+ 1180 31272411 1
280
+ 1185 16737210 1
281
+ 1187 52873726 1
282
+ 1191 30655442 1
283
+ 1194 11419230 1
284
+ 1196 25649714 1
285
+ 1197 25649714 1
286
+ 1199 16760369 1
287
+ 1200 3441524 1
288
+ 1202 3475317 1
289
+ 1204 31141365 1
290
+ 1207 18909530 1
291
+ 1213 14407673 1
292
+ 1216 24142891 1
293
+ 1221 19736671 1
294
+ 1225 9650982 1
295
+ 1226 13777138 1
296
+ 1232 13905670 1
297
+ 1241 4427392 1
298
+ 1245 7662395 1
299
+ 1259 24341590 1
300
+ 1262 44172171 1
301
+ 1266 37480103 1
302
+ 1270 13900610 1
303
+ 1271 13768432 1
304
+ 1272 17081238 1
305
+ 1273 11041152 1
306
+ 1274 12428814 1
307
+ 1274 27731651 1
308
+ 1274 4406819 1
309
+ 1278 11335781 1
310
+ 1279 11335781 1
311
+ 1280 4387784 1
312
+ 1281 4387784 1
313
+ 1282 23649163 1
314
+ 1290 4687948 1
315
+ 1292 56893404 1
316
+ 1298 11718220 1
317
+ 1303 12631697 1
318
+ 1316 27910499 1
319
+ 1319 16284655 1
320
+ 1320 16284655 1
321
+ 1332 5304891 1
322
+ 1335 27910499 1
323
+ 1336 27910499 1
324
+ 1337 20231138 1
325
+ 1339 15482274 1
326
+ 1344 9559146 1
327
+ 1352 12885341 1
328
+ 1359 11614737 1
329
+ 1362 8290953 1
330
+ 1363 8290953 1
331
+ 1368 2425364 1
332
+ 1370 2425364 1
333
+ 1379 16322674 1
334
+ 1379 27123743 1
335
+ 1379 23557241 1
336
+ 1379 17450673 1
337
+ 1382 17755060 1
338
+ 1385 306006 1
339
+ 1389 23895668 1
340
+ 1395 17717391 1
data/scifact/qrels/train.tsv ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ query-id corpus-id score
2
+ 0 31715818 1
3
+ 2 13734012 1
4
+ 4 22942787 1
5
+ 6 2613775 1
6
+ 9 44265107 1
7
+ 10 32587939 1
8
+ 11 32587939 1
9
+ 12 33409100 1
10
+ 14 641786 1
11
+ 15 22080671 1
12
+ 17 1606628 1
13
+ 18 22942787 1
14
+ 19 3202143 1
15
+ 20 3202143 1
16
+ 21 41493639 1
17
+ 22 6490571 1
18
+ 24 3471191 1
19
+ 25 2613775 1
20
+ 26 32390525 1
21
+ 27 32390525 1
22
+ 28 12670680 1
23
+ 30 24341590 1
24
+ 32 12428497 1
25
+ 34 11705328 1
26
+ 35 5152028 1
27
+ 35 11705328 1
28
+ 37 5152028 1
29
+ 37 11705328 1
30
+ 39 13497630 1
31
+ 40 13497630 1
32
+ 41 18174210 1
33
+ 43 7224723 1
34
+ 44 56893404 1
35
+ 45 56893404 1
36
+ 46 380526 1
37
+ 47 3512154 1
38
+ 47 26996935 1
39
+ 52 45638119 1
40
+ 55 49556906 1
41
+ 58 4709641 1
42
+ 60 13899137 1
43
+ 60 13901073 1
44
+ 61 13899137 1
45
+ 61 13901073 1
46
+ 62 32587939 1
47
+ 63 40349336 1
48
+ 64 40349336 1
49
+ 66 14806256 1
50
+ 67 21295300 1
51
+ 68 21295300 1
52
+ 69 5956380 1
53
+ 69 4414547 1
54
+ 71 1127562 1
55
+ 73 6076903 1
56
+ 74 4387784 1
57
+ 76 5531479 1
58
+ 77 5531479 1
59
+ 78 5099266 1
60
+ 79 5099266 1
61
+ 80 4920376 1
62
+ 81 1797622 1
63
+ 82 3619372 1
64
+ 85 7521113 1
65
+ 85 22406695 1
66
+ 86 7521113 1
67
+ 86 22406695 1
68
+ 88 7521113 1
69
+ 88 22406695 1
70
+ 89 7521113 1
71
+ 89 22406695 1
72
+ 90 22406695 1
73
+ 91 1084345 1
74
+ 92 1084345 1
75
+ 93 2692522 1
76
+ 95 1215116 1
77
+ 96 14500725 1
78
+ 98 6540064 1
79
+ 104 40164383 1
80
+ 105 36606083 1
81
+ 106 25515907 1
82
+ 106 5151024 1
83
+ 108 6191684 1
84
+ 108 22995579 1
85
+ 108 23865182 1
86
+ 109 4319174 1
87
+ 111 13513790 1
88
+ 112 6157837 1
89
+ 114 33872649 1
90
+ 116 33872649 1
91
+ 119 14606752 1
92
+ 120 14606752 1
93
+ 121 31460499 1
94
+ 122 31460499 1
95
+ 123 4883040 1
96
+ 126 24512064 1
97
+ 134 4695046 1
98
+ 138 26016929 1
99
+ 139 22080671 1
100
+ 144 10582939 1
101
+ 149 6227220 1
102
+ 152 15488881 1
103
+ 153 4702639 1
104
+ 154 4702639 1
105
+ 155 37549932 1
106
+ 156 37549932 1
107
+ 157 13439128 1
108
+ 159 9394119 1
109
+ 160 52874170 1
110
+ 161 6903077 1
111
+ 164 5824985 1
112
+ 165 5824985 1
113
+ 166 18872233 1
114
+ 167 18872233 1
115
+ 168 5824985 1
116
+ 169 5824985 1
117
+ 172 12670680 1
118
+ 173 8126244 1
119
+ 174 1710116 1
120
+ 175 1710116 1
121
+ 176 32587939 1
122
+ 177 9669099 1
123
+ 178 16322674 1
124
+ 178 27123743 1
125
+ 178 23557241 1
126
+ 178 17450673 1
127
+ 181 16966326 1
128
+ 182 11369420 1
129
+ 184 12827098 1
130
+ 186 16855829 1
131
+ 187 16855829 1
132
+ 189 4421578 1
133
+ 196 19313533 1
134
+ 197 2177022 1
135
+ 199 2177022 1
136
+ 200 18231807 1
137
+ 201 2462673 1
138
+ 203 9558539 1
139
+ 204 7898952 1
140
+ 205 7898952 1
141
+ 205 470625 1
142
+ 209 32587939 1
143
+ 210 13794374 1
144
+ 211 13794374 1
145
+ 214 13625993 1
146
+ 220 19205437 1
147
+ 221 19205437 1
148
+ 222 19205437 1
149
+ 223 2014909 1
150
+ 224 6944800 1
151
+ 225 6944800 1
152
+ 226 6944800 1
153
+ 227 26973393 1
154
+ 228 4928057 1
155
+ 229 56893404 1
156
+ 235 4388470 1
157
+ 241 2212067 1
158
+ 241 10608822 1
159
+ 242 2212067 1
160
+ 242 10608822 1
161
+ 243 8148122 1
162
+ 244 21498497 1
163
+ 245 8447873 1
164
+ 245 3430789 1
165
+ 246 8447873 1
166
+ 246 3430789 1
167
+ 247 13578199 1
168
+ 250 1568684 1
169
+ 251 1568684 1
170
+ 253 37424881 1
171
+ 254 37424881 1
172
+ 255 5850219 1
173
+ 256 5850219 1
174
+ 258 22080671 1
175
+ 259 8883846 1
176
+ 262 14610165 1
177
+ 263 11328820 1
178
+ 263 30041340 1
179
+ 263 14853989 1
180
+ 264 11328820 1
181
+ 265 2033917 1
182
+ 266 22405338 1
183
+ 267 5912283 1
184
+ 267 31554917 1
185
+ 272 11614737 1
186
+ 277 14376683 1
187
+ 278 14376683 1
188
+ 280 25001628 1
189
+ 281 4632921 1
190
+ 283 1974176 1
191
+ 285 5548081 1
192
+ 286 4709641 1
193
+ 287 4709641 1
194
+ 290 15048300 1
195
+ 292 15048300 1
196
+ 293 10874408 1
197
+ 296 4398832 1
198
+ 299 39381118 1
199
+ 301 3553087 1
200
+ 304 14797520 1
201
+ 305 14797520 1
202
+ 306 7821634 1
203
+ 308 7821634 1
204
+ 309 7821634 1
205
+ 310 6173523 1
206
+ 313 6173523 1
207
+ 315 3701541 1
208
+ 316 712078 1
209
+ 317 4506414 1
210
+ 323 2014909 1
211
+ 325 40349336 1
212
+ 326 40349336 1
213
+ 330 9505448 1
214
+ 331 9505448 1
215
+ 332 29023309 1
216
+ 333 29023309 1
217
+ 334 25079962 1
218
+ 335 1780819 1
219
+ 336 2097256 1
220
+ 337 2097256 1
221
+ 339 23349986 1
222
+ 340 7098463 1
223
+ 341 7098463 1
224
+ 342 7873737 1
225
+ 342 5884524 1
226
+ 345 4394817 1
227
+ 346 11902109 1
228
+ 347 11902109 1
229
+ 349 13497630 1
230
+ 351 14658685 1
231
+ 352 14658685 1
232
+ 355 12800122 1
233
+ 355 38380061 1
234
+ 356 6144337 1
235
+ 357 18111172 1
236
+ 358 18111172 1
237
+ 361 38587347 1
238
+ 363 5386514 1
239
+ 364 1550937 1
240
+ 365 600437 1
241
+ 366 13956305 1
242
+ 367 27099731 1
243
+ 368 27099731 1
244
+ 369 6826100 1
245
+ 370 1550937 1
246
+ 371 1550937 1
247
+ 372 24922825 1
248
+ 375 1522647 1
249
+ 376 22401061 1
250
+ 377 18810195 1
251
+ 378 45154987 1
252
+ 378 10534299 1
253
+ 378 11886686 1
254
+ 378 25007443 1
255
+ 378 17150648 1
256
+ 379 19005293 1
257
+ 381 18340282 1
258
+ 382 11659421 1
259
+ 383 13770184 1
260
+ 389 1148122 1
261
+ 390 1148122 1
262
+ 391 1148122 1
263
+ 392 1148122 1
264
+ 393 1148122 1
265
+ 394 11360768 1
266
+ 396 1456068 1
267
+ 397 1456068 1
268
+ 398 8883846 1
269
+ 400 791050 1
270
+ 401 5633876 1
271
+ 403 1921218 1
272
+ 404 1921218 1
273
+ 406 6796297 1
274
+ 407 9889151 1
275
+ 413 6309659 1
276
+ 414 6309659 1
277
+ 416 6309659 1
278
+ 417 6309659 1
279
+ 418 16660256 1
280
+ 420 9315213 1
281
+ 422 11172205 1
282
+ 423 8595678 1
283
+ 425 33257464 1
284
+ 426 16728949 1
285
+ 428 16728949 1
286
+ 429 36540079 1
287
+ 430 28937856 1
288
+ 432 8002887 1
289
+ 434 9500590 1
290
+ 435 9500590 1
291
+ 441 2014909 1
292
+ 444 10165258 1
293
+ 445 10165258 1
294
+ 447 2052720 1
295
+ 448 2052720 1
296
+ 449 12209494 1
297
+ 449 3430789 1
298
+ 453 4200695 1
299
+ 454 4200695 1
300
+ 455 12643937 1
301
+ 456 30507607 1
302
+ 458 597790 1
303
+ 461 40096222 1
304
+ 463 19736671 1
305
+ 466 22544171 1
306
+ 469 1410197 1
307
+ 470 12685434 1
308
+ 472 7185591 1
309
+ 472 26330861 1
310
+ 472 4414481 1
311
+ 473 4373433 1
312
+ 474 4373433 1
313
+ 479 6325527 1
314
+ 480 6325527 1
315
+ 481 14706752 1
316
+ 482 10991183 1
317
+ 483 22703082 1
318
+ 484 14637235 1
319
+ 485 14637235 1
320
+ 486 14637235 1
321
+ 487 14637235 1
322
+ 488 1780819 1
323
+ 489 6625693 1
324
+ 490 56893404 1
325
+ 492 19583924 1
326
+ 493 19583924 1
327
+ 494 34873974 1
328
+ 495 17077004 1
329
+ 498 17077004 1
330
+ 499 26064662 1
331
+ 500 17930286 1
332
+ 504 10883736 1
333
+ 505 22703082 1
334
+ 506 7433668 1
335
+ 509 13980338 1
336
+ 515 29564505 1
337
+ 523 14803797 1
338
+ 524 14803797 1
339
+ 526 3863543 1
340
+ 529 10546779 1
341
+ 529 25413327 1
342
+ 529 36651210 1
343
+ 530 10546779 1
344
+ 530 25413327 1
345
+ 530 36651210 1
346
+ 530 87610599 1
347
+ 531 10546779 1
348
+ 531 25413327 1
349
+ 531 36651210 1
350
+ 537 16056514 1
351
+ 541 45154987 1
352
+ 541 11886686 1
353
+ 541 25007443 1
354
+ 542 19688024 1
355
+ 545 24221369 1
356
+ 547 10648422 1
357
+ 548 18199839 1
358
+ 550 33499189 1
359
+ 553 1471041 1
360
+ 555 1049501 1
361
+ 557 1049501 1
362
+ 559 3475317 1
363
+ 562 20101846 1
364
+ 563 2867345 1
365
+ 564 2867345 1
366
+ 565 16120395 1
367
+ 566 16120395 1
368
+ 568 23418635 1
369
+ 570 20333864 1
370
+ 571 20333864 1
371
+ 572 4447055 1
372
+ 573 10300888 1
373
+ 574 10300888 1
374
+ 576 4468861 1
375
+ 579 34139429 1
376
+ 580 23460562 1
377
+ 582 14260013 1
378
+ 584 14260013 1
379
+ 585 42291761 1
380
+ 588 16999023 1
381
+ 590 10984005 1
382
+ 591 14682243 1
383
+ 592 14682243 1
384
+ 594 19675911 1
385
+ 595 4824840 1
386
+ 600 12258338 1
387
+ 601 12258338 1
388
+ 602 3701541 1
389
+ 603 6540064 1
390
+ 606 712078 1
391
+ 607 4506414 1
392
+ 609 40096222 1
393
+ 610 40096222 1
394
+ 611 32408470 1
395
+ 612 9638032 1
396
+ 614 9638032 1
397
+ 615 9638032 1
398
+ 616 18670 1
399
+ 617 18670 1
400
+ 618 6836086 1
401
+ 620 2565138 1
402
+ 621 1642727 1
403
+ 622 17000834 1
404
+ 624 20033112 1
405
+ 625 20033112 1
406
+ 626 16355392 1
407
+ 631 5468807 1
408
+ 632 5172048 1
409
+ 633 5172048 1
410
+ 635 1686997 1
411
+ 638 25649714 1
412
+ 640 6503185 1
413
+ 642 13619127 1
414
+ 643 15535511 1
415
+ 645 12810152 1
416
+ 646 12810152 1
417
+ 647 15041758 1
418
+ 648 15041758 1
419
+ 650 12789595 1
420
+ 651 9433958 1
421
+ 652 9433958 1
422
+ 653 24384587 1
423
+ 654 57574395 1
424
+ 655 57574395 1
425
+ 657 8533245 1
426
+ 658 5293024 1
427
+ 661 37204802 1
428
+ 662 37204802 1
429
+ 663 22080671 1
430
+ 665 12580014 1
431
+ 666 4469125 1
432
+ 667 6493422 1
433
+ 668 6493422 1
434
+ 668 25148216 1
435
+ 669 6493422 1
436
+ 669 25148216 1
437
+ 670 5573975 1
438
+ 671 5573975 1
439
+ 672 15635366 1
440
+ 673 2095573 1
441
+ 676 857189 1
442
+ 677 857189 1
443
+ 679 13639330 1
444
+ 680 9315213 1
445
+ 681 9315213 1
446
+ 682 9315213 1
447
+ 683 9315213 1
448
+ 685 4452659 1
449
+ 686 4452659 1
450
+ 687 4452659 1
451
+ 688 4452659 1
452
+ 689 22080671 1
453
+ 694 1071991 1
454
+ 696 16355392 1
455
+ 698 22544171 1
456
+ 703 4350400 1
457
+ 704 14658685 1
458
+ 705 22442133 1
459
+ 709 22442133 1
460
+ 710 22442133 1
461
+ 713 18421962 1
462
+ 714 18421962 1
463
+ 717 17587795 1
464
+ 724 5531479 1
465
+ 726 7521113 1
466
+ 726 36444198 1
467
+ 730 13400643 1
468
+ 732 34469966 1
469
+ 733 34469966 1
470
+ 734 4961038 1
471
+ 736 5389095 1
472
+ 737 16562534 1
473
+ 737 6609935 1
474
+ 738 16562534 1
475
+ 738 6609935 1
476
+ 738 33912020 1
477
+ 739 4446814 1
478
+ 740 23078022 1
479
+ 745 11291348 1
480
+ 746 11291348 1
481
+ 747 11291348 1
482
+ 748 11291348 1
483
+ 749 13868795 1
484
+ 751 19800147 1
485
+ 752 19800147 1
486
+ 753 1173667 1
487
+ 755 17844478 1
488
+ 757 17123657 1
489
+ 758 14195528 1
490
+ 760 1805641 1
491
+ 761 10009203 1
492
+ 762 4695046 1
493
+ 764 7552215 1
494
+ 765 7552215 1
495
+ 766 7552215 1
496
+ 767 2488880 1
497
+ 771 15476777 1
498
+ 772 24922825 1
499
+ 774 32275758 1
500
+ 776 32275758 1
501
+ 777 32275758 1
502
+ 778 13001323 1
503
+ 779 13001323 1
504
+ 780 8246922 1
505
+ 780 24338780 1
506
+ 782 8246922 1
507
+ 787 4740447 1
508
+ 788 4740447 1
509
+ 789 15493354 1
510
+ 790 15493354 1
511
+ 791 15984735 1
512
+ 792 3610080 1
513
+ 795 8551160 1
514
+ 797 8551160 1
515
+ 798 8551160 1
516
+ 799 5293024 1
517
+ 801 22180793 1
518
+ 802 22180793 1
519
+ 803 22180793 1
520
+ 804 22180793 1
521
+ 807 36606083 1
522
+ 810 13513790 1
523
+ 812 19799455 1
524
+ 813 33387953 1
525
+ 815 8148304 1
526
+ 816 8148304 1
527
+ 817 17814815 1
528
+ 818 17814815 1
529
+ 822 15319019 1
530
+ 825 15319019 1
531
+ 826 4678846 1
532
+ 828 4678846 1
533
+ 835 15928989 1
534
+ 838 15928989 1
535
+ 840 15663829 1
536
+ 841 15663829 1
537
+ 844 17741440 1
538
+ 846 22696649 1
539
+ 848 14500725 1
540
+ 853 24922825 1
541
+ 854 12206390 1
542
+ 855 8190282 1
543
+ 856 43334921 1
544
+ 857 43334921 1
545
+ 858 1982286 1
546
+ 860 16066726 1
547
+ 861 16066726 1
548
+ 863 20568364 1
549
+ 863 16361581 1
550
+ 866 37822406 1
551
+ 867 14340571 1
552
+ 871 195689316 1
553
+ 876 195689316 1
554
+ 877 313394 1
555
+ 881 14803797 1
556
+ 883 14803797 1
557
+ 884 14803797 1
558
+ 885 6477536 1
559
+ 886 6477536 1
560
+ 890 2097256 1
561
+ 891 2097256 1
562
+ 893 13509809 1
563
+ 894 14724693 1
564
+ 895 18750453 1
565
+ 896 14338915 1
566
+ 897 14338915 1
567
+ 898 13106686 1
568
+ 898 5572127 1
569
+ 899 13106686 1
570
+ 899 5572127 1
571
+ 900 18678095 1
572
+ 901 6540064 1
573
+ 902 10648422 1
574
+ 908 6923961 1
575
+ 909 11254556 1
576
+ 910 11254556 1
577
+ 912 11254556 1
578
+ 916 18037805 1
579
+ 917 34071621 1
580
+ 919 16422880 1
581
+ 923 17077004 1
582
+ 925 17077004 1
583
+ 926 16390264 1
584
+ 927 16390264 1
585
+ 928 18174210 1
586
+ 929 18174210 1
587
+ 930 16056514 1
588
+ 933 14711483 1
589
+ 934 8563659 1
590
+ 935 5483793 1
591
+ 938 26231129 1
592
+ 939 26231129 1
593
+ 940 12258338 1
594
+ 941 12258338 1
595
+ 942 11527199 1
596
+ 944 1642727 1
597
+ 945 8428935 1
598
+ 945 26112696 1
599
+ 945 4463588 1
600
+ 945 13083189 1
601
+ 946 8428935 1
602
+ 946 26112696 1
603
+ 946 4463588 1
604
+ 946 13083189 1
605
+ 949 13578199 1
606
+ 951 21414718 1
607
+ 952 3355397 1
608
+ 953 3355397 1
609
+ 954 3355397 1
610
+ 955 2078658 1
611
+ 955 30507607 1
612
+ 959 8780599 1
613
+ 962 13931771 1
614
+ 962 935538 1
615
+ 962 4306711 1
616
+ 963 4162857 1
617
+ 963 29828242 1
618
+ 964 4162857 1
619
+ 964 29828242 1
620
+ 965 40817021 1
621
+ 969 19356271 1
622
+ 969 17368516 1
623
+ 970 19356271 1
624
+ 970 17368516 1
625
+ 972 46695481 1
626
+ 972 27873158 1
627
+ 972 28617573 1
628
+ 972 9764256 1
629
+ 973 27446873 1
630
+ 973 27873158 1
631
+ 973 28617573 1
632
+ 973 9764256 1
633
+ 976 5304891 1
634
+ 977 14075252 1
635
+ 977 39264456 1
636
+ 978 14075252 1
637
+ 979 11659421 1
638
+ 980 20128547 1
639
+ 984 6828370 1
640
+ 988 3033830 1
641
+ 989 9988425 1
642
+ 990 16472469 1
643
+ 992 16472469 1
644
+ 994 16472469 1
645
+ 996 16472469 1
646
+ 997 16472469 1
647
+ 998 16472469 1
648
+ 999 16472469 1
649
+ 1000 16472469 1
650
+ 1001 5702790 1
651
+ 1002 13639330 1
652
+ 1003 14332945 1
653
+ 1003 4319844 1
654
+ 1003 4899981 1
655
+ 1004 301838 1
656
+ 1004 2734421 1
657
+ 1004 3952288 1
658
+ 1005 301838 1
659
+ 1005 2734421 1
660
+ 1005 3952288 1
661
+ 1006 4926049 1
662
+ 1008 2547636 1
663
+ 1009 1982286 1
664
+ 1011 9745001 1
665
+ 1015 6277638 1
666
+ 1016 6277638 1
667
+ 1018 11603066 1
668
+ 1023 16927286 1
669
+ 1025 32408470 1
670
+ 1026 3113630 1
671
+ 1027 3113630 1
672
+ 1028 13923140 1
673
+ 1028 11899391 1
674
+ 1030 6441369 1
675
+ 1031 12486491 1
676
+ 1032 6836086 1
677
+ 1033 6836086 1
678
+ 1034 4547102 1
679
+ 1035 4547102 1
680
+ 1036 4547102 1
681
+ 1037 16287725 1
682
+ 1038 16287725 1
683
+ 1040 25254425 1
684
+ 1040 16626264 1
685
+ 1042 17421851 1
686
+ 1043 17671145 1
687
+ 1044 22500262 1
688
+ 1045 22500262 1
689
+ 1046 418246 1
690
+ 1046 4324278 1
691
+ 1046 16712164 1
692
+ 1047 14706752 1
693
+ 1048 12486491 1
694
+ 1050 19878070 1
695
+ 1052 18816720 1
696
+ 1053 18816720 1
697
+ 1054 10072941 1
698
+ 1055 13906581 1
699
+ 1056 4200695 1
700
+ 1058 13027590 1
701
+ 1065 20418809 1
702
+ 1067 4429668 1
703
+ 1068 4429668 1
704
+ 1069 4200695 1
705
+ 1070 25649714 1
706
+ 1072 4824840 1
707
+ 1073 4824840 1
708
+ 1074 14658685 1
709
+ 1075 14658685 1
710
+ 1081 5691302 1
711
+ 1084 5691302 1
712
+ 1085 5691302 1
713
+ 1087 39281140 1
714
+ 1090 17628888 1
715
+ 1091 2603304 1
716
+ 1096 29638116 1
717
+ 1097 26851674 1
718
+ 1098 13552682 1
719
+ 1101 3874000 1
720
+ 1102 3874000 1
721
+ 1103 3898784 1
722
+ 1105 6710713 1
723
+ 1106 6710713 1
724
+ 1109 13770184 1
725
+ 1109 8582337 1
726
+ 1111 1686881 1
727
+ 1112 1686881 1
728
+ 1114 12824568 1
729
+ 1115 44048701 1
730
+ 1118 23351136 1
731
+ 1119 5323845 1
732
+ 1119 18997216 1
733
+ 1119 13907928 1
734
+ 1120 5323845 1
735
+ 1120 18997216 1
736
+ 1120 13907928 1
737
+ 1125 21009874 1
738
+ 1126 21009874 1
739
+ 1127 27466734 1
740
+ 1128 33499189 1
741
+ 1128 9283422 1
742
+ 1133 24142891 1
743
+ 1134 33370 1
744
+ 1135 33370 1
745
+ 1136 33370 1
746
+ 1138 6796297 1
747
+ 1139 12009265 1
748
+ 1141 12009265 1
749
+ 1142 5260382 1
750
+ 1145 10071552 1
751
+ 1148 4828631 1
752
+ 1153 7370282 1
753
+ 1156 12584053 1
754
+ 1157 12584053 1
755
+ 1158 12584053 1
756
+ 1159 12584053 1
757
+ 1161 13048272 1
758
+ 1162 15305881 1
759
+ 1164 4455466 1
760
+ 1165 4455466 1
761
+ 1166 9889151 1
762
+ 1168 8563659 1
763
+ 1169 4319174 1
764
+ 1170 18956141 1
765
+ 1171 18956141 1
766
+ 1173 7370282 1
767
+ 1174 31272411 1
768
+ 1176 13910150 1
769
+ 1177 13910150 1
770
+ 1178 31272411 1
771
+ 1181 301838 1
772
+ 1181 2734421 1
773
+ 1181 39128592 1
774
+ 1181 3952288 1
775
+ 1182 14541844 1
776
+ 1183 1967017 1
777
+ 1184 16737210 1
778
+ 1186 7485455 1
779
+ 1188 4394817 1
780
+ 1190 30655442 1
781
+ 1193 20532591 1
782
+ 1195 26283293 1
783
+ 1205 5558754 1
784
+ 1206 18909530 1
785
+ 1208 10284593 1
786
+ 1209 4347374 1
787
+ 1210 4928282 1
788
+ 1211 4928282 1
789
+ 1212 6493422 1
790
+ 1212 44724517 1
791
+ 1214 6493422 1
792
+ 1214 14407673 1
793
+ 1215 16355392 1
794
+ 1218 15635366 1
795
+ 1219 9393969 1
796
+ 1219 14864285 1
797
+ 1220 13023410 1
798
+ 1223 5289038 1
799
+ 1224 21932050 1
800
+ 1224 34016987 1
801
+ 1227 25641414 1
802
+ 1228 25641414 1
803
+ 1229 1676568 1
804
+ 1230 13905670 1
805
+ 1231 13905670 1
806
+ 1234 13905670 1
807
+ 1235 17973161 1
808
+ 1236 17973161 1
809
+ 1237 3654468 1
810
+ 1238 3654468 1
811
+ 1239 21387297 1
812
+ 1239 4427392 1
813
+ 1244 18949516 1
814
+ 1246 7662395 1
815
+ 1247 5114282 1
816
+ 1248 7209559 1
817
+ 1249 7209559 1
818
+ 1253 3321943 1
819
+ 1254 16939583 1
820
+ 1255 16939583 1
821
+ 1257 581832 1
822
+ 1258 12040627 1
823
+ 1260 24341590 1
824
+ 1261 13023410 1
825
+ 1263 3981729 1
826
+ 1265 37480103 1
827
+ 1268 52072815 1
828
+ 1269 13900610 1
829
+ 1275 27731651 1
830
+ 1276 3475317 1
831
+ 1284 3578380 1
832
+ 1288 4687948 1
833
+ 1289 21239672 1
834
+ 1291 56893404 1
835
+ 1293 43329366 1
836
+ 1294 2078658 1
837
+ 1294 30507607 1
838
+ 1295 21239672 1
839
+ 1297 9167230 1
840
+ 1300 6421792 1
841
+ 1302 12631697 1
842
+ 1304 12631697 1
843
+ 1305 12631697 1
844
+ 1306 6000423 1
845
+ 1306 5836 1
846
+ 1307 18231807 1
847
+ 1308 18231807 1
848
+ 1309 18231807 1
849
+ 1310 8042158 1
850
+ 1311 13763195 1
851
+ 1312 24177706 1
852
+ 1314 13072112 1
853
+ 1314 16237005 1
854
+ 1315 13072112 1
855
+ 1315 16237005 1
856
+ 1322 16284655 1
857
+ 1323 19912367 1
858
+ 1324 19912367 1
859
+ 1325 40476126 1
860
+ 1327 24241932 1
861
+ 1327 22194407 1
862
+ 1328 3475317 1
863
+ 1330 14075252 1
864
+ 1331 14075252 1
865
+ 1333 1649738 1
866
+ 1334 13923140 1
867
+ 1334 13940200 1
868
+ 1334 11899391 1
869
+ 1340 15482274 1
870
+ 1341 15482274 1
871
+ 1342 8148122 1
872
+ 1345 9559146 1
873
+ 1346 9505402 1
874
+ 1347 19005293 1
875
+ 1348 19005293 1
876
+ 1349 5377642 1
877
+ 1350 5377642 1
878
+ 1351 28369117 1
879
+ 1353 18816720 1
880
+ 1355 5256564 1
881
+ 1356 13764090 1
882
+ 1360 11614737 1
883
+ 1361 15488881 1
884
+ 1361 15058155 1
885
+ 1364 8290953 1
886
+ 1366 4406819 1
887
+ 1367 2425364 1
888
+ 1371 16256507 1
889
+ 1372 21003930 1
890
+ 1373 21003930 1
891
+ 1374 21993510 1
892
+ 1375 21993510 1
893
+ 1376 3944632 1
894
+ 1378 2488880 1
895
+ 1380 16322674 1
896
+ 1380 23557241 1
897
+ 1380 17450673 1
898
+ 1381 13481880 1
899
+ 1383 17755060 1
900
+ 1386 306006 1
901
+ 1387 9669099 1
902
+ 1390 2890952 1
903
+ 1391 6766459 1
904
+ 1392 6766459 1
905
+ 1393 2000038 1
906
+ 1393 12440953 1
907
+ 1394 2251426 1
908
+ 1397 17717391 1
909
+ 1398 17717391 1
910
+ 1400 14706752 1
911
+ 1401 5185871 1
912
+ 1402 8126244 1
913
+ 1403 33370 1
914
+ 1403 38355793 1
915
+ 1404 33370 1
916
+ 1404 38355793 1
917
+ 1405 10504681 1
918
+ 1406 2617858 1
919
+ 1407 8087082 1
920
+ 1407 29863668 1
data/scifact/queries.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
docker-compose.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ semantic-search:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ container_name: semantic-search
7
+ ports:
8
+ - "7860:7860"
9
+ environment:
10
+ NLTK_DATA: /usr/local/share/nltk_data
11
+ volumes:
12
+ - ./config.yaml:/app/config.yaml:ro
13
+ - ./data:/app/data
14
+ - ./results:/app/results
15
+ - ./documents:/documents
16
+ restart: unless-stopped
evaluation/__pycache__/dataset_loader.cpython-313.pyc.2070577919488 ADDED
Binary file (5.68 kB). View file
 
evaluation/__pycache__/evaluator.cpython-313.pyc.2070577919488 ADDED
Binary file (8.8 kB). View file
 
evaluation/__pycache__/indexer_bridge.cpython-313.pyc.2070577919488 ADDED
Binary file (4.73 kB). View file
 
evaluation/__pycache__/query_runner.cpython-313.pyc.2070577919488 ADDED
Binary file (5.62 kB). View file
 
evaluation/__pycache__/run_eval.cpython-313.pyc.2070577919488 ADDED
Binary file (8.4 kB). View file
 
evaluation/dataset_loader.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation/dataset_loader.py
2
+
3
+ import json
4
+ import csv
5
+ import os
6
+
7
+
8
+ class DatasetLoader:
9
+ """
10
+ Loads BEIR-format datasets (SciFact, NFCorpus, etc.)
11
+
12
+ BEIR format:
13
+ corpus.jsonl — {_id, title, text}
14
+ queries.jsonl — {_id, text}
15
+ qrels/*.tsv — query_id, doc_id, relevance_score
16
+
17
+ Relevance scales:
18
+ SciFact — binary (0 or 1)
19
+ NFCorpus — graded (0, 1, 2, 3) → we keep anything >= 1
20
+ """
21
+
22
+ def __init__(self, dataset_path: str):
23
+ self.dataset_path = dataset_path
24
+ self.corpus_path = os.path.join(dataset_path, "corpus.jsonl")
25
+ self.queries_path = os.path.join(dataset_path, "queries.jsonl")
26
+
27
+ # qrels path — try test.tsv first, fallback to dev.tsv
28
+ # NFCorpus ships with dev.tsv instead of test.tsv
29
+ test_path = os.path.join(dataset_path, "qrels", "test.tsv")
30
+ dev_path = os.path.join(dataset_path, "qrels", "dev.tsv")
31
+
32
+ if os.path.exists(test_path):
33
+ self.qrels_path = test_path
34
+ elif os.path.exists(dev_path):
35
+ self.qrels_path = dev_path
36
+ print(f"[INFO] test.tsv not found, using dev.tsv for qrels")
37
+ else:
38
+ raise FileNotFoundError(
39
+ f"No qrels file found in {os.path.join(dataset_path, 'qrels')} — "
40
+ f"expected test.tsv or dev.tsv"
41
+ )
42
+
43
+ def load_corpus(self) -> dict:
44
+ """
45
+ Load all documents from corpus.jsonl.
46
+
47
+ Returns:
48
+ dict — {doc_id: {"title": str, "text": str}}
49
+ """
50
+ corpus = {}
51
+ with open(self.corpus_path, "r", encoding="utf-8") as f:
52
+ for line in f:
53
+ doc = json.loads(line.strip())
54
+ doc_id = str(doc["_id"])
55
+ corpus[doc_id] = {
56
+ "title": doc.get("title", ""),
57
+ "text": doc.get("text", ""),
58
+ }
59
+ print(f"Loaded {len(corpus)} documents from corpus")
60
+ return corpus
61
+
62
+ def load_queries(self) -> dict:
63
+ """
64
+ Load test queries from queries.jsonl.
65
+
66
+ Returns:
67
+ dict — {query_id: query_text}
68
+ """
69
+ queries = {}
70
+ with open(self.queries_path, "r", encoding="utf-8") as f:
71
+ for line in f:
72
+ q = json.loads(line.strip())
73
+ queries[str(q["_id"])] = q["text"]
74
+ print(f"Loaded {len(queries)} queries")
75
+ return queries
76
+
77
+ def load_qrels(self) -> dict:
78
+ """
79
+ Load relevance judgments from qrels file.
80
+
81
+ Handles both:
82
+ SciFact — binary relevance (0 or 1)
83
+ NFCorpus — graded relevance (0, 1, 2, 3) → keep score >= 1
84
+
85
+ Returns:
86
+ dict — {query_id: {doc_id: relevance_score}}
87
+ """
88
+ qrels = {}
89
+
90
+ with open(self.qrels_path, "r", encoding="utf-8") as f:
91
+ reader = csv.reader(f, delimiter="\t")
92
+ next(reader) # skip header: query-id corpus-id score
93
+
94
+ for row in reader:
95
+ if len(row) < 3:
96
+ continue
97
+
98
+ query_id = str(row[0])
99
+ doc_id = str(row[1])
100
+ score = int(row[2])
101
+
102
+ # skip completely irrelevant docs
103
+ # this handles both binary (0/1) and graded (0/1/2/3)
104
+ if score < 1:
105
+ continue
106
+
107
+ if query_id not in qrels:
108
+ qrels[query_id] = {}
109
+
110
+ qrels[query_id][doc_id] = score
111
+
112
+ print(f"Loaded qrels for {len(qrels)} queries "
113
+ f"from {os.path.basename(self.qrels_path)}")
114
+ return qrels
115
+
116
+
117
+ if __name__ == "__main__":
118
+ import sys
119
+
120
+ # pass dataset path as argument or default to scifact
121
+ # usage: python -m evaluation.dataset_loader data/nfcorpus
122
+ path = sys.argv[1] if len(sys.argv) > 1 else "data/scifact"
123
+ loader = DatasetLoader(path)
124
+
125
+ corpus = loader.load_corpus()
126
+ queries = loader.load_queries()
127
+ qrels = loader.load_qrels()
128
+
129
+ # show a sample
130
+ sample_qid = list(queries.keys())[0]
131
+ print(f"\nSample query [{sample_qid}]: {queries[sample_qid]}")
132
+ print(f"Relevant docs : {qrels.get(sample_qid, {})}")
evaluation/evaluator.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation/evaluator.py
2
+
3
+ import math
4
+ from collections import defaultdict
5
+
6
+
7
+ class Evaluator:
8
+ """
9
+ Computes standard IR evaluation metrics by comparing your
10
+ system's ranked results against the ground-truth qrels.
11
+
12
+ Metrics implemented:
13
+ NDCG@k — Normalized Discounted Cumulative Gain
14
+ Measures ranking quality; rewards relevant docs appearing early
15
+ Handles graded relevance (NFCorpus 0-3) and binary (SciFact 0-1)
16
+ MAP@k — Mean Average Precision
17
+ Average of precision computed at each relevant doc position
18
+ Recall@k — Fraction of relevant docs found in top-k
19
+ P@k — Precision at k (fraction of top-k that are relevant)
20
+ MRR — Mean Reciprocal Rank (position of first relevant result)
21
+ """
22
+
23
+ def ndcg_at_k(self, ranked: list, relevant: dict, k: int) -> float:
24
+ """
25
+ NDCG@k — the most important metric for ranked retrieval.
26
+ Score of 1.0 = perfect ranking, 0.0 = no relevant docs found.
27
+
28
+ Works for both:
29
+ - Binary relevance (SciFact): scores are 0 or 1
30
+ - Graded relevance (NFCorpus): scores are 0, 1, 2, or 3
31
+ """
32
+ dcg = 0.0
33
+ for i, (doc_id, _) in enumerate(ranked[:k]):
34
+ rel = relevant.get(doc_id, 0)
35
+ if rel > 0:
36
+ dcg += rel / math.log2(i + 2) # i+2 because log2(1) = 0
37
+
38
+ # Ideal DCG — best possible ranking given the relevant docs
39
+ ideal_rels = sorted(relevant.values(), reverse=True)[:k]
40
+ idcg = sum(
41
+ rel / math.log2(i + 2)
42
+ for i, rel in enumerate(ideal_rels)
43
+ if rel > 0
44
+ )
45
+
46
+ return dcg / idcg if idcg > 0 else 0.0
47
+
48
+ def map_at_k(self, ranked: list, relevant: dict, k: int) -> float:
49
+ """
50
+ MAP@k — average precision across all relevant document positions.
51
+
52
+ For graded relevance (NFCorpus), any score >= 1 counts as relevant.
53
+ """
54
+ num_relevant = 0
55
+ sum_precision = 0.0
56
+
57
+ for i, (doc_id, _) in enumerate(ranked[:k]):
58
+ if relevant.get(doc_id, 0) > 0:
59
+ num_relevant += 1
60
+ sum_precision += num_relevant / (i + 1)
61
+
62
+ total_relevant = sum(1 for v in relevant.values() if v > 0)
63
+ if total_relevant == 0:
64
+ return 0.0
65
+ return sum_precision / total_relevant
66
+
67
+ def recall_at_k(self, ranked: list, relevant: dict, k: int) -> float:
68
+ """
69
+ Recall@k — what fraction of all relevant docs appear in top-k.
70
+
71
+ For graded relevance, any score >= 1 counts as relevant.
72
+ """
73
+ total_relevant = sum(1 for v in relevant.values() if v > 0)
74
+ if total_relevant == 0:
75
+ return 0.0
76
+ found = sum(
77
+ 1 for doc_id, _ in ranked[:k]
78
+ if relevant.get(doc_id, 0) > 0
79
+ )
80
+ return found / total_relevant
81
+
82
+ def precision_at_k(self, ranked: list, relevant: dict, k: int) -> float:
83
+ """
84
+ P@k — fraction of the top-k results that are relevant.
85
+
86
+ For graded relevance, any score >= 1 counts as relevant.
87
+ """
88
+ if k == 0:
89
+ return 0.0
90
+ hits = sum(
91
+ 1 for doc_id, _ in ranked[:k]
92
+ if relevant.get(doc_id, 0) > 0
93
+ )
94
+ return hits / k
95
+
96
+ def mrr(self, ranked: list, relevant: dict) -> float:
97
+ """
98
+ MRR — reciprocal of the rank of the first relevant result.
99
+ Score of 1.0 = first result is relevant.
100
+
101
+ For graded relevance, any score >= 1 counts as relevant.
102
+ """
103
+ for i, (doc_id, _) in enumerate(ranked):
104
+ if relevant.get(doc_id, 0) > 0:
105
+ return 1.0 / (i + 1)
106
+ return 0.0
107
+
108
+ def evaluate(
109
+ self,
110
+ all_results: dict,
111
+ qrels: dict,
112
+ k_values: list = None,
113
+ ) -> dict:
114
+ """
115
+ Compute all metrics across all queries and average them.
116
+
117
+ Args:
118
+ all_results — {query_id: [(doc_id, score), ...]} from QueryRunner
119
+ qrels — {query_id: {doc_id: relevance}} from DatasetLoader
120
+ k_values — list of k values e.g. [1, 5, 10, 100]
121
+
122
+ Returns:
123
+ dict — {
124
+ "NDCG@10": 0.42,
125
+ "MAP@100": 0.38,
126
+ "Recall@100": 0.71,
127
+ "P@10": 0.15,
128
+ "MRR": 0.55,
129
+ "num_queries": 300,
130
+ "queries_with_results": 298,
131
+ "queries_with_no_qrels": 2,
132
+ }
133
+ """
134
+ if k_values is None:
135
+ k_values = [1, 5, 10, 100]
136
+
137
+ scores = defaultdict(list)
138
+ num_queries = 0
139
+ queries_with_results = 0
140
+ queries_no_qrels = 0
141
+
142
+ for query_id, ranked in all_results.items():
143
+ relevant = qrels.get(query_id, {})
144
+
145
+ # skip queries that have no ground truth at all
146
+ if not relevant:
147
+ queries_no_qrels += 1
148
+ continue
149
+
150
+ num_queries += 1
151
+ if ranked:
152
+ queries_with_results += 1
153
+
154
+ for k in k_values:
155
+ scores[f"NDCG@{k}"].append(self.ndcg_at_k(ranked, relevant, k))
156
+ scores[f"MAP@{k}"].append(self.map_at_k(ranked, relevant, k))
157
+ scores[f"Recall@{k}"].append(self.recall_at_k(ranked, relevant, k))
158
+ scores[f"P@{k}"].append(self.precision_at_k(ranked, relevant, k))
159
+
160
+ scores["MRR"].append(self.mrr(ranked, relevant))
161
+
162
+ # Print diagnostic so you can see if queries matched correctly
163
+ print(f" Evaluated {num_queries} queries | "
164
+ f"{queries_with_results} had results | "
165
+ f"{queries_no_qrels} had no qrels (skipped)")
166
+
167
+ # Average across all queries
168
+ summary = {
169
+ metric: round(sum(vals) / len(vals), 4) if vals else 0.0
170
+ for metric, vals in scores.items()
171
+ }
172
+ summary["num_queries"] = num_queries
173
+ summary["queries_with_results"] = queries_with_results
174
+ summary["queries_with_no_qrels"] = queries_no_qrels
175
+
176
+ return summary
177
+
178
+
179
+ if __name__ == "__main__":
180
+ # Quick sanity check with toy data
181
+ evaluator = Evaluator()
182
+
183
+ # Fake ranked results — doc_1 is relevant, doc_2 is not
184
+ fake_results = {
185
+ "q1": [("doc_1", 0.95), ("doc_2", 0.80), ("doc_3", 0.60)],
186
+ "q2": [("doc_4", 0.70), ("doc_1", 0.50)],
187
+ }
188
+ fake_qrels = {
189
+ "q1": {"doc_1": 1},
190
+ "q2": {"doc_4": 1, "doc_5": 1},
191
+ }
192
+
193
+ metrics = evaluator.evaluate(fake_results, fake_qrels, k_values=[1, 5, 10])
194
+
195
+ print("\nSanity check metrics:")
196
+ for k, v in metrics.items():
197
+ print(f" {k}: {v}")
evaluation/indexer_bridge.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation/indexer_bridge.py
2
+
3
+ import numpy as np
4
+ from indexer.chunker import Chunker
5
+ from indexer.embedder import Embedder
6
+ from indexer.store import Store
7
+
8
+
9
+ class IndexerBridge:
10
+ """
11
+ Feeds the BEIR corpus directly into your existing indexing pipeline.
12
+
13
+ The corpus documents are NOT real files on disk — they come from JSONL.
14
+ So we bypass the Crawler/Extractor and inject text directly into
15
+ Chunker → Embedder → Store.
16
+
17
+ Each document gets a fake filepath: "{dataset_name}://{doc_id}"
18
+ This lets the Store treat them like any other indexed file,
19
+ and the Evaluator can later match doc_id back from results.
20
+ """
21
+
22
+ def __init__(self, config_path: str = "config.yaml"):
23
+ self.chunker = Chunker(chunk_size=500, overlap=50)
24
+ self.embedder = Embedder(config_path)
25
+ self.store = Store(config_path)
26
+
27
+ def index_corpus(self, corpus: dict, batch_size: int = 64, dataset_name: str = "dataset"):
28
+ """
29
+ Index the entire corpus into FAISS + SQLite.
30
+
31
+ Args:
32
+ corpus — {doc_id: {"title": str, "text": str}}
33
+ batch_size — number of chunks to embed at once (memory control)
34
+ dataset_name — used as prefix for fake file paths e.g. "scifact", "nfcorpus"
35
+ """
36
+ doc_ids = list(corpus.keys())
37
+ total = len(doc_ids)
38
+ print(f"Indexing {total} documents from [{dataset_name}]...")
39
+
40
+ # Clear previous entries for THIS dataset only
41
+ existing_hashes = self.store.load_hashes()
42
+ prefix = f"{dataset_name}://"
43
+ existing_entries = [fp for fp in existing_hashes if fp.startswith(prefix)]
44
+ for fp in existing_entries:
45
+ self.store.remove_file_chunks(fp)
46
+ if existing_entries:
47
+ print(f"Cleared {len(existing_entries)} previously indexed [{dataset_name}] documents")
48
+
49
+ chunk_buffer = []
50
+ text_buffer = []
51
+
52
+ def flush(chunk_buffer, text_buffer):
53
+ if not chunk_buffer:
54
+ return
55
+ embeddings = self.embedder.embed_chunks(text_buffer)
56
+ embeddings = np.array(embeddings, dtype="float32")
57
+ self.store.add_chunks(chunk_buffer, embeddings)
58
+
59
+ for i, doc_id in enumerate(doc_ids, 1):
60
+ doc = corpus[doc_id]
61
+ full_text = f"{doc['title']} {doc['text']}".strip()
62
+ if not full_text:
63
+ continue
64
+
65
+ fake_path = f"{prefix}{doc_id}"
66
+ chunks = self.chunker.chunk_file(full_text, fake_path)
67
+
68
+ for chunk in chunks:
69
+ chunk_buffer.append(chunk)
70
+ text_buffer.append(chunk["text"])
71
+
72
+ self.store.save_file_info(fake_path, doc_id, len(chunks))
73
+
74
+ if len(chunk_buffer) >= batch_size:
75
+ flush(chunk_buffer, text_buffer)
76
+ chunk_buffer.clear()
77
+ text_buffer.clear()
78
+
79
+ if i % 500 == 0:
80
+ print(f" Indexed {i}/{total}...")
81
+
82
+ # flush any remaining chunks
83
+ flush(chunk_buffer, text_buffer)
84
+ print(f"Done. Total vectors: {self.store.get_total_vectors()}")
85
+
86
+
87
+ if __name__ == "__main__":
88
+ from evaluation.dataset_loader import DatasetLoader
89
+
90
+ loader = DatasetLoader("data/scifact")
91
+ corpus = loader.load_corpus()
92
+
93
+ bridge = IndexerBridge()
94
+ bridge.index_corpus(corpus, batch_size=64, dataset_name="scifact")
evaluation/query_runner.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation/query_runner.py
2
+
3
+ from searcher.search_engine import SearchEngine
4
+
5
+
6
+ class QueryRunner:
7
+ """
8
+ Runs all evaluation queries through your SearchEngine and collects
9
+ the ranked result lists for scoring.
10
+
11
+ The results are formatted exactly as the Evaluator expects:
12
+ {query_id: [(doc_id, score), ...]} ranked best-first
13
+ """
14
+
15
+ def __init__(self, config_path: str = "config.yaml"):
16
+ self.engine = SearchEngine(config_path)
17
+
18
+ def _extract_doc_id(self, filepath: str) -> str:
19
+ """
20
+ Strip dataset prefix from fake filepath so it matches qrels doc_ids.
21
+
22
+ Examples:
23
+ "scifact://12345" → "12345"
24
+ "nfcorpus://MED-10" → "MED-10"
25
+ "/real/file.pdf" → "/real/file.pdf" (real files unchanged)
26
+
27
+ This is critical — without stripping, doc_ids like "nfcorpus://MED-10"
28
+ will never match qrels keys like "MED-10" and all scores will be 0.0
29
+ """
30
+ if "://" in filepath:
31
+ return filepath.split("://", 1)[1]
32
+ return filepath
33
+
34
+ def run(
35
+ self,
36
+ queries: dict,
37
+ top_k: int = 100,
38
+ mode: str = "full",
39
+ ) -> dict:
40
+ """
41
+ Run all queries and return ranked results.
42
+
43
+ Args:
44
+ queries — {query_id: query_text}
45
+ top_k — number of results per query (use 100 for eval)
46
+ mode — pipeline variant to test:
47
+ "dense" → dense retrieval only
48
+ "sparse" → BM25 only
49
+ "hybrid" → dense + BM25 + RRF (no reranker)
50
+ "full" → complete pipeline with reranker
51
+
52
+ Returns:
53
+ dict — {query_id: [(doc_id, rank_score), ...]}
54
+ """
55
+ results = {}
56
+ total = len(queries)
57
+
58
+ for i, (query_id, query_text) in enumerate(queries.items(), 1):
59
+ if i % 50 == 0:
60
+ print(f" Running query {i}/{total}...")
61
+
62
+ try:
63
+ if mode == "dense":
64
+ raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
65
+ ranked = [
66
+ (self._extract_doc_id(r["filepath"]), -r["dense_score"])
67
+ for r in raw
68
+ ]
69
+
70
+ elif mode == "sparse":
71
+ raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
72
+ ranked = [
73
+ (self._extract_doc_id(r["filepath"]), r["sparse_score"])
74
+ for r in raw
75
+ ]
76
+
77
+ elif mode == "hybrid":
78
+ dense_raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
79
+ sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
80
+ fused = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k)
81
+ ranked = [
82
+ (self._extract_doc_id(r["filepath"]), r["rrf_score"])
83
+ for r in fused
84
+ ]
85
+
86
+ else: # full pipeline
87
+ output = self.engine.search(query_text, top_k=top_k)
88
+ ranked = [
89
+ (
90
+ self._extract_doc_id(r["filepath"]),
91
+ r.get("rerank_score", r.get("rrf_score", 0))
92
+ )
93
+ for r in output["results"]
94
+ ]
95
+
96
+ # Deduplicate by doc_id
97
+ # multiple chunks from same doc → keep only the best score
98
+ seen = {}
99
+ for doc_id, score in ranked:
100
+ if doc_id not in seen or score > seen[doc_id]:
101
+ seen[doc_id] = score
102
+
103
+ results[query_id] = sorted(
104
+ seen.items(),
105
+ key=lambda x: x[1],
106
+ reverse=True
107
+ )
108
+
109
+ except Exception as e:
110
+ print(f" Error on query {query_id}: {e}")
111
+ results[query_id] = []
112
+
113
+ return results
114
+
115
+
116
+ if __name__ == "__main__":
117
+ from evaluation.dataset_loader import DatasetLoader
118
+
119
+ loader = DatasetLoader("data/scifact")
120
+ queries = loader.load_queries()
121
+
122
+ runner = QueryRunner()
123
+ results = runner.run(queries, top_k=10, mode="full")
124
+
125
+ sample_qid = list(results.keys())[0]
126
+ print(f"\nQuery {sample_qid} top results:")
127
+ for doc_id, score in results[sample_qid][:5]:
128
+ print(f" doc {doc_id} score={score:.4f}")
evaluation/run_eval.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluation/run_eval.py
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import time
7
+ from evaluation.dataset_loader import DatasetLoader
8
+ from evaluation.indexer_bridge import IndexerBridge
9
+ from evaluation.query_runner import QueryRunner
10
+ from evaluation.evaluator import Evaluator
11
+
12
+
13
+ MODES = ["dense", "sparse", "hybrid", "full"]
14
+ DISPLAY_METRICS = ["NDCG@10", "MAP@100", "Recall@100", "P@10", "MRR"]
15
+
16
+ # All supported datasets — add more here later if needed
17
+ AVAILABLE_DATASETS = {
18
+ "scifact": "data/scifact",
19
+ "nfcorpus": "data/nfcorpus",
20
+ }
21
+
22
+
23
+ def print_table(results: dict, title: str = ""):
24
+ col_w = 14
25
+ header = f"{'Mode':<10}" + "".join(f"{m:>{col_w}}" for m in DISPLAY_METRICS)
26
+ if title:
27
+ print(f"\n {title}")
28
+ print("=" * len(header))
29
+ print(header)
30
+ print("-" * len(header))
31
+ for mode, metrics in results.items():
32
+ row = f"{mode:<10}"
33
+ for m in DISPLAY_METRICS:
34
+ val = metrics.get(m, 0.0)
35
+ row += f"{val:>{col_w}.4f}"
36
+ print(row)
37
+ print("=" * len(header))
38
+
39
+
40
+ def print_comparison_table(all_dataset_results: dict):
41
+ """
42
+ Print a single comparison table across all datasets.
43
+ Shows NDCG@10 and MRR side by side for each dataset.
44
+ """
45
+ datasets = list(all_dataset_results.keys())
46
+ modes = list(list(all_dataset_results.values())[0].keys())
47
+
48
+ print("\n" + "=" * 80)
49
+ print("CROSS-DATASET COMPARISON — full pipeline mode")
50
+ print("=" * 80)
51
+
52
+ # Header
53
+ header = f"{'Dataset':<14}" + "".join(
54
+ f"{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}"
55
+ )
56
+ print(f"{'Dataset':<14}{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}")
57
+ print("-" * 46)
58
+
59
+ for dataset, mode_results in all_dataset_results.items():
60
+ # use "full" mode results for comparison, fallback to first mode
61
+ metrics = mode_results.get("full", list(mode_results.values())[0])
62
+ ndcg = metrics.get("NDCG@10", 0.0)
63
+ mrr = metrics.get("MRR", 0.0)
64
+ map_ = metrics.get("MAP@100", 0.0)
65
+ print(f"{dataset:<14}{ndcg:>12.4f}{mrr:>10.4f}{map_:>10.4f}")
66
+
67
+ print("=" * 46)
68
+
69
+
70
+ def run_single_dataset(dataset_name: str, dataset_path: str, args) -> dict:
71
+ """Run full eval pipeline for one dataset. Returns mode→metrics dict."""
72
+
73
+ print(f"\n{'#'*60}")
74
+ print(f" DATASET: {dataset_name.upper()}")
75
+ print(f"{'#'*60}")
76
+
77
+ # 1 — load
78
+ print("\n[1/4] Loading dataset...")
79
+ loader = DatasetLoader(dataset_path)
80
+ corpus = loader.load_corpus()
81
+ queries = loader.load_queries()
82
+ qrels = loader.load_qrels()
83
+
84
+ # 2 — index
85
+ if not args.skip_index:
86
+ print("\n[2/4] Indexing corpus...")
87
+ bridge = IndexerBridge(args.config)
88
+ # pass dataset_name so fake paths are e.g. nfcorpus://doc_id
89
+ bridge.index_corpus(corpus, batch_size=64, dataset_name=dataset_name)
90
+ else:
91
+ print("\n[2/4] Skipping indexing (--skip-index)")
92
+
93
+ # 3 — run queries
94
+ print("\n[3/4] Running queries...")
95
+ runner = QueryRunner(args.config)
96
+ evaluator = Evaluator()
97
+
98
+ modes_to_run = MODES if args.mode == "all" else [args.mode]
99
+ all_mode_results = {}
100
+
101
+ for mode in modes_to_run:
102
+ print(f"\n Mode: {mode}")
103
+ t0 = time.time()
104
+ ranked_results = runner.run(queries, top_k=args.top_k, mode=mode)
105
+ elapsed = time.time() - t0
106
+
107
+ metrics = evaluator.evaluate(ranked_results, qrels, k_values=[1, 5, 10, 100])
108
+ metrics["query_time_s"] = round(elapsed, 2)
109
+ all_mode_results[mode] = metrics
110
+
111
+ print(f" NDCG@10={metrics.get('NDCG@10', 0):.4f} "
112
+ f"MAP@100={metrics.get('MAP@100', 0):.4f} "
113
+ f"MRR={metrics.get('MRR', 0):.4f}")
114
+
115
+ # 4 — per-dataset table
116
+ print(f"\n[4/4] Results for {dataset_name.upper()}")
117
+ print_table(all_mode_results, title=f"EVALUATION RESULTS — {dataset_name} (pytrec_eval)")
118
+
119
+ return all_mode_results
120
+
121
+
122
+ def main():
123
+ parser = argparse.ArgumentParser(description="Evaluate semantic search on BEIR datasets")
124
+ parser.add_argument(
125
+ "--datasets",
126
+ nargs="+",
127
+ default=["scifact", "nfcorpus"],
128
+ choices=list(AVAILABLE_DATASETS.keys()),
129
+ help="Which datasets to evaluate. e.g. --datasets scifact nfcorpus"
130
+ )
131
+ parser.add_argument("--config", default="config.yaml")
132
+ parser.add_argument("--top-k", default=100, type=int)
133
+ parser.add_argument("--skip-index", action="store_true")
134
+ parser.add_argument("--mode", default="all",
135
+ help="dense | sparse | hybrid | full | all")
136
+ args = parser.parse_args()
137
+
138
+ os.makedirs("results", exist_ok=True)
139
+
140
+ all_dataset_results = {}
141
+
142
+ for dataset_name in args.datasets:
143
+ dataset_path = AVAILABLE_DATASETS[dataset_name]
144
+
145
+ if not os.path.exists(dataset_path):
146
+ print(f"\n[WARNING] Dataset folder not found: {dataset_path} — skipping {dataset_name}")
147
+ continue
148
+
149
+ results = run_single_dataset(dataset_name, dataset_path, args)
150
+ all_dataset_results[dataset_name] = results
151
+
152
+ # save per-dataset report
153
+ report_path = f"results/eval_{dataset_name}.json"
154
+ with open(report_path, "w") as f:
155
+ json.dump(results, f, indent=2)
156
+ print(f" Saved → {report_path}")
157
+
158
+ # cross-dataset comparison (only if more than one dataset ran)
159
+ if len(all_dataset_results) > 1:
160
+ print_comparison_table(all_dataset_results)
161
+
162
+ # save combined report
163
+ combined_path = "results/eval_all.json"
164
+ with open(combined_path, "w") as f:
165
+ json.dump(all_dataset_results, f, indent=2)
166
+ print(f"\nCombined report saved → {combined_path}")
167
+
168
+
169
+ if __name__ == "__main__":
170
+ main()
indexer/__pycache__/chunker.cpython-313.pyc.2070577919488 ADDED
Binary file (5.35 kB). View file
 
indexer/__pycache__/crawler.cpython-313.pyc.2070577919488 ADDED
Binary file (4.8 kB). View file
 
indexer/__pycache__/embedder.cpython-313.pyc.2070577919488 ADDED
Binary file (4.39 kB). View file
 
indexer/__pycache__/extractor.cpython-313.pyc.2070577919488 ADDED
Binary file (5.84 kB). View file
 
indexer/__pycache__/pipeline.cpython-313.pyc.2070577919488 ADDED
Binary file (6.86 kB). View file
 
indexer/__pycache__/store.cpython-313.pyc.2070577919488 ADDED
Binary file (11.1 kB). View file
 
indexer/__pycache__/watcher.cpython-313.pyc.2070577919488 ADDED
Binary file (8.89 kB). View file
 
indexer/chunker.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # indexer/chunker.py
2
+
3
+
4
+ class Chunker:
5
+ """
6
+ Splits extracted text into overlapping chunks using a sliding window.
7
+ Each chunk will later be embedded as a separate vector.
8
+
9
+ Why chunk at all?
10
+ - Embedding models have a token limit (typically 256-512 tokens)
11
+ - A 50-page PDF as one embedding would lose detail
12
+ - Small chunks let us pinpoint the EXACT passage that matches a query
13
+
14
+ Why overlap?
15
+ - A sentence at the boundary might get cut in half
16
+ - Overlap ensures every sentence appears fully in at least one chunk
17
+ """
18
+
19
+ def __init__(self, chunk_size=500, overlap=50):
20
+ """
21
+ Args:
22
+ chunk_size (int) — max number of words per chunk
23
+ overlap (int) — number of words shared between consecutive chunks
24
+
25
+ TODO:
26
+ - Store chunk_size and overlap as instance variables
27
+ - Validate that overlap is less than chunk_size
28
+ (if overlap >= chunk_size, chunks would never advance forward)
29
+ """
30
+ self.chunk_size = chunk_size
31
+ self.overlap = overlap
32
+ if self.overlap >= self.chunk_size:
33
+ raise ValueError("Overlap must be smaller than chunk_size")
34
+
35
+ def chunk_text(self, text):
36
+ """
37
+ Split a text string into overlapping chunks based on word count.
38
+
39
+ Args:
40
+ text (str) — the full extracted text from a file
41
+
42
+ Returns:
43
+ list[str] — list of text chunks
44
+
45
+ Example with chunk_size=5, overlap=2:
46
+ text = "The quick brown fox jumps over the lazy dog today"
47
+ words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "today"]
48
+
49
+ Chunk 0: words[0:5] → "The quick brown fox jumps"
50
+ Chunk 1: words[3:8] → "fox jumps over the lazy" (step = 5-2 = 3)
51
+ Chunk 2: words[6:11] → "the lazy dog today" (step = 3 again)
52
+
53
+ TODO:
54
+ - Split the text into a list of words using .split()
55
+ - If the word list is empty, return an empty list
56
+ - Calculate step size: step = chunk_size - overlap
57
+ - Use a loop starting at 0, stepping by 'step', up to len(words)
58
+ - At each position, take words[i : i + chunk_size]
59
+ - Join each slice back into a string with " ".join()
60
+ - Return the list of chunk strings
61
+
62
+ HINT:
63
+ words = text.split()
64
+ step = self.chunk_size - self.overlap
65
+ for i in range(0, len(words), step):
66
+ chunk_words = words[i : i + self.chunk_size]
67
+ """
68
+ words = text.split()
69
+ if not words:
70
+ return []
71
+ step = self.chunk_size - self.overlap
72
+ chunks = []
73
+ for i in range(0, len(words), step):
74
+ chunk_words = words[i:i+self.chunk_size]
75
+ chunks.append(" ".join(chunk_words))
76
+ return chunks
77
+
78
+ def chunk_file(self, text, filepath):
79
+ """
80
+ Chunk a file's text and attach metadata to each chunk.
81
+ This metadata will be stored in SQLite alongside the vectors.
82
+
83
+ Args:
84
+ text (str) — extracted text content
85
+ filepath (str) — source file path (for metadata)
86
+
87
+ Returns:
88
+ list[dict] — each dict contains:
89
+ {
90
+ "text": "the chunk text...",
91
+ "filepath": "/path/to/file.pdf",
92
+ "chunk_index": 0, # position in the file
93
+ "total_chunks": 5 # how many chunks this file produced
94
+ }
95
+
96
+ TODO:
97
+ - Call self.chunk_text(text) to get the list of chunk strings
98
+ - Build a list of dicts, one per chunk, with the fields shown above
99
+ - chunk_index starts at 0
100
+
101
+ HINT:
102
+ chunks = self.chunk_text(text)
103
+ for i, chunk in enumerate(chunks):
104
+ # build the dict here
105
+ """
106
+ chunks = self.chunk_text(text)
107
+ results = []
108
+ for i, chunk in enumerate(chunks):
109
+ results.append({
110
+ "text": chunk,
111
+ "filepath": filepath,
112
+ "chunk_index": i,
113
+ })
114
+ return results
115
+
116
+
117
+ # --- Test it ---
118
+ if __name__ == "__main__":
119
+ chunker = Chunker(chunk_size=10, overlap=3)
120
+
121
+ sample = (
122
+ "The quick brown fox jumps over the lazy dog. "
123
+ "Semantic search finds files by meaning not just keywords. "
124
+ "This is a test of the chunking system for our project."
125
+ )
126
+
127
+ chunks = chunker.chunk_text(sample)
128
+ print(f"Text has {len(sample.split())} words → {len(chunks)} chunks\n")
129
+ for i, chunk in enumerate(chunks):
130
+ print(f"Chunk {i}: {chunk}")
131
+
132
+ print("\n--- With metadata ---")
133
+ results = chunker.chunk_file(sample, "/test/sample.txt")
134
+ for r in results:
135
+ print(f"[{r['chunk_index']}] {r['text'][:60]}...")
indexer/crawler.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # indexer/crawler.py
2
+
3
+ import os
4
+ import hashlib
5
+ import yaml
6
+
7
+
8
+ class Crawler:
9
+ """
10
+ Discovers files in configured directories and tracks which ones
11
+ are new or modified using SHA-256 hashing.
12
+ """
13
+
14
+ def __init__(self, config_path="config.yaml"):
15
+ """
16
+ Load the config file and store the settings as instance variables.
17
+ """
18
+ with open(config_path, "r") as f:
19
+ config = yaml.safe_load(f)
20
+
21
+ self.watch_paths = config["watch_paths"]
22
+ self.include_extensions = config["include_extensions"]
23
+ self.skip_directories = config["skip_directories"]
24
+ self.data_dir = config["data_dir"]
25
+
26
+ def discover_files(self):
27
+ """
28
+ Walk through all watch_paths recursively and collect every file
29
+ that matches include_extensions, skipping skip_directories.
30
+
31
+ Returns:
32
+ list[str] — list of absolute file paths
33
+ """
34
+ results=[]
35
+ for path in self.watch_paths:
36
+ for dirpath, dirnames, filenames in os.walk(path):
37
+ for filename in filenames:
38
+ if os.path.splitext(filename)[1] in self.include_extensions:
39
+ full_path = os.path.join(dirpath, filename)
40
+ results.append(full_path)
41
+ dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
42
+ return results
43
+
44
+
45
+ def compute_hash(self, filepath):
46
+ """
47
+ Compute the SHA-256 hash of a file's contents.
48
+
49
+ Args:
50
+ filepath (str) — absolute path to the file
51
+
52
+ Returns:
53
+ str — hex string of the SHA-256 hash
54
+ """
55
+ hasher = hashlib.sha256()
56
+ with open(filepath, "rb") as f:
57
+ while chunk := f.read(8192):
58
+ hasher.update(chunk)
59
+ return hasher.hexdigest()
60
+
61
+ def get_new_and_modified(self, known_hashes=None):
62
+ """
63
+ Compare discovered files against previously known hashes to find
64
+ which files are new or have been modified since last run.
65
+
66
+ Args:
67
+ known_hashes (dict) — {filepath: hash} from previous run
68
+ Pass None or {} on first run.
69
+
70
+ Returns:
71
+ tuple: (files_to_process, current_hashes, deleted_files)
72
+ - files_to_process: list[str] — paths that are new or changed
73
+ - current_hashes: dict — {filepath: hash} for ALL current files
74
+ - deleted files: list[str] — files that were deleted
75
+ """
76
+ if known_hashes is None:
77
+ known_hashes = {}
78
+ current_files = self.discover_files()
79
+ files_to_process = []
80
+ current_hashes = {}
81
+ for file in current_files:
82
+ file_hash = self.compute_hash(file)
83
+ if file not in known_hashes or file_hash != known_hashes[file]:
84
+ files_to_process.append(file)
85
+ current_hashes[file] = file_hash
86
+
87
+ deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())
88
+
89
+ return files_to_process, current_hashes, deleted_files
90
+
91
+
92
+ # --- Test it ---
93
+ if __name__ == "__main__":
94
+ crawler = Crawler()
95
+ files = crawler.discover_files()
96
+ print(f"Found {len(files)} files:")
97
+ for f in files:
98
+ print(f" {f}")
99
+
100
+ print("\n--- Checking for new/modified ---")
101
+ to_process, hashes = crawler.get_new_and_modified()
102
+ print(f"{len(to_process)} files to process")
indexer/embedder.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # indexer/embedder.py
2
+
3
+ import yaml
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+
7
+ class Embedder:
8
+ """
9
+ Loads a sentence-transformer model and converts text chunks
10
+ into dense vector embeddings.
11
+
12
+ Model upgrade: all-MiniLM-L6-v2 → BAAI/bge-small-en-v1.5
13
+
14
+ Why BGE over MiniLM:
15
+ - MiniLM : general purpose, fast, 384-dim, NDCG ~0.65 on SciFact
16
+ - BGE-small: retrieval-specific training, 384-dim, NDCG ~0.72 on SciFact
17
+ - Same dimension (384), same API — only the model name changes
18
+ - BGE uses a special instruction prefix for queries (not for documents)
19
+ "Represent this sentence for searching relevant passages: {query}"
20
+ This is handled automatically in embed_single()
21
+ """
22
+
23
+ # BGE query instruction prefix — improves retrieval accuracy
24
+ # Applied to queries only, NOT to document chunks during indexing
25
+ BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
26
+
27
+ def __init__(self, config_path="config.yaml"):
28
+ """
29
+ Load the config and initialize the embedding model.
30
+
31
+ Args:
32
+ config_path (str) — path to config.yaml
33
+ """
34
+ with open(config_path, "r") as f:
35
+ config = yaml.safe_load(f)
36
+
37
+ model_name = config["embedding_model"]
38
+ self.model_name = model_name
39
+
40
+ # detect if we are using a BGE model
41
+ # BGE models need a special prefix on queries (not on documents)
42
+ self.is_bge = "bge" in model_name.lower()
43
+
44
+ print(f"Loading embedding model '{model_name}'...")
45
+ self.model = SentenceTransformer(model_name)
46
+ print(f"Model loaded — BGE mode: {self.is_bge}")
47
+
48
+ def embed_chunks(self, chunks):
49
+ """
50
+ Convert a list of text chunks into dense vector embeddings.
51
+ Used during INDEXING — no query prefix applied here.
52
+
53
+ Args:
54
+ chunks (list[str]) — list of text strings to embed
55
+
56
+ Returns:
57
+ numpy.ndarray — shape (num_chunks, embedding_dim)
58
+ 384 dimensions for both MiniLM and BGE-small
59
+ """
60
+ embeddings = self.model.encode(
61
+ chunks,
62
+ batch_size=64,
63
+ show_progress_bar=False,
64
+ normalize_embeddings=self.is_bge, # BGE needs L2 normalization
65
+ )
66
+ return embeddings
67
+
68
+ def embed_single(self, text):
69
+ """
70
+ Embed a single query string.
71
+ Used during SEARCH — BGE prefix is applied here if using BGE model.
72
+
73
+ Why prefix only on queries:
74
+ BGE was trained with this asymmetric setup.
75
+ Documents are indexed as-is.
76
+ Queries get the instruction prefix so the model knows
77
+ it is searching for relevant passages, not matching exact text.
78
+
79
+ Args:
80
+ text (str) — a single query string
81
+
82
+ Returns:
83
+ numpy.ndarray — one embedding vector (384 dimensions)
84
+ """
85
+ if self.is_bge:
86
+ text = self.BGE_QUERY_PREFIX + text
87
+
88
+ return self.model.encode(
89
+ text,
90
+ normalize_embeddings=True, # always normalize for BGE
91
+ )
92
+
93
+
94
+ if __name__ == "__main__":
95
+ embedder = Embedder()
96
+
97
+ test_chunks = [
98
+ "The quarterly budget report shows increased spending",
99
+ "Machine learning models can understand text semantics",
100
+ "The cat sat on the mat and looked out the window"
101
+ ]
102
+
103
+ print("Embedding 3 test chunks...")
104
+ vectors = embedder.embed_chunks(test_chunks)
105
+ print(f"Got {len(vectors)} vectors")
106
+ print(f"Each vector has {len(vectors[0])} dimensions")
107
+ print(f"First vector (first 5 values): {vectors[0][:5]}")
108
+
109
+ print("\n--- Single query embedding ---")
110
+ query_vec = embedder.embed_single("budget spending report")
111
+ print(f"Query vector: {len(query_vec)} dimensions")
indexer/extractor.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # indexer/extractor.py
2
+
3
+ import os
4
+ import json
5
+ import fitz # PyMuPDF
6
+ from docx import Document
7
+ from pptx import Presentation
8
+ from openpyxl import load_workbook
9
+
10
+
11
+ class Extractor:
12
+ """
13
+ Extracts raw text content from different file types.
14
+ Each file type has its own extraction method.
15
+ """
16
+
17
+ def extract(self, filepath):
18
+ """
19
+ Main dispatcher — picks the right extraction method based on file extension.
20
+ """
21
+ handlers = {
22
+ ".pdf": self.extract_pdf,
23
+ ".docx": self.extract_docx,
24
+ ".pptx": self.extract_pptx,
25
+ ".xlsx": self.extract_xlsx,
26
+ ".ipynb": self.extract_ipynb,
27
+ ".txt": self.extract_text,
28
+ ".md": self.extract_text,
29
+ ".py": self.extract_text,
30
+ ".js": self.extract_text,
31
+ }
32
+
33
+ try:
34
+ ext = os.path.splitext(filepath)[1].lower()
35
+ handler = handlers.get(ext)
36
+ if handler:
37
+ return handler(filepath)
38
+ else:
39
+ print(f"Warning: Unrecognized file extension: {ext}")
40
+ return ""
41
+ except Exception as e:
42
+ print(f"Error extracting text from {filepath}: {e}")
43
+ return ""
44
+
45
+ def extract_pdf(self, filepath):
46
+ """Extract text from a PDF file using PyMuPDF."""
47
+ doc = fitz.open(filepath)
48
+ pages = []
49
+ for page in doc:
50
+ pages.append(page.get_text())
51
+ doc.close()
52
+ return "\n".join(pages)
53
+
54
+ def extract_docx(self, filepath):
55
+ """Extract text from a Word document using python-docx."""
56
+ doc = Document(filepath)
57
+ paragraphs = []
58
+ for para in doc.paragraphs:
59
+ paragraphs.append(para.text)
60
+ return "\n".join(paragraphs)
61
+
62
+ def extract_pptx(self, filepath):
63
+ """Extract text from a PowerPoint file using python-pptx."""
64
+ prs = Presentation(filepath)
65
+ lines = []
66
+ for slide in prs.slides:
67
+ for shape in slide.shapes:
68
+ if shape.has_text_frame:
69
+ for para in shape.text_frame.paragraphs:
70
+ lines.append(para.text)
71
+ return "\n".join(lines)
72
+
73
+ def extract_xlsx(self, filepath):
74
+ """Extract text from an Excel file using openpyxl."""
75
+ wb = load_workbook(filepath, data_only=True)
76
+ rows = []
77
+ for sheet_name in wb.sheetnames:
78
+ sheet = wb[sheet_name]
79
+ for row in sheet.iter_rows():
80
+ cells = []
81
+ for cell in row:
82
+ if cell.value is not None:
83
+ cells.append(str(cell.value))
84
+ rows.append(" ".join(cells))
85
+ return "\n".join(rows)
86
+
87
+ def extract_ipynb(self, filepath):
88
+ """Extract text from a Jupyter notebook (.ipynb) file."""
89
+ with open(filepath, "r", encoding="utf-8") as f:
90
+ notebook = json.load(f)
91
+ cells = []
92
+ for cell in notebook["cells"]:
93
+ cell_text = "".join(cell["source"])
94
+ cells.append(cell_text)
95
+ return "\n".join(cells)
96
+
97
+ def extract_text(self, filepath):
98
+ """Extract text from plain text files (.txt, .md, .py, .js, etc.)"""
99
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
100
+ return f.read()
101
+
102
+
103
+ # --- Test it ---
104
+ if __name__ == "__main__":
105
+ import sys
106
+
107
+ extractor = Extractor()
108
+
109
+ if len(sys.argv) > 1:
110
+ filepath = sys.argv[1]
111
+ text = extractor.extract(filepath)
112
+ print(f"Extracted {len(text)} characters from {filepath}")
113
+ print(f"Preview:\n{text[:500]}")
114
+ else:
115
+ print("Usage: python -m indexer.extractor <filepath>")
indexer/pipeline.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # indexer/pipeline.py
2
+
3
+ import hashlib
4
+ import os
5
+
6
+ from evaluation.dataset_loader import DatasetLoader
7
+ from indexer.crawler import Crawler
8
+ from indexer.extractor import Extractor
9
+ from indexer.chunker import Chunker
10
+ from indexer.embedder import Embedder
11
+ from indexer.store import Store
12
+
13
+
14
+ class IndexingPipeline:
15
+ """
16
+ Wires all indexer modules together.
17
+
18
+ The flow for each file:
19
+ Crawler (discover + hash check)
20
+ → Extractor (file → raw text)
21
+ → Chunker (text → chunks with metadata)
22
+ → Embedder (chunks → vectors)
23
+ → Store (vectors → FAISS, metadata → SQLite)
24
+ """
25
+
26
+ def __init__(self, config_path="config.yaml"):
27
+ """
28
+ Initialize all pipeline components.
29
+ """
30
+ self.config_path = config_path
31
+ self.crawler = Crawler(config_path)
32
+ self.extractor = Extractor()
33
+ self.chunker = Chunker(chunk_size=500, overlap=50)
34
+ self.embedder = Embedder(config_path)
35
+ self.store = Store(config_path)
36
+
37
+ def _iter_dataset_documents(self):
38
+ """
39
+ Yield BEIR corpus documents as synthetic files so hosted deployments
40
+ can build an index from dataset folders containing corpus.jsonl.
41
+ """
42
+ for dataset_path in self.crawler.watch_paths:
43
+ corpus_path = os.path.join(dataset_path, "corpus.jsonl")
44
+ if not os.path.exists(corpus_path):
45
+ continue
46
+
47
+ dataset_name = os.path.basename(os.path.normpath(dataset_path))
48
+
49
+ try:
50
+ corpus = DatasetLoader(dataset_path).load_corpus()
51
+ except Exception as e:
52
+ print(f"[Pipeline] Could not load dataset corpus from {dataset_path}: {e}")
53
+ continue
54
+
55
+ for doc_id, doc in corpus.items():
56
+ title = (doc.get("title") or "").strip()
57
+ body = (doc.get("text") or "").strip()
58
+ text = "\n\n".join(part for part in [title, body] if part).strip()
59
+ if not text:
60
+ continue
61
+
62
+ synthetic_path = f"{dataset_name}://{doc_id}"
63
+ synthetic_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
64
+ yield synthetic_path, synthetic_hash, text
65
+
66
+ def run(self):
67
+ """
68
+ Execute the full indexing pipeline.
69
+ """
70
+ known_hashes = self.store.load_hashes()
71
+ print("Scanning for new/modified files...")
72
+ files_to_process, current_hashes, deleted_files = self.crawler.get_new_and_modified(known_hashes)
73
+
74
+ dataset_documents = list(self._iter_dataset_documents())
75
+ known_dataset_hashes = {
76
+ filepath: file_hash
77
+ for filepath, file_hash in known_hashes.items()
78
+ if "://" in filepath
79
+ }
80
+
81
+ for filepath, file_hash, text in dataset_documents:
82
+ current_hashes[filepath] = file_hash
83
+ if known_dataset_hashes.get(filepath) != file_hash:
84
+ files_to_process.append((filepath, text))
85
+
86
+ current_dataset_paths = {filepath for filepath, _, _ in dataset_documents}
87
+ deleted_files = set(deleted_files) | (
88
+ set(known_dataset_hashes.keys()) - current_dataset_paths
89
+ )
90
+
91
+ for filepath in deleted_files:
92
+ self.store.remove_file_chunks(filepath)
93
+
94
+ if not files_to_process:
95
+ print("Index is up to date.")
96
+ print(f"Total vectors: {self.store.get_total_vectors()}")
97
+ return
98
+
99
+ total = len(files_to_process)
100
+ for i, item in enumerate(files_to_process, 1):
101
+ if isinstance(item, tuple):
102
+ filepath, text = item
103
+ else:
104
+ filepath = item
105
+ text = self.extractor.extract(filepath)
106
+
107
+ print(f"[{i}/{total}] {filepath}")
108
+ if not text.strip():
109
+ print(f" Skipping (no text extracted)")
110
+ continue
111
+ chunks = self.chunker.chunk_file(text, filepath)
112
+ chunk_texts = [c["text"] for c in chunks]
113
+ embeddings = self.embedder.embed_chunks(chunk_texts)
114
+ self.store.remove_file_chunks(filepath)
115
+ self.store.add_chunks(chunks, embeddings)
116
+ self.store.save_file_info(filepath, current_hashes[filepath], len(chunks))
117
+
118
+ print(f"\nProcessed {total} files.")
119
+ print(f"Total vectors: {self.store.get_total_vectors()}")
120
+
121
+
122
+ # --- Test it ---
123
+ if __name__ == "__main__":
124
+ pipeline = IndexingPipeline()
125
+ pipeline.run()
indexer/store.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # indexer/store.py
2
+
3
+ import os
4
+ import sqlite3
5
+ import numpy as np
6
+ import faiss
7
+ import yaml
8
+
9
+
10
+ class Store:
11
+ """
12
+ Handles two storage systems:
13
+
14
+ 1. FAISS — stores dense vectors for fast similarity search
15
+ Uses IndexHNSWFlat instead of IndexFlatL2
16
+ HNSW = Hierarchical Navigable Small World graph
17
+ - IndexFlatL2 : scans every vector (slow at scale)
18
+ - IndexHNSWFlat: graph-based navigation (fast, same accuracy)
19
+
20
+ 2. SQLite — stores metadata about each chunk
21
+ """
22
+
23
+ # HNSW parameter — higher = more accurate but more memory
24
+ # 32 is the standard default, good balance for this use case
25
+ HNSW_M = 32
26
+
27
+ def __init__(self, config_path="config.yaml"):
28
+ """
29
+ Load config, set up file paths, initialize FAISS index and SQLite.
30
+ """
31
+ with open(config_path, "r") as f:
32
+ config = yaml.safe_load(f)
33
+
34
+ self.data_dir = config["data_dir"]
35
+ os.makedirs(self.data_dir, exist_ok=True)
36
+
37
+ self.faiss_path = os.path.join(self.data_dir, "index.faiss")
38
+ self.db_path = os.path.join(self.data_dir, "metadata.db")
39
+
40
+ self._init_db()
41
+ self._load_or_create_index()
42
+
43
+ def _init_db(self):
44
+ """
45
+ Create SQLite tables if they don't already exist.
46
+ """
47
+ conn = sqlite3.connect(self.db_path)
48
+ cursor = conn.cursor()
49
+
50
+ cursor.execute('''
51
+ CREATE TABLE IF NOT EXISTS chunks (
52
+ id INTEGER PRIMARY KEY,
53
+ filepath TEXT NOT NULL,
54
+ chunk_text TEXT NOT NULL,
55
+ chunk_index INTEGER,
56
+ FOREIGN KEY (filepath) REFERENCES files(filepath)
57
+ )
58
+ ''')
59
+
60
+ cursor.execute('''
61
+ CREATE TABLE IF NOT EXISTS files (
62
+ filepath TEXT PRIMARY KEY,
63
+ file_hash TEXT NOT NULL,
64
+ total_chunks INTEGER
65
+ )
66
+ ''')
67
+
68
+ conn.commit()
69
+ conn.close()
70
+
71
+ def _load_or_create_index(self):
72
+ """
73
+ Load an existing FAISS index from disk, or set to None.
74
+ The actual index is created on first add_chunks() call
75
+ so we know the embedding dimension at that point.
76
+ """
77
+ if os.path.exists(self.faiss_path):
78
+ self.index = faiss.read_index(self.faiss_path)
79
+ print(f"[Store] Loaded FAISS index — {self.index.ntotal} vectors")
80
+ else:
81
+ self.index = None
82
+ print("[Store] No existing index found — will create on first insert")
83
+
84
+ def _create_hnsw_index(self, dimension: int):
85
+ """
86
+ Create a new HNSW-based FAISS index.
87
+
88
+ Why HNSW over FlatL2:
89
+ FlatL2 — exact search, O(n) per query, slow at scale
90
+ HNSWFlat — approximate search, O(log n) per query, same accuracy
91
+ for top-k retrieval tasks
92
+
93
+ IndexIDMap2 wraps HNSW to support custom integer IDs and deletion.
94
+
95
+ Args:
96
+ dimension — embedding size (384 for MiniLM and BGE-small)
97
+ """
98
+ hnsw_index = faiss.IndexHNSWFlat(dimension, self.HNSW_M)
99
+ hnsw_index.hnsw.efSearch = 64 # search quality — higher = better recall
100
+ hnsw_index.hnsw.efConstruction = 64 # build quality — higher = better graph
101
+ self.index = faiss.IndexIDMap2(hnsw_index)
102
+ print(f"[Store] Created HNSW index — dim={dimension}, M={self.HNSW_M}")
103
+
104
+ def get_next_id(self):
105
+ """
106
+ Get the next available chunk ID from SQLite.
107
+ """
108
+ conn = sqlite3.connect(self.db_path)
109
+ cursor = conn.cursor()
110
+ cursor.execute("SELECT MAX(id) FROM chunks")
111
+ result = cursor.fetchone()[0]
112
+ conn.close()
113
+ return 0 if result is None else result + 1
114
+
115
+ def add_chunks(self, chunks_with_metadata, embeddings):
116
+ """
117
+ Add new chunks and their embeddings to both FAISS and SQLite.
118
+
119
+ Args:
120
+ chunks_with_metadata (list[dict]) — from chunker.chunk_file()
121
+ Each dict has: text, filepath, chunk_index
122
+ embeddings (numpy.ndarray) — shape (num_chunks, embedding_dim)
123
+ From embedder.embed_chunks()
124
+ """
125
+ embeddings = embeddings.astype("float32")
126
+
127
+ # create index on first insert — dimension comes from embeddings
128
+ if self.index is None:
129
+ dimension = embeddings.shape[1]
130
+ self._create_hnsw_index(dimension)
131
+
132
+ start_id = self.get_next_id()
133
+ ids = np.array(
134
+ [start_id + i for i in range(len(chunks_with_metadata))],
135
+ dtype=np.int64
136
+ )
137
+
138
+ self.index.add_with_ids(embeddings, ids)
139
+ faiss.write_index(self.index, self.faiss_path)
140
+
141
+ # save chunk metadata to SQLite
142
+ conn = sqlite3.connect(self.db_path)
143
+ cursor = conn.cursor()
144
+
145
+ for i, chunk in enumerate(chunks_with_metadata):
146
+ vector_id = start_id + i
147
+ cursor.execute(
148
+ "INSERT INTO chunks (id, filepath, chunk_text, chunk_index) "
149
+ "VALUES (?, ?, ?, ?)",
150
+ (vector_id, chunk["filepath"], chunk["text"], chunk["chunk_index"])
151
+ )
152
+
153
+ conn.commit()
154
+ conn.close()
155
+
156
+ def save_file_info(self, filepath, file_hash, total_chunks):
157
+ """
158
+ Save or update file info in SQLite.
159
+
160
+ Args:
161
+ filepath — file path or fake path e.g. "scifact://12345"
162
+ file_hash — SHA256 hash or doc_id string
163
+ total_chunks — number of chunks this file was split into
164
+ """
165
+ conn = sqlite3.connect(self.db_path)
166
+ cursor = conn.cursor()
167
+ cursor.execute(
168
+ "INSERT OR REPLACE INTO files (filepath, file_hash, total_chunks) "
169
+ "VALUES (?, ?, ?)",
170
+ (filepath, file_hash, total_chunks)
171
+ )
172
+ conn.commit()
173
+ conn.close()
174
+
175
+ def load_hashes(self):
176
+ """
177
+ Load all stored file hashes from SQLite.
178
+
179
+ Returns:
180
+ dict — {filepath: hash_string}
181
+ """
182
+ conn = sqlite3.connect(self.db_path)
183
+ cursor = conn.cursor()
184
+ cursor.execute("SELECT filepath, file_hash FROM files")
185
+ rows = cursor.fetchall()
186
+ conn.close()
187
+ return {row[0]: row[1] for row in rows}
188
+
189
+ def remove_file_chunks(self, filepath):
190
+ """
191
+ Delete all chunks for a file from both SQLite and FAISS.
192
+
193
+ Args:
194
+ filepath — the filepath to remove
195
+ """
196
+ conn = sqlite3.connect(self.db_path)
197
+ cursor = conn.cursor()
198
+
199
+ ids = cursor.execute(
200
+ "SELECT id FROM chunks WHERE filepath = ?", (filepath,)
201
+ ).fetchall()
202
+
203
+ cursor.execute("DELETE FROM chunks WHERE filepath = ?", (filepath,))
204
+ cursor.execute("DELETE FROM files WHERE filepath = ?", (filepath,))
205
+ conn.commit()
206
+ conn.close()
207
+
208
+ if ids and self.index is not None:
209
+ id_array = np.array([i[0] for i in ids], dtype=np.int64)
210
+ self.index.remove_ids(id_array)
211
+ faiss.write_index(self.index, self.faiss_path)
212
+
213
+ def get_total_vectors(self):
214
+ """
215
+ Return how many vectors are in the FAISS index.
216
+
217
+ Returns:
218
+ int — number of vectors, or 0 if index is empty
219
+ """
220
+ if self.index is None:
221
+ return 0
222
+ return self.index.ntotal
223
+
224
+
225
+ if __name__ == "__main__":
226
+ store = Store()
227
+
228
+ fake_chunks = [
229
+ {"text": "quarterly budget report summary", "filepath": "/docs/report.pdf", "chunk_index": 0},
230
+ {"text": "revenue increased by fifteen percent", "filepath": "/docs/report.pdf", "chunk_index": 1},
231
+ {"text": "python machine learning tutorial", "filepath": "/docs/tutorial.txt", "chunk_index": 0},
232
+ ]
233
+
234
+ fake_embeddings = np.random.rand(3, 384).astype("float32")
235
+
236
+ print(f"Vectors before: {store.get_total_vectors()}")
237
+ store.add_chunks(fake_chunks, fake_embeddings)
238
+ print(f"Vectors after: {store.get_total_vectors()}")
indexer/watcher.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # indexer/watcher.py
2
+
3
+ import os
4
+ import time
5
+ from watchdog.observers import Observer
6
+ from watchdog.events import FileSystemEventHandler
7
+ from indexer.pipeline import IndexingPipeline
8
+ import yaml
9
+
10
+
11
+ class IndexHandler(FileSystemEventHandler):
12
+ """
13
+ Handles filesystem events detected by watchdog.
14
+
15
+ watchdog calls these methods automatically:
16
+ - on_created(event) → new file added
17
+ - on_modified(event) → existing file changed
18
+ - on_deleted(event) → file removed
19
+ """
20
+
21
+ def __init__(self, pipeline, config_path="config.yaml"):
22
+ """
23
+ Args:
24
+ pipeline (IndexingPipeline) — existing pipeline instance
25
+ """
26
+
27
+ with open(config_path) as f:
28
+ config = yaml.safe_load(f)
29
+ self._debounce_seconds = config["debounce_seconds"]
30
+
31
+ self.pipeline = pipeline
32
+ self.include_extensions = self.pipeline.crawler.include_extensions
33
+ self._last_event = {} # {filepath: timestamp}
34
+
35
+ def _is_duplicate(self, filepath):
36
+ """
37
+ Check if we've already handled an event for this file recently.
38
+ Returns True if we should skip this event.
39
+ """
40
+ now = time.time()
41
+ last = self._last_event.get(filepath, 0)
42
+ if now - last < self._debounce_seconds:
43
+ return True
44
+ self._last_event[filepath] = now
45
+ return False
46
+
47
+ def _is_relevant(self, filepath):
48
+ """
49
+ Check if a file event is for a file type we care about.
50
+
51
+ Args:
52
+ filepath (str) — path from the event
53
+
54
+ Returns:
55
+ bool — True if the file extension is in our include list
56
+ """
57
+ ext = os.path.splitext(filepath)[1].lower()
58
+ return ext in self.include_extensions
59
+
60
+ def on_created(self, event):
61
+ """
62
+ Called when a new file is created.
63
+
64
+ Args:
65
+ event — watchdog event
66
+ """
67
+ if(event.is_directory):
68
+ return
69
+
70
+ if(not self._is_relevant(event.src_path)):
71
+ return
72
+
73
+ if self._is_duplicate(event.src_path):
74
+ return
75
+
76
+ print(f"New file detected: {event.src_path}")
77
+ text = self.pipeline.extractor.extract(event.src_path)
78
+ if(not text.strip()):
79
+ print(f" Skipping (no text extracted)")
80
+ return
81
+
82
+ chunks = self.pipeline.chunker.chunk_file(text, event.src_path)
83
+ chunk_texts = [c["text"] for c in chunks]
84
+ embeddings = self.pipeline.embedder.embed_chunks(chunk_texts)
85
+ self.pipeline.store.remove_file_chunks(event.src_path)
86
+ self.pipeline.store.add_chunks(chunks, embeddings)
87
+
88
+ file_hash = self.pipeline.crawler.compute_hash(event.src_path)
89
+ self.pipeline.store.save_file_info(event.src_path, file_hash, len(chunks))
90
+ print(f" File stored: {event.src_path}")
91
+
92
+
93
+ def on_modified(self, event):
94
+ """
95
+ Called when an existing file is modified.
96
+
97
+ Args:
98
+ event - watchdog event
99
+ """
100
+ if(event.is_directory):
101
+ return
102
+
103
+ if(not self._is_relevant(event.src_path)):
104
+ return
105
+
106
+ if self._is_duplicate(event.src_path):
107
+ return
108
+
109
+ print(f"File modified: {event.src_path}")
110
+
111
+ self.pipeline.store.remove_file_chunks(event.src_path)
112
+ text = self.pipeline.extractor.extract(event.src_path)
113
+ if(not text.strip()):
114
+ print(f" Skipping (no text extracted)")
115
+ return
116
+
117
+ chunks = self.pipeline.chunker.chunk_file(text, event.src_path)
118
+ chunk_texts = [c["text"] for c in chunks]
119
+ embeddings = self.pipeline.embedder.embed_chunks(chunk_texts)
120
+ self.pipeline.store.add_chunks(chunks, embeddings)
121
+
122
+ file_hash = self.pipeline.crawler.compute_hash(event.src_path)
123
+ self.pipeline.store.save_file_info(event.src_path, file_hash, len(chunks))
124
+ print(f" File saved: {event.src_path}")
125
+
126
+ def on_deleted(self, event):
127
+ """
128
+ Called when a file is deleted.
129
+
130
+ Args:
131
+ event - watchdog event
132
+ """
133
+ if(event.is_directory):
134
+ return
135
+
136
+ if(not self._is_relevant(event.src_path)):
137
+ return
138
+
139
+ print(f"File deleted: {event.src_path}")
140
+ self.pipeline.store.remove_file_chunks(event.src_path)
141
+
142
+
143
+ class Watcher:
144
+ """
145
+ Starts watchdog observers on all configured watch_paths.
146
+ Runs continuously until the user presses Ctrl+C.
147
+ """
148
+
149
+ def __init__(self, config_path="config.yaml"):
150
+ """
151
+ Initialize the Watcher.
152
+ """
153
+ self.pipeline = IndexingPipeline(config_path)
154
+ self.handler = IndexHandler(self.pipeline)
155
+ self.watch_paths = self.pipeline.crawler.watch_paths
156
+
157
+ def start(self):
158
+ """
159
+ Start watching all configured directories.
160
+ """
161
+ observer = Observer()
162
+ for path in self.watch_paths:
163
+ observer.schedule(self.handler, path, recursive=True)
164
+ observer.start()
165
+
166
+ print(f"Watchdog active. Watching {', '.join(self.watch_paths)}")
167
+
168
+ try:
169
+ while True:
170
+ time.sleep(1)
171
+ except KeyboardInterrupt:
172
+ print("Stopping watcher...")
173
+ finally:
174
+ observer.stop()
175
+ observer.join()
176
+
177
+
178
+ # --- Test it ---
179
+ if __name__ == "__main__":
180
+ # First run the full pipeline to index existing files
181
+ print("Running initial index...")
182
+ watcher = Watcher()
183
+ watcher.pipeline.run()
184
+
185
+ # Then start watching for changes
186
+ print("\nStarting file watcher...")
187
+ watcher.start()
main.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+
3
+ import json
4
+ import os
5
+ import time
6
+ from functools import lru_cache
7
+ import yaml
8
+ from fastapi import FastAPI, Request, Form
9
+ from fastapi.responses import HTMLResponse
10
+ from fastapi.staticfiles import StaticFiles
11
+ from fastapi.templating import Jinja2Templates
12
+
13
+ from evaluation.dataset_loader import DatasetLoader
14
+
15
+ app = FastAPI(title="Semantic Search Engine")
16
+
17
+ app.mount("/static", StaticFiles(directory="static"), name="static")
18
+ templates = Jinja2Templates(directory="templates")
19
+
20
+ # ── load search engine once at startup ──────────────────────────────────────
21
+ ENGINE_ERROR = None
22
+
23
+
24
+ @lru_cache(maxsize=1)
25
+ def get_engine():
26
+ global ENGINE_ERROR
27
+ try:
28
+ from searcher.search_engine import SearchEngine
29
+ ENGINE_ERROR = None
30
+ return SearchEngine("config.yaml")
31
+ except Exception as e:
32
+ ENGINE_ERROR = str(e)
33
+ print(f"[Startup] Search engine unavailable: {e}")
34
+ return None
35
+
36
+
37
+ # ── load dataset queries at startup ─────────────────────────────────────────
38
+ # These are the actual queries from SciFact and NFCorpus
39
+ # We use them to show "which dataset queries matched your search"
40
+
41
+ def load_dataset_queries() -> dict:
42
+ """
43
+ Load all queries from SciFact and NFCorpus at startup.
44
+
45
+ Returns:
46
+ dict — {
47
+ "scifact": {query_id: query_text, ...},
48
+ "nfcorpus": {query_id: query_text, ...},
49
+ }
50
+ """
51
+ all_queries = {}
52
+
53
+ datasets = {
54
+ "scifact": "data/scifact",
55
+ "nfcorpus": "data/nfcorpus",
56
+ }
57
+
58
+ for name, path in datasets.items():
59
+ if os.path.exists(path):
60
+ try:
61
+ loader = DatasetLoader(path)
62
+ all_queries[name] = loader.load_queries()
63
+ print(f"[Startup] Loaded {len(all_queries[name])} queries from {name}")
64
+ except Exception as e:
65
+ print(f"[Startup] Could not load {name} queries: {e}")
66
+ all_queries[name] = {}
67
+ else:
68
+ print(f"[Startup] Dataset path not found: {path}")
69
+ all_queries[name] = {}
70
+
71
+ return all_queries
72
+
73
+
74
+ # load once at startup — available globally
75
+ DATASET_QUERIES = load_dataset_queries()
76
+
77
+
78
+ # ── helpers ──────────────────────────────────────────────────────────────────
79
+
80
+ def load_eval_results() -> dict:
81
+ path = "results/eval_all.json"
82
+ if os.path.exists(path):
83
+ with open(path, "r") as f:
84
+ return json.load(f)
85
+ return {}
86
+
87
+
88
+ def extract_doc_id(filepath: str) -> str:
89
+ if "://" in filepath:
90
+ return filepath.split("://", 1)[1]
91
+ return filepath
92
+
93
+
94
+ def get_dataset_from_filepath(filepath: str) -> str:
95
+ if "scifact://" in filepath: return "scifact"
96
+ if "nfcorpus://" in filepath: return "nfcorpus"
97
+ return "filesystem"
98
+
99
+
100
+ def get_file_icon(filepath: str) -> str:
101
+ if "scifact://" in filepath: return "🔬"
102
+ if "nfcorpus://" in filepath: return "🏥"
103
+ ext = filepath.lower().split(".")[-1] if "." in filepath else ""
104
+ icons = {
105
+ "pdf": "📄", "docx": "📝", "txt": "📃",
106
+ "pptx": "📊", "xlsx": "📋", "py": "🐍",
107
+ }
108
+ return icons.get(ext, "📄")
109
+
110
+
111
+ def find_matching_dataset_queries(
112
+ user_query: str,
113
+ top_results: list,
114
+ ) -> list:
115
+ """
116
+ Find which dataset queries are semantically related to what the user typed.
117
+
118
+ Strategy — two passes:
119
+ 1. Exact / substring match — query text contains user words
120
+ 2. Doc-based match — if a result doc came from dataset X,
121
+ show the queries that reference that doc
122
+ from the qrels (loaded separately)
123
+
124
+ We use simple word overlap here (no extra model call needed).
125
+
126
+ Returns:
127
+ list of dicts — [
128
+ {
129
+ "query_id": "1234",
130
+ "query_text": "Does vitamin D cause cancer?",
131
+ "dataset": "scifact",
132
+ "match_type": "text" or "doc"
133
+ },
134
+ ...
135
+ ]
136
+ """
137
+ matched = []
138
+ seen_ids = set()
139
+
140
+ # words from user query — lowercase, skip short words
141
+ user_words = set(
142
+ w.lower() for w in user_query.split()
143
+ if len(w) > 3
144
+ )
145
+
146
+ # Pass 1 — text overlap match
147
+ # check every dataset query for word overlap with user query
148
+ for dataset_name, queries in DATASET_QUERIES.items():
149
+ for qid, qtext in queries.items():
150
+ q_words = set(w.lower() for w in qtext.split() if len(w) > 3)
151
+ overlap = user_words & q_words
152
+
153
+ # need at least 1 word overlap
154
+ if overlap and qid not in seen_ids:
155
+ matched.append({
156
+ "query_id": qid,
157
+ "query_text": qtext,
158
+ "dataset": dataset_name,
159
+ "match_type": "text",
160
+ "overlap": len(overlap),
161
+ })
162
+ seen_ids.add(qid)
163
+
164
+ # sort by overlap count — most overlapping queries first
165
+ matched.sort(key=lambda x: x["overlap"], reverse=True)
166
+
167
+ # return top 8 matched queries max
168
+ return matched[:8]
169
+
170
+
171
+ # ── routes ───────────────────────────────────────────────────────────────────
172
+
173
+ @app.get("/", response_class=HTMLResponse)
174
+ async def home(request: Request):
175
+ return templates.TemplateResponse("index.html", {
176
+ "request": request,
177
+ "scifact_count": len(DATASET_QUERIES.get("scifact", {})),
178
+ "nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
179
+ "error": ENGINE_ERROR,
180
+ })
181
+
182
+
183
+ @app.post("/search", response_class=HTMLResponse)
184
+ async def search(
185
+ request: Request,
186
+ query: str = Form(...),
187
+ top_k: int = Form(10),
188
+ mode: str = Form("full"),
189
+ ):
190
+ if not query.strip():
191
+ return templates.TemplateResponse("index.html", {
192
+ "request": request,
193
+ "error": "Please enter a search query.",
194
+ "scifact_count": len(DATASET_QUERIES.get("scifact", {})),
195
+ "nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
196
+ })
197
+
198
+ engine = get_engine()
199
+ if engine is None:
200
+ return templates.TemplateResponse("index.html", {
201
+ "request": request,
202
+ "error": (
203
+ "Search is not ready yet. The semantic index is still missing or failed to build. "
204
+ f"Startup details: {ENGINE_ERROR}"
205
+ ),
206
+ "scifact_count": len(DATASET_QUERIES.get("scifact", {})),
207
+ "nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
208
+ })
209
+
210
+ t0 = time.time()
211
+ output = engine.search(query.strip(), top_k=top_k)
212
+ elapsed = round(time.time() - t0, 3)
213
+
214
+ # format search results
215
+ results = []
216
+ for r in output.get("results", []):
217
+ filepath = r.get("filepath", "")
218
+ doc_id = extract_doc_id(filepath)
219
+ score = r.get("rerank_score", r.get("rrf_score", r.get("dense_score", 0)))
220
+ snippet = r.get("chunk_text", r.get("text", "No preview available."))
221
+
222
+ if len(snippet) > 200:
223
+ snippet = snippet[:200].rsplit(" ", 1)[0] + "..."
224
+
225
+ dataset = get_dataset_from_filepath(filepath)
226
+
227
+ results.append({
228
+ "doc_id": doc_id,
229
+ "filepath": filepath,
230
+ "score": round(float(score), 4),
231
+ "snippet": snippet,
232
+ "icon": get_file_icon(filepath),
233
+ "dataset": dataset,
234
+ })
235
+
236
+ # find matching dataset queries
237
+ matched_queries = find_matching_dataset_queries(query.strip(), results)
238
+
239
+ # group matched queries by dataset for display
240
+ matched_scifact = [q for q in matched_queries if q["dataset"] == "scifact"]
241
+ matched_nfcorpus = [q for q in matched_queries if q["dataset"] == "nfcorpus"]
242
+
243
+ return templates.TemplateResponse("results.html", {
244
+ "request": request,
245
+ "query": query,
246
+ "results": results,
247
+ "total": len(results),
248
+ "elapsed": elapsed,
249
+ "mode": mode,
250
+ "top_k": top_k,
251
+ "matched_scifact": matched_scifact,
252
+ "matched_nfcorpus": matched_nfcorpus,
253
+ "total_matched": len(matched_queries),
254
+ })
255
+
256
+
257
+ @app.get("/dashboard", response_class=HTMLResponse)
258
+ async def dashboard(request: Request):
259
+ eval_data = load_eval_results()
260
+
261
+ datasets = []
262
+ for dataset_name, mode_results in eval_data.items():
263
+ full = mode_results.get("full", {})
264
+ datasets.append({
265
+ "name": dataset_name,
266
+ "ndcg": full.get("NDCG@10", 0.0),
267
+ "mrr": full.get("MRR", 0.0),
268
+ "map": full.get("MAP@100", 0.0),
269
+ "recall": full.get("Recall@100", 0.0),
270
+ "precision": full.get("P@10", 0.0),
271
+ "queries": full.get("num_queries", 0),
272
+ "modes": mode_results,
273
+ })
274
+
275
+ return templates.TemplateResponse("dashboard.html", {
276
+ "request": request,
277
+ "datasets": datasets,
278
+ })
279
+
280
+
281
+ @app.get("/health")
282
+ async def health():
283
+ engine = get_engine()
284
+ return {
285
+ "status": "ok" if engine is not None else "degraded",
286
+ "engine_ready": engine is not None,
287
+ "engine_error": ENGINE_ERROR,
288
+ }
289
+
290
+
291
+ if __name__ == "__main__":
292
+ import uvicorn
293
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
294
+
295
+
296
+
297
+
298
+ # uvicorn main:app --reload --host 0.0.0.0 --port 8000
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ jinja2
4
+ python-multipart
5
+ sentence-transformers
6
+ transformers
7
+ torch
8
+ faiss-cpu
9
+ numpy
10
+ scipy
11
+ scikit-learn
12
+ networkx
13
+ nltk
14
+ pandas
15
+ pyyaml
16
+ python-docx
17
+ python-pptx
18
+ openpyxl
19
+ pillow
20
+ lxml
21
+ PyMuPDF
searcher/__init__.py ADDED
File without changes
searcher/__pycache__/__init__.cpython-313.pyc.2070577919488 ADDED
Binary file (133 Bytes). View file
 
searcher/__pycache__/dense_retriever.cpython-313.pyc.2070577919488 ADDED
Binary file (4.17 kB). View file
 
searcher/__pycache__/facet_filter.cpython-313.pyc.2070577919488 ADDED
Binary file (3.08 kB). View file
 
searcher/__pycache__/fusion_ranker.cpython-313.pyc.2070577919488 ADDED
Binary file (3.62 kB). View file
 
searcher/__pycache__/highlighter.cpython-313.pyc.2070577919488 ADDED
Binary file (4.27 kB). View file
 
searcher/__pycache__/query_understanding.cpython-313.pyc.2070578319792 ADDED
Binary file (4.41 kB). View file