Spaces:

JackSparrow89
/

Semantic_File

Sleeping

App Files Files Community

JackSparrow89 commited on Apr 16

Commit

bb04c5f

verified ·

1 Parent(s): 15c80f2

Upload 65 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +15 -0
.gitignore +34 -0
Dockerfile +20 -0
__pycache__/check_nfcorpus.cpython-313.pyc.2070577919488 +0 -0
__pycache__/main.cpython-313.pyc.2070578258992 +0 -0
check_nfcorpus.py +29 -0
config.yaml +42 -0
data/nfcorpus/corpus.jsonl +0 -0
data/nfcorpus/qrels/dev.tsv +0 -0
data/nfcorpus/qrels/test.tsv +0 -0
data/nfcorpus/qrels/train.tsv +0 -0
data/nfcorpus/queries.jsonl +0 -0
data/scifact/corpus.jsonl +0 -0
data/scifact/qrels/test.tsv +340 -0
data/scifact/qrels/train.tsv +920 -0
data/scifact/queries.jsonl +0 -0
docker-compose.yml +16 -0
evaluation/__pycache__/dataset_loader.cpython-313.pyc.2070577919488 +0 -0
evaluation/__pycache__/evaluator.cpython-313.pyc.2070577919488 +0 -0
evaluation/__pycache__/indexer_bridge.cpython-313.pyc.2070577919488 +0 -0
evaluation/__pycache__/query_runner.cpython-313.pyc.2070577919488 +0 -0
evaluation/__pycache__/run_eval.cpython-313.pyc.2070577919488 +0 -0
evaluation/dataset_loader.py +132 -0
evaluation/evaluator.py +197 -0
evaluation/indexer_bridge.py +94 -0
evaluation/query_runner.py +128 -0
evaluation/run_eval.py +170 -0
indexer/__pycache__/chunker.cpython-313.pyc.2070577919488 +0 -0
indexer/__pycache__/crawler.cpython-313.pyc.2070577919488 +0 -0
indexer/__pycache__/embedder.cpython-313.pyc.2070577919488 +0 -0
indexer/__pycache__/extractor.cpython-313.pyc.2070577919488 +0 -0
indexer/__pycache__/pipeline.cpython-313.pyc.2070577919488 +0 -0
indexer/__pycache__/store.cpython-313.pyc.2070577919488 +0 -0
indexer/__pycache__/watcher.cpython-313.pyc.2070577919488 +0 -0
indexer/chunker.py +135 -0
indexer/crawler.py +102 -0
indexer/embedder.py +111 -0
indexer/extractor.py +115 -0
indexer/pipeline.py +125 -0
indexer/store.py +238 -0
indexer/watcher.py +187 -0
main.py +298 -0
requirements.txt +21 -0
searcher/__init__.py +0 -0
searcher/__pycache__/__init__.cpython-313.pyc.2070577919488 +0 -0
searcher/__pycache__/dense_retriever.cpython-313.pyc.2070577919488 +0 -0
searcher/__pycache__/facet_filter.cpython-313.pyc.2070577919488 +0 -0
searcher/__pycache__/fusion_ranker.cpython-313.pyc.2070577919488 +0 -0
searcher/__pycache__/highlighter.cpython-313.pyc.2070577919488 +0 -0
searcher/__pycache__/query_understanding.cpython-313.pyc.2070578319792 +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,15 @@

+.git
+.gitignore
+.venv
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.log
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.idea/
+.vscode/
+data/
+results/

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Virtual environment
+.venv/
+# Vector index and database (large binary files)
+data/
+results/
+# Logs
+*.log
+# Downloaded ML models (auto-downloaded at runtime)
+models/
+.cache/
+sentence_transformers/
+# Python cache
+___pycache__/
+*.pyc
+*.pyo
+*.pyd
+.env
+venv/
+env/
+.venv/
+# Model cache
+.cache/
+# OS files
+.DS_Store
+Thumbs.db
+# IDE
+.vscode/
+.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.10
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    NLTK_DATA=/usr/local/share/nltk_data
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --upgrade pip && \
+    pip install -r requirements.txt && \
+    python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data'); nltk.download('omw-1.4', download_dir='/usr/local/share/nltk_data')"
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/check_nfcorpus.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (1.46 kB). View file

__pycache__/main.cpython-313.pyc.2070578258992 ADDED Viewed

Binary file (11.7 kB). View file

check_nfcorpus.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import json
+import sys
+import os
+sys.path.append(os.path.abspath("."))
+# Load results
+with open('results/eval_nfcorpus.json') as f:
+    data = json.load(f)
+# Load qrels
+from evaluation.dataset_loader import DatasetLoader
+loader = DatasetLoader('data/nfcorpus')
+qrels = loader.load_qrels()
+# 🔍 Debug prints
+print("Sample RESULT query_id:", list(data.keys())[0])
+first_qid = list(qrels.keys())[0]
+print("Sample QREL query_id:", first_qid)
+print("Sample QREL doc_id:", list(qrels[first_qid].keys())[0])
+print("Total QREL queries:", len(qrels))
+print("Total RESULT queries:", len(data))
+# 🔥 Check overlap
+common = set(data.keys()) & set(qrels.keys())
+print("Common query IDs:", len(common))

config.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# Directories to index
+watch_paths:
+  - ./data/scifact
+  - ./data/nfcorpus #modify this
+# File extensions to include
+include_extensions:
+  - ".pdf"
+  - ".docx"
+  - ".txt"
+  - ".md"
+  - ".pptx"
+  - ".xlsx"
+  - ".py"
+  - ".js"
+  - ".ipynb"
+# --- Add these ---
+top_k: 5                  # final results returned to user
+candidate_k: 20           # candidates fetched before reranking
+query_expansion: true     # WordNet synonym expansion
+max_synonyms: 5           # max synonyms to append
+reranking_enabled: true   # cross-encoder reranking
+reranker_model: "cross-encoder/ms-marco-MiniLM-L-6-v2"
+# Directories to skip
+skip_directories:
+  - ".git"
+  - "node_modules"
+  - "__pycache__"
+  - ".venv"
+# Where to store index data
+data_dir: "./data"
+embedding_model: "all-MiniLM-L6-v2"
+# embedding_model: BAAI/bge-small-en-v1.5
+debounce_seconds: 5

data/nfcorpus/corpus.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/nfcorpus/qrels/dev.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/nfcorpus/qrels/test.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/nfcorpus/qrels/train.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/nfcorpus/queries.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/scifact/corpus.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/scifact/qrels/test.tsv ADDED Viewed

	@@ -0,0 +1,340 @@

+query-id	corpus-id	score
+1	31715818	1
+3	14717500	1
+5	13734012	1
+13	1606628	1
+36	5152028	1
+36	11705328	1
+42	18174210	1
+48	13734012	1
+49	5953485	1
+50	12580014	1
+51	45638119	1
+53	45638119	1
+54	49556906	1
+56	4709641	1
+57	4709641	1
+70	5956380	1
+70	4414547	1
+72	6076903	1
+75	4387784	1
+94	1215116	1
+99	18810195	1
+100	4381486	1
+113	6157837	1
+115	33872649	1
+118	6372244	1
+124	4883040	1
+127	21598000	1
+128	8290953	1
+129	27768226	1
+130	27768226	1
+132	7975937	1
+133	38485364	1
+133	6969753	1
+133	17934082	1
+133	16280642	1
+133	12640810	1
+137	26016929	1
+141	6955746	1
+141	14437255	1
+142	10582939	1
+143	10582939	1
+146	10582939	1
+148	1084345	1
+163	18872233	1
+171	12670680	1
+179	16322674	1
+179	27123743	1
+179	23557241	1
+179	17450673	1
+180	16966326	1
+183	12827098	1
+185	18340282	1
+198	2177022	1
+208	13519661	1
+212	22038539	1
+213	13625993	1
+216	21366394	1
+217	21366394	1
+218	21366394	1
+219	21366394	1
+230	3067015	1
+232	10536636	1
+233	4388470	1
+236	4388470	1
+237	4942718	1
+238	2251426	1
+239	14079881	1
+248	1568684	1
+249	1568684	1
+261	1122279	1
+261	10697096	1
+268	970012	1
+269	970012	1
+274	11614737	1
+275	4961038	1
+275	14241418	1
+275	14819804	1
+279	14376683	1
+294	10874408	1
+295	20310709	1
+298	39381118	1
+300	3553087	1
+303	4388470	1
+312	6173523	1
+314	4347374	1
+324	2014909	1
+327	17997584	1
+338	23349986	1
+343	7873737	1
+343	5884524	1
+350	16927286	1
+354	8774475	1
+362	38587347	1
+380	19005293	1
+384	13770184	1
+385	9955779	1
+385	9767444	1
+386	16495649	1
+388	1148122	1
+399	791050	1
+410	14924526	1
+411	14924526	1
+415	6309659	1
+421	11172205	1
+431	28937856	1
+436	14637235	1
+437	18399038	1
+439	4423559	1
+440	4423559	1
+443	10165258	1
+452	12804937	1
+452	464511	1
+475	18678095	1
+478	14767844	1
+491	56893404	1
+501	17930286	1
+502	13071728	1
+507	30774694	1
+508	13980338	1
+513	13230773	1
+514	16256507	1
+516	29564505	1
+517	15663829	1
+521	34873974	1
+525	13639330	1
+527	3863543	1
+528	5476778	1
+532	12991445	1
+533	12991445	1
+535	39368721	1
+536	16056514	1
+539	13282296	1
+540	11886686	1
+540	25007443	1
+544	24221369	1
+549	9433958	1
+551	33499189	1
+552	1471041	1
+554	1049501	1
+560	40096222	1
+569	23460562	1
+575	10300888	1
+577	5289038	1
+578	8764879	1
+587	16999023	1
+589	10984005	1
+593	19675911	1
+597	12779444	1
+597	36355784	1
+597	25742130	1
+598	25742130	1
+613	9638032	1
+619	20888849	1
+619	2565138	1
+623	17000834	1
+628	24512064	1
+636	24294572	1
+637	25649714	1
+641	5912283	1
+641	31554917	1
+644	13619127	1
+649	12789595	1
+659	1215116	1
+660	1215116	1
+674	2095573	1
+684	4942718	1
+690	18750453	1
+691	10991183	1
+692	24088502	1
+693	24088502	1
+700	4350400	1
+702	4350400	1
+715	18421962	1
+716	18421962	1
+718	17587795	1
+721	1834762	1
+723	5531479	1
+727	7521113	1
+728	7521113	1
+728	36444198	1
+729	26851674	1
+742	32159283	1
+743	32159283	1
+744	8460275	1
+756	2831620	1
+759	1805641	1
+768	6421792	1
+770	15476777	1
+775	32275758	1
+781	24338780	1
+783	40632104	1
+784	2356950	1
+785	12471115	1
+793	8551160	1
+800	22543403	1
+805	22180793	1
+808	36606083	1
+811	19799455	1
+814	33387953	1
+820	8646760	1
+821	8646760	1
+823	15319019	1
+830	1897324	1
+831	1897324	1
+832	30303335	1
+834	5483793	1
+837	15928989	1
+839	1469751	1
+845	17741440	1
+847	16787954	1
+852	13843341	1
+859	1982286	1
+870	195689316	1
+873	1180972	1
+873	19307912	1
+873	27393799	1
+873	29025270	1
+873	3315558	1
+879	8426046	1
+880	8426046	1
+882	14803797	1
+887	18855191	1
+903	10648422	1
+904	7370282	1
+907	6923961	1
+911	11254556	1
+913	3203590	1
+914	3203590	1
+921	1642727	1
+922	17077004	1
+936	5483793	1
+956	12956194	1
+957	123859	1
+960	8780599	1
+967	2119889	1
+967	8997410	1
+971	46695481	1
+971	27873158	1
+971	28617573	1
+971	9764256	1
+975	5304891	1
+982	2988714	1
+985	6828370	1
+993	16472469	1
+1012	9745001	1
+1014	6277638	1
+1019	11603066	1
+1020	9433958	1
+1021	9433958	1
+1024	5373138	1
+1029	13923140	1
+1029	13940200	1
+1029	11899391	1
+1041	25254425	1
+1041	16626264	1
+1049	12486491	1
+1062	20381484	1
+1086	39281140	1
+1088	37549932	1
+1089	17628888	1
+1099	7662206	1
+1100	7662206	1
+1104	3898784	1
+1107	20532591	1
+1110	13770184	1
+1121	4456756	1
+1130	17997584	1
+1132	33499189	1
+1132	9283422	1
+1137	33370	1
+1140	12009265	1
+1144	10071552	1
+1146	13906581	1
+1150	11369420	1
+1163	15305881	1
+1175	31272411	1
+1179	31272411	1
+1180	31272411	1
+1185	16737210	1
+1187	52873726	1
+1191	30655442	1
+1194	11419230	1
+1196	25649714	1
+1197	25649714	1
+1199	16760369	1
+1200	3441524	1
+1202	3475317	1
+1204	31141365	1
+1207	18909530	1
+1213	14407673	1
+1216	24142891	1
+1221	19736671	1
+1225	9650982	1
+1226	13777138	1
+1232	13905670	1
+1241	4427392	1
+1245	7662395	1
+1259	24341590	1
+1262	44172171	1
+1266	37480103	1
+1270	13900610	1
+1271	13768432	1
+1272	17081238	1
+1273	11041152	1
+1274	12428814	1
+1274	27731651	1
+1274	4406819	1
+1278	11335781	1
+1279	11335781	1
+1280	4387784	1
+1281	4387784	1
+1282	23649163	1
+1290	4687948	1
+1292	56893404	1
+1298	11718220	1
+1303	12631697	1
+1316	27910499	1
+1319	16284655	1
+1320	16284655	1
+1332	5304891	1
+1335	27910499	1
+1336	27910499	1
+1337	20231138	1
+1339	15482274	1
+1344	9559146	1
+1352	12885341	1
+1359	11614737	1
+1362	8290953	1
+1363	8290953	1
+1368	2425364	1
+1370	2425364	1
+1379	16322674	1
+1379	27123743	1
+1379	23557241	1
+1379	17450673	1
+1382	17755060	1
+1385	306006	1
+1389	23895668	1
+1395	17717391	1

data/scifact/qrels/train.tsv ADDED Viewed

	@@ -0,0 +1,920 @@

+query-id	corpus-id	score
+0	31715818	1
+2	13734012	1
+4	22942787	1
+6	2613775	1
+9	44265107	1
+10	32587939	1
+11	32587939	1
+12	33409100	1
+14	641786	1
+15	22080671	1
+17	1606628	1
+18	22942787	1
+19	3202143	1
+20	3202143	1
+21	41493639	1
+22	6490571	1
+24	3471191	1
+25	2613775	1
+26	32390525	1
+27	32390525	1
+28	12670680	1
+30	24341590	1
+32	12428497	1
+34	11705328	1
+35	5152028	1
+35	11705328	1
+37	5152028	1
+37	11705328	1
+39	13497630	1
+40	13497630	1
+41	18174210	1
+43	7224723	1
+44	56893404	1
+45	56893404	1
+46	380526	1
+47	3512154	1
+47	26996935	1
+52	45638119	1
+55	49556906	1
+58	4709641	1
+60	13899137	1
+60	13901073	1
+61	13899137	1
+61	13901073	1
+62	32587939	1
+63	40349336	1
+64	40349336	1
+66	14806256	1
+67	21295300	1
+68	21295300	1
+69	5956380	1
+69	4414547	1
+71	1127562	1
+73	6076903	1
+74	4387784	1
+76	5531479	1
+77	5531479	1
+78	5099266	1
+79	5099266	1
+80	4920376	1
+81	1797622	1
+82	3619372	1
+85	7521113	1
+85	22406695	1
+86	7521113	1
+86	22406695	1
+88	7521113	1
+88	22406695	1
+89	7521113	1
+89	22406695	1
+90	22406695	1
+91	1084345	1
+92	1084345	1
+93	2692522	1
+95	1215116	1
+96	14500725	1
+98	6540064	1
+104	40164383	1
+105	36606083	1
+106	25515907	1
+106	5151024	1
+108	6191684	1
+108	22995579	1
+108	23865182	1
+109	4319174	1
+111	13513790	1
+112	6157837	1
+114	33872649	1
+116	33872649	1
+119	14606752	1
+120	14606752	1
+121	31460499	1
+122	31460499	1
+123	4883040	1
+126	24512064	1
+134	4695046	1
+138	26016929	1
+139	22080671	1
+144	10582939	1
+149	6227220	1
+152	15488881	1
+153	4702639	1
+154	4702639	1
+155	37549932	1
+156	37549932	1
+157	13439128	1
+159	9394119	1
+160	52874170	1
+161	6903077	1
+164	5824985	1
+165	5824985	1
+166	18872233	1
+167	18872233	1
+168	5824985	1
+169	5824985	1
+172	12670680	1
+173	8126244	1
+174	1710116	1
+175	1710116	1
+176	32587939	1
+177	9669099	1
+178	16322674	1
+178	27123743	1
+178	23557241	1
+178	17450673	1
+181	16966326	1
+182	11369420	1
+184	12827098	1
+186	16855829	1
+187	16855829	1
+189	4421578	1
+196	19313533	1
+197	2177022	1
+199	2177022	1
+200	18231807	1
+201	2462673	1
+203	9558539	1
+204	7898952	1
+205	7898952	1
+205	470625	1
+209	32587939	1
+210	13794374	1
+211	13794374	1
+214	13625993	1
+220	19205437	1
+221	19205437	1
+222	19205437	1
+223	2014909	1
+224	6944800	1
+225	6944800	1
+226	6944800	1
+227	26973393	1
+228	4928057	1
+229	56893404	1
+235	4388470	1
+241	2212067	1
+241	10608822	1
+242	2212067	1
+242	10608822	1
+243	8148122	1
+244	21498497	1
+245	8447873	1
+245	3430789	1
+246	8447873	1
+246	3430789	1
+247	13578199	1
+250	1568684	1
+251	1568684	1
+253	37424881	1
+254	37424881	1
+255	5850219	1
+256	5850219	1
+258	22080671	1
+259	8883846	1
+262	14610165	1
+263	11328820	1
+263	30041340	1
+263	14853989	1
+264	11328820	1
+265	2033917	1
+266	22405338	1
+267	5912283	1
+267	31554917	1
+272	11614737	1
+277	14376683	1
+278	14376683	1
+280	25001628	1
+281	4632921	1
+283	1974176	1
+285	5548081	1
+286	4709641	1
+287	4709641	1
+290	15048300	1
+292	15048300	1
+293	10874408	1
+296	4398832	1
+299	39381118	1
+301	3553087	1
+304	14797520	1
+305	14797520	1
+306	7821634	1
+308	7821634	1
+309	7821634	1
+310	6173523	1
+313	6173523	1
+315	3701541	1
+316	712078	1
+317	4506414	1
+323	2014909	1
+325	40349336	1
+326	40349336	1
+330	9505448	1
+331	9505448	1
+332	29023309	1
+333	29023309	1
+334	25079962	1
+335	1780819	1
+336	2097256	1
+337	2097256	1
+339	23349986	1
+340	7098463	1
+341	7098463	1
+342	7873737	1
+342	5884524	1
+345	4394817	1
+346	11902109	1
+347	11902109	1
+349	13497630	1
+351	14658685	1
+352	14658685	1
+355	12800122	1
+355	38380061	1
+356	6144337	1
+357	18111172	1
+358	18111172	1
+361	38587347	1
+363	5386514	1
+364	1550937	1
+365	600437	1
+366	13956305	1
+367	27099731	1
+368	27099731	1
+369	6826100	1
+370	1550937	1
+371	1550937	1
+372	24922825	1
+375	1522647	1
+376	22401061	1
+377	18810195	1
+378	45154987	1
+378	10534299	1
+378	11886686	1
+378	25007443	1
+378	17150648	1
+379	19005293	1
+381	18340282	1
+382	11659421	1
+383	13770184	1
+389	1148122	1
+390	1148122	1
+391	1148122	1
+392	1148122	1
+393	1148122	1
+394	11360768	1
+396	1456068	1
+397	1456068	1
+398	8883846	1
+400	791050	1
+401	5633876	1
+403	1921218	1
+404	1921218	1
+406	6796297	1
+407	9889151	1
+413	6309659	1
+414	6309659	1
+416	6309659	1
+417	6309659	1
+418	16660256	1
+420	9315213	1
+422	11172205	1
+423	8595678	1
+425	33257464	1
+426	16728949	1
+428	16728949	1
+429	36540079	1
+430	28937856	1
+432	8002887	1
+434	9500590	1
+435	9500590	1
+441	2014909	1
+444	10165258	1
+445	10165258	1
+447	2052720	1
+448	2052720	1
+449	12209494	1
+449	3430789	1
+453	4200695	1
+454	4200695	1
+455	12643937	1
+456	30507607	1
+458	597790	1
+461	40096222	1
+463	19736671	1
+466	22544171	1
+469	1410197	1
+470	12685434	1
+472	7185591	1
+472	26330861	1
+472	4414481	1
+473	4373433	1
+474	4373433	1
+479	6325527	1
+480	6325527	1
+481	14706752	1
+482	10991183	1
+483	22703082	1
+484	14637235	1
+485	14637235	1
+486	14637235	1
+487	14637235	1
+488	1780819	1
+489	6625693	1
+490	56893404	1
+492	19583924	1
+493	19583924	1
+494	34873974	1
+495	17077004	1
+498	17077004	1
+499	26064662	1
+500	17930286	1
+504	10883736	1
+505	22703082	1
+506	7433668	1
+509	13980338	1
+515	29564505	1
+523	14803797	1
+524	14803797	1
+526	3863543	1
+529	10546779	1
+529	25413327	1
+529	36651210	1
+530	10546779	1
+530	25413327	1
+530	36651210	1
+530	87610599	1
+531	10546779	1
+531	25413327	1
+531	36651210	1
+537	16056514	1
+541	45154987	1
+541	11886686	1
+541	25007443	1
+542	19688024	1
+545	24221369	1
+547	10648422	1
+548	18199839	1
+550	33499189	1
+553	1471041	1
+555	1049501	1
+557	1049501	1
+559	3475317	1
+562	20101846	1
+563	2867345	1
+564	2867345	1
+565	16120395	1
+566	16120395	1
+568	23418635	1
+570	20333864	1
+571	20333864	1
+572	4447055	1
+573	10300888	1
+574	10300888	1
+576	4468861	1
+579	34139429	1
+580	23460562	1
+582	14260013	1
+584	14260013	1
+585	42291761	1
+588	16999023	1
+590	10984005	1
+591	14682243	1
+592	14682243	1
+594	19675911	1
+595	4824840	1
+600	12258338	1
+601	12258338	1
+602	3701541	1
+603	6540064	1
+606	712078	1
+607	4506414	1
+609	40096222	1
+610	40096222	1
+611	32408470	1
+612	9638032	1
+614	9638032	1
+615	9638032	1
+616	18670	1
+617	18670	1
+618	6836086	1
+620	2565138	1
+621	1642727	1
+622	17000834	1
+624	20033112	1
+625	20033112	1
+626	16355392	1
+631	5468807	1
+632	5172048	1
+633	5172048	1
+635	1686997	1
+638	25649714	1
+640	6503185	1
+642	13619127	1
+643	15535511	1
+645	12810152	1
+646	12810152	1
+647	15041758	1
+648	15041758	1
+650	12789595	1
+651	9433958	1
+652	9433958	1
+653	24384587	1
+654	57574395	1
+655	57574395	1
+657	8533245	1
+658	5293024	1
+661	37204802	1
+662	37204802	1
+663	22080671	1
+665	12580014	1
+666	4469125	1
+667	6493422	1
+668	6493422	1
+668	25148216	1
+669	6493422	1
+669	25148216	1
+670	5573975	1
+671	5573975	1
+672	15635366	1
+673	2095573	1
+676	857189	1
+677	857189	1
+679	13639330	1
+680	9315213	1
+681	9315213	1
+682	9315213	1
+683	9315213	1
+685	4452659	1
+686	4452659	1
+687	4452659	1
+688	4452659	1
+689	22080671	1
+694	1071991	1
+696	16355392	1
+698	22544171	1
+703	4350400	1
+704	14658685	1
+705	22442133	1
+709	22442133	1
+710	22442133	1
+713	18421962	1
+714	18421962	1
+717	17587795	1
+724	5531479	1
+726	7521113	1
+726	36444198	1
+730	13400643	1
+732	34469966	1
+733	34469966	1
+734	4961038	1
+736	5389095	1
+737	16562534	1
+737	6609935	1
+738	16562534	1
+738	6609935	1
+738	33912020	1
+739	4446814	1
+740	23078022	1
+745	11291348	1
+746	11291348	1
+747	11291348	1
+748	11291348	1
+749	13868795	1
+751	19800147	1
+752	19800147	1
+753	1173667	1
+755	17844478	1
+757	17123657	1
+758	14195528	1
+760	1805641	1
+761	10009203	1
+762	4695046	1
+764	7552215	1
+765	7552215	1
+766	7552215	1
+767	2488880	1
+771	15476777	1
+772	24922825	1
+774	32275758	1
+776	32275758	1
+777	32275758	1
+778	13001323	1
+779	13001323	1
+780	8246922	1
+780	24338780	1
+782	8246922	1
+787	4740447	1
+788	4740447	1
+789	15493354	1
+790	15493354	1
+791	15984735	1
+792	3610080	1
+795	8551160	1
+797	8551160	1
+798	8551160	1
+799	5293024	1
+801	22180793	1
+802	22180793	1
+803	22180793	1
+804	22180793	1
+807	36606083	1
+810	13513790	1
+812	19799455	1
+813	33387953	1
+815	8148304	1
+816	8148304	1
+817	17814815	1
+818	17814815	1
+822	15319019	1
+825	15319019	1
+826	4678846	1
+828	4678846	1
+835	15928989	1
+838	15928989	1
+840	15663829	1
+841	15663829	1
+844	17741440	1
+846	22696649	1
+848	14500725	1
+853	24922825	1
+854	12206390	1
+855	8190282	1
+856	43334921	1
+857	43334921	1
+858	1982286	1
+860	16066726	1
+861	16066726	1
+863	20568364	1
+863	16361581	1
+866	37822406	1
+867	14340571	1
+871	195689316	1
+876	195689316	1
+877	313394	1
+881	14803797	1
+883	14803797	1
+884	14803797	1
+885	6477536	1
+886	6477536	1
+890	2097256	1
+891	2097256	1
+893	13509809	1
+894	14724693	1
+895	18750453	1
+896	14338915	1
+897	14338915	1
+898	13106686	1
+898	5572127	1
+899	13106686	1
+899	5572127	1
+900	18678095	1
+901	6540064	1
+902	10648422	1
+908	6923961	1
+909	11254556	1
+910	11254556	1
+912	11254556	1
+916	18037805	1
+917	34071621	1
+919	16422880	1
+923	17077004	1
+925	17077004	1
+926	16390264	1
+927	16390264	1
+928	18174210	1
+929	18174210	1
+930	16056514	1
+933	14711483	1
+934	8563659	1
+935	5483793	1
+938	26231129	1
+939	26231129	1
+940	12258338	1
+941	12258338	1
+942	11527199	1
+944	1642727	1
+945	8428935	1
+945	26112696	1
+945	4463588	1
+945	13083189	1
+946	8428935	1
+946	26112696	1
+946	4463588	1
+946	13083189	1
+949	13578199	1
+951	21414718	1
+952	3355397	1
+953	3355397	1
+954	3355397	1
+955	2078658	1
+955	30507607	1
+959	8780599	1
+962	13931771	1
+962	935538	1
+962	4306711	1
+963	4162857	1
+963	29828242	1
+964	4162857	1
+964	29828242	1
+965	40817021	1
+969	19356271	1
+969	17368516	1
+970	19356271	1
+970	17368516	1
+972	46695481	1
+972	27873158	1
+972	28617573	1
+972	9764256	1
+973	27446873	1
+973	27873158	1
+973	28617573	1
+973	9764256	1
+976	5304891	1
+977	14075252	1
+977	39264456	1
+978	14075252	1
+979	11659421	1
+980	20128547	1
+984	6828370	1
+988	3033830	1
+989	9988425	1
+990	16472469	1
+992	16472469	1
+994	16472469	1
+996	16472469	1
+997	16472469	1
+998	16472469	1
+999	16472469	1
+1000	16472469	1
+1001	5702790	1
+1002	13639330	1
+1003	14332945	1
+1003	4319844	1
+1003	4899981	1
+1004	301838	1
+1004	2734421	1
+1004	3952288	1
+1005	301838	1
+1005	2734421	1
+1005	3952288	1
+1006	4926049	1
+1008	2547636	1
+1009	1982286	1
+1011	9745001	1
+1015	6277638	1
+1016	6277638	1
+1018	11603066	1
+1023	16927286	1
+1025	32408470	1
+1026	3113630	1
+1027	3113630	1
+1028	13923140	1
+1028	11899391	1
+1030	6441369	1
+1031	12486491	1
+1032	6836086	1
+1033	6836086	1
+1034	4547102	1
+1035	4547102	1
+1036	4547102	1
+1037	16287725	1
+1038	16287725	1
+1040	25254425	1
+1040	16626264	1
+1042	17421851	1
+1043	17671145	1
+1044	22500262	1
+1045	22500262	1
+1046	418246	1
+1046	4324278	1
+1046	16712164	1
+1047	14706752	1
+1048	12486491	1
+1050	19878070	1
+1052	18816720	1
+1053	18816720	1
+1054	10072941	1
+1055	13906581	1
+1056	4200695	1
+1058	13027590	1
+1065	20418809	1
+1067	4429668	1
+1068	4429668	1
+1069	4200695	1
+1070	25649714	1
+1072	4824840	1
+1073	4824840	1
+1074	14658685	1
+1075	14658685	1
+1081	5691302	1
+1084	5691302	1
+1085	5691302	1
+1087	39281140	1
+1090	17628888	1
+1091	2603304	1
+1096	29638116	1
+1097	26851674	1
+1098	13552682	1
+1101	3874000	1
+1102	3874000	1
+1103	3898784	1
+1105	6710713	1
+1106	6710713	1
+1109	13770184	1
+1109	8582337	1
+1111	1686881	1
+1112	1686881	1
+1114	12824568	1
+1115	44048701	1
+1118	23351136	1
+1119	5323845	1
+1119	18997216	1
+1119	13907928	1
+1120	5323845	1
+1120	18997216	1
+1120	13907928	1
+1125	21009874	1
+1126	21009874	1
+1127	27466734	1
+1128	33499189	1
+1128	9283422	1
+1133	24142891	1
+1134	33370	1
+1135	33370	1
+1136	33370	1
+1138	6796297	1
+1139	12009265	1
+1141	12009265	1
+1142	5260382	1
+1145	10071552	1
+1148	4828631	1
+1153	7370282	1
+1156	12584053	1
+1157	12584053	1
+1158	12584053	1
+1159	12584053	1
+1161	13048272	1
+1162	15305881	1
+1164	4455466	1
+1165	4455466	1
+1166	9889151	1
+1168	8563659	1
+1169	4319174	1
+1170	18956141	1
+1171	18956141	1
+1173	7370282	1
+1174	31272411	1
+1176	13910150	1
+1177	13910150	1
+1178	31272411	1
+1181	301838	1
+1181	2734421	1
+1181	39128592	1
+1181	3952288	1
+1182	14541844	1
+1183	1967017	1
+1184	16737210	1
+1186	7485455	1
+1188	4394817	1
+1190	30655442	1
+1193	20532591	1
+1195	26283293	1
+1205	5558754	1
+1206	18909530	1
+1208	10284593	1
+1209	4347374	1
+1210	4928282	1
+1211	4928282	1
+1212	6493422	1
+1212	44724517	1
+1214	6493422	1
+1214	14407673	1
+1215	16355392	1
+1218	15635366	1
+1219	9393969	1
+1219	14864285	1
+1220	13023410	1
+1223	5289038	1
+1224	21932050	1
+1224	34016987	1
+1227	25641414	1
+1228	25641414	1
+1229	1676568	1
+1230	13905670	1
+1231	13905670	1
+1234	13905670	1
+1235	17973161	1
+1236	17973161	1
+1237	3654468	1
+1238	3654468	1
+1239	21387297	1
+1239	4427392	1
+1244	18949516	1
+1246	7662395	1
+1247	5114282	1
+1248	7209559	1
+1249	7209559	1
+1253	3321943	1
+1254	16939583	1
+1255	16939583	1
+1257	581832	1
+1258	12040627	1
+1260	24341590	1
+1261	13023410	1
+1263	3981729	1
+1265	37480103	1
+1268	52072815	1
+1269	13900610	1
+1275	27731651	1
+1276	3475317	1
+1284	3578380	1
+1288	4687948	1
+1289	21239672	1
+1291	56893404	1
+1293	43329366	1
+1294	2078658	1
+1294	30507607	1
+1295	21239672	1
+1297	9167230	1
+1300	6421792	1
+1302	12631697	1
+1304	12631697	1
+1305	12631697	1
+1306	6000423	1
+1306	5836	1
+1307	18231807	1
+1308	18231807	1
+1309	18231807	1
+1310	8042158	1
+1311	13763195	1
+1312	24177706	1
+1314	13072112	1
+1314	16237005	1
+1315	13072112	1
+1315	16237005	1
+1322	16284655	1
+1323	19912367	1
+1324	19912367	1
+1325	40476126	1
+1327	24241932	1
+1327	22194407	1
+1328	3475317	1
+1330	14075252	1
+1331	14075252	1
+1333	1649738	1
+1334	13923140	1
+1334	13940200	1
+1334	11899391	1
+1340	15482274	1
+1341	15482274	1
+1342	8148122	1
+1345	9559146	1
+1346	9505402	1
+1347	19005293	1
+1348	19005293	1
+1349	5377642	1
+1350	5377642	1
+1351	28369117	1
+1353	18816720	1
+1355	5256564	1
+1356	13764090	1
+1360	11614737	1
+1361	15488881	1
+1361	15058155	1
+1364	8290953	1
+1366	4406819	1
+1367	2425364	1
+1371	16256507	1
+1372	21003930	1
+1373	21003930	1
+1374	21993510	1
+1375	21993510	1
+1376	3944632	1
+1378	2488880	1
+1380	16322674	1
+1380	23557241	1
+1380	17450673	1
+1381	13481880	1
+1383	17755060	1
+1386	306006	1
+1387	9669099	1
+1390	2890952	1
+1391	6766459	1
+1392	6766459	1
+1393	2000038	1
+1393	12440953	1
+1394	2251426	1
+1397	17717391	1
+1398	17717391	1
+1400	14706752	1
+1401	5185871	1
+1402	8126244	1
+1403	33370	1
+1403	38355793	1
+1404	33370	1
+1404	38355793	1
+1405	10504681	1
+1406	2617858	1
+1407	8087082	1
+1407	29863668	1

data/scifact/queries.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+services:
+  semantic-search:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: semantic-search
+    ports:
+      - "7860:7860"
+    environment:
+      NLTK_DATA: /usr/local/share/nltk_data
+    volumes:
+      - ./config.yaml:/app/config.yaml:ro
+      - ./data:/app/data
+      - ./results:/app/results
+      - ./documents:/documents
+    restart: unless-stopped

evaluation/__pycache__/dataset_loader.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (5.68 kB). View file

evaluation/__pycache__/evaluator.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (8.8 kB). View file

evaluation/__pycache__/indexer_bridge.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (4.73 kB). View file

evaluation/__pycache__/query_runner.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (5.62 kB). View file

evaluation/__pycache__/run_eval.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (8.4 kB). View file

evaluation/dataset_loader.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# evaluation/dataset_loader.py
+import json
+import csv
+import os
+class DatasetLoader:
+    """
+    Loads BEIR-format datasets (SciFact, NFCorpus, etc.)
+    BEIR format:
+        corpus.jsonl  — {_id, title, text}
+        queries.jsonl — {_id, text}
+        qrels/*.tsv   — query_id, doc_id, relevance_score
+    Relevance scales:
+        SciFact  — binary (0 or 1)
+        NFCorpus — graded (0, 1, 2, 3)  → we keep anything >= 1
+    """
+    def __init__(self, dataset_path: str):
+        self.dataset_path = dataset_path
+        self.corpus_path  = os.path.join(dataset_path, "corpus.jsonl")
+        self.queries_path = os.path.join(dataset_path, "queries.jsonl")
+        # qrels path — try test.tsv first, fallback to dev.tsv
+        # NFCorpus ships with dev.tsv instead of test.tsv
+        test_path = os.path.join(dataset_path, "qrels", "test.tsv")
+        dev_path  = os.path.join(dataset_path, "qrels", "dev.tsv")
+        if os.path.exists(test_path):
+            self.qrels_path = test_path
+        elif os.path.exists(dev_path):
+            self.qrels_path = dev_path
+            print(f"[INFO] test.tsv not found, using dev.tsv for qrels")
+        else:
+            raise FileNotFoundError(
+                f"No qrels file found in {os.path.join(dataset_path, 'qrels')} — "
+                f"expected test.tsv or dev.tsv"
+            )
+    def load_corpus(self) -> dict:
+        """
+        Load all documents from corpus.jsonl.
+        Returns:
+            dict — {doc_id: {"title": str, "text": str}}
+        """
+        corpus = {}
+        with open(self.corpus_path, "r", encoding="utf-8") as f:
+            for line in f:
+                doc    = json.loads(line.strip())
+                doc_id = str(doc["_id"])
+                corpus[doc_id] = {
+                    "title": doc.get("title", ""),
+                    "text":  doc.get("text",  ""),
+                }
+        print(f"Loaded {len(corpus)} documents from corpus")
+        return corpus
+    def load_queries(self) -> dict:
+        """
+        Load test queries from queries.jsonl.
+        Returns:
+            dict — {query_id: query_text}
+        """
+        queries = {}
+        with open(self.queries_path, "r", encoding="utf-8") as f:
+            for line in f:
+                q = json.loads(line.strip())
+                queries[str(q["_id"])] = q["text"]
+        print(f"Loaded {len(queries)} queries")
+        return queries
+    def load_qrels(self) -> dict:
+        """
+        Load relevance judgments from qrels file.
+        Handles both:
+            SciFact  — binary relevance (0 or 1)
+            NFCorpus — graded relevance (0, 1, 2, 3) → keep score >= 1
+        Returns:
+            dict — {query_id: {doc_id: relevance_score}}
+        """
+        qrels = {}
+        with open(self.qrels_path, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t")
+            next(reader)  # skip header: query-id  corpus-id  score
+            for row in reader:
+                if len(row) < 3:
+                    continue
+                query_id = str(row[0])
+                doc_id   = str(row[1])
+                score    = int(row[2])
+                # skip completely irrelevant docs
+                # this handles both binary (0/1) and graded (0/1/2/3)
+                if score < 1:
+                    continue
+                if query_id not in qrels:
+                    qrels[query_id] = {}
+                qrels[query_id][doc_id] = score
+        print(f"Loaded qrels for {len(qrels)} queries "
+              f"from {os.path.basename(self.qrels_path)}")
+        return qrels
+if __name__ == "__main__":
+    import sys
+    # pass dataset path as argument or default to scifact
+    # usage: python -m evaluation.dataset_loader data/nfcorpus
+    path   = sys.argv[1] if len(sys.argv) > 1 else "data/scifact"
+    loader = DatasetLoader(path)
+    corpus  = loader.load_corpus()
+    queries = loader.load_queries()
+    qrels   = loader.load_qrels()
+    # show a sample
+    sample_qid = list(queries.keys())[0]
+    print(f"\nSample query  [{sample_qid}]: {queries[sample_qid]}")
+    print(f"Relevant docs : {qrels.get(sample_qid, {})}")

evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# evaluation/evaluator.py
+import math
+from collections import defaultdict
+class Evaluator:
+    """
+    Computes standard IR evaluation metrics by comparing your
+    system's ranked results against the ground-truth qrels.
+    Metrics implemented:
+        NDCG@k   — Normalized Discounted Cumulative Gain
+                   Measures ranking quality; rewards relevant docs appearing early
+                   Handles graded relevance (NFCorpus 0-3) and binary (SciFact 0-1)
+        MAP@k    — Mean Average Precision
+                   Average of precision computed at each relevant doc position
+        Recall@k — Fraction of relevant docs found in top-k
+        P@k      — Precision at k (fraction of top-k that are relevant)
+        MRR      — Mean Reciprocal Rank (position of first relevant result)
+    """
+    def ndcg_at_k(self, ranked: list, relevant: dict, k: int) -> float:
+        """
+        NDCG@k — the most important metric for ranked retrieval.
+        Score of 1.0 = perfect ranking, 0.0 = no relevant docs found.
+        Works for both:
+            - Binary relevance (SciFact): scores are 0 or 1
+            - Graded relevance (NFCorpus): scores are 0, 1, 2, or 3
+        """
+        dcg = 0.0
+        for i, (doc_id, _) in enumerate(ranked[:k]):
+            rel = relevant.get(doc_id, 0)
+            if rel > 0:
+                dcg += rel / math.log2(i + 2)   # i+2 because log2(1) = 0
+        # Ideal DCG — best possible ranking given the relevant docs
+        ideal_rels = sorted(relevant.values(), reverse=True)[:k]
+        idcg = sum(
+            rel / math.log2(i + 2)
+            for i, rel in enumerate(ideal_rels)
+            if rel > 0
+        )
+        return dcg / idcg if idcg > 0 else 0.0
+    def map_at_k(self, ranked: list, relevant: dict, k: int) -> float:
+        """
+        MAP@k — average precision across all relevant document positions.
+        For graded relevance (NFCorpus), any score >= 1 counts as relevant.
+        """
+        num_relevant  = 0
+        sum_precision = 0.0
+        for i, (doc_id, _) in enumerate(ranked[:k]):
+            if relevant.get(doc_id, 0) > 0:
+                num_relevant  += 1
+                sum_precision += num_relevant / (i + 1)
+        total_relevant = sum(1 for v in relevant.values() if v > 0)
+        if total_relevant == 0:
+            return 0.0
+        return sum_precision / total_relevant
+    def recall_at_k(self, ranked: list, relevant: dict, k: int) -> float:
+        """
+        Recall@k — what fraction of all relevant docs appear in top-k.
+        For graded relevance, any score >= 1 counts as relevant.
+        """
+        total_relevant = sum(1 for v in relevant.values() if v > 0)
+        if total_relevant == 0:
+            return 0.0
+        found = sum(
+            1 for doc_id, _ in ranked[:k]
+            if relevant.get(doc_id, 0) > 0
+        )
+        return found / total_relevant
+    def precision_at_k(self, ranked: list, relevant: dict, k: int) -> float:
+        """
+        P@k — fraction of the top-k results that are relevant.
+        For graded relevance, any score >= 1 counts as relevant.
+        """
+        if k == 0:
+            return 0.0
+        hits = sum(
+            1 for doc_id, _ in ranked[:k]
+            if relevant.get(doc_id, 0) > 0
+        )
+        return hits / k
+    def mrr(self, ranked: list, relevant: dict) -> float:
+        """
+        MRR — reciprocal of the rank of the first relevant result.
+        Score of 1.0 = first result is relevant.
+        For graded relevance, any score >= 1 counts as relevant.
+        """
+        for i, (doc_id, _) in enumerate(ranked):
+            if relevant.get(doc_id, 0) > 0:
+                return 1.0 / (i + 1)
+        return 0.0
+    def evaluate(
+        self,
+        all_results: dict,
+        qrels: dict,
+        k_values: list = None,
+    ) -> dict:
+        """
+        Compute all metrics across all queries and average them.
+        Args:
+            all_results — {query_id: [(doc_id, score), ...]}  from QueryRunner
+            qrels       — {query_id: {doc_id: relevance}}     from DatasetLoader
+            k_values    — list of k values e.g. [1, 5, 10, 100]
+        Returns:
+            dict — {
+                "NDCG@10": 0.42,
+                "MAP@100": 0.38,
+                "Recall@100": 0.71,
+                "P@10": 0.15,
+                "MRR": 0.55,
+                "num_queries": 300,
+                "queries_with_results": 298,
+                "queries_with_no_qrels": 2,
+            }
+        """
+        if k_values is None:
+            k_values = [1, 5, 10, 100]
+        scores               = defaultdict(list)
+        num_queries          = 0
+        queries_with_results = 0
+        queries_no_qrels     = 0
+        for query_id, ranked in all_results.items():
+            relevant = qrels.get(query_id, {})
+            # skip queries that have no ground truth at all
+            if not relevant:
+                queries_no_qrels += 1
+                continue
+            num_queries += 1
+            if ranked:
+                queries_with_results += 1
+            for k in k_values:
+                scores[f"NDCG@{k}"].append(self.ndcg_at_k(ranked, relevant, k))
+                scores[f"MAP@{k}"].append(self.map_at_k(ranked, relevant, k))
+                scores[f"Recall@{k}"].append(self.recall_at_k(ranked, relevant, k))
+                scores[f"P@{k}"].append(self.precision_at_k(ranked, relevant, k))
+            scores["MRR"].append(self.mrr(ranked, relevant))
+        # Print diagnostic so you can see if queries matched correctly
+        print(f"  Evaluated {num_queries} queries  |  "
+              f"{queries_with_results} had results  |  "
+              f"{queries_no_qrels} had no qrels (skipped)")
+        # Average across all queries
+        summary = {
+            metric: round(sum(vals) / len(vals), 4) if vals else 0.0
+            for metric, vals in scores.items()
+        }
+        summary["num_queries"]           = num_queries
+        summary["queries_with_results"]  = queries_with_results
+        summary["queries_with_no_qrels"] = queries_no_qrels
+        return summary
+if __name__ == "__main__":
+    # Quick sanity check with toy data
+    evaluator = Evaluator()
+    # Fake ranked results — doc_1 is relevant, doc_2 is not
+    fake_results = {
+        "q1": [("doc_1", 0.95), ("doc_2", 0.80), ("doc_3", 0.60)],
+        "q2": [("doc_4", 0.70), ("doc_1", 0.50)],
+    }
+    fake_qrels = {
+        "q1": {"doc_1": 1},
+        "q2": {"doc_4": 1, "doc_5": 1},
+    }
+    metrics = evaluator.evaluate(fake_results, fake_qrels, k_values=[1, 5, 10])
+    print("\nSanity check metrics:")
+    for k, v in metrics.items():
+        print(f"  {k}: {v}")

evaluation/indexer_bridge.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# evaluation/indexer_bridge.py
+import numpy as np
+from indexer.chunker import Chunker
+from indexer.embedder import Embedder
+from indexer.store import Store
+class IndexerBridge:
+    """
+    Feeds the BEIR corpus directly into your existing indexing pipeline.
+    The corpus documents are NOT real files on disk — they come from JSONL.
+    So we bypass the Crawler/Extractor and inject text directly into
+    Chunker → Embedder → Store.
+    Each document gets a fake filepath: "{dataset_name}://{doc_id}"
+    This lets the Store treat them like any other indexed file,
+    and the Evaluator can later match doc_id back from results.
+    """
+    def __init__(self, config_path: str = "config.yaml"):
+        self.chunker  = Chunker(chunk_size=500, overlap=50)
+        self.embedder = Embedder(config_path)
+        self.store    = Store(config_path)
+    def index_corpus(self, corpus: dict, batch_size: int = 64, dataset_name: str = "dataset"):
+        """
+        Index the entire corpus into FAISS + SQLite.
+        Args:
+            corpus       — {doc_id: {"title": str, "text": str}}
+            batch_size   — number of chunks to embed at once (memory control)
+            dataset_name — used as prefix for fake file paths e.g. "scifact", "nfcorpus"
+        """
+        doc_ids = list(corpus.keys())
+        total   = len(doc_ids)
+        print(f"Indexing {total} documents from [{dataset_name}]...")
+        # Clear previous entries for THIS dataset only
+        existing_hashes  = self.store.load_hashes()
+        prefix           = f"{dataset_name}://"
+        existing_entries = [fp for fp in existing_hashes if fp.startswith(prefix)]
+        for fp in existing_entries:
+            self.store.remove_file_chunks(fp)
+        if existing_entries:
+            print(f"Cleared {len(existing_entries)} previously indexed [{dataset_name}] documents")
+        chunk_buffer = []
+        text_buffer  = []
+        def flush(chunk_buffer, text_buffer):
+            if not chunk_buffer:
+                return
+            embeddings = self.embedder.embed_chunks(text_buffer)
+            embeddings = np.array(embeddings, dtype="float32")
+            self.store.add_chunks(chunk_buffer, embeddings)
+        for i, doc_id in enumerate(doc_ids, 1):
+            doc       = corpus[doc_id]
+            full_text = f"{doc['title']} {doc['text']}".strip()
+            if not full_text:
+                continue
+            fake_path = f"{prefix}{doc_id}"
+            chunks    = self.chunker.chunk_file(full_text, fake_path)
+            for chunk in chunks:
+                chunk_buffer.append(chunk)
+                text_buffer.append(chunk["text"])
+            self.store.save_file_info(fake_path, doc_id, len(chunks))
+            if len(chunk_buffer) >= batch_size:
+                flush(chunk_buffer, text_buffer)
+                chunk_buffer.clear()
+                text_buffer.clear()
+            if i % 500 == 0:
+                print(f"  Indexed {i}/{total}...")
+        # flush any remaining chunks
+        flush(chunk_buffer, text_buffer)
+        print(f"Done. Total vectors: {self.store.get_total_vectors()}")
+if __name__ == "__main__":
+    from evaluation.dataset_loader import DatasetLoader
+    loader = DatasetLoader("data/scifact")
+    corpus = loader.load_corpus()
+    bridge = IndexerBridge()
+    bridge.index_corpus(corpus, batch_size=64, dataset_name="scifact")

evaluation/query_runner.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# evaluation/query_runner.py
+from searcher.search_engine import SearchEngine
+class QueryRunner:
+    """
+    Runs all evaluation queries through your SearchEngine and collects
+    the ranked result lists for scoring.
+    The results are formatted exactly as the Evaluator expects:
+        {query_id: [(doc_id, score), ...]}   ranked best-first
+    """
+    def __init__(self, config_path: str = "config.yaml"):
+        self.engine = SearchEngine(config_path)
+    def _extract_doc_id(self, filepath: str) -> str:
+        """
+        Strip dataset prefix from fake filepath so it matches qrels doc_ids.
+        Examples:
+            "scifact://12345"    →  "12345"
+            "nfcorpus://MED-10"  →  "MED-10"
+            "/real/file.pdf"     →  "/real/file.pdf"  (real files unchanged)
+        This is critical — without stripping, doc_ids like "nfcorpus://MED-10"
+        will never match qrels keys like "MED-10" and all scores will be 0.0
+        """
+        if "://" in filepath:
+            return filepath.split("://", 1)[1]
+        return filepath
+    def run(
+        self,
+        queries: dict,
+        top_k: int = 100,
+        mode: str = "full",
+    ) -> dict:
+        """
+        Run all queries and return ranked results.
+        Args:
+            queries — {query_id: query_text}
+            top_k   — number of results per query (use 100 for eval)
+            mode    — pipeline variant to test:
+                        "dense"   → dense retrieval only
+                        "sparse"  → BM25 only
+                        "hybrid"  → dense + BM25 + RRF (no reranker)
+                        "full"    → complete pipeline with reranker
+        Returns:
+            dict — {query_id: [(doc_id, rank_score), ...]}
+        """
+        results = {}
+        total   = len(queries)
+        for i, (query_id, query_text) in enumerate(queries.items(), 1):
+            if i % 50 == 0:
+                print(f"  Running query {i}/{total}...")
+            try:
+                if mode == "dense":
+                    raw    = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
+                    ranked = [
+                        (self._extract_doc_id(r["filepath"]), -r["dense_score"])
+                        for r in raw
+                    ]
+                elif mode == "sparse":
+                    raw    = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
+                    ranked = [
+                        (self._extract_doc_id(r["filepath"]), r["sparse_score"])
+                        for r in raw
+                    ]
+                elif mode == "hybrid":
+                    dense_raw  = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
+                    sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
+                    fused      = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k)
+                    ranked     = [
+                        (self._extract_doc_id(r["filepath"]), r["rrf_score"])
+                        for r in fused
+                    ]
+                else:  # full pipeline
+                    output = self.engine.search(query_text, top_k=top_k)
+                    ranked = [
+                        (
+                            self._extract_doc_id(r["filepath"]),
+                            r.get("rerank_score", r.get("rrf_score", 0))
+                        )
+                        for r in output["results"]
+                    ]
+                # Deduplicate by doc_id
+                # multiple chunks from same doc → keep only the best score
+                seen = {}
+                for doc_id, score in ranked:
+                    if doc_id not in seen or score > seen[doc_id]:
+                        seen[doc_id] = score
+                results[query_id] = sorted(
+                    seen.items(),
+                    key=lambda x: x[1],
+                    reverse=True
+                )
+            except Exception as e:
+                print(f"  Error on query {query_id}: {e}")
+                results[query_id] = []
+        return results
+if __name__ == "__main__":
+    from evaluation.dataset_loader import DatasetLoader
+    loader  = DatasetLoader("data/scifact")
+    queries = loader.load_queries()
+    runner  = QueryRunner()
+    results = runner.run(queries, top_k=10, mode="full")
+    sample_qid = list(results.keys())[0]
+    print(f"\nQuery {sample_qid} top results:")
+    for doc_id, score in results[sample_qid][:5]:
+        print(f"  doc {doc_id}  score={score:.4f}")

evaluation/run_eval.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# evaluation/run_eval.py
+import argparse
+import json
+import os
+import time
+from evaluation.dataset_loader import DatasetLoader
+from evaluation.indexer_bridge  import IndexerBridge
+from evaluation.query_runner    import QueryRunner
+from evaluation.evaluator       import Evaluator
+MODES           = ["dense", "sparse", "hybrid", "full"]
+DISPLAY_METRICS = ["NDCG@10", "MAP@100", "Recall@100", "P@10", "MRR"]
+# All supported datasets — add more here later if needed
+AVAILABLE_DATASETS = {
+    "scifact":  "data/scifact",
+    "nfcorpus": "data/nfcorpus",
+}
+def print_table(results: dict, title: str = ""):
+    col_w  = 14
+    header = f"{'Mode':<10}" + "".join(f"{m:>{col_w}}" for m in DISPLAY_METRICS)
+    if title:
+        print(f"\n  {title}")
+    print("=" * len(header))
+    print(header)
+    print("-" * len(header))
+    for mode, metrics in results.items():
+        row = f"{mode:<10}"
+        for m in DISPLAY_METRICS:
+            val = metrics.get(m, 0.0)
+            row += f"{val:>{col_w}.4f}"
+        print(row)
+    print("=" * len(header))
+def print_comparison_table(all_dataset_results: dict):
+    """
+    Print a single comparison table across all datasets.
+    Shows NDCG@10 and MRR side by side for each dataset.
+    """
+    datasets = list(all_dataset_results.keys())
+    modes    = list(list(all_dataset_results.values())[0].keys())
+    print("\n" + "=" * 80)
+    print("CROSS-DATASET COMPARISON — full pipeline mode")
+    print("=" * 80)
+    # Header
+    header = f"{'Dataset':<14}" + "".join(
+        f"{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}"
+    )
+    print(f"{'Dataset':<14}{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}")
+    print("-" * 46)
+    for dataset, mode_results in all_dataset_results.items():
+        # use "full" mode results for comparison, fallback to first mode
+        metrics = mode_results.get("full", list(mode_results.values())[0])
+        ndcg    = metrics.get("NDCG@10", 0.0)
+        mrr     = metrics.get("MRR", 0.0)
+        map_    = metrics.get("MAP@100", 0.0)
+        print(f"{dataset:<14}{ndcg:>12.4f}{mrr:>10.4f}{map_:>10.4f}")
+    print("=" * 46)
+def run_single_dataset(dataset_name: str, dataset_path: str, args) -> dict:
+    """Run full eval pipeline for one dataset. Returns mode→metrics dict."""
+    print(f"\n{'#'*60}")
+    print(f"  DATASET: {dataset_name.upper()}")
+    print(f"{'#'*60}")
+    # 1 — load
+    print("\n[1/4] Loading dataset...")
+    loader  = DatasetLoader(dataset_path)
+    corpus  = loader.load_corpus()
+    queries = loader.load_queries()
+    qrels   = loader.load_qrels()
+    # 2 — index
+    if not args.skip_index:
+        print("\n[2/4] Indexing corpus...")
+        bridge = IndexerBridge(args.config)
+        # pass dataset_name so fake paths are e.g. nfcorpus://doc_id
+        bridge.index_corpus(corpus, batch_size=64, dataset_name=dataset_name)
+    else:
+        print("\n[2/4] Skipping indexing (--skip-index)")
+    # 3 — run queries
+    print("\n[3/4] Running queries...")
+    runner    = QueryRunner(args.config)
+    evaluator = Evaluator()
+    modes_to_run     = MODES if args.mode == "all" else [args.mode]
+    all_mode_results = {}
+    for mode in modes_to_run:
+        print(f"\n  Mode: {mode}")
+        t0             = time.time()
+        ranked_results = runner.run(queries, top_k=args.top_k, mode=mode)
+        elapsed        = time.time() - t0
+        metrics                  = evaluator.evaluate(ranked_results, qrels, k_values=[1, 5, 10, 100])
+        metrics["query_time_s"]  = round(elapsed, 2)
+        all_mode_results[mode]   = metrics
+        print(f"  NDCG@10={metrics.get('NDCG@10', 0):.4f}  "
+              f"MAP@100={metrics.get('MAP@100', 0):.4f}  "
+              f"MRR={metrics.get('MRR', 0):.4f}")
+    # 4 — per-dataset table
+    print(f"\n[4/4] Results for {dataset_name.upper()}")
+    print_table(all_mode_results, title=f"EVALUATION RESULTS — {dataset_name} (pytrec_eval)")
+    return all_mode_results
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate semantic search on BEIR datasets")
+    parser.add_argument(
+        "--datasets",
+        nargs="+",
+        default=["scifact", "nfcorpus"],
+        choices=list(AVAILABLE_DATASETS.keys()),
+        help="Which datasets to evaluate. e.g. --datasets scifact nfcorpus"
+    )
+    parser.add_argument("--config",     default="config.yaml")
+    parser.add_argument("--top-k",      default=100, type=int)
+    parser.add_argument("--skip-index", action="store_true")
+    parser.add_argument("--mode",       default="all",
+                        help="dense | sparse | hybrid | full | all")
+    args = parser.parse_args()
+    os.makedirs("results", exist_ok=True)
+    all_dataset_results = {}
+    for dataset_name in args.datasets:
+        dataset_path = AVAILABLE_DATASETS[dataset_name]
+        if not os.path.exists(dataset_path):
+            print(f"\n[WARNING] Dataset folder not found: {dataset_path} — skipping {dataset_name}")
+            continue
+        results = run_single_dataset(dataset_name, dataset_path, args)
+        all_dataset_results[dataset_name] = results
+        # save per-dataset report
+        report_path = f"results/eval_{dataset_name}.json"
+        with open(report_path, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"  Saved → {report_path}")
+    # cross-dataset comparison (only if more than one dataset ran)
+    if len(all_dataset_results) > 1:
+        print_comparison_table(all_dataset_results)
+    # save combined report
+    combined_path = "results/eval_all.json"
+    with open(combined_path, "w") as f:
+        json.dump(all_dataset_results, f, indent=2)
+    print(f"\nCombined report saved → {combined_path}")
+if __name__ == "__main__":
+    main()

indexer/__pycache__/chunker.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (5.35 kB). View file

indexer/__pycache__/crawler.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (4.8 kB). View file

indexer/__pycache__/embedder.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (4.39 kB). View file

indexer/__pycache__/extractor.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (5.84 kB). View file

indexer/__pycache__/pipeline.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (6.86 kB). View file

indexer/__pycache__/store.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (11.1 kB). View file

indexer/__pycache__/watcher.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (8.89 kB). View file

indexer/chunker.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# indexer/chunker.py
+class Chunker:
+    """
+    Splits extracted text into overlapping chunks using a sliding window.
+    Each chunk will later be embedded as a separate vector.
+    Why chunk at all?
+    - Embedding models have a token limit (typically 256-512 tokens)
+    - A 50-page PDF as one embedding would lose detail
+    - Small chunks let us pinpoint the EXACT passage that matches a query
+    Why overlap?
+    - A sentence at the boundary might get cut in half
+    - Overlap ensures every sentence appears fully in at least one chunk
+    """
+    def __init__(self, chunk_size=500, overlap=50):
+        """
+        Args:
+            chunk_size (int) — max number of words per chunk
+            overlap (int) — number of words shared between consecutive chunks
+        TODO:
+        - Store chunk_size and overlap as instance variables
+        - Validate that overlap is less than chunk_size
+            (if overlap >= chunk_size, chunks would never advance forward)
+        """
+        self.chunk_size = chunk_size
+        self.overlap = overlap
+        if self.overlap >= self.chunk_size:
+            raise ValueError("Overlap must be smaller than chunk_size")
+    def chunk_text(self, text):
+        """
+        Split a text string into overlapping chunks based on word count.
+        Args:
+            text (str) — the full extracted text from a file
+        Returns:
+            list[str] — list of text chunks
+        Example with chunk_size=5, overlap=2:
+            text = "The quick brown fox jumps over the lazy dog today"
+            words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "today"]
+            Chunk 0: words[0:5]  → "The quick brown fox jumps"
+            Chunk 1: words[3:8]  → "fox jumps over the lazy"      (step = 5-2 = 3)
+            Chunk 2: words[6:11] → "the lazy dog today"           (step = 3 again)
+        TODO:
+        - Split the text into a list of words using .split()
+        - If the word list is empty, return an empty list
+        - Calculate step size: step = chunk_size - overlap
+        - Use a loop starting at 0, stepping by 'step', up to len(words)
+        - At each position, take words[i : i + chunk_size]
+        - Join each slice back into a string with " ".join()
+        - Return the list of chunk strings
+        HINT:
+            words = text.split()
+            step = self.chunk_size - self.overlap
+            for i in range(0, len(words), step):
+                chunk_words = words[i : i + self.chunk_size]
+        """
+        words = text.split()
+        if not words:
+            return []
+        step = self.chunk_size - self.overlap
+        chunks = []
+        for i in range(0, len(words), step):
+            chunk_words = words[i:i+self.chunk_size]
+            chunks.append(" ".join(chunk_words))
+        return chunks
+    def chunk_file(self, text, filepath):
+        """
+        Chunk a file's text and attach metadata to each chunk.
+        This metadata will be stored in SQLite alongside the vectors.
+        Args:
+            text (str) — extracted text content
+            filepath (str) — source file path (for metadata)
+        Returns:
+            list[dict] — each dict contains:
+                {
+                    "text": "the chunk text...",
+                    "filepath": "/path/to/file.pdf",
+                    "chunk_index": 0,     # position in the file
+                    "total_chunks": 5     # how many chunks this file produced
+                }
+        TODO:
+        - Call self.chunk_text(text) to get the list of chunk strings
+        - Build a list of dicts, one per chunk, with the fields shown above
+        - chunk_index starts at 0
+        HINT:
+            chunks = self.chunk_text(text)
+            for i, chunk in enumerate(chunks):
+                # build the dict here
+        """
+        chunks = self.chunk_text(text)
+        results = []
+        for i, chunk in enumerate(chunks):
+            results.append({
+                "text": chunk,
+                "filepath": filepath,
+                "chunk_index": i,
+            })
+        return results
+# --- Test it ---
+if __name__ == "__main__":
+    chunker = Chunker(chunk_size=10, overlap=3)
+    sample = (
+        "The quick brown fox jumps over the lazy dog. "
+        "Semantic search finds files by meaning not just keywords. "
+        "This is a test of the chunking system for our project."
+    )
+    chunks = chunker.chunk_text(sample)
+    print(f"Text has {len(sample.split())} words → {len(chunks)} chunks\n")
+    for i, chunk in enumerate(chunks):
+        print(f"Chunk {i}: {chunk}")
+    print("\n--- With metadata ---")
+    results = chunker.chunk_file(sample, "/test/sample.txt")
+    for r in results:
+        print(f"[{r['chunk_index']}] {r['text'][:60]}...")

indexer/crawler.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# indexer/crawler.py
+import os
+import hashlib
+import yaml
+class Crawler:
+    """
+    Discovers files in configured directories and tracks which ones
+    are new or modified using SHA-256 hashing.
+    """
+    def __init__(self, config_path="config.yaml"):
+        """
+        Load the config file and store the settings as instance variables.
+        """
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        self.watch_paths = config["watch_paths"]
+        self.include_extensions = config["include_extensions"]
+        self.skip_directories = config["skip_directories"]
+        self.data_dir = config["data_dir"]
+    def discover_files(self):
+        """
+        Walk through all watch_paths recursively and collect every file
+        that matches include_extensions, skipping skip_directories.
+        Returns:
+            list[str] — list of absolute file paths
+        """
+        results=[]
+        for path in self.watch_paths:
+            for dirpath, dirnames, filenames in os.walk(path):
+                for filename in filenames:
+                    if os.path.splitext(filename)[1] in self.include_extensions:
+                        full_path = os.path.join(dirpath, filename)
+                        results.append(full_path)
+                dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
+        return results
+    def compute_hash(self, filepath):
+        """
+        Compute the SHA-256 hash of a file's contents.
+        Args:
+            filepath (str) — absolute path to the file
+        Returns:
+            str — hex string of the SHA-256 hash
+        """
+        hasher = hashlib.sha256()
+        with open(filepath, "rb") as f:
+            while chunk := f.read(8192):
+                hasher.update(chunk)
+        return hasher.hexdigest()
+    def get_new_and_modified(self, known_hashes=None):
+        """
+        Compare discovered files against previously known hashes to find
+        which files are new or have been modified since last run.
+        Args:
+            known_hashes (dict) — {filepath: hash} from previous run
+                                   Pass None or {} on first run.
+        Returns:
+            tuple: (files_to_process, current_hashes, deleted_files)
+            - files_to_process: list[str] — paths that are new or changed
+            - current_hashes: dict — {filepath: hash} for ALL current files
+            - deleted files: list[str] — files that were deleted
+        """
+        if known_hashes is None:
+            known_hashes = {}
+        current_files = self.discover_files()
+        files_to_process = []
+        current_hashes = {}
+        for file in current_files:
+            file_hash = self.compute_hash(file)
+            if file not in known_hashes or file_hash != known_hashes[file]:
+                files_to_process.append(file)
+            current_hashes[file] = file_hash
+        deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())
+        return files_to_process, current_hashes, deleted_files
+# --- Test it ---
+if __name__ == "__main__":
+    crawler = Crawler()
+    files = crawler.discover_files()
+    print(f"Found {len(files)} files:")
+    for f in files:
+        print(f"  {f}")
+    print("\n--- Checking for new/modified ---")
+    to_process, hashes = crawler.get_new_and_modified()
+    print(f"{len(to_process)} files to process")

indexer/embedder.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# indexer/embedder.py
+import yaml
+from sentence_transformers import SentenceTransformer
+class Embedder:
+    """
+    Loads a sentence-transformer model and converts text chunks
+    into dense vector embeddings.
+    Model upgrade: all-MiniLM-L6-v2  →  BAAI/bge-small-en-v1.5
+    Why BGE over MiniLM:
+        - MiniLM   : general purpose, fast, 384-dim, NDCG ~0.65 on SciFact
+        - BGE-small: retrieval-specific training, 384-dim, NDCG ~0.72 on SciFact
+        - Same dimension (384), same API — only the model name changes
+        - BGE uses a special instruction prefix for queries (not for documents)
+          "Represent this sentence for searching relevant passages: {query}"
+          This is handled automatically in embed_single()
+    """
+    # BGE query instruction prefix — improves retrieval accuracy
+    # Applied to queries only, NOT to document chunks during indexing
+    BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
+    def __init__(self, config_path="config.yaml"):
+        """
+        Load the config and initialize the embedding model.
+        Args:
+            config_path (str) — path to config.yaml
+        """
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        model_name     = config["embedding_model"]
+        self.model_name = model_name
+        # detect if we are using a BGE model
+        # BGE models need a special prefix on queries (not on documents)
+        self.is_bge = "bge" in model_name.lower()
+        print(f"Loading embedding model '{model_name}'...")
+        self.model = SentenceTransformer(model_name)
+        print(f"Model loaded — BGE mode: {self.is_bge}")
+    def embed_chunks(self, chunks):
+        """
+        Convert a list of text chunks into dense vector embeddings.
+        Used during INDEXING — no query prefix applied here.
+        Args:
+            chunks (list[str]) — list of text strings to embed
+        Returns:
+            numpy.ndarray — shape (num_chunks, embedding_dim)
+                            384 dimensions for both MiniLM and BGE-small
+        """
+        embeddings = self.model.encode(
+            chunks,
+            batch_size=64,
+            show_progress_bar=False,
+            normalize_embeddings=self.is_bge,  # BGE needs L2 normalization
+        )
+        return embeddings
+    def embed_single(self, text):
+        """
+        Embed a single query string.
+        Used during SEARCH — BGE prefix is applied here if using BGE model.
+        Why prefix only on queries:
+            BGE was trained with this asymmetric setup.
+            Documents are indexed as-is.
+            Queries get the instruction prefix so the model knows
+            it is searching for relevant passages, not matching exact text.
+        Args:
+            text (str) — a single query string
+        Returns:
+            numpy.ndarray — one embedding vector (384 dimensions)
+        """
+        if self.is_bge:
+            text = self.BGE_QUERY_PREFIX + text
+        return self.model.encode(
+            text,
+            normalize_embeddings=True,  # always normalize for BGE
+        )
+if __name__ == "__main__":
+    embedder = Embedder()
+    test_chunks = [
+        "The quarterly budget report shows increased spending",
+        "Machine learning models can understand text semantics",
+        "The cat sat on the mat and looked out the window"
+    ]
+    print("Embedding 3 test chunks...")
+    vectors = embedder.embed_chunks(test_chunks)
+    print(f"Got {len(vectors)} vectors")
+    print(f"Each vector has {len(vectors[0])} dimensions")
+    print(f"First vector (first 5 values): {vectors[0][:5]}")
+    print("\n--- Single query embedding ---")
+    query_vec = embedder.embed_single("budget spending report")
+    print(f"Query vector: {len(query_vec)} dimensions")

indexer/extractor.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# indexer/extractor.py
+import os
+import json
+import fitz  # PyMuPDF
+from docx import Document
+from pptx import Presentation
+from openpyxl import load_workbook
+class Extractor:
+    """
+    Extracts raw text content from different file types.
+    Each file type has its own extraction method.
+    """
+    def extract(self, filepath):
+        """
+        Main dispatcher — picks the right extraction method based on file extension.
+        """
+        handlers = {
+            ".pdf": self.extract_pdf,
+            ".docx": self.extract_docx,
+            ".pptx": self.extract_pptx,
+            ".xlsx": self.extract_xlsx,
+            ".ipynb": self.extract_ipynb,
+            ".txt": self.extract_text,
+            ".md": self.extract_text,
+            ".py": self.extract_text,
+            ".js": self.extract_text,
+        }
+        try:
+            ext = os.path.splitext(filepath)[1].lower()
+            handler = handlers.get(ext)
+            if handler:
+                return handler(filepath)
+            else:
+                print(f"Warning: Unrecognized file extension: {ext}")
+                return ""
+        except Exception as e:
+            print(f"Error extracting text from {filepath}: {e}")
+            return ""
+    def extract_pdf(self, filepath):
+        """Extract text from a PDF file using PyMuPDF."""
+        doc = fitz.open(filepath)
+        pages = []
+        for page in doc:
+            pages.append(page.get_text())
+        doc.close()
+        return "\n".join(pages)
+    def extract_docx(self, filepath):
+        """Extract text from a Word document using python-docx."""
+        doc = Document(filepath)
+        paragraphs = []
+        for para in doc.paragraphs:
+            paragraphs.append(para.text)
+        return "\n".join(paragraphs)
+    def extract_pptx(self, filepath):
+        """Extract text from a PowerPoint file using python-pptx."""
+        prs = Presentation(filepath)
+        lines = []
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if shape.has_text_frame:
+                    for para in shape.text_frame.paragraphs:
+                        lines.append(para.text)
+        return "\n".join(lines)
+    def extract_xlsx(self, filepath):
+        """Extract text from an Excel file using openpyxl."""
+        wb = load_workbook(filepath, data_only=True)
+        rows = []
+        for sheet_name in wb.sheetnames:
+            sheet = wb[sheet_name]
+            for row in sheet.iter_rows():
+                cells = []
+                for cell in row:
+                    if cell.value is not None:
+                        cells.append(str(cell.value))
+                rows.append(" ".join(cells))
+        return "\n".join(rows)
+    def extract_ipynb(self, filepath):
+        """Extract text from a Jupyter notebook (.ipynb) file."""
+        with open(filepath, "r", encoding="utf-8") as f:
+            notebook = json.load(f)
+        cells = []
+        for cell in notebook["cells"]:
+            cell_text = "".join(cell["source"])
+            cells.append(cell_text)
+        return "\n".join(cells)
+    def extract_text(self, filepath):
+        """Extract text from plain text files (.txt, .md, .py, .js, etc.)"""
+        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+            return f.read()
+# --- Test it ---
+if __name__ == "__main__":
+    import sys
+    extractor = Extractor()
+    if len(sys.argv) > 1:
+        filepath = sys.argv[1]
+        text = extractor.extract(filepath)
+        print(f"Extracted {len(text)} characters from {filepath}")
+        print(f"Preview:\n{text[:500]}")
+    else:
+        print("Usage: python -m indexer.extractor <filepath>")

indexer/pipeline.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# indexer/pipeline.py
+import hashlib
+import os
+from evaluation.dataset_loader import DatasetLoader
+from indexer.crawler import Crawler
+from indexer.extractor import Extractor
+from indexer.chunker import Chunker
+from indexer.embedder import Embedder
+from indexer.store import Store
+class IndexingPipeline:
+    """
+    Wires all indexer modules together.
+    The flow for each file:
+        Crawler (discover + hash check)
+            → Extractor (file → raw text)
+                → Chunker (text → chunks with metadata)
+                    → Embedder (chunks → vectors)
+                        → Store (vectors → FAISS, metadata → SQLite)
+    """
+    def __init__(self, config_path="config.yaml"):
+        """
+        Initialize all pipeline components.
+        """
+        self.config_path = config_path
+        self.crawler = Crawler(config_path)
+        self.extractor = Extractor()
+        self.chunker = Chunker(chunk_size=500, overlap=50)
+        self.embedder = Embedder(config_path)
+        self.store = Store(config_path)
+    def _iter_dataset_documents(self):
+        """
+        Yield BEIR corpus documents as synthetic files so hosted deployments
+        can build an index from dataset folders containing corpus.jsonl.
+        """
+        for dataset_path in self.crawler.watch_paths:
+            corpus_path = os.path.join(dataset_path, "corpus.jsonl")
+            if not os.path.exists(corpus_path):
+                continue
+            dataset_name = os.path.basename(os.path.normpath(dataset_path))
+            try:
+                corpus = DatasetLoader(dataset_path).load_corpus()
+            except Exception as e:
+                print(f"[Pipeline] Could not load dataset corpus from {dataset_path}: {e}")
+                continue
+            for doc_id, doc in corpus.items():
+                title = (doc.get("title") or "").strip()
+                body = (doc.get("text") or "").strip()
+                text = "\n\n".join(part for part in [title, body] if part).strip()
+                if not text:
+                    continue
+                synthetic_path = f"{dataset_name}://{doc_id}"
+                synthetic_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
+                yield synthetic_path, synthetic_hash, text
+    def run(self):
+        """
+        Execute the full indexing pipeline.
+        """
+        known_hashes = self.store.load_hashes()
+        print("Scanning for new/modified files...")
+        files_to_process, current_hashes, deleted_files = self.crawler.get_new_and_modified(known_hashes)
+        dataset_documents = list(self._iter_dataset_documents())
+        known_dataset_hashes = {
+            filepath: file_hash
+            for filepath, file_hash in known_hashes.items()
+            if "://" in filepath
+        }
+        for filepath, file_hash, text in dataset_documents:
+            current_hashes[filepath] = file_hash
+            if known_dataset_hashes.get(filepath) != file_hash:
+                files_to_process.append((filepath, text))
+        current_dataset_paths = {filepath for filepath, _, _ in dataset_documents}
+        deleted_files = set(deleted_files) | (
+            set(known_dataset_hashes.keys()) - current_dataset_paths
+        )
+        for filepath in deleted_files:
+            self.store.remove_file_chunks(filepath)
+        if not files_to_process:
+            print("Index is up to date.")
+            print(f"Total vectors: {self.store.get_total_vectors()}")
+            return
+        total = len(files_to_process)
+        for i, item in enumerate(files_to_process, 1):
+            if isinstance(item, tuple):
+                filepath, text = item
+            else:
+                filepath = item
+                text = self.extractor.extract(filepath)
+            print(f"[{i}/{total}] {filepath}")
+            if not text.strip():
+                print(f"  Skipping (no text extracted)")
+                continue
+            chunks = self.chunker.chunk_file(text, filepath)
+            chunk_texts = [c["text"] for c in chunks]
+            embeddings = self.embedder.embed_chunks(chunk_texts)
+            self.store.remove_file_chunks(filepath)
+            self.store.add_chunks(chunks, embeddings)
+            self.store.save_file_info(filepath, current_hashes[filepath], len(chunks))
+        print(f"\nProcessed {total} files.")
+        print(f"Total vectors: {self.store.get_total_vectors()}")
+# --- Test it ---
+if __name__ == "__main__":
+    pipeline = IndexingPipeline()
+    pipeline.run()

indexer/store.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# indexer/store.py
+import os
+import sqlite3
+import numpy as np
+import faiss
+import yaml
+class Store:
+    """
+    Handles two storage systems:
+    1. FAISS — stores dense vectors for fast similarity search
+               Uses IndexHNSWFlat instead of IndexFlatL2
+               HNSW = Hierarchical Navigable Small World graph
+               - IndexFlatL2  : scans every vector (slow at scale)
+               - IndexHNSWFlat: graph-based navigation (fast, same accuracy)
+    2. SQLite — stores metadata about each chunk
+    """
+    # HNSW parameter — higher = more accurate but more memory
+    # 32 is the standard default, good balance for this use case
+    HNSW_M = 32
+    def __init__(self, config_path="config.yaml"):
+        """
+        Load config, set up file paths, initialize FAISS index and SQLite.
+        """
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        self.data_dir = config["data_dir"]
+        os.makedirs(self.data_dir, exist_ok=True)
+        self.faiss_path = os.path.join(self.data_dir, "index.faiss")
+        self.db_path    = os.path.join(self.data_dir, "metadata.db")
+        self._init_db()
+        self._load_or_create_index()
+    def _init_db(self):
+        """
+        Create SQLite tables if they don't already exist.
+        """
+        conn   = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS chunks (
+                id          INTEGER PRIMARY KEY,
+                filepath    TEXT    NOT NULL,
+                chunk_text  TEXT    NOT NULL,
+                chunk_index INTEGER,
+                FOREIGN KEY (filepath) REFERENCES files(filepath)
+            )
+        ''')
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS files (
+                filepath     TEXT PRIMARY KEY,
+                file_hash    TEXT NOT NULL,
+                total_chunks INTEGER
+            )
+        ''')
+        conn.commit()
+        conn.close()
+    def _load_or_create_index(self):
+        """
+        Load an existing FAISS index from disk, or set to None.
+        The actual index is created on first add_chunks() call
+        so we know the embedding dimension at that point.
+        """
+        if os.path.exists(self.faiss_path):
+            self.index = faiss.read_index(self.faiss_path)
+            print(f"[Store] Loaded FAISS index — {self.index.ntotal} vectors")
+        else:
+            self.index = None
+            print("[Store] No existing index found — will create on first insert")
+    def _create_hnsw_index(self, dimension: int):
+        """
+        Create a new HNSW-based FAISS index.
+        Why HNSW over FlatL2:
+            FlatL2   — exact search, O(n) per query, slow at scale
+            HNSWFlat — approximate search, O(log n) per query, same accuracy
+                       for top-k retrieval tasks
+        IndexIDMap2 wraps HNSW to support custom integer IDs and deletion.
+        Args:
+            dimension — embedding size (384 for MiniLM and BGE-small)
+        """
+        hnsw_index      = faiss.IndexHNSWFlat(dimension, self.HNSW_M)
+        hnsw_index.hnsw.efSearch     = 64   # search quality — higher = better recall
+        hnsw_index.hnsw.efConstruction = 64 # build quality  — higher = better graph
+        self.index      = faiss.IndexIDMap2(hnsw_index)
+        print(f"[Store] Created HNSW index — dim={dimension}, M={self.HNSW_M}")
+    def get_next_id(self):
+        """
+        Get the next available chunk ID from SQLite.
+        """
+        conn   = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT MAX(id) FROM chunks")
+        result = cursor.fetchone()[0]
+        conn.close()
+        return 0 if result is None else result + 1
+    def add_chunks(self, chunks_with_metadata, embeddings):
+        """
+        Add new chunks and their embeddings to both FAISS and SQLite.
+        Args:
+            chunks_with_metadata (list[dict]) — from chunker.chunk_file()
+                Each dict has: text, filepath, chunk_index
+            embeddings (numpy.ndarray) — shape (num_chunks, embedding_dim)
+                From embedder.embed_chunks()
+        """
+        embeddings = embeddings.astype("float32")
+        # create index on first insert — dimension comes from embeddings
+        if self.index is None:
+            dimension = embeddings.shape[1]
+            self._create_hnsw_index(dimension)
+        start_id = self.get_next_id()
+        ids      = np.array(
+            [start_id + i for i in range(len(chunks_with_metadata))],
+            dtype=np.int64
+        )
+        self.index.add_with_ids(embeddings, ids)
+        faiss.write_index(self.index, self.faiss_path)
+        # save chunk metadata to SQLite
+        conn   = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        for i, chunk in enumerate(chunks_with_metadata):
+            vector_id = start_id + i
+            cursor.execute(
+                "INSERT INTO chunks (id, filepath, chunk_text, chunk_index) "
+                "VALUES (?, ?, ?, ?)",
+                (vector_id, chunk["filepath"], chunk["text"], chunk["chunk_index"])
+            )
+        conn.commit()
+        conn.close()
+    def save_file_info(self, filepath, file_hash, total_chunks):
+        """
+        Save or update file info in SQLite.
+        Args:
+            filepath     — file path or fake path e.g. "scifact://12345"
+            file_hash    — SHA256 hash or doc_id string
+            total_chunks — number of chunks this file was split into
+        """
+        conn   = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute(
+            "INSERT OR REPLACE INTO files (filepath, file_hash, total_chunks) "
+            "VALUES (?, ?, ?)",
+            (filepath, file_hash, total_chunks)
+        )
+        conn.commit()
+        conn.close()
+    def load_hashes(self):
+        """
+        Load all stored file hashes from SQLite.
+        Returns:
+            dict — {filepath: hash_string}
+        """
+        conn   = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT filepath, file_hash FROM files")
+        rows   = cursor.fetchall()
+        conn.close()
+        return {row[0]: row[1] for row in rows}
+    def remove_file_chunks(self, filepath):
+        """
+        Delete all chunks for a file from both SQLite and FAISS.
+        Args:
+            filepath — the filepath to remove
+        """
+        conn   = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        ids = cursor.execute(
+            "SELECT id FROM chunks WHERE filepath = ?", (filepath,)
+        ).fetchall()
+        cursor.execute("DELETE FROM chunks WHERE filepath = ?", (filepath,))
+        cursor.execute("DELETE FROM files  WHERE filepath = ?", (filepath,))
+        conn.commit()
+        conn.close()
+        if ids and self.index is not None:
+            id_array = np.array([i[0] for i in ids], dtype=np.int64)
+            self.index.remove_ids(id_array)
+            faiss.write_index(self.index, self.faiss_path)
+    def get_total_vectors(self):
+        """
+        Return how many vectors are in the FAISS index.
+        Returns:
+            int — number of vectors, or 0 if index is empty
+        """
+        if self.index is None:
+            return 0
+        return self.index.ntotal
+if __name__ == "__main__":
+    store = Store()
+    fake_chunks = [
+        {"text": "quarterly budget report summary",       "filepath": "/docs/report.pdf",   "chunk_index": 0},
+        {"text": "revenue increased by fifteen percent",  "filepath": "/docs/report.pdf",   "chunk_index": 1},
+        {"text": "python machine learning tutorial",      "filepath": "/docs/tutorial.txt", "chunk_index": 0},
+    ]
+    fake_embeddings = np.random.rand(3, 384).astype("float32")
+    print(f"Vectors before: {store.get_total_vectors()}")
+    store.add_chunks(fake_chunks, fake_embeddings)
+    print(f"Vectors after:  {store.get_total_vectors()}")

indexer/watcher.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# indexer/watcher.py
+import os
+import time
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+from indexer.pipeline import IndexingPipeline
+import yaml
+class IndexHandler(FileSystemEventHandler):
+    """
+    Handles filesystem events detected by watchdog.
+    watchdog calls these methods automatically:
+        - on_created(event)   → new file added
+        - on_modified(event)  → existing file changed
+        - on_deleted(event)   → file removed
+    """
+    def __init__(self, pipeline, config_path="config.yaml"):
+        """
+        Args:
+            pipeline (IndexingPipeline) — existing pipeline instance
+        """
+        with open(config_path) as f:
+            config = yaml.safe_load(f)
+            self._debounce_seconds = config["debounce_seconds"]
+        self.pipeline = pipeline
+        self.include_extensions = self.pipeline.crawler.include_extensions
+        self._last_event = {}    # {filepath: timestamp}
+    def _is_duplicate(self, filepath):
+        """
+        Check if we've already handled an event for this file recently.
+        Returns True if we should skip this event.
+        """
+        now = time.time()
+        last = self._last_event.get(filepath, 0)
+        if now - last < self._debounce_seconds:
+            return True
+        self._last_event[filepath] = now
+        return False
+    def _is_relevant(self, filepath):
+        """
+        Check if a file event is for a file type we care about.
+        Args:
+            filepath (str) — path from the event
+        Returns:
+            bool — True if the file extension is in our include list
+        """
+        ext = os.path.splitext(filepath)[1].lower()
+        return ext in self.include_extensions
+    def on_created(self, event):
+        """
+        Called when a new file is created.
+        Args:
+            event — watchdog event
+        """
+        if(event.is_directory):
+            return
+        if(not self._is_relevant(event.src_path)):
+            return
+        if self._is_duplicate(event.src_path):
+            return
+        print(f"New file detected: {event.src_path}")
+        text = self.pipeline.extractor.extract(event.src_path)
+        if(not text.strip()):
+            print(f"  Skipping (no text extracted)")
+            return
+        chunks = self.pipeline.chunker.chunk_file(text, event.src_path)
+        chunk_texts = [c["text"] for c in chunks]
+        embeddings = self.pipeline.embedder.embed_chunks(chunk_texts)
+        self.pipeline.store.remove_file_chunks(event.src_path)
+        self.pipeline.store.add_chunks(chunks, embeddings)
+        file_hash = self.pipeline.crawler.compute_hash(event.src_path)
+        self.pipeline.store.save_file_info(event.src_path, file_hash, len(chunks))
+        print(f"  File stored: {event.src_path}")
+    def on_modified(self, event):
+        """
+        Called when an existing file is modified.
+        Args:
+            event - watchdog event
+        """
+        if(event.is_directory):
+            return
+        if(not self._is_relevant(event.src_path)):
+            return
+        if self._is_duplicate(event.src_path):
+            return
+        print(f"File modified: {event.src_path}")
+        self.pipeline.store.remove_file_chunks(event.src_path)
+        text = self.pipeline.extractor.extract(event.src_path)
+        if(not text.strip()):
+            print(f"  Skipping (no text extracted)")
+            return
+        chunks = self.pipeline.chunker.chunk_file(text, event.src_path)
+        chunk_texts = [c["text"] for c in chunks]
+        embeddings = self.pipeline.embedder.embed_chunks(chunk_texts)
+        self.pipeline.store.add_chunks(chunks, embeddings)
+        file_hash = self.pipeline.crawler.compute_hash(event.src_path)
+        self.pipeline.store.save_file_info(event.src_path, file_hash, len(chunks))
+        print(f"  File saved: {event.src_path}")
+    def on_deleted(self, event):
+        """
+        Called when a file is deleted.
+        Args:
+            event - watchdog event
+        """
+        if(event.is_directory):
+            return
+        if(not self._is_relevant(event.src_path)):
+            return
+        print(f"File deleted: {event.src_path}")
+        self.pipeline.store.remove_file_chunks(event.src_path)
+class Watcher:
+    """
+    Starts watchdog observers on all configured watch_paths.
+    Runs continuously until the user presses Ctrl+C.
+    """
+    def __init__(self, config_path="config.yaml"):
+        """
+        Initialize the Watcher.
+        """
+        self.pipeline = IndexingPipeline(config_path)
+        self.handler = IndexHandler(self.pipeline)
+        self.watch_paths = self.pipeline.crawler.watch_paths
+    def start(self):
+        """
+        Start watching all configured directories.
+        """
+        observer = Observer()
+        for path in self.watch_paths:
+            observer.schedule(self.handler, path, recursive=True)
+        observer.start()
+        print(f"Watchdog active. Watching {', '.join(self.watch_paths)}")
+        try:
+            while True:
+                time.sleep(1)
+        except KeyboardInterrupt:
+            print("Stopping watcher...")
+        finally:
+            observer.stop()
+            observer.join()
+# --- Test it ---
+if __name__ == "__main__":
+    # First run the full pipeline to index existing files
+    print("Running initial index...")
+    watcher = Watcher()
+    watcher.pipeline.run()
+    # Then start watching for changes
+    print("\nStarting file watcher...")
+    watcher.start()

main.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# main.py
+import json
+import os
+import time
+from functools import lru_cache
+import yaml
+from fastapi import FastAPI, Request, Form
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from evaluation.dataset_loader import DatasetLoader
+app = FastAPI(title="Semantic Search Engine")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+templates = Jinja2Templates(directory="templates")
+# ── load search engine once at startup ──────────────────────────────────────
+ENGINE_ERROR = None
+@lru_cache(maxsize=1)
+def get_engine():
+    global ENGINE_ERROR
+    try:
+        from searcher.search_engine import SearchEngine
+        ENGINE_ERROR = None
+        return SearchEngine("config.yaml")
+    except Exception as e:
+        ENGINE_ERROR = str(e)
+        print(f"[Startup] Search engine unavailable: {e}")
+        return None
+# ── load dataset queries at startup ─────────────────────────────────────────
+# These are the actual queries from SciFact and NFCorpus
+# We use them to show "which dataset queries matched your search"
+def load_dataset_queries() -> dict:
+    """
+    Load all queries from SciFact and NFCorpus at startup.
+    Returns:
+        dict — {
+            "scifact":  {query_id: query_text, ...},
+            "nfcorpus": {query_id: query_text, ...},
+        }
+    """
+    all_queries = {}
+    datasets = {
+        "scifact":  "data/scifact",
+        "nfcorpus": "data/nfcorpus",
+    }
+    for name, path in datasets.items():
+        if os.path.exists(path):
+            try:
+                loader             = DatasetLoader(path)
+                all_queries[name]  = loader.load_queries()
+                print(f"[Startup] Loaded {len(all_queries[name])} queries from {name}")
+            except Exception as e:
+                print(f"[Startup] Could not load {name} queries: {e}")
+                all_queries[name] = {}
+        else:
+            print(f"[Startup] Dataset path not found: {path}")
+            all_queries[name] = {}
+    return all_queries
+# load once at startup — available globally
+DATASET_QUERIES = load_dataset_queries()
+# ── helpers ──────────────────────────────────────────────────────────────────
+def load_eval_results() -> dict:
+    path = "results/eval_all.json"
+    if os.path.exists(path):
+        with open(path, "r") as f:
+            return json.load(f)
+    return {}
+def extract_doc_id(filepath: str) -> str:
+    if "://" in filepath:
+        return filepath.split("://", 1)[1]
+    return filepath
+def get_dataset_from_filepath(filepath: str) -> str:
+    if "scifact://"  in filepath: return "scifact"
+    if "nfcorpus://" in filepath: return "nfcorpus"
+    return "filesystem"
+def get_file_icon(filepath: str) -> str:
+    if "scifact://"  in filepath: return "🔬"
+    if "nfcorpus://" in filepath: return "🏥"
+    ext   = filepath.lower().split(".")[-1] if "." in filepath else ""
+    icons = {
+        "pdf": "📄", "docx": "📝", "txt": "📃",
+        "pptx": "📊", "xlsx": "📋", "py": "🐍",
+    }
+    return icons.get(ext, "📄")
+def find_matching_dataset_queries(
+    user_query: str,
+    top_results: list,
+) -> list:
+    """
+    Find which dataset queries are semantically related to what the user typed.
+    Strategy — two passes:
+        1. Exact / substring match  — query text contains user words
+        2. Doc-based match          — if a result doc came from dataset X,
+                                      show the queries that reference that doc
+                                      from the qrels (loaded separately)
+    We use simple word overlap here (no extra model call needed).
+    Returns:
+        list of dicts — [
+            {
+                "query_id":   "1234",
+                "query_text": "Does vitamin D cause cancer?",
+                "dataset":    "scifact",
+                "match_type": "text"   or "doc"
+            },
+            ...
+        ]
+    """
+    matched   = []
+    seen_ids  = set()
+    # words from user query — lowercase, skip short words
+    user_words = set(
+        w.lower() for w in user_query.split()
+        if len(w) > 3
+    )
+    # Pass 1 — text overlap match
+    # check every dataset query for word overlap with user query
+    for dataset_name, queries in DATASET_QUERIES.items():
+        for qid, qtext in queries.items():
+            q_words = set(w.lower() for w in qtext.split() if len(w) > 3)
+            overlap = user_words & q_words
+            # need at least 1 word overlap
+            if overlap and qid not in seen_ids:
+                matched.append({
+                    "query_id":   qid,
+                    "query_text": qtext,
+                    "dataset":    dataset_name,
+                    "match_type": "text",
+                    "overlap":    len(overlap),
+                })
+                seen_ids.add(qid)
+    # sort by overlap count — most overlapping queries first
+    matched.sort(key=lambda x: x["overlap"], reverse=True)
+    # return top 8 matched queries max
+    return matched[:8]
+# ── routes ───────────────────────────────────────────────────────────────────
+@app.get("/", response_class=HTMLResponse)
+async def home(request: Request):
+    return templates.TemplateResponse("index.html", {
+        "request":          request,
+        "scifact_count":    len(DATASET_QUERIES.get("scifact",  {})),
+        "nfcorpus_count":   len(DATASET_QUERIES.get("nfcorpus", {})),
+        "error":            ENGINE_ERROR,
+    })
+@app.post("/search", response_class=HTMLResponse)
+async def search(
+    request: Request,
+    query:   str = Form(...),
+    top_k:   int = Form(10),
+    mode:    str = Form("full"),
+):
+    if not query.strip():
+        return templates.TemplateResponse("index.html", {
+            "request":        request,
+            "error":          "Please enter a search query.",
+            "scifact_count":  len(DATASET_QUERIES.get("scifact", {})),
+            "nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
+        })
+    engine = get_engine()
+    if engine is None:
+        return templates.TemplateResponse("index.html", {
+            "request":        request,
+            "error":          (
+                "Search is not ready yet. The semantic index is still missing or failed to build. "
+                f"Startup details: {ENGINE_ERROR}"
+            ),
+            "scifact_count":  len(DATASET_QUERIES.get("scifact", {})),
+            "nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
+        })
+    t0      = time.time()
+    output  = engine.search(query.strip(), top_k=top_k)
+    elapsed = round(time.time() - t0, 3)
+    # format search results
+    results = []
+    for r in output.get("results", []):
+        filepath = r.get("filepath", "")
+        doc_id   = extract_doc_id(filepath)
+        score    = r.get("rerank_score", r.get("rrf_score", r.get("dense_score", 0)))
+        snippet  = r.get("chunk_text", r.get("text", "No preview available."))
+        if len(snippet) > 200:
+            snippet = snippet[:200].rsplit(" ", 1)[0] + "..."
+        dataset = get_dataset_from_filepath(filepath)
+        results.append({
+            "doc_id":   doc_id,
+            "filepath": filepath,
+            "score":    round(float(score), 4),
+            "snippet":  snippet,
+            "icon":     get_file_icon(filepath),
+            "dataset":  dataset,
+        })
+    # find matching dataset queries
+    matched_queries = find_matching_dataset_queries(query.strip(), results)
+    # group matched queries by dataset for display
+    matched_scifact  = [q for q in matched_queries if q["dataset"] == "scifact"]
+    matched_nfcorpus = [q for q in matched_queries if q["dataset"] == "nfcorpus"]
+    return templates.TemplateResponse("results.html", {
+        "request":          request,
+        "query":            query,
+        "results":          results,
+        "total":            len(results),
+        "elapsed":          elapsed,
+        "mode":             mode,
+        "top_k":            top_k,
+        "matched_scifact":  matched_scifact,
+        "matched_nfcorpus": matched_nfcorpus,
+        "total_matched":    len(matched_queries),
+    })
+@app.get("/dashboard", response_class=HTMLResponse)
+async def dashboard(request: Request):
+    eval_data = load_eval_results()
+    datasets = []
+    for dataset_name, mode_results in eval_data.items():
+        full = mode_results.get("full", {})
+        datasets.append({
+            "name":      dataset_name,
+            "ndcg":      full.get("NDCG@10",    0.0),
+            "mrr":       full.get("MRR",         0.0),
+            "map":       full.get("MAP@100",     0.0),
+            "recall":    full.get("Recall@100",  0.0),
+            "precision": full.get("P@10",        0.0),
+            "queries":   full.get("num_queries", 0),
+            "modes":     mode_results,
+        })
+    return templates.TemplateResponse("dashboard.html", {
+        "request":  request,
+        "datasets": datasets,
+    })
+@app.get("/health")
+async def health():
+    engine = get_engine()
+    return {
+        "status": "ok" if engine is not None else "degraded",
+        "engine_ready": engine is not None,
+        "engine_error": ENGINE_ERROR,
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
+    # uvicorn main:app --reload --host 0.0.0.0 --port 8000

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+fastapi
+uvicorn
+jinja2
+python-multipart
+sentence-transformers
+transformers
+torch
+faiss-cpu
+numpy
+scipy
+scikit-learn
+networkx
+nltk
+pandas
+pyyaml
+python-docx
+python-pptx
+openpyxl
+pillow
+lxml
+PyMuPDF

searcher/__init__.py ADDED Viewed

File without changes

searcher/__pycache__/__init__.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (133 Bytes). View file

searcher/__pycache__/dense_retriever.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (4.17 kB). View file

searcher/__pycache__/facet_filter.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (3.08 kB). View file

searcher/__pycache__/fusion_ranker.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (3.62 kB). View file

searcher/__pycache__/highlighter.cpython-313.pyc.2070577919488 ADDED Viewed

Binary file (4.27 kB). View file

searcher/__pycache__/query_understanding.cpython-313.pyc.2070578319792 ADDED Viewed

Binary file (4.41 kB). View file