Spaces:
Sleeping
Sleeping
Upload 65 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +15 -0
- .gitignore +34 -0
- Dockerfile +20 -0
- __pycache__/check_nfcorpus.cpython-313.pyc.2070577919488 +0 -0
- __pycache__/main.cpython-313.pyc.2070578258992 +0 -0
- check_nfcorpus.py +29 -0
- config.yaml +42 -0
- data/nfcorpus/corpus.jsonl +0 -0
- data/nfcorpus/qrels/dev.tsv +0 -0
- data/nfcorpus/qrels/test.tsv +0 -0
- data/nfcorpus/qrels/train.tsv +0 -0
- data/nfcorpus/queries.jsonl +0 -0
- data/scifact/corpus.jsonl +0 -0
- data/scifact/qrels/test.tsv +340 -0
- data/scifact/qrels/train.tsv +920 -0
- data/scifact/queries.jsonl +0 -0
- docker-compose.yml +16 -0
- evaluation/__pycache__/dataset_loader.cpython-313.pyc.2070577919488 +0 -0
- evaluation/__pycache__/evaluator.cpython-313.pyc.2070577919488 +0 -0
- evaluation/__pycache__/indexer_bridge.cpython-313.pyc.2070577919488 +0 -0
- evaluation/__pycache__/query_runner.cpython-313.pyc.2070577919488 +0 -0
- evaluation/__pycache__/run_eval.cpython-313.pyc.2070577919488 +0 -0
- evaluation/dataset_loader.py +132 -0
- evaluation/evaluator.py +197 -0
- evaluation/indexer_bridge.py +94 -0
- evaluation/query_runner.py +128 -0
- evaluation/run_eval.py +170 -0
- indexer/__pycache__/chunker.cpython-313.pyc.2070577919488 +0 -0
- indexer/__pycache__/crawler.cpython-313.pyc.2070577919488 +0 -0
- indexer/__pycache__/embedder.cpython-313.pyc.2070577919488 +0 -0
- indexer/__pycache__/extractor.cpython-313.pyc.2070577919488 +0 -0
- indexer/__pycache__/pipeline.cpython-313.pyc.2070577919488 +0 -0
- indexer/__pycache__/store.cpython-313.pyc.2070577919488 +0 -0
- indexer/__pycache__/watcher.cpython-313.pyc.2070577919488 +0 -0
- indexer/chunker.py +135 -0
- indexer/crawler.py +102 -0
- indexer/embedder.py +111 -0
- indexer/extractor.py +115 -0
- indexer/pipeline.py +125 -0
- indexer/store.py +238 -0
- indexer/watcher.py +187 -0
- main.py +298 -0
- requirements.txt +21 -0
- searcher/__init__.py +0 -0
- searcher/__pycache__/__init__.cpython-313.pyc.2070577919488 +0 -0
- searcher/__pycache__/dense_retriever.cpython-313.pyc.2070577919488 +0 -0
- searcher/__pycache__/facet_filter.cpython-313.pyc.2070577919488 +0 -0
- searcher/__pycache__/fusion_ranker.cpython-313.pyc.2070577919488 +0 -0
- searcher/__pycache__/highlighter.cpython-313.pyc.2070577919488 +0 -0
- searcher/__pycache__/query_understanding.cpython-313.pyc.2070578319792 +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
.venv
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
*.log
|
| 9 |
+
.pytest_cache/
|
| 10 |
+
.mypy_cache/
|
| 11 |
+
.ruff_cache/
|
| 12 |
+
.idea/
|
| 13 |
+
.vscode/
|
| 14 |
+
data/
|
| 15 |
+
results/
|
.gitignore
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Virtual environment
|
| 2 |
+
.venv/
|
| 3 |
+
|
| 4 |
+
# Vector index and database (large binary files)
|
| 5 |
+
data/
|
| 6 |
+
results/
|
| 7 |
+
|
| 8 |
+
# Logs
|
| 9 |
+
*.log
|
| 10 |
+
|
| 11 |
+
# Downloaded ML models (auto-downloaded at runtime)
|
| 12 |
+
models/
|
| 13 |
+
.cache/
|
| 14 |
+
sentence_transformers/
|
| 15 |
+
|
| 16 |
+
# Python cache
|
| 17 |
+
___pycache__/
|
| 18 |
+
*.pyc
|
| 19 |
+
*.pyo
|
| 20 |
+
*.pyd
|
| 21 |
+
.env
|
| 22 |
+
venv/
|
| 23 |
+
env/
|
| 24 |
+
.venv/
|
| 25 |
+
# Model cache
|
| 26 |
+
.cache/
|
| 27 |
+
|
| 28 |
+
# OS files
|
| 29 |
+
.DS_Store
|
| 30 |
+
Thumbs.db
|
| 31 |
+
|
| 32 |
+
# IDE
|
| 33 |
+
.vscode/
|
| 34 |
+
.idea/
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1 \
|
| 6 |
+
NLTK_DATA=/usr/local/share/nltk_data
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
COPY requirements.txt .
|
| 11 |
+
|
| 12 |
+
RUN pip install --upgrade pip && \
|
| 13 |
+
pip install -r requirements.txt && \
|
| 14 |
+
python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data'); nltk.download('omw-1.4', download_dir='/usr/local/share/nltk_data')"
|
| 15 |
+
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
__pycache__/check_nfcorpus.cpython-313.pyc.2070577919488
ADDED
|
Binary file (1.46 kB). View file
|
|
|
__pycache__/main.cpython-313.pyc.2070578258992
ADDED
|
Binary file (11.7 kB). View file
|
|
|
check_nfcorpus.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
sys.path.append(os.path.abspath("."))
|
| 6 |
+
# Load results
|
| 7 |
+
with open('results/eval_nfcorpus.json') as f:
|
| 8 |
+
data = json.load(f)
|
| 9 |
+
|
| 10 |
+
# Load qrels
|
| 11 |
+
from evaluation.dataset_loader import DatasetLoader
|
| 12 |
+
|
| 13 |
+
loader = DatasetLoader('data/nfcorpus')
|
| 14 |
+
qrels = loader.load_qrels()
|
| 15 |
+
|
| 16 |
+
# 🔍 Debug prints
|
| 17 |
+
print("Sample RESULT query_id:", list(data.keys())[0])
|
| 18 |
+
|
| 19 |
+
first_qid = list(qrels.keys())[0]
|
| 20 |
+
print("Sample QREL query_id:", first_qid)
|
| 21 |
+
|
| 22 |
+
print("Sample QREL doc_id:", list(qrels[first_qid].keys())[0])
|
| 23 |
+
|
| 24 |
+
print("Total QREL queries:", len(qrels))
|
| 25 |
+
print("Total RESULT queries:", len(data))
|
| 26 |
+
|
| 27 |
+
# 🔥 Check overlap
|
| 28 |
+
common = set(data.keys()) & set(qrels.keys())
|
| 29 |
+
print("Common query IDs:", len(common))
|
config.yaml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Directories to index
|
| 2 |
+
watch_paths:
|
| 3 |
+
- ./data/scifact
|
| 4 |
+
- ./data/nfcorpus #modify this
|
| 5 |
+
|
| 6 |
+
# File extensions to include
|
| 7 |
+
include_extensions:
|
| 8 |
+
- ".pdf"
|
| 9 |
+
- ".docx"
|
| 10 |
+
- ".txt"
|
| 11 |
+
- ".md"
|
| 12 |
+
- ".pptx"
|
| 13 |
+
- ".xlsx"
|
| 14 |
+
- ".py"
|
| 15 |
+
- ".js"
|
| 16 |
+
- ".ipynb"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# --- Add these ---
|
| 20 |
+
top_k: 5 # final results returned to user
|
| 21 |
+
candidate_k: 20 # candidates fetched before reranking
|
| 22 |
+
|
| 23 |
+
query_expansion: true # WordNet synonym expansion
|
| 24 |
+
max_synonyms: 5 # max synonyms to append
|
| 25 |
+
|
| 26 |
+
reranking_enabled: true # cross-encoder reranking
|
| 27 |
+
reranker_model: "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 28 |
+
|
| 29 |
+
# Directories to skip
|
| 30 |
+
skip_directories:
|
| 31 |
+
- ".git"
|
| 32 |
+
- "node_modules"
|
| 33 |
+
- "__pycache__"
|
| 34 |
+
- ".venv"
|
| 35 |
+
|
| 36 |
+
# Where to store index data
|
| 37 |
+
data_dir: "./data"
|
| 38 |
+
|
| 39 |
+
embedding_model: "all-MiniLM-L6-v2"
|
| 40 |
+
# embedding_model: BAAI/bge-small-en-v1.5
|
| 41 |
+
|
| 42 |
+
debounce_seconds: 5
|
data/nfcorpus/corpus.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/nfcorpus/qrels/dev.tsv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/nfcorpus/qrels/test.tsv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/nfcorpus/qrels/train.tsv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/nfcorpus/queries.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/scifact/corpus.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/scifact/qrels/test.tsv
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
query-id corpus-id score
|
| 2 |
+
1 31715818 1
|
| 3 |
+
3 14717500 1
|
| 4 |
+
5 13734012 1
|
| 5 |
+
13 1606628 1
|
| 6 |
+
36 5152028 1
|
| 7 |
+
36 11705328 1
|
| 8 |
+
42 18174210 1
|
| 9 |
+
48 13734012 1
|
| 10 |
+
49 5953485 1
|
| 11 |
+
50 12580014 1
|
| 12 |
+
51 45638119 1
|
| 13 |
+
53 45638119 1
|
| 14 |
+
54 49556906 1
|
| 15 |
+
56 4709641 1
|
| 16 |
+
57 4709641 1
|
| 17 |
+
70 5956380 1
|
| 18 |
+
70 4414547 1
|
| 19 |
+
72 6076903 1
|
| 20 |
+
75 4387784 1
|
| 21 |
+
94 1215116 1
|
| 22 |
+
99 18810195 1
|
| 23 |
+
100 4381486 1
|
| 24 |
+
113 6157837 1
|
| 25 |
+
115 33872649 1
|
| 26 |
+
118 6372244 1
|
| 27 |
+
124 4883040 1
|
| 28 |
+
127 21598000 1
|
| 29 |
+
128 8290953 1
|
| 30 |
+
129 27768226 1
|
| 31 |
+
130 27768226 1
|
| 32 |
+
132 7975937 1
|
| 33 |
+
133 38485364 1
|
| 34 |
+
133 6969753 1
|
| 35 |
+
133 17934082 1
|
| 36 |
+
133 16280642 1
|
| 37 |
+
133 12640810 1
|
| 38 |
+
137 26016929 1
|
| 39 |
+
141 6955746 1
|
| 40 |
+
141 14437255 1
|
| 41 |
+
142 10582939 1
|
| 42 |
+
143 10582939 1
|
| 43 |
+
146 10582939 1
|
| 44 |
+
148 1084345 1
|
| 45 |
+
163 18872233 1
|
| 46 |
+
171 12670680 1
|
| 47 |
+
179 16322674 1
|
| 48 |
+
179 27123743 1
|
| 49 |
+
179 23557241 1
|
| 50 |
+
179 17450673 1
|
| 51 |
+
180 16966326 1
|
| 52 |
+
183 12827098 1
|
| 53 |
+
185 18340282 1
|
| 54 |
+
198 2177022 1
|
| 55 |
+
208 13519661 1
|
| 56 |
+
212 22038539 1
|
| 57 |
+
213 13625993 1
|
| 58 |
+
216 21366394 1
|
| 59 |
+
217 21366394 1
|
| 60 |
+
218 21366394 1
|
| 61 |
+
219 21366394 1
|
| 62 |
+
230 3067015 1
|
| 63 |
+
232 10536636 1
|
| 64 |
+
233 4388470 1
|
| 65 |
+
236 4388470 1
|
| 66 |
+
237 4942718 1
|
| 67 |
+
238 2251426 1
|
| 68 |
+
239 14079881 1
|
| 69 |
+
248 1568684 1
|
| 70 |
+
249 1568684 1
|
| 71 |
+
261 1122279 1
|
| 72 |
+
261 10697096 1
|
| 73 |
+
268 970012 1
|
| 74 |
+
269 970012 1
|
| 75 |
+
274 11614737 1
|
| 76 |
+
275 4961038 1
|
| 77 |
+
275 14241418 1
|
| 78 |
+
275 14819804 1
|
| 79 |
+
279 14376683 1
|
| 80 |
+
294 10874408 1
|
| 81 |
+
295 20310709 1
|
| 82 |
+
298 39381118 1
|
| 83 |
+
300 3553087 1
|
| 84 |
+
303 4388470 1
|
| 85 |
+
312 6173523 1
|
| 86 |
+
314 4347374 1
|
| 87 |
+
324 2014909 1
|
| 88 |
+
327 17997584 1
|
| 89 |
+
338 23349986 1
|
| 90 |
+
343 7873737 1
|
| 91 |
+
343 5884524 1
|
| 92 |
+
350 16927286 1
|
| 93 |
+
354 8774475 1
|
| 94 |
+
362 38587347 1
|
| 95 |
+
380 19005293 1
|
| 96 |
+
384 13770184 1
|
| 97 |
+
385 9955779 1
|
| 98 |
+
385 9767444 1
|
| 99 |
+
386 16495649 1
|
| 100 |
+
388 1148122 1
|
| 101 |
+
399 791050 1
|
| 102 |
+
410 14924526 1
|
| 103 |
+
411 14924526 1
|
| 104 |
+
415 6309659 1
|
| 105 |
+
421 11172205 1
|
| 106 |
+
431 28937856 1
|
| 107 |
+
436 14637235 1
|
| 108 |
+
437 18399038 1
|
| 109 |
+
439 4423559 1
|
| 110 |
+
440 4423559 1
|
| 111 |
+
443 10165258 1
|
| 112 |
+
452 12804937 1
|
| 113 |
+
452 464511 1
|
| 114 |
+
475 18678095 1
|
| 115 |
+
478 14767844 1
|
| 116 |
+
491 56893404 1
|
| 117 |
+
501 17930286 1
|
| 118 |
+
502 13071728 1
|
| 119 |
+
507 30774694 1
|
| 120 |
+
508 13980338 1
|
| 121 |
+
513 13230773 1
|
| 122 |
+
514 16256507 1
|
| 123 |
+
516 29564505 1
|
| 124 |
+
517 15663829 1
|
| 125 |
+
521 34873974 1
|
| 126 |
+
525 13639330 1
|
| 127 |
+
527 3863543 1
|
| 128 |
+
528 5476778 1
|
| 129 |
+
532 12991445 1
|
| 130 |
+
533 12991445 1
|
| 131 |
+
535 39368721 1
|
| 132 |
+
536 16056514 1
|
| 133 |
+
539 13282296 1
|
| 134 |
+
540 11886686 1
|
| 135 |
+
540 25007443 1
|
| 136 |
+
544 24221369 1
|
| 137 |
+
549 9433958 1
|
| 138 |
+
551 33499189 1
|
| 139 |
+
552 1471041 1
|
| 140 |
+
554 1049501 1
|
| 141 |
+
560 40096222 1
|
| 142 |
+
569 23460562 1
|
| 143 |
+
575 10300888 1
|
| 144 |
+
577 5289038 1
|
| 145 |
+
578 8764879 1
|
| 146 |
+
587 16999023 1
|
| 147 |
+
589 10984005 1
|
| 148 |
+
593 19675911 1
|
| 149 |
+
597 12779444 1
|
| 150 |
+
597 36355784 1
|
| 151 |
+
597 25742130 1
|
| 152 |
+
598 25742130 1
|
| 153 |
+
613 9638032 1
|
| 154 |
+
619 20888849 1
|
| 155 |
+
619 2565138 1
|
| 156 |
+
623 17000834 1
|
| 157 |
+
628 24512064 1
|
| 158 |
+
636 24294572 1
|
| 159 |
+
637 25649714 1
|
| 160 |
+
641 5912283 1
|
| 161 |
+
641 31554917 1
|
| 162 |
+
644 13619127 1
|
| 163 |
+
649 12789595 1
|
| 164 |
+
659 1215116 1
|
| 165 |
+
660 1215116 1
|
| 166 |
+
674 2095573 1
|
| 167 |
+
684 4942718 1
|
| 168 |
+
690 18750453 1
|
| 169 |
+
691 10991183 1
|
| 170 |
+
692 24088502 1
|
| 171 |
+
693 24088502 1
|
| 172 |
+
700 4350400 1
|
| 173 |
+
702 4350400 1
|
| 174 |
+
715 18421962 1
|
| 175 |
+
716 18421962 1
|
| 176 |
+
718 17587795 1
|
| 177 |
+
721 1834762 1
|
| 178 |
+
723 5531479 1
|
| 179 |
+
727 7521113 1
|
| 180 |
+
728 7521113 1
|
| 181 |
+
728 36444198 1
|
| 182 |
+
729 26851674 1
|
| 183 |
+
742 32159283 1
|
| 184 |
+
743 32159283 1
|
| 185 |
+
744 8460275 1
|
| 186 |
+
756 2831620 1
|
| 187 |
+
759 1805641 1
|
| 188 |
+
768 6421792 1
|
| 189 |
+
770 15476777 1
|
| 190 |
+
775 32275758 1
|
| 191 |
+
781 24338780 1
|
| 192 |
+
783 40632104 1
|
| 193 |
+
784 2356950 1
|
| 194 |
+
785 12471115 1
|
| 195 |
+
793 8551160 1
|
| 196 |
+
800 22543403 1
|
| 197 |
+
805 22180793 1
|
| 198 |
+
808 36606083 1
|
| 199 |
+
811 19799455 1
|
| 200 |
+
814 33387953 1
|
| 201 |
+
820 8646760 1
|
| 202 |
+
821 8646760 1
|
| 203 |
+
823 15319019 1
|
| 204 |
+
830 1897324 1
|
| 205 |
+
831 1897324 1
|
| 206 |
+
832 30303335 1
|
| 207 |
+
834 5483793 1
|
| 208 |
+
837 15928989 1
|
| 209 |
+
839 1469751 1
|
| 210 |
+
845 17741440 1
|
| 211 |
+
847 16787954 1
|
| 212 |
+
852 13843341 1
|
| 213 |
+
859 1982286 1
|
| 214 |
+
870 195689316 1
|
| 215 |
+
873 1180972 1
|
| 216 |
+
873 19307912 1
|
| 217 |
+
873 27393799 1
|
| 218 |
+
873 29025270 1
|
| 219 |
+
873 3315558 1
|
| 220 |
+
879 8426046 1
|
| 221 |
+
880 8426046 1
|
| 222 |
+
882 14803797 1
|
| 223 |
+
887 18855191 1
|
| 224 |
+
903 10648422 1
|
| 225 |
+
904 7370282 1
|
| 226 |
+
907 6923961 1
|
| 227 |
+
911 11254556 1
|
| 228 |
+
913 3203590 1
|
| 229 |
+
914 3203590 1
|
| 230 |
+
921 1642727 1
|
| 231 |
+
922 17077004 1
|
| 232 |
+
936 5483793 1
|
| 233 |
+
956 12956194 1
|
| 234 |
+
957 123859 1
|
| 235 |
+
960 8780599 1
|
| 236 |
+
967 2119889 1
|
| 237 |
+
967 8997410 1
|
| 238 |
+
971 46695481 1
|
| 239 |
+
971 27873158 1
|
| 240 |
+
971 28617573 1
|
| 241 |
+
971 9764256 1
|
| 242 |
+
975 5304891 1
|
| 243 |
+
982 2988714 1
|
| 244 |
+
985 6828370 1
|
| 245 |
+
993 16472469 1
|
| 246 |
+
1012 9745001 1
|
| 247 |
+
1014 6277638 1
|
| 248 |
+
1019 11603066 1
|
| 249 |
+
1020 9433958 1
|
| 250 |
+
1021 9433958 1
|
| 251 |
+
1024 5373138 1
|
| 252 |
+
1029 13923140 1
|
| 253 |
+
1029 13940200 1
|
| 254 |
+
1029 11899391 1
|
| 255 |
+
1041 25254425 1
|
| 256 |
+
1041 16626264 1
|
| 257 |
+
1049 12486491 1
|
| 258 |
+
1062 20381484 1
|
| 259 |
+
1086 39281140 1
|
| 260 |
+
1088 37549932 1
|
| 261 |
+
1089 17628888 1
|
| 262 |
+
1099 7662206 1
|
| 263 |
+
1100 7662206 1
|
| 264 |
+
1104 3898784 1
|
| 265 |
+
1107 20532591 1
|
| 266 |
+
1110 13770184 1
|
| 267 |
+
1121 4456756 1
|
| 268 |
+
1130 17997584 1
|
| 269 |
+
1132 33499189 1
|
| 270 |
+
1132 9283422 1
|
| 271 |
+
1137 33370 1
|
| 272 |
+
1140 12009265 1
|
| 273 |
+
1144 10071552 1
|
| 274 |
+
1146 13906581 1
|
| 275 |
+
1150 11369420 1
|
| 276 |
+
1163 15305881 1
|
| 277 |
+
1175 31272411 1
|
| 278 |
+
1179 31272411 1
|
| 279 |
+
1180 31272411 1
|
| 280 |
+
1185 16737210 1
|
| 281 |
+
1187 52873726 1
|
| 282 |
+
1191 30655442 1
|
| 283 |
+
1194 11419230 1
|
| 284 |
+
1196 25649714 1
|
| 285 |
+
1197 25649714 1
|
| 286 |
+
1199 16760369 1
|
| 287 |
+
1200 3441524 1
|
| 288 |
+
1202 3475317 1
|
| 289 |
+
1204 31141365 1
|
| 290 |
+
1207 18909530 1
|
| 291 |
+
1213 14407673 1
|
| 292 |
+
1216 24142891 1
|
| 293 |
+
1221 19736671 1
|
| 294 |
+
1225 9650982 1
|
| 295 |
+
1226 13777138 1
|
| 296 |
+
1232 13905670 1
|
| 297 |
+
1241 4427392 1
|
| 298 |
+
1245 7662395 1
|
| 299 |
+
1259 24341590 1
|
| 300 |
+
1262 44172171 1
|
| 301 |
+
1266 37480103 1
|
| 302 |
+
1270 13900610 1
|
| 303 |
+
1271 13768432 1
|
| 304 |
+
1272 17081238 1
|
| 305 |
+
1273 11041152 1
|
| 306 |
+
1274 12428814 1
|
| 307 |
+
1274 27731651 1
|
| 308 |
+
1274 4406819 1
|
| 309 |
+
1278 11335781 1
|
| 310 |
+
1279 11335781 1
|
| 311 |
+
1280 4387784 1
|
| 312 |
+
1281 4387784 1
|
| 313 |
+
1282 23649163 1
|
| 314 |
+
1290 4687948 1
|
| 315 |
+
1292 56893404 1
|
| 316 |
+
1298 11718220 1
|
| 317 |
+
1303 12631697 1
|
| 318 |
+
1316 27910499 1
|
| 319 |
+
1319 16284655 1
|
| 320 |
+
1320 16284655 1
|
| 321 |
+
1332 5304891 1
|
| 322 |
+
1335 27910499 1
|
| 323 |
+
1336 27910499 1
|
| 324 |
+
1337 20231138 1
|
| 325 |
+
1339 15482274 1
|
| 326 |
+
1344 9559146 1
|
| 327 |
+
1352 12885341 1
|
| 328 |
+
1359 11614737 1
|
| 329 |
+
1362 8290953 1
|
| 330 |
+
1363 8290953 1
|
| 331 |
+
1368 2425364 1
|
| 332 |
+
1370 2425364 1
|
| 333 |
+
1379 16322674 1
|
| 334 |
+
1379 27123743 1
|
| 335 |
+
1379 23557241 1
|
| 336 |
+
1379 17450673 1
|
| 337 |
+
1382 17755060 1
|
| 338 |
+
1385 306006 1
|
| 339 |
+
1389 23895668 1
|
| 340 |
+
1395 17717391 1
|
data/scifact/qrels/train.tsv
ADDED
|
@@ -0,0 +1,920 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
query-id corpus-id score
|
| 2 |
+
0 31715818 1
|
| 3 |
+
2 13734012 1
|
| 4 |
+
4 22942787 1
|
| 5 |
+
6 2613775 1
|
| 6 |
+
9 44265107 1
|
| 7 |
+
10 32587939 1
|
| 8 |
+
11 32587939 1
|
| 9 |
+
12 33409100 1
|
| 10 |
+
14 641786 1
|
| 11 |
+
15 22080671 1
|
| 12 |
+
17 1606628 1
|
| 13 |
+
18 22942787 1
|
| 14 |
+
19 3202143 1
|
| 15 |
+
20 3202143 1
|
| 16 |
+
21 41493639 1
|
| 17 |
+
22 6490571 1
|
| 18 |
+
24 3471191 1
|
| 19 |
+
25 2613775 1
|
| 20 |
+
26 32390525 1
|
| 21 |
+
27 32390525 1
|
| 22 |
+
28 12670680 1
|
| 23 |
+
30 24341590 1
|
| 24 |
+
32 12428497 1
|
| 25 |
+
34 11705328 1
|
| 26 |
+
35 5152028 1
|
| 27 |
+
35 11705328 1
|
| 28 |
+
37 5152028 1
|
| 29 |
+
37 11705328 1
|
| 30 |
+
39 13497630 1
|
| 31 |
+
40 13497630 1
|
| 32 |
+
41 18174210 1
|
| 33 |
+
43 7224723 1
|
| 34 |
+
44 56893404 1
|
| 35 |
+
45 56893404 1
|
| 36 |
+
46 380526 1
|
| 37 |
+
47 3512154 1
|
| 38 |
+
47 26996935 1
|
| 39 |
+
52 45638119 1
|
| 40 |
+
55 49556906 1
|
| 41 |
+
58 4709641 1
|
| 42 |
+
60 13899137 1
|
| 43 |
+
60 13901073 1
|
| 44 |
+
61 13899137 1
|
| 45 |
+
61 13901073 1
|
| 46 |
+
62 32587939 1
|
| 47 |
+
63 40349336 1
|
| 48 |
+
64 40349336 1
|
| 49 |
+
66 14806256 1
|
| 50 |
+
67 21295300 1
|
| 51 |
+
68 21295300 1
|
| 52 |
+
69 5956380 1
|
| 53 |
+
69 4414547 1
|
| 54 |
+
71 1127562 1
|
| 55 |
+
73 6076903 1
|
| 56 |
+
74 4387784 1
|
| 57 |
+
76 5531479 1
|
| 58 |
+
77 5531479 1
|
| 59 |
+
78 5099266 1
|
| 60 |
+
79 5099266 1
|
| 61 |
+
80 4920376 1
|
| 62 |
+
81 1797622 1
|
| 63 |
+
82 3619372 1
|
| 64 |
+
85 7521113 1
|
| 65 |
+
85 22406695 1
|
| 66 |
+
86 7521113 1
|
| 67 |
+
86 22406695 1
|
| 68 |
+
88 7521113 1
|
| 69 |
+
88 22406695 1
|
| 70 |
+
89 7521113 1
|
| 71 |
+
89 22406695 1
|
| 72 |
+
90 22406695 1
|
| 73 |
+
91 1084345 1
|
| 74 |
+
92 1084345 1
|
| 75 |
+
93 2692522 1
|
| 76 |
+
95 1215116 1
|
| 77 |
+
96 14500725 1
|
| 78 |
+
98 6540064 1
|
| 79 |
+
104 40164383 1
|
| 80 |
+
105 36606083 1
|
| 81 |
+
106 25515907 1
|
| 82 |
+
106 5151024 1
|
| 83 |
+
108 6191684 1
|
| 84 |
+
108 22995579 1
|
| 85 |
+
108 23865182 1
|
| 86 |
+
109 4319174 1
|
| 87 |
+
111 13513790 1
|
| 88 |
+
112 6157837 1
|
| 89 |
+
114 33872649 1
|
| 90 |
+
116 33872649 1
|
| 91 |
+
119 14606752 1
|
| 92 |
+
120 14606752 1
|
| 93 |
+
121 31460499 1
|
| 94 |
+
122 31460499 1
|
| 95 |
+
123 4883040 1
|
| 96 |
+
126 24512064 1
|
| 97 |
+
134 4695046 1
|
| 98 |
+
138 26016929 1
|
| 99 |
+
139 22080671 1
|
| 100 |
+
144 10582939 1
|
| 101 |
+
149 6227220 1
|
| 102 |
+
152 15488881 1
|
| 103 |
+
153 4702639 1
|
| 104 |
+
154 4702639 1
|
| 105 |
+
155 37549932 1
|
| 106 |
+
156 37549932 1
|
| 107 |
+
157 13439128 1
|
| 108 |
+
159 9394119 1
|
| 109 |
+
160 52874170 1
|
| 110 |
+
161 6903077 1
|
| 111 |
+
164 5824985 1
|
| 112 |
+
165 5824985 1
|
| 113 |
+
166 18872233 1
|
| 114 |
+
167 18872233 1
|
| 115 |
+
168 5824985 1
|
| 116 |
+
169 5824985 1
|
| 117 |
+
172 12670680 1
|
| 118 |
+
173 8126244 1
|
| 119 |
+
174 1710116 1
|
| 120 |
+
175 1710116 1
|
| 121 |
+
176 32587939 1
|
| 122 |
+
177 9669099 1
|
| 123 |
+
178 16322674 1
|
| 124 |
+
178 27123743 1
|
| 125 |
+
178 23557241 1
|
| 126 |
+
178 17450673 1
|
| 127 |
+
181 16966326 1
|
| 128 |
+
182 11369420 1
|
| 129 |
+
184 12827098 1
|
| 130 |
+
186 16855829 1
|
| 131 |
+
187 16855829 1
|
| 132 |
+
189 4421578 1
|
| 133 |
+
196 19313533 1
|
| 134 |
+
197 2177022 1
|
| 135 |
+
199 2177022 1
|
| 136 |
+
200 18231807 1
|
| 137 |
+
201 2462673 1
|
| 138 |
+
203 9558539 1
|
| 139 |
+
204 7898952 1
|
| 140 |
+
205 7898952 1
|
| 141 |
+
205 470625 1
|
| 142 |
+
209 32587939 1
|
| 143 |
+
210 13794374 1
|
| 144 |
+
211 13794374 1
|
| 145 |
+
214 13625993 1
|
| 146 |
+
220 19205437 1
|
| 147 |
+
221 19205437 1
|
| 148 |
+
222 19205437 1
|
| 149 |
+
223 2014909 1
|
| 150 |
+
224 6944800 1
|
| 151 |
+
225 6944800 1
|
| 152 |
+
226 6944800 1
|
| 153 |
+
227 26973393 1
|
| 154 |
+
228 4928057 1
|
| 155 |
+
229 56893404 1
|
| 156 |
+
235 4388470 1
|
| 157 |
+
241 2212067 1
|
| 158 |
+
241 10608822 1
|
| 159 |
+
242 2212067 1
|
| 160 |
+
242 10608822 1
|
| 161 |
+
243 8148122 1
|
| 162 |
+
244 21498497 1
|
| 163 |
+
245 8447873 1
|
| 164 |
+
245 3430789 1
|
| 165 |
+
246 8447873 1
|
| 166 |
+
246 3430789 1
|
| 167 |
+
247 13578199 1
|
| 168 |
+
250 1568684 1
|
| 169 |
+
251 1568684 1
|
| 170 |
+
253 37424881 1
|
| 171 |
+
254 37424881 1
|
| 172 |
+
255 5850219 1
|
| 173 |
+
256 5850219 1
|
| 174 |
+
258 22080671 1
|
| 175 |
+
259 8883846 1
|
| 176 |
+
262 14610165 1
|
| 177 |
+
263 11328820 1
|
| 178 |
+
263 30041340 1
|
| 179 |
+
263 14853989 1
|
| 180 |
+
264 11328820 1
|
| 181 |
+
265 2033917 1
|
| 182 |
+
266 22405338 1
|
| 183 |
+
267 5912283 1
|
| 184 |
+
267 31554917 1
|
| 185 |
+
272 11614737 1
|
| 186 |
+
277 14376683 1
|
| 187 |
+
278 14376683 1
|
| 188 |
+
280 25001628 1
|
| 189 |
+
281 4632921 1
|
| 190 |
+
283 1974176 1
|
| 191 |
+
285 5548081 1
|
| 192 |
+
286 4709641 1
|
| 193 |
+
287 4709641 1
|
| 194 |
+
290 15048300 1
|
| 195 |
+
292 15048300 1
|
| 196 |
+
293 10874408 1
|
| 197 |
+
296 4398832 1
|
| 198 |
+
299 39381118 1
|
| 199 |
+
301 3553087 1
|
| 200 |
+
304 14797520 1
|
| 201 |
+
305 14797520 1
|
| 202 |
+
306 7821634 1
|
| 203 |
+
308 7821634 1
|
| 204 |
+
309 7821634 1
|
| 205 |
+
310 6173523 1
|
| 206 |
+
313 6173523 1
|
| 207 |
+
315 3701541 1
|
| 208 |
+
316 712078 1
|
| 209 |
+
317 4506414 1
|
| 210 |
+
323 2014909 1
|
| 211 |
+
325 40349336 1
|
| 212 |
+
326 40349336 1
|
| 213 |
+
330 9505448 1
|
| 214 |
+
331 9505448 1
|
| 215 |
+
332 29023309 1
|
| 216 |
+
333 29023309 1
|
| 217 |
+
334 25079962 1
|
| 218 |
+
335 1780819 1
|
| 219 |
+
336 2097256 1
|
| 220 |
+
337 2097256 1
|
| 221 |
+
339 23349986 1
|
| 222 |
+
340 7098463 1
|
| 223 |
+
341 7098463 1
|
| 224 |
+
342 7873737 1
|
| 225 |
+
342 5884524 1
|
| 226 |
+
345 4394817 1
|
| 227 |
+
346 11902109 1
|
| 228 |
+
347 11902109 1
|
| 229 |
+
349 13497630 1
|
| 230 |
+
351 14658685 1
|
| 231 |
+
352 14658685 1
|
| 232 |
+
355 12800122 1
|
| 233 |
+
355 38380061 1
|
| 234 |
+
356 6144337 1
|
| 235 |
+
357 18111172 1
|
| 236 |
+
358 18111172 1
|
| 237 |
+
361 38587347 1
|
| 238 |
+
363 5386514 1
|
| 239 |
+
364 1550937 1
|
| 240 |
+
365 600437 1
|
| 241 |
+
366 13956305 1
|
| 242 |
+
367 27099731 1
|
| 243 |
+
368 27099731 1
|
| 244 |
+
369 6826100 1
|
| 245 |
+
370 1550937 1
|
| 246 |
+
371 1550937 1
|
| 247 |
+
372 24922825 1
|
| 248 |
+
375 1522647 1
|
| 249 |
+
376 22401061 1
|
| 250 |
+
377 18810195 1
|
| 251 |
+
378 45154987 1
|
| 252 |
+
378 10534299 1
|
| 253 |
+
378 11886686 1
|
| 254 |
+
378 25007443 1
|
| 255 |
+
378 17150648 1
|
| 256 |
+
379 19005293 1
|
| 257 |
+
381 18340282 1
|
| 258 |
+
382 11659421 1
|
| 259 |
+
383 13770184 1
|
| 260 |
+
389 1148122 1
|
| 261 |
+
390 1148122 1
|
| 262 |
+
391 1148122 1
|
| 263 |
+
392 1148122 1
|
| 264 |
+
393 1148122 1
|
| 265 |
+
394 11360768 1
|
| 266 |
+
396 1456068 1
|
| 267 |
+
397 1456068 1
|
| 268 |
+
398 8883846 1
|
| 269 |
+
400 791050 1
|
| 270 |
+
401 5633876 1
|
| 271 |
+
403 1921218 1
|
| 272 |
+
404 1921218 1
|
| 273 |
+
406 6796297 1
|
| 274 |
+
407 9889151 1
|
| 275 |
+
413 6309659 1
|
| 276 |
+
414 6309659 1
|
| 277 |
+
416 6309659 1
|
| 278 |
+
417 6309659 1
|
| 279 |
+
418 16660256 1
|
| 280 |
+
420 9315213 1
|
| 281 |
+
422 11172205 1
|
| 282 |
+
423 8595678 1
|
| 283 |
+
425 33257464 1
|
| 284 |
+
426 16728949 1
|
| 285 |
+
428 16728949 1
|
| 286 |
+
429 36540079 1
|
| 287 |
+
430 28937856 1
|
| 288 |
+
432 8002887 1
|
| 289 |
+
434 9500590 1
|
| 290 |
+
435 9500590 1
|
| 291 |
+
441 2014909 1
|
| 292 |
+
444 10165258 1
|
| 293 |
+
445 10165258 1
|
| 294 |
+
447 2052720 1
|
| 295 |
+
448 2052720 1
|
| 296 |
+
449 12209494 1
|
| 297 |
+
449 3430789 1
|
| 298 |
+
453 4200695 1
|
| 299 |
+
454 4200695 1
|
| 300 |
+
455 12643937 1
|
| 301 |
+
456 30507607 1
|
| 302 |
+
458 597790 1
|
| 303 |
+
461 40096222 1
|
| 304 |
+
463 19736671 1
|
| 305 |
+
466 22544171 1
|
| 306 |
+
469 1410197 1
|
| 307 |
+
470 12685434 1
|
| 308 |
+
472 7185591 1
|
| 309 |
+
472 26330861 1
|
| 310 |
+
472 4414481 1
|
| 311 |
+
473 4373433 1
|
| 312 |
+
474 4373433 1
|
| 313 |
+
479 6325527 1
|
| 314 |
+
480 6325527 1
|
| 315 |
+
481 14706752 1
|
| 316 |
+
482 10991183 1
|
| 317 |
+
483 22703082 1
|
| 318 |
+
484 14637235 1
|
| 319 |
+
485 14637235 1
|
| 320 |
+
486 14637235 1
|
| 321 |
+
487 14637235 1
|
| 322 |
+
488 1780819 1
|
| 323 |
+
489 6625693 1
|
| 324 |
+
490 56893404 1
|
| 325 |
+
492 19583924 1
|
| 326 |
+
493 19583924 1
|
| 327 |
+
494 34873974 1
|
| 328 |
+
495 17077004 1
|
| 329 |
+
498 17077004 1
|
| 330 |
+
499 26064662 1
|
| 331 |
+
500 17930286 1
|
| 332 |
+
504 10883736 1
|
| 333 |
+
505 22703082 1
|
| 334 |
+
506 7433668 1
|
| 335 |
+
509 13980338 1
|
| 336 |
+
515 29564505 1
|
| 337 |
+
523 14803797 1
|
| 338 |
+
524 14803797 1
|
| 339 |
+
526 3863543 1
|
| 340 |
+
529 10546779 1
|
| 341 |
+
529 25413327 1
|
| 342 |
+
529 36651210 1
|
| 343 |
+
530 10546779 1
|
| 344 |
+
530 25413327 1
|
| 345 |
+
530 36651210 1
|
| 346 |
+
530 87610599 1
|
| 347 |
+
531 10546779 1
|
| 348 |
+
531 25413327 1
|
| 349 |
+
531 36651210 1
|
| 350 |
+
537 16056514 1
|
| 351 |
+
541 45154987 1
|
| 352 |
+
541 11886686 1
|
| 353 |
+
541 25007443 1
|
| 354 |
+
542 19688024 1
|
| 355 |
+
545 24221369 1
|
| 356 |
+
547 10648422 1
|
| 357 |
+
548 18199839 1
|
| 358 |
+
550 33499189 1
|
| 359 |
+
553 1471041 1
|
| 360 |
+
555 1049501 1
|
| 361 |
+
557 1049501 1
|
| 362 |
+
559 3475317 1
|
| 363 |
+
562 20101846 1
|
| 364 |
+
563 2867345 1
|
| 365 |
+
564 2867345 1
|
| 366 |
+
565 16120395 1
|
| 367 |
+
566 16120395 1
|
| 368 |
+
568 23418635 1
|
| 369 |
+
570 20333864 1
|
| 370 |
+
571 20333864 1
|
| 371 |
+
572 4447055 1
|
| 372 |
+
573 10300888 1
|
| 373 |
+
574 10300888 1
|
| 374 |
+
576 4468861 1
|
| 375 |
+
579 34139429 1
|
| 376 |
+
580 23460562 1
|
| 377 |
+
582 14260013 1
|
| 378 |
+
584 14260013 1
|
| 379 |
+
585 42291761 1
|
| 380 |
+
588 16999023 1
|
| 381 |
+
590 10984005 1
|
| 382 |
+
591 14682243 1
|
| 383 |
+
592 14682243 1
|
| 384 |
+
594 19675911 1
|
| 385 |
+
595 4824840 1
|
| 386 |
+
600 12258338 1
|
| 387 |
+
601 12258338 1
|
| 388 |
+
602 3701541 1
|
| 389 |
+
603 6540064 1
|
| 390 |
+
606 712078 1
|
| 391 |
+
607 4506414 1
|
| 392 |
+
609 40096222 1
|
| 393 |
+
610 40096222 1
|
| 394 |
+
611 32408470 1
|
| 395 |
+
612 9638032 1
|
| 396 |
+
614 9638032 1
|
| 397 |
+
615 9638032 1
|
| 398 |
+
616 18670 1
|
| 399 |
+
617 18670 1
|
| 400 |
+
618 6836086 1
|
| 401 |
+
620 2565138 1
|
| 402 |
+
621 1642727 1
|
| 403 |
+
622 17000834 1
|
| 404 |
+
624 20033112 1
|
| 405 |
+
625 20033112 1
|
| 406 |
+
626 16355392 1
|
| 407 |
+
631 5468807 1
|
| 408 |
+
632 5172048 1
|
| 409 |
+
633 5172048 1
|
| 410 |
+
635 1686997 1
|
| 411 |
+
638 25649714 1
|
| 412 |
+
640 6503185 1
|
| 413 |
+
642 13619127 1
|
| 414 |
+
643 15535511 1
|
| 415 |
+
645 12810152 1
|
| 416 |
+
646 12810152 1
|
| 417 |
+
647 15041758 1
|
| 418 |
+
648 15041758 1
|
| 419 |
+
650 12789595 1
|
| 420 |
+
651 9433958 1
|
| 421 |
+
652 9433958 1
|
| 422 |
+
653 24384587 1
|
| 423 |
+
654 57574395 1
|
| 424 |
+
655 57574395 1
|
| 425 |
+
657 8533245 1
|
| 426 |
+
658 5293024 1
|
| 427 |
+
661 37204802 1
|
| 428 |
+
662 37204802 1
|
| 429 |
+
663 22080671 1
|
| 430 |
+
665 12580014 1
|
| 431 |
+
666 4469125 1
|
| 432 |
+
667 6493422 1
|
| 433 |
+
668 6493422 1
|
| 434 |
+
668 25148216 1
|
| 435 |
+
669 6493422 1
|
| 436 |
+
669 25148216 1
|
| 437 |
+
670 5573975 1
|
| 438 |
+
671 5573975 1
|
| 439 |
+
672 15635366 1
|
| 440 |
+
673 2095573 1
|
| 441 |
+
676 857189 1
|
| 442 |
+
677 857189 1
|
| 443 |
+
679 13639330 1
|
| 444 |
+
680 9315213 1
|
| 445 |
+
681 9315213 1
|
| 446 |
+
682 9315213 1
|
| 447 |
+
683 9315213 1
|
| 448 |
+
685 4452659 1
|
| 449 |
+
686 4452659 1
|
| 450 |
+
687 4452659 1
|
| 451 |
+
688 4452659 1
|
| 452 |
+
689 22080671 1
|
| 453 |
+
694 1071991 1
|
| 454 |
+
696 16355392 1
|
| 455 |
+
698 22544171 1
|
| 456 |
+
703 4350400 1
|
| 457 |
+
704 14658685 1
|
| 458 |
+
705 22442133 1
|
| 459 |
+
709 22442133 1
|
| 460 |
+
710 22442133 1
|
| 461 |
+
713 18421962 1
|
| 462 |
+
714 18421962 1
|
| 463 |
+
717 17587795 1
|
| 464 |
+
724 5531479 1
|
| 465 |
+
726 7521113 1
|
| 466 |
+
726 36444198 1
|
| 467 |
+
730 13400643 1
|
| 468 |
+
732 34469966 1
|
| 469 |
+
733 34469966 1
|
| 470 |
+
734 4961038 1
|
| 471 |
+
736 5389095 1
|
| 472 |
+
737 16562534 1
|
| 473 |
+
737 6609935 1
|
| 474 |
+
738 16562534 1
|
| 475 |
+
738 6609935 1
|
| 476 |
+
738 33912020 1
|
| 477 |
+
739 4446814 1
|
| 478 |
+
740 23078022 1
|
| 479 |
+
745 11291348 1
|
| 480 |
+
746 11291348 1
|
| 481 |
+
747 11291348 1
|
| 482 |
+
748 11291348 1
|
| 483 |
+
749 13868795 1
|
| 484 |
+
751 19800147 1
|
| 485 |
+
752 19800147 1
|
| 486 |
+
753 1173667 1
|
| 487 |
+
755 17844478 1
|
| 488 |
+
757 17123657 1
|
| 489 |
+
758 14195528 1
|
| 490 |
+
760 1805641 1
|
| 491 |
+
761 10009203 1
|
| 492 |
+
762 4695046 1
|
| 493 |
+
764 7552215 1
|
| 494 |
+
765 7552215 1
|
| 495 |
+
766 7552215 1
|
| 496 |
+
767 2488880 1
|
| 497 |
+
771 15476777 1
|
| 498 |
+
772 24922825 1
|
| 499 |
+
774 32275758 1
|
| 500 |
+
776 32275758 1
|
| 501 |
+
777 32275758 1
|
| 502 |
+
778 13001323 1
|
| 503 |
+
779 13001323 1
|
| 504 |
+
780 8246922 1
|
| 505 |
+
780 24338780 1
|
| 506 |
+
782 8246922 1
|
| 507 |
+
787 4740447 1
|
| 508 |
+
788 4740447 1
|
| 509 |
+
789 15493354 1
|
| 510 |
+
790 15493354 1
|
| 511 |
+
791 15984735 1
|
| 512 |
+
792 3610080 1
|
| 513 |
+
795 8551160 1
|
| 514 |
+
797 8551160 1
|
| 515 |
+
798 8551160 1
|
| 516 |
+
799 5293024 1
|
| 517 |
+
801 22180793 1
|
| 518 |
+
802 22180793 1
|
| 519 |
+
803 22180793 1
|
| 520 |
+
804 22180793 1
|
| 521 |
+
807 36606083 1
|
| 522 |
+
810 13513790 1
|
| 523 |
+
812 19799455 1
|
| 524 |
+
813 33387953 1
|
| 525 |
+
815 8148304 1
|
| 526 |
+
816 8148304 1
|
| 527 |
+
817 17814815 1
|
| 528 |
+
818 17814815 1
|
| 529 |
+
822 15319019 1
|
| 530 |
+
825 15319019 1
|
| 531 |
+
826 4678846 1
|
| 532 |
+
828 4678846 1
|
| 533 |
+
835 15928989 1
|
| 534 |
+
838 15928989 1
|
| 535 |
+
840 15663829 1
|
| 536 |
+
841 15663829 1
|
| 537 |
+
844 17741440 1
|
| 538 |
+
846 22696649 1
|
| 539 |
+
848 14500725 1
|
| 540 |
+
853 24922825 1
|
| 541 |
+
854 12206390 1
|
| 542 |
+
855 8190282 1
|
| 543 |
+
856 43334921 1
|
| 544 |
+
857 43334921 1
|
| 545 |
+
858 1982286 1
|
| 546 |
+
860 16066726 1
|
| 547 |
+
861 16066726 1
|
| 548 |
+
863 20568364 1
|
| 549 |
+
863 16361581 1
|
| 550 |
+
866 37822406 1
|
| 551 |
+
867 14340571 1
|
| 552 |
+
871 195689316 1
|
| 553 |
+
876 195689316 1
|
| 554 |
+
877 313394 1
|
| 555 |
+
881 14803797 1
|
| 556 |
+
883 14803797 1
|
| 557 |
+
884 14803797 1
|
| 558 |
+
885 6477536 1
|
| 559 |
+
886 6477536 1
|
| 560 |
+
890 2097256 1
|
| 561 |
+
891 2097256 1
|
| 562 |
+
893 13509809 1
|
| 563 |
+
894 14724693 1
|
| 564 |
+
895 18750453 1
|
| 565 |
+
896 14338915 1
|
| 566 |
+
897 14338915 1
|
| 567 |
+
898 13106686 1
|
| 568 |
+
898 5572127 1
|
| 569 |
+
899 13106686 1
|
| 570 |
+
899 5572127 1
|
| 571 |
+
900 18678095 1
|
| 572 |
+
901 6540064 1
|
| 573 |
+
902 10648422 1
|
| 574 |
+
908 6923961 1
|
| 575 |
+
909 11254556 1
|
| 576 |
+
910 11254556 1
|
| 577 |
+
912 11254556 1
|
| 578 |
+
916 18037805 1
|
| 579 |
+
917 34071621 1
|
| 580 |
+
919 16422880 1
|
| 581 |
+
923 17077004 1
|
| 582 |
+
925 17077004 1
|
| 583 |
+
926 16390264 1
|
| 584 |
+
927 16390264 1
|
| 585 |
+
928 18174210 1
|
| 586 |
+
929 18174210 1
|
| 587 |
+
930 16056514 1
|
| 588 |
+
933 14711483 1
|
| 589 |
+
934 8563659 1
|
| 590 |
+
935 5483793 1
|
| 591 |
+
938 26231129 1
|
| 592 |
+
939 26231129 1
|
| 593 |
+
940 12258338 1
|
| 594 |
+
941 12258338 1
|
| 595 |
+
942 11527199 1
|
| 596 |
+
944 1642727 1
|
| 597 |
+
945 8428935 1
|
| 598 |
+
945 26112696 1
|
| 599 |
+
945 4463588 1
|
| 600 |
+
945 13083189 1
|
| 601 |
+
946 8428935 1
|
| 602 |
+
946 26112696 1
|
| 603 |
+
946 4463588 1
|
| 604 |
+
946 13083189 1
|
| 605 |
+
949 13578199 1
|
| 606 |
+
951 21414718 1
|
| 607 |
+
952 3355397 1
|
| 608 |
+
953 3355397 1
|
| 609 |
+
954 3355397 1
|
| 610 |
+
955 2078658 1
|
| 611 |
+
955 30507607 1
|
| 612 |
+
959 8780599 1
|
| 613 |
+
962 13931771 1
|
| 614 |
+
962 935538 1
|
| 615 |
+
962 4306711 1
|
| 616 |
+
963 4162857 1
|
| 617 |
+
963 29828242 1
|
| 618 |
+
964 4162857 1
|
| 619 |
+
964 29828242 1
|
| 620 |
+
965 40817021 1
|
| 621 |
+
969 19356271 1
|
| 622 |
+
969 17368516 1
|
| 623 |
+
970 19356271 1
|
| 624 |
+
970 17368516 1
|
| 625 |
+
972 46695481 1
|
| 626 |
+
972 27873158 1
|
| 627 |
+
972 28617573 1
|
| 628 |
+
972 9764256 1
|
| 629 |
+
973 27446873 1
|
| 630 |
+
973 27873158 1
|
| 631 |
+
973 28617573 1
|
| 632 |
+
973 9764256 1
|
| 633 |
+
976 5304891 1
|
| 634 |
+
977 14075252 1
|
| 635 |
+
977 39264456 1
|
| 636 |
+
978 14075252 1
|
| 637 |
+
979 11659421 1
|
| 638 |
+
980 20128547 1
|
| 639 |
+
984 6828370 1
|
| 640 |
+
988 3033830 1
|
| 641 |
+
989 9988425 1
|
| 642 |
+
990 16472469 1
|
| 643 |
+
992 16472469 1
|
| 644 |
+
994 16472469 1
|
| 645 |
+
996 16472469 1
|
| 646 |
+
997 16472469 1
|
| 647 |
+
998 16472469 1
|
| 648 |
+
999 16472469 1
|
| 649 |
+
1000 16472469 1
|
| 650 |
+
1001 5702790 1
|
| 651 |
+
1002 13639330 1
|
| 652 |
+
1003 14332945 1
|
| 653 |
+
1003 4319844 1
|
| 654 |
+
1003 4899981 1
|
| 655 |
+
1004 301838 1
|
| 656 |
+
1004 2734421 1
|
| 657 |
+
1004 3952288 1
|
| 658 |
+
1005 301838 1
|
| 659 |
+
1005 2734421 1
|
| 660 |
+
1005 3952288 1
|
| 661 |
+
1006 4926049 1
|
| 662 |
+
1008 2547636 1
|
| 663 |
+
1009 1982286 1
|
| 664 |
+
1011 9745001 1
|
| 665 |
+
1015 6277638 1
|
| 666 |
+
1016 6277638 1
|
| 667 |
+
1018 11603066 1
|
| 668 |
+
1023 16927286 1
|
| 669 |
+
1025 32408470 1
|
| 670 |
+
1026 3113630 1
|
| 671 |
+
1027 3113630 1
|
| 672 |
+
1028 13923140 1
|
| 673 |
+
1028 11899391 1
|
| 674 |
+
1030 6441369 1
|
| 675 |
+
1031 12486491 1
|
| 676 |
+
1032 6836086 1
|
| 677 |
+
1033 6836086 1
|
| 678 |
+
1034 4547102 1
|
| 679 |
+
1035 4547102 1
|
| 680 |
+
1036 4547102 1
|
| 681 |
+
1037 16287725 1
|
| 682 |
+
1038 16287725 1
|
| 683 |
+
1040 25254425 1
|
| 684 |
+
1040 16626264 1
|
| 685 |
+
1042 17421851 1
|
| 686 |
+
1043 17671145 1
|
| 687 |
+
1044 22500262 1
|
| 688 |
+
1045 22500262 1
|
| 689 |
+
1046 418246 1
|
| 690 |
+
1046 4324278 1
|
| 691 |
+
1046 16712164 1
|
| 692 |
+
1047 14706752 1
|
| 693 |
+
1048 12486491 1
|
| 694 |
+
1050 19878070 1
|
| 695 |
+
1052 18816720 1
|
| 696 |
+
1053 18816720 1
|
| 697 |
+
1054 10072941 1
|
| 698 |
+
1055 13906581 1
|
| 699 |
+
1056 4200695 1
|
| 700 |
+
1058 13027590 1
|
| 701 |
+
1065 20418809 1
|
| 702 |
+
1067 4429668 1
|
| 703 |
+
1068 4429668 1
|
| 704 |
+
1069 4200695 1
|
| 705 |
+
1070 25649714 1
|
| 706 |
+
1072 4824840 1
|
| 707 |
+
1073 4824840 1
|
| 708 |
+
1074 14658685 1
|
| 709 |
+
1075 14658685 1
|
| 710 |
+
1081 5691302 1
|
| 711 |
+
1084 5691302 1
|
| 712 |
+
1085 5691302 1
|
| 713 |
+
1087 39281140 1
|
| 714 |
+
1090 17628888 1
|
| 715 |
+
1091 2603304 1
|
| 716 |
+
1096 29638116 1
|
| 717 |
+
1097 26851674 1
|
| 718 |
+
1098 13552682 1
|
| 719 |
+
1101 3874000 1
|
| 720 |
+
1102 3874000 1
|
| 721 |
+
1103 3898784 1
|
| 722 |
+
1105 6710713 1
|
| 723 |
+
1106 6710713 1
|
| 724 |
+
1109 13770184 1
|
| 725 |
+
1109 8582337 1
|
| 726 |
+
1111 1686881 1
|
| 727 |
+
1112 1686881 1
|
| 728 |
+
1114 12824568 1
|
| 729 |
+
1115 44048701 1
|
| 730 |
+
1118 23351136 1
|
| 731 |
+
1119 5323845 1
|
| 732 |
+
1119 18997216 1
|
| 733 |
+
1119 13907928 1
|
| 734 |
+
1120 5323845 1
|
| 735 |
+
1120 18997216 1
|
| 736 |
+
1120 13907928 1
|
| 737 |
+
1125 21009874 1
|
| 738 |
+
1126 21009874 1
|
| 739 |
+
1127 27466734 1
|
| 740 |
+
1128 33499189 1
|
| 741 |
+
1128 9283422 1
|
| 742 |
+
1133 24142891 1
|
| 743 |
+
1134 33370 1
|
| 744 |
+
1135 33370 1
|
| 745 |
+
1136 33370 1
|
| 746 |
+
1138 6796297 1
|
| 747 |
+
1139 12009265 1
|
| 748 |
+
1141 12009265 1
|
| 749 |
+
1142 5260382 1
|
| 750 |
+
1145 10071552 1
|
| 751 |
+
1148 4828631 1
|
| 752 |
+
1153 7370282 1
|
| 753 |
+
1156 12584053 1
|
| 754 |
+
1157 12584053 1
|
| 755 |
+
1158 12584053 1
|
| 756 |
+
1159 12584053 1
|
| 757 |
+
1161 13048272 1
|
| 758 |
+
1162 15305881 1
|
| 759 |
+
1164 4455466 1
|
| 760 |
+
1165 4455466 1
|
| 761 |
+
1166 9889151 1
|
| 762 |
+
1168 8563659 1
|
| 763 |
+
1169 4319174 1
|
| 764 |
+
1170 18956141 1
|
| 765 |
+
1171 18956141 1
|
| 766 |
+
1173 7370282 1
|
| 767 |
+
1174 31272411 1
|
| 768 |
+
1176 13910150 1
|
| 769 |
+
1177 13910150 1
|
| 770 |
+
1178 31272411 1
|
| 771 |
+
1181 301838 1
|
| 772 |
+
1181 2734421 1
|
| 773 |
+
1181 39128592 1
|
| 774 |
+
1181 3952288 1
|
| 775 |
+
1182 14541844 1
|
| 776 |
+
1183 1967017 1
|
| 777 |
+
1184 16737210 1
|
| 778 |
+
1186 7485455 1
|
| 779 |
+
1188 4394817 1
|
| 780 |
+
1190 30655442 1
|
| 781 |
+
1193 20532591 1
|
| 782 |
+
1195 26283293 1
|
| 783 |
+
1205 5558754 1
|
| 784 |
+
1206 18909530 1
|
| 785 |
+
1208 10284593 1
|
| 786 |
+
1209 4347374 1
|
| 787 |
+
1210 4928282 1
|
| 788 |
+
1211 4928282 1
|
| 789 |
+
1212 6493422 1
|
| 790 |
+
1212 44724517 1
|
| 791 |
+
1214 6493422 1
|
| 792 |
+
1214 14407673 1
|
| 793 |
+
1215 16355392 1
|
| 794 |
+
1218 15635366 1
|
| 795 |
+
1219 9393969 1
|
| 796 |
+
1219 14864285 1
|
| 797 |
+
1220 13023410 1
|
| 798 |
+
1223 5289038 1
|
| 799 |
+
1224 21932050 1
|
| 800 |
+
1224 34016987 1
|
| 801 |
+
1227 25641414 1
|
| 802 |
+
1228 25641414 1
|
| 803 |
+
1229 1676568 1
|
| 804 |
+
1230 13905670 1
|
| 805 |
+
1231 13905670 1
|
| 806 |
+
1234 13905670 1
|
| 807 |
+
1235 17973161 1
|
| 808 |
+
1236 17973161 1
|
| 809 |
+
1237 3654468 1
|
| 810 |
+
1238 3654468 1
|
| 811 |
+
1239 21387297 1
|
| 812 |
+
1239 4427392 1
|
| 813 |
+
1244 18949516 1
|
| 814 |
+
1246 7662395 1
|
| 815 |
+
1247 5114282 1
|
| 816 |
+
1248 7209559 1
|
| 817 |
+
1249 7209559 1
|
| 818 |
+
1253 3321943 1
|
| 819 |
+
1254 16939583 1
|
| 820 |
+
1255 16939583 1
|
| 821 |
+
1257 581832 1
|
| 822 |
+
1258 12040627 1
|
| 823 |
+
1260 24341590 1
|
| 824 |
+
1261 13023410 1
|
| 825 |
+
1263 3981729 1
|
| 826 |
+
1265 37480103 1
|
| 827 |
+
1268 52072815 1
|
| 828 |
+
1269 13900610 1
|
| 829 |
+
1275 27731651 1
|
| 830 |
+
1276 3475317 1
|
| 831 |
+
1284 3578380 1
|
| 832 |
+
1288 4687948 1
|
| 833 |
+
1289 21239672 1
|
| 834 |
+
1291 56893404 1
|
| 835 |
+
1293 43329366 1
|
| 836 |
+
1294 2078658 1
|
| 837 |
+
1294 30507607 1
|
| 838 |
+
1295 21239672 1
|
| 839 |
+
1297 9167230 1
|
| 840 |
+
1300 6421792 1
|
| 841 |
+
1302 12631697 1
|
| 842 |
+
1304 12631697 1
|
| 843 |
+
1305 12631697 1
|
| 844 |
+
1306 6000423 1
|
| 845 |
+
1306 5836 1
|
| 846 |
+
1307 18231807 1
|
| 847 |
+
1308 18231807 1
|
| 848 |
+
1309 18231807 1
|
| 849 |
+
1310 8042158 1
|
| 850 |
+
1311 13763195 1
|
| 851 |
+
1312 24177706 1
|
| 852 |
+
1314 13072112 1
|
| 853 |
+
1314 16237005 1
|
| 854 |
+
1315 13072112 1
|
| 855 |
+
1315 16237005 1
|
| 856 |
+
1322 16284655 1
|
| 857 |
+
1323 19912367 1
|
| 858 |
+
1324 19912367 1
|
| 859 |
+
1325 40476126 1
|
| 860 |
+
1327 24241932 1
|
| 861 |
+
1327 22194407 1
|
| 862 |
+
1328 3475317 1
|
| 863 |
+
1330 14075252 1
|
| 864 |
+
1331 14075252 1
|
| 865 |
+
1333 1649738 1
|
| 866 |
+
1334 13923140 1
|
| 867 |
+
1334 13940200 1
|
| 868 |
+
1334 11899391 1
|
| 869 |
+
1340 15482274 1
|
| 870 |
+
1341 15482274 1
|
| 871 |
+
1342 8148122 1
|
| 872 |
+
1345 9559146 1
|
| 873 |
+
1346 9505402 1
|
| 874 |
+
1347 19005293 1
|
| 875 |
+
1348 19005293 1
|
| 876 |
+
1349 5377642 1
|
| 877 |
+
1350 5377642 1
|
| 878 |
+
1351 28369117 1
|
| 879 |
+
1353 18816720 1
|
| 880 |
+
1355 5256564 1
|
| 881 |
+
1356 13764090 1
|
| 882 |
+
1360 11614737 1
|
| 883 |
+
1361 15488881 1
|
| 884 |
+
1361 15058155 1
|
| 885 |
+
1364 8290953 1
|
| 886 |
+
1366 4406819 1
|
| 887 |
+
1367 2425364 1
|
| 888 |
+
1371 16256507 1
|
| 889 |
+
1372 21003930 1
|
| 890 |
+
1373 21003930 1
|
| 891 |
+
1374 21993510 1
|
| 892 |
+
1375 21993510 1
|
| 893 |
+
1376 3944632 1
|
| 894 |
+
1378 2488880 1
|
| 895 |
+
1380 16322674 1
|
| 896 |
+
1380 23557241 1
|
| 897 |
+
1380 17450673 1
|
| 898 |
+
1381 13481880 1
|
| 899 |
+
1383 17755060 1
|
| 900 |
+
1386 306006 1
|
| 901 |
+
1387 9669099 1
|
| 902 |
+
1390 2890952 1
|
| 903 |
+
1391 6766459 1
|
| 904 |
+
1392 6766459 1
|
| 905 |
+
1393 2000038 1
|
| 906 |
+
1393 12440953 1
|
| 907 |
+
1394 2251426 1
|
| 908 |
+
1397 17717391 1
|
| 909 |
+
1398 17717391 1
|
| 910 |
+
1400 14706752 1
|
| 911 |
+
1401 5185871 1
|
| 912 |
+
1402 8126244 1
|
| 913 |
+
1403 33370 1
|
| 914 |
+
1403 38355793 1
|
| 915 |
+
1404 33370 1
|
| 916 |
+
1404 38355793 1
|
| 917 |
+
1405 10504681 1
|
| 918 |
+
1406 2617858 1
|
| 919 |
+
1407 8087082 1
|
| 920 |
+
1407 29863668 1
|
data/scifact/queries.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
semantic-search:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: Dockerfile
|
| 6 |
+
container_name: semantic-search
|
| 7 |
+
ports:
|
| 8 |
+
- "7860:7860"
|
| 9 |
+
environment:
|
| 10 |
+
NLTK_DATA: /usr/local/share/nltk_data
|
| 11 |
+
volumes:
|
| 12 |
+
- ./config.yaml:/app/config.yaml:ro
|
| 13 |
+
- ./data:/app/data
|
| 14 |
+
- ./results:/app/results
|
| 15 |
+
- ./documents:/documents
|
| 16 |
+
restart: unless-stopped
|
evaluation/__pycache__/dataset_loader.cpython-313.pyc.2070577919488
ADDED
|
Binary file (5.68 kB). View file
|
|
|
evaluation/__pycache__/evaluator.cpython-313.pyc.2070577919488
ADDED
|
Binary file (8.8 kB). View file
|
|
|
evaluation/__pycache__/indexer_bridge.cpython-313.pyc.2070577919488
ADDED
|
Binary file (4.73 kB). View file
|
|
|
evaluation/__pycache__/query_runner.cpython-313.pyc.2070577919488
ADDED
|
Binary file (5.62 kB). View file
|
|
|
evaluation/__pycache__/run_eval.cpython-313.pyc.2070577919488
ADDED
|
Binary file (8.4 kB). View file
|
|
|
evaluation/dataset_loader.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluation/dataset_loader.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import csv
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DatasetLoader:
|
| 9 |
+
"""
|
| 10 |
+
Loads BEIR-format datasets (SciFact, NFCorpus, etc.)
|
| 11 |
+
|
| 12 |
+
BEIR format:
|
| 13 |
+
corpus.jsonl — {_id, title, text}
|
| 14 |
+
queries.jsonl — {_id, text}
|
| 15 |
+
qrels/*.tsv — query_id, doc_id, relevance_score
|
| 16 |
+
|
| 17 |
+
Relevance scales:
|
| 18 |
+
SciFact — binary (0 or 1)
|
| 19 |
+
NFCorpus — graded (0, 1, 2, 3) → we keep anything >= 1
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, dataset_path: str):
|
| 23 |
+
self.dataset_path = dataset_path
|
| 24 |
+
self.corpus_path = os.path.join(dataset_path, "corpus.jsonl")
|
| 25 |
+
self.queries_path = os.path.join(dataset_path, "queries.jsonl")
|
| 26 |
+
|
| 27 |
+
# qrels path — try test.tsv first, fallback to dev.tsv
|
| 28 |
+
# NFCorpus ships with dev.tsv instead of test.tsv
|
| 29 |
+
test_path = os.path.join(dataset_path, "qrels", "test.tsv")
|
| 30 |
+
dev_path = os.path.join(dataset_path, "qrels", "dev.tsv")
|
| 31 |
+
|
| 32 |
+
if os.path.exists(test_path):
|
| 33 |
+
self.qrels_path = test_path
|
| 34 |
+
elif os.path.exists(dev_path):
|
| 35 |
+
self.qrels_path = dev_path
|
| 36 |
+
print(f"[INFO] test.tsv not found, using dev.tsv for qrels")
|
| 37 |
+
else:
|
| 38 |
+
raise FileNotFoundError(
|
| 39 |
+
f"No qrels file found in {os.path.join(dataset_path, 'qrels')} — "
|
| 40 |
+
f"expected test.tsv or dev.tsv"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def load_corpus(self) -> dict:
|
| 44 |
+
"""
|
| 45 |
+
Load all documents from corpus.jsonl.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
dict — {doc_id: {"title": str, "text": str}}
|
| 49 |
+
"""
|
| 50 |
+
corpus = {}
|
| 51 |
+
with open(self.corpus_path, "r", encoding="utf-8") as f:
|
| 52 |
+
for line in f:
|
| 53 |
+
doc = json.loads(line.strip())
|
| 54 |
+
doc_id = str(doc["_id"])
|
| 55 |
+
corpus[doc_id] = {
|
| 56 |
+
"title": doc.get("title", ""),
|
| 57 |
+
"text": doc.get("text", ""),
|
| 58 |
+
}
|
| 59 |
+
print(f"Loaded {len(corpus)} documents from corpus")
|
| 60 |
+
return corpus
|
| 61 |
+
|
| 62 |
+
def load_queries(self) -> dict:
|
| 63 |
+
"""
|
| 64 |
+
Load test queries from queries.jsonl.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
dict — {query_id: query_text}
|
| 68 |
+
"""
|
| 69 |
+
queries = {}
|
| 70 |
+
with open(self.queries_path, "r", encoding="utf-8") as f:
|
| 71 |
+
for line in f:
|
| 72 |
+
q = json.loads(line.strip())
|
| 73 |
+
queries[str(q["_id"])] = q["text"]
|
| 74 |
+
print(f"Loaded {len(queries)} queries")
|
| 75 |
+
return queries
|
| 76 |
+
|
| 77 |
+
def load_qrels(self) -> dict:
|
| 78 |
+
"""
|
| 79 |
+
Load relevance judgments from qrels file.
|
| 80 |
+
|
| 81 |
+
Handles both:
|
| 82 |
+
SciFact — binary relevance (0 or 1)
|
| 83 |
+
NFCorpus — graded relevance (0, 1, 2, 3) → keep score >= 1
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
dict — {query_id: {doc_id: relevance_score}}
|
| 87 |
+
"""
|
| 88 |
+
qrels = {}
|
| 89 |
+
|
| 90 |
+
with open(self.qrels_path, "r", encoding="utf-8") as f:
|
| 91 |
+
reader = csv.reader(f, delimiter="\t")
|
| 92 |
+
next(reader) # skip header: query-id corpus-id score
|
| 93 |
+
|
| 94 |
+
for row in reader:
|
| 95 |
+
if len(row) < 3:
|
| 96 |
+
continue
|
| 97 |
+
|
| 98 |
+
query_id = str(row[0])
|
| 99 |
+
doc_id = str(row[1])
|
| 100 |
+
score = int(row[2])
|
| 101 |
+
|
| 102 |
+
# skip completely irrelevant docs
|
| 103 |
+
# this handles both binary (0/1) and graded (0/1/2/3)
|
| 104 |
+
if score < 1:
|
| 105 |
+
continue
|
| 106 |
+
|
| 107 |
+
if query_id not in qrels:
|
| 108 |
+
qrels[query_id] = {}
|
| 109 |
+
|
| 110 |
+
qrels[query_id][doc_id] = score
|
| 111 |
+
|
| 112 |
+
print(f"Loaded qrels for {len(qrels)} queries "
|
| 113 |
+
f"from {os.path.basename(self.qrels_path)}")
|
| 114 |
+
return qrels
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
import sys
|
| 119 |
+
|
| 120 |
+
# pass dataset path as argument or default to scifact
|
| 121 |
+
# usage: python -m evaluation.dataset_loader data/nfcorpus
|
| 122 |
+
path = sys.argv[1] if len(sys.argv) > 1 else "data/scifact"
|
| 123 |
+
loader = DatasetLoader(path)
|
| 124 |
+
|
| 125 |
+
corpus = loader.load_corpus()
|
| 126 |
+
queries = loader.load_queries()
|
| 127 |
+
qrels = loader.load_qrels()
|
| 128 |
+
|
| 129 |
+
# show a sample
|
| 130 |
+
sample_qid = list(queries.keys())[0]
|
| 131 |
+
print(f"\nSample query [{sample_qid}]: {queries[sample_qid]}")
|
| 132 |
+
print(f"Relevant docs : {qrels.get(sample_qid, {})}")
|
evaluation/evaluator.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluation/evaluator.py
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Evaluator:
|
| 8 |
+
"""
|
| 9 |
+
Computes standard IR evaluation metrics by comparing your
|
| 10 |
+
system's ranked results against the ground-truth qrels.
|
| 11 |
+
|
| 12 |
+
Metrics implemented:
|
| 13 |
+
NDCG@k — Normalized Discounted Cumulative Gain
|
| 14 |
+
Measures ranking quality; rewards relevant docs appearing early
|
| 15 |
+
Handles graded relevance (NFCorpus 0-3) and binary (SciFact 0-1)
|
| 16 |
+
MAP@k — Mean Average Precision
|
| 17 |
+
Average of precision computed at each relevant doc position
|
| 18 |
+
Recall@k — Fraction of relevant docs found in top-k
|
| 19 |
+
P@k — Precision at k (fraction of top-k that are relevant)
|
| 20 |
+
MRR — Mean Reciprocal Rank (position of first relevant result)
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def ndcg_at_k(self, ranked: list, relevant: dict, k: int) -> float:
|
| 24 |
+
"""
|
| 25 |
+
NDCG@k — the most important metric for ranked retrieval.
|
| 26 |
+
Score of 1.0 = perfect ranking, 0.0 = no relevant docs found.
|
| 27 |
+
|
| 28 |
+
Works for both:
|
| 29 |
+
- Binary relevance (SciFact): scores are 0 or 1
|
| 30 |
+
- Graded relevance (NFCorpus): scores are 0, 1, 2, or 3
|
| 31 |
+
"""
|
| 32 |
+
dcg = 0.0
|
| 33 |
+
for i, (doc_id, _) in enumerate(ranked[:k]):
|
| 34 |
+
rel = relevant.get(doc_id, 0)
|
| 35 |
+
if rel > 0:
|
| 36 |
+
dcg += rel / math.log2(i + 2) # i+2 because log2(1) = 0
|
| 37 |
+
|
| 38 |
+
# Ideal DCG — best possible ranking given the relevant docs
|
| 39 |
+
ideal_rels = sorted(relevant.values(), reverse=True)[:k]
|
| 40 |
+
idcg = sum(
|
| 41 |
+
rel / math.log2(i + 2)
|
| 42 |
+
for i, rel in enumerate(ideal_rels)
|
| 43 |
+
if rel > 0
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
return dcg / idcg if idcg > 0 else 0.0
|
| 47 |
+
|
| 48 |
+
def map_at_k(self, ranked: list, relevant: dict, k: int) -> float:
|
| 49 |
+
"""
|
| 50 |
+
MAP@k — average precision across all relevant document positions.
|
| 51 |
+
|
| 52 |
+
For graded relevance (NFCorpus), any score >= 1 counts as relevant.
|
| 53 |
+
"""
|
| 54 |
+
num_relevant = 0
|
| 55 |
+
sum_precision = 0.0
|
| 56 |
+
|
| 57 |
+
for i, (doc_id, _) in enumerate(ranked[:k]):
|
| 58 |
+
if relevant.get(doc_id, 0) > 0:
|
| 59 |
+
num_relevant += 1
|
| 60 |
+
sum_precision += num_relevant / (i + 1)
|
| 61 |
+
|
| 62 |
+
total_relevant = sum(1 for v in relevant.values() if v > 0)
|
| 63 |
+
if total_relevant == 0:
|
| 64 |
+
return 0.0
|
| 65 |
+
return sum_precision / total_relevant
|
| 66 |
+
|
| 67 |
+
def recall_at_k(self, ranked: list, relevant: dict, k: int) -> float:
|
| 68 |
+
"""
|
| 69 |
+
Recall@k — what fraction of all relevant docs appear in top-k.
|
| 70 |
+
|
| 71 |
+
For graded relevance, any score >= 1 counts as relevant.
|
| 72 |
+
"""
|
| 73 |
+
total_relevant = sum(1 for v in relevant.values() if v > 0)
|
| 74 |
+
if total_relevant == 0:
|
| 75 |
+
return 0.0
|
| 76 |
+
found = sum(
|
| 77 |
+
1 for doc_id, _ in ranked[:k]
|
| 78 |
+
if relevant.get(doc_id, 0) > 0
|
| 79 |
+
)
|
| 80 |
+
return found / total_relevant
|
| 81 |
+
|
| 82 |
+
def precision_at_k(self, ranked: list, relevant: dict, k: int) -> float:
|
| 83 |
+
"""
|
| 84 |
+
P@k — fraction of the top-k results that are relevant.
|
| 85 |
+
|
| 86 |
+
For graded relevance, any score >= 1 counts as relevant.
|
| 87 |
+
"""
|
| 88 |
+
if k == 0:
|
| 89 |
+
return 0.0
|
| 90 |
+
hits = sum(
|
| 91 |
+
1 for doc_id, _ in ranked[:k]
|
| 92 |
+
if relevant.get(doc_id, 0) > 0
|
| 93 |
+
)
|
| 94 |
+
return hits / k
|
| 95 |
+
|
| 96 |
+
def mrr(self, ranked: list, relevant: dict) -> float:
|
| 97 |
+
"""
|
| 98 |
+
MRR — reciprocal of the rank of the first relevant result.
|
| 99 |
+
Score of 1.0 = first result is relevant.
|
| 100 |
+
|
| 101 |
+
For graded relevance, any score >= 1 counts as relevant.
|
| 102 |
+
"""
|
| 103 |
+
for i, (doc_id, _) in enumerate(ranked):
|
| 104 |
+
if relevant.get(doc_id, 0) > 0:
|
| 105 |
+
return 1.0 / (i + 1)
|
| 106 |
+
return 0.0
|
| 107 |
+
|
| 108 |
+
def evaluate(
|
| 109 |
+
self,
|
| 110 |
+
all_results: dict,
|
| 111 |
+
qrels: dict,
|
| 112 |
+
k_values: list = None,
|
| 113 |
+
) -> dict:
|
| 114 |
+
"""
|
| 115 |
+
Compute all metrics across all queries and average them.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
all_results — {query_id: [(doc_id, score), ...]} from QueryRunner
|
| 119 |
+
qrels — {query_id: {doc_id: relevance}} from DatasetLoader
|
| 120 |
+
k_values — list of k values e.g. [1, 5, 10, 100]
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
dict — {
|
| 124 |
+
"NDCG@10": 0.42,
|
| 125 |
+
"MAP@100": 0.38,
|
| 126 |
+
"Recall@100": 0.71,
|
| 127 |
+
"P@10": 0.15,
|
| 128 |
+
"MRR": 0.55,
|
| 129 |
+
"num_queries": 300,
|
| 130 |
+
"queries_with_results": 298,
|
| 131 |
+
"queries_with_no_qrels": 2,
|
| 132 |
+
}
|
| 133 |
+
"""
|
| 134 |
+
if k_values is None:
|
| 135 |
+
k_values = [1, 5, 10, 100]
|
| 136 |
+
|
| 137 |
+
scores = defaultdict(list)
|
| 138 |
+
num_queries = 0
|
| 139 |
+
queries_with_results = 0
|
| 140 |
+
queries_no_qrels = 0
|
| 141 |
+
|
| 142 |
+
for query_id, ranked in all_results.items():
|
| 143 |
+
relevant = qrels.get(query_id, {})
|
| 144 |
+
|
| 145 |
+
# skip queries that have no ground truth at all
|
| 146 |
+
if not relevant:
|
| 147 |
+
queries_no_qrels += 1
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
num_queries += 1
|
| 151 |
+
if ranked:
|
| 152 |
+
queries_with_results += 1
|
| 153 |
+
|
| 154 |
+
for k in k_values:
|
| 155 |
+
scores[f"NDCG@{k}"].append(self.ndcg_at_k(ranked, relevant, k))
|
| 156 |
+
scores[f"MAP@{k}"].append(self.map_at_k(ranked, relevant, k))
|
| 157 |
+
scores[f"Recall@{k}"].append(self.recall_at_k(ranked, relevant, k))
|
| 158 |
+
scores[f"P@{k}"].append(self.precision_at_k(ranked, relevant, k))
|
| 159 |
+
|
| 160 |
+
scores["MRR"].append(self.mrr(ranked, relevant))
|
| 161 |
+
|
| 162 |
+
# Print diagnostic so you can see if queries matched correctly
|
| 163 |
+
print(f" Evaluated {num_queries} queries | "
|
| 164 |
+
f"{queries_with_results} had results | "
|
| 165 |
+
f"{queries_no_qrels} had no qrels (skipped)")
|
| 166 |
+
|
| 167 |
+
# Average across all queries
|
| 168 |
+
summary = {
|
| 169 |
+
metric: round(sum(vals) / len(vals), 4) if vals else 0.0
|
| 170 |
+
for metric, vals in scores.items()
|
| 171 |
+
}
|
| 172 |
+
summary["num_queries"] = num_queries
|
| 173 |
+
summary["queries_with_results"] = queries_with_results
|
| 174 |
+
summary["queries_with_no_qrels"] = queries_no_qrels
|
| 175 |
+
|
| 176 |
+
return summary
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
# Quick sanity check with toy data
|
| 181 |
+
evaluator = Evaluator()
|
| 182 |
+
|
| 183 |
+
# Fake ranked results — doc_1 is relevant, doc_2 is not
|
| 184 |
+
fake_results = {
|
| 185 |
+
"q1": [("doc_1", 0.95), ("doc_2", 0.80), ("doc_3", 0.60)],
|
| 186 |
+
"q2": [("doc_4", 0.70), ("doc_1", 0.50)],
|
| 187 |
+
}
|
| 188 |
+
fake_qrels = {
|
| 189 |
+
"q1": {"doc_1": 1},
|
| 190 |
+
"q2": {"doc_4": 1, "doc_5": 1},
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
metrics = evaluator.evaluate(fake_results, fake_qrels, k_values=[1, 5, 10])
|
| 194 |
+
|
| 195 |
+
print("\nSanity check metrics:")
|
| 196 |
+
for k, v in metrics.items():
|
| 197 |
+
print(f" {k}: {v}")
|
evaluation/indexer_bridge.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluation/indexer_bridge.py
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
from indexer.chunker import Chunker
|
| 5 |
+
from indexer.embedder import Embedder
|
| 6 |
+
from indexer.store import Store
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class IndexerBridge:
|
| 10 |
+
"""
|
| 11 |
+
Feeds the BEIR corpus directly into your existing indexing pipeline.
|
| 12 |
+
|
| 13 |
+
The corpus documents are NOT real files on disk — they come from JSONL.
|
| 14 |
+
So we bypass the Crawler/Extractor and inject text directly into
|
| 15 |
+
Chunker → Embedder → Store.
|
| 16 |
+
|
| 17 |
+
Each document gets a fake filepath: "{dataset_name}://{doc_id}"
|
| 18 |
+
This lets the Store treat them like any other indexed file,
|
| 19 |
+
and the Evaluator can later match doc_id back from results.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, config_path: str = "config.yaml"):
|
| 23 |
+
self.chunker = Chunker(chunk_size=500, overlap=50)
|
| 24 |
+
self.embedder = Embedder(config_path)
|
| 25 |
+
self.store = Store(config_path)
|
| 26 |
+
|
| 27 |
+
def index_corpus(self, corpus: dict, batch_size: int = 64, dataset_name: str = "dataset"):
|
| 28 |
+
"""
|
| 29 |
+
Index the entire corpus into FAISS + SQLite.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
corpus — {doc_id: {"title": str, "text": str}}
|
| 33 |
+
batch_size — number of chunks to embed at once (memory control)
|
| 34 |
+
dataset_name — used as prefix for fake file paths e.g. "scifact", "nfcorpus"
|
| 35 |
+
"""
|
| 36 |
+
doc_ids = list(corpus.keys())
|
| 37 |
+
total = len(doc_ids)
|
| 38 |
+
print(f"Indexing {total} documents from [{dataset_name}]...")
|
| 39 |
+
|
| 40 |
+
# Clear previous entries for THIS dataset only
|
| 41 |
+
existing_hashes = self.store.load_hashes()
|
| 42 |
+
prefix = f"{dataset_name}://"
|
| 43 |
+
existing_entries = [fp for fp in existing_hashes if fp.startswith(prefix)]
|
| 44 |
+
for fp in existing_entries:
|
| 45 |
+
self.store.remove_file_chunks(fp)
|
| 46 |
+
if existing_entries:
|
| 47 |
+
print(f"Cleared {len(existing_entries)} previously indexed [{dataset_name}] documents")
|
| 48 |
+
|
| 49 |
+
chunk_buffer = []
|
| 50 |
+
text_buffer = []
|
| 51 |
+
|
| 52 |
+
def flush(chunk_buffer, text_buffer):
|
| 53 |
+
if not chunk_buffer:
|
| 54 |
+
return
|
| 55 |
+
embeddings = self.embedder.embed_chunks(text_buffer)
|
| 56 |
+
embeddings = np.array(embeddings, dtype="float32")
|
| 57 |
+
self.store.add_chunks(chunk_buffer, embeddings)
|
| 58 |
+
|
| 59 |
+
for i, doc_id in enumerate(doc_ids, 1):
|
| 60 |
+
doc = corpus[doc_id]
|
| 61 |
+
full_text = f"{doc['title']} {doc['text']}".strip()
|
| 62 |
+
if not full_text:
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
fake_path = f"{prefix}{doc_id}"
|
| 66 |
+
chunks = self.chunker.chunk_file(full_text, fake_path)
|
| 67 |
+
|
| 68 |
+
for chunk in chunks:
|
| 69 |
+
chunk_buffer.append(chunk)
|
| 70 |
+
text_buffer.append(chunk["text"])
|
| 71 |
+
|
| 72 |
+
self.store.save_file_info(fake_path, doc_id, len(chunks))
|
| 73 |
+
|
| 74 |
+
if len(chunk_buffer) >= batch_size:
|
| 75 |
+
flush(chunk_buffer, text_buffer)
|
| 76 |
+
chunk_buffer.clear()
|
| 77 |
+
text_buffer.clear()
|
| 78 |
+
|
| 79 |
+
if i % 500 == 0:
|
| 80 |
+
print(f" Indexed {i}/{total}...")
|
| 81 |
+
|
| 82 |
+
# flush any remaining chunks
|
| 83 |
+
flush(chunk_buffer, text_buffer)
|
| 84 |
+
print(f"Done. Total vectors: {self.store.get_total_vectors()}")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
from evaluation.dataset_loader import DatasetLoader
|
| 89 |
+
|
| 90 |
+
loader = DatasetLoader("data/scifact")
|
| 91 |
+
corpus = loader.load_corpus()
|
| 92 |
+
|
| 93 |
+
bridge = IndexerBridge()
|
| 94 |
+
bridge.index_corpus(corpus, batch_size=64, dataset_name="scifact")
|
evaluation/query_runner.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluation/query_runner.py
|
| 2 |
+
|
| 3 |
+
from searcher.search_engine import SearchEngine
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class QueryRunner:
|
| 7 |
+
"""
|
| 8 |
+
Runs all evaluation queries through your SearchEngine and collects
|
| 9 |
+
the ranked result lists for scoring.
|
| 10 |
+
|
| 11 |
+
The results are formatted exactly as the Evaluator expects:
|
| 12 |
+
{query_id: [(doc_id, score), ...]} ranked best-first
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, config_path: str = "config.yaml"):
|
| 16 |
+
self.engine = SearchEngine(config_path)
|
| 17 |
+
|
| 18 |
+
def _extract_doc_id(self, filepath: str) -> str:
|
| 19 |
+
"""
|
| 20 |
+
Strip dataset prefix from fake filepath so it matches qrels doc_ids.
|
| 21 |
+
|
| 22 |
+
Examples:
|
| 23 |
+
"scifact://12345" → "12345"
|
| 24 |
+
"nfcorpus://MED-10" → "MED-10"
|
| 25 |
+
"/real/file.pdf" → "/real/file.pdf" (real files unchanged)
|
| 26 |
+
|
| 27 |
+
This is critical — without stripping, doc_ids like "nfcorpus://MED-10"
|
| 28 |
+
will never match qrels keys like "MED-10" and all scores will be 0.0
|
| 29 |
+
"""
|
| 30 |
+
if "://" in filepath:
|
| 31 |
+
return filepath.split("://", 1)[1]
|
| 32 |
+
return filepath
|
| 33 |
+
|
| 34 |
+
def run(
|
| 35 |
+
self,
|
| 36 |
+
queries: dict,
|
| 37 |
+
top_k: int = 100,
|
| 38 |
+
mode: str = "full",
|
| 39 |
+
) -> dict:
|
| 40 |
+
"""
|
| 41 |
+
Run all queries and return ranked results.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
queries — {query_id: query_text}
|
| 45 |
+
top_k — number of results per query (use 100 for eval)
|
| 46 |
+
mode — pipeline variant to test:
|
| 47 |
+
"dense" → dense retrieval only
|
| 48 |
+
"sparse" → BM25 only
|
| 49 |
+
"hybrid" → dense + BM25 + RRF (no reranker)
|
| 50 |
+
"full" → complete pipeline with reranker
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
dict — {query_id: [(doc_id, rank_score), ...]}
|
| 54 |
+
"""
|
| 55 |
+
results = {}
|
| 56 |
+
total = len(queries)
|
| 57 |
+
|
| 58 |
+
for i, (query_id, query_text) in enumerate(queries.items(), 1):
|
| 59 |
+
if i % 50 == 0:
|
| 60 |
+
print(f" Running query {i}/{total}...")
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
if mode == "dense":
|
| 64 |
+
raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
|
| 65 |
+
ranked = [
|
| 66 |
+
(self._extract_doc_id(r["filepath"]), -r["dense_score"])
|
| 67 |
+
for r in raw
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
elif mode == "sparse":
|
| 71 |
+
raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
|
| 72 |
+
ranked = [
|
| 73 |
+
(self._extract_doc_id(r["filepath"]), r["sparse_score"])
|
| 74 |
+
for r in raw
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
elif mode == "hybrid":
|
| 78 |
+
dense_raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
|
| 79 |
+
sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
|
| 80 |
+
fused = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k)
|
| 81 |
+
ranked = [
|
| 82 |
+
(self._extract_doc_id(r["filepath"]), r["rrf_score"])
|
| 83 |
+
for r in fused
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
else: # full pipeline
|
| 87 |
+
output = self.engine.search(query_text, top_k=top_k)
|
| 88 |
+
ranked = [
|
| 89 |
+
(
|
| 90 |
+
self._extract_doc_id(r["filepath"]),
|
| 91 |
+
r.get("rerank_score", r.get("rrf_score", 0))
|
| 92 |
+
)
|
| 93 |
+
for r in output["results"]
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
# Deduplicate by doc_id
|
| 97 |
+
# multiple chunks from same doc → keep only the best score
|
| 98 |
+
seen = {}
|
| 99 |
+
for doc_id, score in ranked:
|
| 100 |
+
if doc_id not in seen or score > seen[doc_id]:
|
| 101 |
+
seen[doc_id] = score
|
| 102 |
+
|
| 103 |
+
results[query_id] = sorted(
|
| 104 |
+
seen.items(),
|
| 105 |
+
key=lambda x: x[1],
|
| 106 |
+
reverse=True
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
except Exception as e:
|
| 110 |
+
print(f" Error on query {query_id}: {e}")
|
| 111 |
+
results[query_id] = []
|
| 112 |
+
|
| 113 |
+
return results
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
from evaluation.dataset_loader import DatasetLoader
|
| 118 |
+
|
| 119 |
+
loader = DatasetLoader("data/scifact")
|
| 120 |
+
queries = loader.load_queries()
|
| 121 |
+
|
| 122 |
+
runner = QueryRunner()
|
| 123 |
+
results = runner.run(queries, top_k=10, mode="full")
|
| 124 |
+
|
| 125 |
+
sample_qid = list(results.keys())[0]
|
| 126 |
+
print(f"\nQuery {sample_qid} top results:")
|
| 127 |
+
for doc_id, score in results[sample_qid][:5]:
|
| 128 |
+
print(f" doc {doc_id} score={score:.4f}")
|
evaluation/run_eval.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# evaluation/run_eval.py
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
from evaluation.dataset_loader import DatasetLoader
|
| 8 |
+
from evaluation.indexer_bridge import IndexerBridge
|
| 9 |
+
from evaluation.query_runner import QueryRunner
|
| 10 |
+
from evaluation.evaluator import Evaluator
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
MODES = ["dense", "sparse", "hybrid", "full"]
|
| 14 |
+
DISPLAY_METRICS = ["NDCG@10", "MAP@100", "Recall@100", "P@10", "MRR"]
|
| 15 |
+
|
| 16 |
+
# All supported datasets — add more here later if needed
|
| 17 |
+
AVAILABLE_DATASETS = {
|
| 18 |
+
"scifact": "data/scifact",
|
| 19 |
+
"nfcorpus": "data/nfcorpus",
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def print_table(results: dict, title: str = ""):
|
| 24 |
+
col_w = 14
|
| 25 |
+
header = f"{'Mode':<10}" + "".join(f"{m:>{col_w}}" for m in DISPLAY_METRICS)
|
| 26 |
+
if title:
|
| 27 |
+
print(f"\n {title}")
|
| 28 |
+
print("=" * len(header))
|
| 29 |
+
print(header)
|
| 30 |
+
print("-" * len(header))
|
| 31 |
+
for mode, metrics in results.items():
|
| 32 |
+
row = f"{mode:<10}"
|
| 33 |
+
for m in DISPLAY_METRICS:
|
| 34 |
+
val = metrics.get(m, 0.0)
|
| 35 |
+
row += f"{val:>{col_w}.4f}"
|
| 36 |
+
print(row)
|
| 37 |
+
print("=" * len(header))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def print_comparison_table(all_dataset_results: dict):
|
| 41 |
+
"""
|
| 42 |
+
Print a single comparison table across all datasets.
|
| 43 |
+
Shows NDCG@10 and MRR side by side for each dataset.
|
| 44 |
+
"""
|
| 45 |
+
datasets = list(all_dataset_results.keys())
|
| 46 |
+
modes = list(list(all_dataset_results.values())[0].keys())
|
| 47 |
+
|
| 48 |
+
print("\n" + "=" * 80)
|
| 49 |
+
print("CROSS-DATASET COMPARISON — full pipeline mode")
|
| 50 |
+
print("=" * 80)
|
| 51 |
+
|
| 52 |
+
# Header
|
| 53 |
+
header = f"{'Dataset':<14}" + "".join(
|
| 54 |
+
f"{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}"
|
| 55 |
+
)
|
| 56 |
+
print(f"{'Dataset':<14}{'NDCG@10':>12}{'MRR':>10}{'MAP@100':>10}")
|
| 57 |
+
print("-" * 46)
|
| 58 |
+
|
| 59 |
+
for dataset, mode_results in all_dataset_results.items():
|
| 60 |
+
# use "full" mode results for comparison, fallback to first mode
|
| 61 |
+
metrics = mode_results.get("full", list(mode_results.values())[0])
|
| 62 |
+
ndcg = metrics.get("NDCG@10", 0.0)
|
| 63 |
+
mrr = metrics.get("MRR", 0.0)
|
| 64 |
+
map_ = metrics.get("MAP@100", 0.0)
|
| 65 |
+
print(f"{dataset:<14}{ndcg:>12.4f}{mrr:>10.4f}{map_:>10.4f}")
|
| 66 |
+
|
| 67 |
+
print("=" * 46)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def run_single_dataset(dataset_name: str, dataset_path: str, args) -> dict:
|
| 71 |
+
"""Run full eval pipeline for one dataset. Returns mode→metrics dict."""
|
| 72 |
+
|
| 73 |
+
print(f"\n{'#'*60}")
|
| 74 |
+
print(f" DATASET: {dataset_name.upper()}")
|
| 75 |
+
print(f"{'#'*60}")
|
| 76 |
+
|
| 77 |
+
# 1 — load
|
| 78 |
+
print("\n[1/4] Loading dataset...")
|
| 79 |
+
loader = DatasetLoader(dataset_path)
|
| 80 |
+
corpus = loader.load_corpus()
|
| 81 |
+
queries = loader.load_queries()
|
| 82 |
+
qrels = loader.load_qrels()
|
| 83 |
+
|
| 84 |
+
# 2 — index
|
| 85 |
+
if not args.skip_index:
|
| 86 |
+
print("\n[2/4] Indexing corpus...")
|
| 87 |
+
bridge = IndexerBridge(args.config)
|
| 88 |
+
# pass dataset_name so fake paths are e.g. nfcorpus://doc_id
|
| 89 |
+
bridge.index_corpus(corpus, batch_size=64, dataset_name=dataset_name)
|
| 90 |
+
else:
|
| 91 |
+
print("\n[2/4] Skipping indexing (--skip-index)")
|
| 92 |
+
|
| 93 |
+
# 3 — run queries
|
| 94 |
+
print("\n[3/4] Running queries...")
|
| 95 |
+
runner = QueryRunner(args.config)
|
| 96 |
+
evaluator = Evaluator()
|
| 97 |
+
|
| 98 |
+
modes_to_run = MODES if args.mode == "all" else [args.mode]
|
| 99 |
+
all_mode_results = {}
|
| 100 |
+
|
| 101 |
+
for mode in modes_to_run:
|
| 102 |
+
print(f"\n Mode: {mode}")
|
| 103 |
+
t0 = time.time()
|
| 104 |
+
ranked_results = runner.run(queries, top_k=args.top_k, mode=mode)
|
| 105 |
+
elapsed = time.time() - t0
|
| 106 |
+
|
| 107 |
+
metrics = evaluator.evaluate(ranked_results, qrels, k_values=[1, 5, 10, 100])
|
| 108 |
+
metrics["query_time_s"] = round(elapsed, 2)
|
| 109 |
+
all_mode_results[mode] = metrics
|
| 110 |
+
|
| 111 |
+
print(f" NDCG@10={metrics.get('NDCG@10', 0):.4f} "
|
| 112 |
+
f"MAP@100={metrics.get('MAP@100', 0):.4f} "
|
| 113 |
+
f"MRR={metrics.get('MRR', 0):.4f}")
|
| 114 |
+
|
| 115 |
+
# 4 — per-dataset table
|
| 116 |
+
print(f"\n[4/4] Results for {dataset_name.upper()}")
|
| 117 |
+
print_table(all_mode_results, title=f"EVALUATION RESULTS — {dataset_name} (pytrec_eval)")
|
| 118 |
+
|
| 119 |
+
return all_mode_results
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def main():
|
| 123 |
+
parser = argparse.ArgumentParser(description="Evaluate semantic search on BEIR datasets")
|
| 124 |
+
parser.add_argument(
|
| 125 |
+
"--datasets",
|
| 126 |
+
nargs="+",
|
| 127 |
+
default=["scifact", "nfcorpus"],
|
| 128 |
+
choices=list(AVAILABLE_DATASETS.keys()),
|
| 129 |
+
help="Which datasets to evaluate. e.g. --datasets scifact nfcorpus"
|
| 130 |
+
)
|
| 131 |
+
parser.add_argument("--config", default="config.yaml")
|
| 132 |
+
parser.add_argument("--top-k", default=100, type=int)
|
| 133 |
+
parser.add_argument("--skip-index", action="store_true")
|
| 134 |
+
parser.add_argument("--mode", default="all",
|
| 135 |
+
help="dense | sparse | hybrid | full | all")
|
| 136 |
+
args = parser.parse_args()
|
| 137 |
+
|
| 138 |
+
os.makedirs("results", exist_ok=True)
|
| 139 |
+
|
| 140 |
+
all_dataset_results = {}
|
| 141 |
+
|
| 142 |
+
for dataset_name in args.datasets:
|
| 143 |
+
dataset_path = AVAILABLE_DATASETS[dataset_name]
|
| 144 |
+
|
| 145 |
+
if not os.path.exists(dataset_path):
|
| 146 |
+
print(f"\n[WARNING] Dataset folder not found: {dataset_path} — skipping {dataset_name}")
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
+
results = run_single_dataset(dataset_name, dataset_path, args)
|
| 150 |
+
all_dataset_results[dataset_name] = results
|
| 151 |
+
|
| 152 |
+
# save per-dataset report
|
| 153 |
+
report_path = f"results/eval_{dataset_name}.json"
|
| 154 |
+
with open(report_path, "w") as f:
|
| 155 |
+
json.dump(results, f, indent=2)
|
| 156 |
+
print(f" Saved → {report_path}")
|
| 157 |
+
|
| 158 |
+
# cross-dataset comparison (only if more than one dataset ran)
|
| 159 |
+
if len(all_dataset_results) > 1:
|
| 160 |
+
print_comparison_table(all_dataset_results)
|
| 161 |
+
|
| 162 |
+
# save combined report
|
| 163 |
+
combined_path = "results/eval_all.json"
|
| 164 |
+
with open(combined_path, "w") as f:
|
| 165 |
+
json.dump(all_dataset_results, f, indent=2)
|
| 166 |
+
print(f"\nCombined report saved → {combined_path}")
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
if __name__ == "__main__":
|
| 170 |
+
main()
|
indexer/__pycache__/chunker.cpython-313.pyc.2070577919488
ADDED
|
Binary file (5.35 kB). View file
|
|
|
indexer/__pycache__/crawler.cpython-313.pyc.2070577919488
ADDED
|
Binary file (4.8 kB). View file
|
|
|
indexer/__pycache__/embedder.cpython-313.pyc.2070577919488
ADDED
|
Binary file (4.39 kB). View file
|
|
|
indexer/__pycache__/extractor.cpython-313.pyc.2070577919488
ADDED
|
Binary file (5.84 kB). View file
|
|
|
indexer/__pycache__/pipeline.cpython-313.pyc.2070577919488
ADDED
|
Binary file (6.86 kB). View file
|
|
|
indexer/__pycache__/store.cpython-313.pyc.2070577919488
ADDED
|
Binary file (11.1 kB). View file
|
|
|
indexer/__pycache__/watcher.cpython-313.pyc.2070577919488
ADDED
|
Binary file (8.89 kB). View file
|
|
|
indexer/chunker.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# indexer/chunker.py
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Chunker:
|
| 5 |
+
"""
|
| 6 |
+
Splits extracted text into overlapping chunks using a sliding window.
|
| 7 |
+
Each chunk will later be embedded as a separate vector.
|
| 8 |
+
|
| 9 |
+
Why chunk at all?
|
| 10 |
+
- Embedding models have a token limit (typically 256-512 tokens)
|
| 11 |
+
- A 50-page PDF as one embedding would lose detail
|
| 12 |
+
- Small chunks let us pinpoint the EXACT passage that matches a query
|
| 13 |
+
|
| 14 |
+
Why overlap?
|
| 15 |
+
- A sentence at the boundary might get cut in half
|
| 16 |
+
- Overlap ensures every sentence appears fully in at least one chunk
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(self, chunk_size=500, overlap=50):
|
| 20 |
+
"""
|
| 21 |
+
Args:
|
| 22 |
+
chunk_size (int) — max number of words per chunk
|
| 23 |
+
overlap (int) — number of words shared between consecutive chunks
|
| 24 |
+
|
| 25 |
+
TODO:
|
| 26 |
+
- Store chunk_size and overlap as instance variables
|
| 27 |
+
- Validate that overlap is less than chunk_size
|
| 28 |
+
(if overlap >= chunk_size, chunks would never advance forward)
|
| 29 |
+
"""
|
| 30 |
+
self.chunk_size = chunk_size
|
| 31 |
+
self.overlap = overlap
|
| 32 |
+
if self.overlap >= self.chunk_size:
|
| 33 |
+
raise ValueError("Overlap must be smaller than chunk_size")
|
| 34 |
+
|
| 35 |
+
def chunk_text(self, text):
|
| 36 |
+
"""
|
| 37 |
+
Split a text string into overlapping chunks based on word count.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
text (str) — the full extracted text from a file
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
list[str] — list of text chunks
|
| 44 |
+
|
| 45 |
+
Example with chunk_size=5, overlap=2:
|
| 46 |
+
text = "The quick brown fox jumps over the lazy dog today"
|
| 47 |
+
words = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "today"]
|
| 48 |
+
|
| 49 |
+
Chunk 0: words[0:5] → "The quick brown fox jumps"
|
| 50 |
+
Chunk 1: words[3:8] → "fox jumps over the lazy" (step = 5-2 = 3)
|
| 51 |
+
Chunk 2: words[6:11] → "the lazy dog today" (step = 3 again)
|
| 52 |
+
|
| 53 |
+
TODO:
|
| 54 |
+
- Split the text into a list of words using .split()
|
| 55 |
+
- If the word list is empty, return an empty list
|
| 56 |
+
- Calculate step size: step = chunk_size - overlap
|
| 57 |
+
- Use a loop starting at 0, stepping by 'step', up to len(words)
|
| 58 |
+
- At each position, take words[i : i + chunk_size]
|
| 59 |
+
- Join each slice back into a string with " ".join()
|
| 60 |
+
- Return the list of chunk strings
|
| 61 |
+
|
| 62 |
+
HINT:
|
| 63 |
+
words = text.split()
|
| 64 |
+
step = self.chunk_size - self.overlap
|
| 65 |
+
for i in range(0, len(words), step):
|
| 66 |
+
chunk_words = words[i : i + self.chunk_size]
|
| 67 |
+
"""
|
| 68 |
+
words = text.split()
|
| 69 |
+
if not words:
|
| 70 |
+
return []
|
| 71 |
+
step = self.chunk_size - self.overlap
|
| 72 |
+
chunks = []
|
| 73 |
+
for i in range(0, len(words), step):
|
| 74 |
+
chunk_words = words[i:i+self.chunk_size]
|
| 75 |
+
chunks.append(" ".join(chunk_words))
|
| 76 |
+
return chunks
|
| 77 |
+
|
| 78 |
+
def chunk_file(self, text, filepath):
|
| 79 |
+
"""
|
| 80 |
+
Chunk a file's text and attach metadata to each chunk.
|
| 81 |
+
This metadata will be stored in SQLite alongside the vectors.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
text (str) — extracted text content
|
| 85 |
+
filepath (str) — source file path (for metadata)
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
list[dict] — each dict contains:
|
| 89 |
+
{
|
| 90 |
+
"text": "the chunk text...",
|
| 91 |
+
"filepath": "/path/to/file.pdf",
|
| 92 |
+
"chunk_index": 0, # position in the file
|
| 93 |
+
"total_chunks": 5 # how many chunks this file produced
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
TODO:
|
| 97 |
+
- Call self.chunk_text(text) to get the list of chunk strings
|
| 98 |
+
- Build a list of dicts, one per chunk, with the fields shown above
|
| 99 |
+
- chunk_index starts at 0
|
| 100 |
+
|
| 101 |
+
HINT:
|
| 102 |
+
chunks = self.chunk_text(text)
|
| 103 |
+
for i, chunk in enumerate(chunks):
|
| 104 |
+
# build the dict here
|
| 105 |
+
"""
|
| 106 |
+
chunks = self.chunk_text(text)
|
| 107 |
+
results = []
|
| 108 |
+
for i, chunk in enumerate(chunks):
|
| 109 |
+
results.append({
|
| 110 |
+
"text": chunk,
|
| 111 |
+
"filepath": filepath,
|
| 112 |
+
"chunk_index": i,
|
| 113 |
+
})
|
| 114 |
+
return results
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# --- Test it ---
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
chunker = Chunker(chunk_size=10, overlap=3)
|
| 120 |
+
|
| 121 |
+
sample = (
|
| 122 |
+
"The quick brown fox jumps over the lazy dog. "
|
| 123 |
+
"Semantic search finds files by meaning not just keywords. "
|
| 124 |
+
"This is a test of the chunking system for our project."
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
chunks = chunker.chunk_text(sample)
|
| 128 |
+
print(f"Text has {len(sample.split())} words → {len(chunks)} chunks\n")
|
| 129 |
+
for i, chunk in enumerate(chunks):
|
| 130 |
+
print(f"Chunk {i}: {chunk}")
|
| 131 |
+
|
| 132 |
+
print("\n--- With metadata ---")
|
| 133 |
+
results = chunker.chunk_file(sample, "/test/sample.txt")
|
| 134 |
+
for r in results:
|
| 135 |
+
print(f"[{r['chunk_index']}] {r['text'][:60]}...")
|
indexer/crawler.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# indexer/crawler.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import hashlib
|
| 5 |
+
import yaml
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Crawler:
|
| 9 |
+
"""
|
| 10 |
+
Discovers files in configured directories and tracks which ones
|
| 11 |
+
are new or modified using SHA-256 hashing.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, config_path="config.yaml"):
|
| 15 |
+
"""
|
| 16 |
+
Load the config file and store the settings as instance variables.
|
| 17 |
+
"""
|
| 18 |
+
with open(config_path, "r") as f:
|
| 19 |
+
config = yaml.safe_load(f)
|
| 20 |
+
|
| 21 |
+
self.watch_paths = config["watch_paths"]
|
| 22 |
+
self.include_extensions = config["include_extensions"]
|
| 23 |
+
self.skip_directories = config["skip_directories"]
|
| 24 |
+
self.data_dir = config["data_dir"]
|
| 25 |
+
|
| 26 |
+
def discover_files(self):
|
| 27 |
+
"""
|
| 28 |
+
Walk through all watch_paths recursively and collect every file
|
| 29 |
+
that matches include_extensions, skipping skip_directories.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
list[str] — list of absolute file paths
|
| 33 |
+
"""
|
| 34 |
+
results=[]
|
| 35 |
+
for path in self.watch_paths:
|
| 36 |
+
for dirpath, dirnames, filenames in os.walk(path):
|
| 37 |
+
for filename in filenames:
|
| 38 |
+
if os.path.splitext(filename)[1] in self.include_extensions:
|
| 39 |
+
full_path = os.path.join(dirpath, filename)
|
| 40 |
+
results.append(full_path)
|
| 41 |
+
dirnames[:] = [d for d in dirnames if d not in self.skip_directories]
|
| 42 |
+
return results
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def compute_hash(self, filepath):
|
| 46 |
+
"""
|
| 47 |
+
Compute the SHA-256 hash of a file's contents.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
filepath (str) — absolute path to the file
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
str — hex string of the SHA-256 hash
|
| 54 |
+
"""
|
| 55 |
+
hasher = hashlib.sha256()
|
| 56 |
+
with open(filepath, "rb") as f:
|
| 57 |
+
while chunk := f.read(8192):
|
| 58 |
+
hasher.update(chunk)
|
| 59 |
+
return hasher.hexdigest()
|
| 60 |
+
|
| 61 |
+
def get_new_and_modified(self, known_hashes=None):
|
| 62 |
+
"""
|
| 63 |
+
Compare discovered files against previously known hashes to find
|
| 64 |
+
which files are new or have been modified since last run.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
known_hashes (dict) — {filepath: hash} from previous run
|
| 68 |
+
Pass None or {} on first run.
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
tuple: (files_to_process, current_hashes, deleted_files)
|
| 72 |
+
- files_to_process: list[str] — paths that are new or changed
|
| 73 |
+
- current_hashes: dict — {filepath: hash} for ALL current files
|
| 74 |
+
- deleted files: list[str] — files that were deleted
|
| 75 |
+
"""
|
| 76 |
+
if known_hashes is None:
|
| 77 |
+
known_hashes = {}
|
| 78 |
+
current_files = self.discover_files()
|
| 79 |
+
files_to_process = []
|
| 80 |
+
current_hashes = {}
|
| 81 |
+
for file in current_files:
|
| 82 |
+
file_hash = self.compute_hash(file)
|
| 83 |
+
if file not in known_hashes or file_hash != known_hashes[file]:
|
| 84 |
+
files_to_process.append(file)
|
| 85 |
+
current_hashes[file] = file_hash
|
| 86 |
+
|
| 87 |
+
deleted_files = set(known_hashes.keys()) - set(current_hashes.keys())
|
| 88 |
+
|
| 89 |
+
return files_to_process, current_hashes, deleted_files
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# --- Test it ---
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
crawler = Crawler()
|
| 95 |
+
files = crawler.discover_files()
|
| 96 |
+
print(f"Found {len(files)} files:")
|
| 97 |
+
for f in files:
|
| 98 |
+
print(f" {f}")
|
| 99 |
+
|
| 100 |
+
print("\n--- Checking for new/modified ---")
|
| 101 |
+
to_process, hashes = crawler.get_new_and_modified()
|
| 102 |
+
print(f"{len(to_process)} files to process")
|
indexer/embedder.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# indexer/embedder.py
|
| 2 |
+
|
| 3 |
+
import yaml
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Embedder:
|
| 8 |
+
"""
|
| 9 |
+
Loads a sentence-transformer model and converts text chunks
|
| 10 |
+
into dense vector embeddings.
|
| 11 |
+
|
| 12 |
+
Model upgrade: all-MiniLM-L6-v2 → BAAI/bge-small-en-v1.5
|
| 13 |
+
|
| 14 |
+
Why BGE over MiniLM:
|
| 15 |
+
- MiniLM : general purpose, fast, 384-dim, NDCG ~0.65 on SciFact
|
| 16 |
+
- BGE-small: retrieval-specific training, 384-dim, NDCG ~0.72 on SciFact
|
| 17 |
+
- Same dimension (384), same API — only the model name changes
|
| 18 |
+
- BGE uses a special instruction prefix for queries (not for documents)
|
| 19 |
+
"Represent this sentence for searching relevant passages: {query}"
|
| 20 |
+
This is handled automatically in embed_single()
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
# BGE query instruction prefix — improves retrieval accuracy
|
| 24 |
+
# Applied to queries only, NOT to document chunks during indexing
|
| 25 |
+
BGE_QUERY_PREFIX = "Represent this sentence for searching relevant passages: "
|
| 26 |
+
|
| 27 |
+
def __init__(self, config_path="config.yaml"):
|
| 28 |
+
"""
|
| 29 |
+
Load the config and initialize the embedding model.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
config_path (str) — path to config.yaml
|
| 33 |
+
"""
|
| 34 |
+
with open(config_path, "r") as f:
|
| 35 |
+
config = yaml.safe_load(f)
|
| 36 |
+
|
| 37 |
+
model_name = config["embedding_model"]
|
| 38 |
+
self.model_name = model_name
|
| 39 |
+
|
| 40 |
+
# detect if we are using a BGE model
|
| 41 |
+
# BGE models need a special prefix on queries (not on documents)
|
| 42 |
+
self.is_bge = "bge" in model_name.lower()
|
| 43 |
+
|
| 44 |
+
print(f"Loading embedding model '{model_name}'...")
|
| 45 |
+
self.model = SentenceTransformer(model_name)
|
| 46 |
+
print(f"Model loaded — BGE mode: {self.is_bge}")
|
| 47 |
+
|
| 48 |
+
def embed_chunks(self, chunks):
|
| 49 |
+
"""
|
| 50 |
+
Convert a list of text chunks into dense vector embeddings.
|
| 51 |
+
Used during INDEXING — no query prefix applied here.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
chunks (list[str]) — list of text strings to embed
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
numpy.ndarray — shape (num_chunks, embedding_dim)
|
| 58 |
+
384 dimensions for both MiniLM and BGE-small
|
| 59 |
+
"""
|
| 60 |
+
embeddings = self.model.encode(
|
| 61 |
+
chunks,
|
| 62 |
+
batch_size=64,
|
| 63 |
+
show_progress_bar=False,
|
| 64 |
+
normalize_embeddings=self.is_bge, # BGE needs L2 normalization
|
| 65 |
+
)
|
| 66 |
+
return embeddings
|
| 67 |
+
|
| 68 |
+
def embed_single(self, text):
|
| 69 |
+
"""
|
| 70 |
+
Embed a single query string.
|
| 71 |
+
Used during SEARCH — BGE prefix is applied here if using BGE model.
|
| 72 |
+
|
| 73 |
+
Why prefix only on queries:
|
| 74 |
+
BGE was trained with this asymmetric setup.
|
| 75 |
+
Documents are indexed as-is.
|
| 76 |
+
Queries get the instruction prefix so the model knows
|
| 77 |
+
it is searching for relevant passages, not matching exact text.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
text (str) — a single query string
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
numpy.ndarray — one embedding vector (384 dimensions)
|
| 84 |
+
"""
|
| 85 |
+
if self.is_bge:
|
| 86 |
+
text = self.BGE_QUERY_PREFIX + text
|
| 87 |
+
|
| 88 |
+
return self.model.encode(
|
| 89 |
+
text,
|
| 90 |
+
normalize_embeddings=True, # always normalize for BGE
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
if __name__ == "__main__":
|
| 95 |
+
embedder = Embedder()
|
| 96 |
+
|
| 97 |
+
test_chunks = [
|
| 98 |
+
"The quarterly budget report shows increased spending",
|
| 99 |
+
"Machine learning models can understand text semantics",
|
| 100 |
+
"The cat sat on the mat and looked out the window"
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
print("Embedding 3 test chunks...")
|
| 104 |
+
vectors = embedder.embed_chunks(test_chunks)
|
| 105 |
+
print(f"Got {len(vectors)} vectors")
|
| 106 |
+
print(f"Each vector has {len(vectors[0])} dimensions")
|
| 107 |
+
print(f"First vector (first 5 values): {vectors[0][:5]}")
|
| 108 |
+
|
| 109 |
+
print("\n--- Single query embedding ---")
|
| 110 |
+
query_vec = embedder.embed_single("budget spending report")
|
| 111 |
+
print(f"Query vector: {len(query_vec)} dimensions")
|
indexer/extractor.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# indexer/extractor.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import fitz # PyMuPDF
|
| 6 |
+
from docx import Document
|
| 7 |
+
from pptx import Presentation
|
| 8 |
+
from openpyxl import load_workbook
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Extractor:
|
| 12 |
+
"""
|
| 13 |
+
Extracts raw text content from different file types.
|
| 14 |
+
Each file type has its own extraction method.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
def extract(self, filepath):
|
| 18 |
+
"""
|
| 19 |
+
Main dispatcher — picks the right extraction method based on file extension.
|
| 20 |
+
"""
|
| 21 |
+
handlers = {
|
| 22 |
+
".pdf": self.extract_pdf,
|
| 23 |
+
".docx": self.extract_docx,
|
| 24 |
+
".pptx": self.extract_pptx,
|
| 25 |
+
".xlsx": self.extract_xlsx,
|
| 26 |
+
".ipynb": self.extract_ipynb,
|
| 27 |
+
".txt": self.extract_text,
|
| 28 |
+
".md": self.extract_text,
|
| 29 |
+
".py": self.extract_text,
|
| 30 |
+
".js": self.extract_text,
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
ext = os.path.splitext(filepath)[1].lower()
|
| 35 |
+
handler = handlers.get(ext)
|
| 36 |
+
if handler:
|
| 37 |
+
return handler(filepath)
|
| 38 |
+
else:
|
| 39 |
+
print(f"Warning: Unrecognized file extension: {ext}")
|
| 40 |
+
return ""
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"Error extracting text from {filepath}: {e}")
|
| 43 |
+
return ""
|
| 44 |
+
|
| 45 |
+
def extract_pdf(self, filepath):
|
| 46 |
+
"""Extract text from a PDF file using PyMuPDF."""
|
| 47 |
+
doc = fitz.open(filepath)
|
| 48 |
+
pages = []
|
| 49 |
+
for page in doc:
|
| 50 |
+
pages.append(page.get_text())
|
| 51 |
+
doc.close()
|
| 52 |
+
return "\n".join(pages)
|
| 53 |
+
|
| 54 |
+
def extract_docx(self, filepath):
|
| 55 |
+
"""Extract text from a Word document using python-docx."""
|
| 56 |
+
doc = Document(filepath)
|
| 57 |
+
paragraphs = []
|
| 58 |
+
for para in doc.paragraphs:
|
| 59 |
+
paragraphs.append(para.text)
|
| 60 |
+
return "\n".join(paragraphs)
|
| 61 |
+
|
| 62 |
+
def extract_pptx(self, filepath):
|
| 63 |
+
"""Extract text from a PowerPoint file using python-pptx."""
|
| 64 |
+
prs = Presentation(filepath)
|
| 65 |
+
lines = []
|
| 66 |
+
for slide in prs.slides:
|
| 67 |
+
for shape in slide.shapes:
|
| 68 |
+
if shape.has_text_frame:
|
| 69 |
+
for para in shape.text_frame.paragraphs:
|
| 70 |
+
lines.append(para.text)
|
| 71 |
+
return "\n".join(lines)
|
| 72 |
+
|
| 73 |
+
def extract_xlsx(self, filepath):
|
| 74 |
+
"""Extract text from an Excel file using openpyxl."""
|
| 75 |
+
wb = load_workbook(filepath, data_only=True)
|
| 76 |
+
rows = []
|
| 77 |
+
for sheet_name in wb.sheetnames:
|
| 78 |
+
sheet = wb[sheet_name]
|
| 79 |
+
for row in sheet.iter_rows():
|
| 80 |
+
cells = []
|
| 81 |
+
for cell in row:
|
| 82 |
+
if cell.value is not None:
|
| 83 |
+
cells.append(str(cell.value))
|
| 84 |
+
rows.append(" ".join(cells))
|
| 85 |
+
return "\n".join(rows)
|
| 86 |
+
|
| 87 |
+
def extract_ipynb(self, filepath):
|
| 88 |
+
"""Extract text from a Jupyter notebook (.ipynb) file."""
|
| 89 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
| 90 |
+
notebook = json.load(f)
|
| 91 |
+
cells = []
|
| 92 |
+
for cell in notebook["cells"]:
|
| 93 |
+
cell_text = "".join(cell["source"])
|
| 94 |
+
cells.append(cell_text)
|
| 95 |
+
return "\n".join(cells)
|
| 96 |
+
|
| 97 |
+
def extract_text(self, filepath):
|
| 98 |
+
"""Extract text from plain text files (.txt, .md, .py, .js, etc.)"""
|
| 99 |
+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
| 100 |
+
return f.read()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# --- Test it ---
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
import sys
|
| 106 |
+
|
| 107 |
+
extractor = Extractor()
|
| 108 |
+
|
| 109 |
+
if len(sys.argv) > 1:
|
| 110 |
+
filepath = sys.argv[1]
|
| 111 |
+
text = extractor.extract(filepath)
|
| 112 |
+
print(f"Extracted {len(text)} characters from {filepath}")
|
| 113 |
+
print(f"Preview:\n{text[:500]}")
|
| 114 |
+
else:
|
| 115 |
+
print("Usage: python -m indexer.extractor <filepath>")
|
indexer/pipeline.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# indexer/pipeline.py
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
from evaluation.dataset_loader import DatasetLoader
|
| 7 |
+
from indexer.crawler import Crawler
|
| 8 |
+
from indexer.extractor import Extractor
|
| 9 |
+
from indexer.chunker import Chunker
|
| 10 |
+
from indexer.embedder import Embedder
|
| 11 |
+
from indexer.store import Store
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class IndexingPipeline:
|
| 15 |
+
"""
|
| 16 |
+
Wires all indexer modules together.
|
| 17 |
+
|
| 18 |
+
The flow for each file:
|
| 19 |
+
Crawler (discover + hash check)
|
| 20 |
+
→ Extractor (file → raw text)
|
| 21 |
+
→ Chunker (text → chunks with metadata)
|
| 22 |
+
→ Embedder (chunks → vectors)
|
| 23 |
+
→ Store (vectors → FAISS, metadata → SQLite)
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __init__(self, config_path="config.yaml"):
|
| 27 |
+
"""
|
| 28 |
+
Initialize all pipeline components.
|
| 29 |
+
"""
|
| 30 |
+
self.config_path = config_path
|
| 31 |
+
self.crawler = Crawler(config_path)
|
| 32 |
+
self.extractor = Extractor()
|
| 33 |
+
self.chunker = Chunker(chunk_size=500, overlap=50)
|
| 34 |
+
self.embedder = Embedder(config_path)
|
| 35 |
+
self.store = Store(config_path)
|
| 36 |
+
|
| 37 |
+
def _iter_dataset_documents(self):
|
| 38 |
+
"""
|
| 39 |
+
Yield BEIR corpus documents as synthetic files so hosted deployments
|
| 40 |
+
can build an index from dataset folders containing corpus.jsonl.
|
| 41 |
+
"""
|
| 42 |
+
for dataset_path in self.crawler.watch_paths:
|
| 43 |
+
corpus_path = os.path.join(dataset_path, "corpus.jsonl")
|
| 44 |
+
if not os.path.exists(corpus_path):
|
| 45 |
+
continue
|
| 46 |
+
|
| 47 |
+
dataset_name = os.path.basename(os.path.normpath(dataset_path))
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
corpus = DatasetLoader(dataset_path).load_corpus()
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"[Pipeline] Could not load dataset corpus from {dataset_path}: {e}")
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
+
for doc_id, doc in corpus.items():
|
| 56 |
+
title = (doc.get("title") or "").strip()
|
| 57 |
+
body = (doc.get("text") or "").strip()
|
| 58 |
+
text = "\n\n".join(part for part in [title, body] if part).strip()
|
| 59 |
+
if not text:
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
synthetic_path = f"{dataset_name}://{doc_id}"
|
| 63 |
+
synthetic_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
| 64 |
+
yield synthetic_path, synthetic_hash, text
|
| 65 |
+
|
| 66 |
+
def run(self):
|
| 67 |
+
"""
|
| 68 |
+
Execute the full indexing pipeline.
|
| 69 |
+
"""
|
| 70 |
+
known_hashes = self.store.load_hashes()
|
| 71 |
+
print("Scanning for new/modified files...")
|
| 72 |
+
files_to_process, current_hashes, deleted_files = self.crawler.get_new_and_modified(known_hashes)
|
| 73 |
+
|
| 74 |
+
dataset_documents = list(self._iter_dataset_documents())
|
| 75 |
+
known_dataset_hashes = {
|
| 76 |
+
filepath: file_hash
|
| 77 |
+
for filepath, file_hash in known_hashes.items()
|
| 78 |
+
if "://" in filepath
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
for filepath, file_hash, text in dataset_documents:
|
| 82 |
+
current_hashes[filepath] = file_hash
|
| 83 |
+
if known_dataset_hashes.get(filepath) != file_hash:
|
| 84 |
+
files_to_process.append((filepath, text))
|
| 85 |
+
|
| 86 |
+
current_dataset_paths = {filepath for filepath, _, _ in dataset_documents}
|
| 87 |
+
deleted_files = set(deleted_files) | (
|
| 88 |
+
set(known_dataset_hashes.keys()) - current_dataset_paths
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
for filepath in deleted_files:
|
| 92 |
+
self.store.remove_file_chunks(filepath)
|
| 93 |
+
|
| 94 |
+
if not files_to_process:
|
| 95 |
+
print("Index is up to date.")
|
| 96 |
+
print(f"Total vectors: {self.store.get_total_vectors()}")
|
| 97 |
+
return
|
| 98 |
+
|
| 99 |
+
total = len(files_to_process)
|
| 100 |
+
for i, item in enumerate(files_to_process, 1):
|
| 101 |
+
if isinstance(item, tuple):
|
| 102 |
+
filepath, text = item
|
| 103 |
+
else:
|
| 104 |
+
filepath = item
|
| 105 |
+
text = self.extractor.extract(filepath)
|
| 106 |
+
|
| 107 |
+
print(f"[{i}/{total}] {filepath}")
|
| 108 |
+
if not text.strip():
|
| 109 |
+
print(f" Skipping (no text extracted)")
|
| 110 |
+
continue
|
| 111 |
+
chunks = self.chunker.chunk_file(text, filepath)
|
| 112 |
+
chunk_texts = [c["text"] for c in chunks]
|
| 113 |
+
embeddings = self.embedder.embed_chunks(chunk_texts)
|
| 114 |
+
self.store.remove_file_chunks(filepath)
|
| 115 |
+
self.store.add_chunks(chunks, embeddings)
|
| 116 |
+
self.store.save_file_info(filepath, current_hashes[filepath], len(chunks))
|
| 117 |
+
|
| 118 |
+
print(f"\nProcessed {total} files.")
|
| 119 |
+
print(f"Total vectors: {self.store.get_total_vectors()}")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# --- Test it ---
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
pipeline = IndexingPipeline()
|
| 125 |
+
pipeline.run()
|
indexer/store.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# indexer/store.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sqlite3
|
| 5 |
+
import numpy as np
|
| 6 |
+
import faiss
|
| 7 |
+
import yaml
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Store:
|
| 11 |
+
"""
|
| 12 |
+
Handles two storage systems:
|
| 13 |
+
|
| 14 |
+
1. FAISS — stores dense vectors for fast similarity search
|
| 15 |
+
Uses IndexHNSWFlat instead of IndexFlatL2
|
| 16 |
+
HNSW = Hierarchical Navigable Small World graph
|
| 17 |
+
- IndexFlatL2 : scans every vector (slow at scale)
|
| 18 |
+
- IndexHNSWFlat: graph-based navigation (fast, same accuracy)
|
| 19 |
+
|
| 20 |
+
2. SQLite — stores metadata about each chunk
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
# HNSW parameter — higher = more accurate but more memory
|
| 24 |
+
# 32 is the standard default, good balance for this use case
|
| 25 |
+
HNSW_M = 32
|
| 26 |
+
|
| 27 |
+
def __init__(self, config_path="config.yaml"):
|
| 28 |
+
"""
|
| 29 |
+
Load config, set up file paths, initialize FAISS index and SQLite.
|
| 30 |
+
"""
|
| 31 |
+
with open(config_path, "r") as f:
|
| 32 |
+
config = yaml.safe_load(f)
|
| 33 |
+
|
| 34 |
+
self.data_dir = config["data_dir"]
|
| 35 |
+
os.makedirs(self.data_dir, exist_ok=True)
|
| 36 |
+
|
| 37 |
+
self.faiss_path = os.path.join(self.data_dir, "index.faiss")
|
| 38 |
+
self.db_path = os.path.join(self.data_dir, "metadata.db")
|
| 39 |
+
|
| 40 |
+
self._init_db()
|
| 41 |
+
self._load_or_create_index()
|
| 42 |
+
|
| 43 |
+
def _init_db(self):
|
| 44 |
+
"""
|
| 45 |
+
Create SQLite tables if they don't already exist.
|
| 46 |
+
"""
|
| 47 |
+
conn = sqlite3.connect(self.db_path)
|
| 48 |
+
cursor = conn.cursor()
|
| 49 |
+
|
| 50 |
+
cursor.execute('''
|
| 51 |
+
CREATE TABLE IF NOT EXISTS chunks (
|
| 52 |
+
id INTEGER PRIMARY KEY,
|
| 53 |
+
filepath TEXT NOT NULL,
|
| 54 |
+
chunk_text TEXT NOT NULL,
|
| 55 |
+
chunk_index INTEGER,
|
| 56 |
+
FOREIGN KEY (filepath) REFERENCES files(filepath)
|
| 57 |
+
)
|
| 58 |
+
''')
|
| 59 |
+
|
| 60 |
+
cursor.execute('''
|
| 61 |
+
CREATE TABLE IF NOT EXISTS files (
|
| 62 |
+
filepath TEXT PRIMARY KEY,
|
| 63 |
+
file_hash TEXT NOT NULL,
|
| 64 |
+
total_chunks INTEGER
|
| 65 |
+
)
|
| 66 |
+
''')
|
| 67 |
+
|
| 68 |
+
conn.commit()
|
| 69 |
+
conn.close()
|
| 70 |
+
|
| 71 |
+
def _load_or_create_index(self):
|
| 72 |
+
"""
|
| 73 |
+
Load an existing FAISS index from disk, or set to None.
|
| 74 |
+
The actual index is created on first add_chunks() call
|
| 75 |
+
so we know the embedding dimension at that point.
|
| 76 |
+
"""
|
| 77 |
+
if os.path.exists(self.faiss_path):
|
| 78 |
+
self.index = faiss.read_index(self.faiss_path)
|
| 79 |
+
print(f"[Store] Loaded FAISS index — {self.index.ntotal} vectors")
|
| 80 |
+
else:
|
| 81 |
+
self.index = None
|
| 82 |
+
print("[Store] No existing index found — will create on first insert")
|
| 83 |
+
|
| 84 |
+
def _create_hnsw_index(self, dimension: int):
|
| 85 |
+
"""
|
| 86 |
+
Create a new HNSW-based FAISS index.
|
| 87 |
+
|
| 88 |
+
Why HNSW over FlatL2:
|
| 89 |
+
FlatL2 — exact search, O(n) per query, slow at scale
|
| 90 |
+
HNSWFlat — approximate search, O(log n) per query, same accuracy
|
| 91 |
+
for top-k retrieval tasks
|
| 92 |
+
|
| 93 |
+
IndexIDMap2 wraps HNSW to support custom integer IDs and deletion.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
dimension — embedding size (384 for MiniLM and BGE-small)
|
| 97 |
+
"""
|
| 98 |
+
hnsw_index = faiss.IndexHNSWFlat(dimension, self.HNSW_M)
|
| 99 |
+
hnsw_index.hnsw.efSearch = 64 # search quality — higher = better recall
|
| 100 |
+
hnsw_index.hnsw.efConstruction = 64 # build quality — higher = better graph
|
| 101 |
+
self.index = faiss.IndexIDMap2(hnsw_index)
|
| 102 |
+
print(f"[Store] Created HNSW index — dim={dimension}, M={self.HNSW_M}")
|
| 103 |
+
|
| 104 |
+
def get_next_id(self):
|
| 105 |
+
"""
|
| 106 |
+
Get the next available chunk ID from SQLite.
|
| 107 |
+
"""
|
| 108 |
+
conn = sqlite3.connect(self.db_path)
|
| 109 |
+
cursor = conn.cursor()
|
| 110 |
+
cursor.execute("SELECT MAX(id) FROM chunks")
|
| 111 |
+
result = cursor.fetchone()[0]
|
| 112 |
+
conn.close()
|
| 113 |
+
return 0 if result is None else result + 1
|
| 114 |
+
|
| 115 |
+
def add_chunks(self, chunks_with_metadata, embeddings):
|
| 116 |
+
"""
|
| 117 |
+
Add new chunks and their embeddings to both FAISS and SQLite.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
chunks_with_metadata (list[dict]) — from chunker.chunk_file()
|
| 121 |
+
Each dict has: text, filepath, chunk_index
|
| 122 |
+
embeddings (numpy.ndarray) — shape (num_chunks, embedding_dim)
|
| 123 |
+
From embedder.embed_chunks()
|
| 124 |
+
"""
|
| 125 |
+
embeddings = embeddings.astype("float32")
|
| 126 |
+
|
| 127 |
+
# create index on first insert — dimension comes from embeddings
|
| 128 |
+
if self.index is None:
|
| 129 |
+
dimension = embeddings.shape[1]
|
| 130 |
+
self._create_hnsw_index(dimension)
|
| 131 |
+
|
| 132 |
+
start_id = self.get_next_id()
|
| 133 |
+
ids = np.array(
|
| 134 |
+
[start_id + i for i in range(len(chunks_with_metadata))],
|
| 135 |
+
dtype=np.int64
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
self.index.add_with_ids(embeddings, ids)
|
| 139 |
+
faiss.write_index(self.index, self.faiss_path)
|
| 140 |
+
|
| 141 |
+
# save chunk metadata to SQLite
|
| 142 |
+
conn = sqlite3.connect(self.db_path)
|
| 143 |
+
cursor = conn.cursor()
|
| 144 |
+
|
| 145 |
+
for i, chunk in enumerate(chunks_with_metadata):
|
| 146 |
+
vector_id = start_id + i
|
| 147 |
+
cursor.execute(
|
| 148 |
+
"INSERT INTO chunks (id, filepath, chunk_text, chunk_index) "
|
| 149 |
+
"VALUES (?, ?, ?, ?)",
|
| 150 |
+
(vector_id, chunk["filepath"], chunk["text"], chunk["chunk_index"])
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
conn.commit()
|
| 154 |
+
conn.close()
|
| 155 |
+
|
| 156 |
+
def save_file_info(self, filepath, file_hash, total_chunks):
|
| 157 |
+
"""
|
| 158 |
+
Save or update file info in SQLite.
|
| 159 |
+
|
| 160 |
+
Args:
|
| 161 |
+
filepath — file path or fake path e.g. "scifact://12345"
|
| 162 |
+
file_hash — SHA256 hash or doc_id string
|
| 163 |
+
total_chunks — number of chunks this file was split into
|
| 164 |
+
"""
|
| 165 |
+
conn = sqlite3.connect(self.db_path)
|
| 166 |
+
cursor = conn.cursor()
|
| 167 |
+
cursor.execute(
|
| 168 |
+
"INSERT OR REPLACE INTO files (filepath, file_hash, total_chunks) "
|
| 169 |
+
"VALUES (?, ?, ?)",
|
| 170 |
+
(filepath, file_hash, total_chunks)
|
| 171 |
+
)
|
| 172 |
+
conn.commit()
|
| 173 |
+
conn.close()
|
| 174 |
+
|
| 175 |
+
def load_hashes(self):
|
| 176 |
+
"""
|
| 177 |
+
Load all stored file hashes from SQLite.
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
dict — {filepath: hash_string}
|
| 181 |
+
"""
|
| 182 |
+
conn = sqlite3.connect(self.db_path)
|
| 183 |
+
cursor = conn.cursor()
|
| 184 |
+
cursor.execute("SELECT filepath, file_hash FROM files")
|
| 185 |
+
rows = cursor.fetchall()
|
| 186 |
+
conn.close()
|
| 187 |
+
return {row[0]: row[1] for row in rows}
|
| 188 |
+
|
| 189 |
+
def remove_file_chunks(self, filepath):
|
| 190 |
+
"""
|
| 191 |
+
Delete all chunks for a file from both SQLite and FAISS.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
filepath — the filepath to remove
|
| 195 |
+
"""
|
| 196 |
+
conn = sqlite3.connect(self.db_path)
|
| 197 |
+
cursor = conn.cursor()
|
| 198 |
+
|
| 199 |
+
ids = cursor.execute(
|
| 200 |
+
"SELECT id FROM chunks WHERE filepath = ?", (filepath,)
|
| 201 |
+
).fetchall()
|
| 202 |
+
|
| 203 |
+
cursor.execute("DELETE FROM chunks WHERE filepath = ?", (filepath,))
|
| 204 |
+
cursor.execute("DELETE FROM files WHERE filepath = ?", (filepath,))
|
| 205 |
+
conn.commit()
|
| 206 |
+
conn.close()
|
| 207 |
+
|
| 208 |
+
if ids and self.index is not None:
|
| 209 |
+
id_array = np.array([i[0] for i in ids], dtype=np.int64)
|
| 210 |
+
self.index.remove_ids(id_array)
|
| 211 |
+
faiss.write_index(self.index, self.faiss_path)
|
| 212 |
+
|
| 213 |
+
def get_total_vectors(self):
|
| 214 |
+
"""
|
| 215 |
+
Return how many vectors are in the FAISS index.
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
int — number of vectors, or 0 if index is empty
|
| 219 |
+
"""
|
| 220 |
+
if self.index is None:
|
| 221 |
+
return 0
|
| 222 |
+
return self.index.ntotal
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
if __name__ == "__main__":
|
| 226 |
+
store = Store()
|
| 227 |
+
|
| 228 |
+
fake_chunks = [
|
| 229 |
+
{"text": "quarterly budget report summary", "filepath": "/docs/report.pdf", "chunk_index": 0},
|
| 230 |
+
{"text": "revenue increased by fifteen percent", "filepath": "/docs/report.pdf", "chunk_index": 1},
|
| 231 |
+
{"text": "python machine learning tutorial", "filepath": "/docs/tutorial.txt", "chunk_index": 0},
|
| 232 |
+
]
|
| 233 |
+
|
| 234 |
+
fake_embeddings = np.random.rand(3, 384).astype("float32")
|
| 235 |
+
|
| 236 |
+
print(f"Vectors before: {store.get_total_vectors()}")
|
| 237 |
+
store.add_chunks(fake_chunks, fake_embeddings)
|
| 238 |
+
print(f"Vectors after: {store.get_total_vectors()}")
|
indexer/watcher.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# indexer/watcher.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from watchdog.observers import Observer
|
| 6 |
+
from watchdog.events import FileSystemEventHandler
|
| 7 |
+
from indexer.pipeline import IndexingPipeline
|
| 8 |
+
import yaml
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class IndexHandler(FileSystemEventHandler):
|
| 12 |
+
"""
|
| 13 |
+
Handles filesystem events detected by watchdog.
|
| 14 |
+
|
| 15 |
+
watchdog calls these methods automatically:
|
| 16 |
+
- on_created(event) → new file added
|
| 17 |
+
- on_modified(event) → existing file changed
|
| 18 |
+
- on_deleted(event) → file removed
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, pipeline, config_path="config.yaml"):
|
| 22 |
+
"""
|
| 23 |
+
Args:
|
| 24 |
+
pipeline (IndexingPipeline) — existing pipeline instance
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
with open(config_path) as f:
|
| 28 |
+
config = yaml.safe_load(f)
|
| 29 |
+
self._debounce_seconds = config["debounce_seconds"]
|
| 30 |
+
|
| 31 |
+
self.pipeline = pipeline
|
| 32 |
+
self.include_extensions = self.pipeline.crawler.include_extensions
|
| 33 |
+
self._last_event = {} # {filepath: timestamp}
|
| 34 |
+
|
| 35 |
+
def _is_duplicate(self, filepath):
|
| 36 |
+
"""
|
| 37 |
+
Check if we've already handled an event for this file recently.
|
| 38 |
+
Returns True if we should skip this event.
|
| 39 |
+
"""
|
| 40 |
+
now = time.time()
|
| 41 |
+
last = self._last_event.get(filepath, 0)
|
| 42 |
+
if now - last < self._debounce_seconds:
|
| 43 |
+
return True
|
| 44 |
+
self._last_event[filepath] = now
|
| 45 |
+
return False
|
| 46 |
+
|
| 47 |
+
def _is_relevant(self, filepath):
|
| 48 |
+
"""
|
| 49 |
+
Check if a file event is for a file type we care about.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
filepath (str) — path from the event
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
bool — True if the file extension is in our include list
|
| 56 |
+
"""
|
| 57 |
+
ext = os.path.splitext(filepath)[1].lower()
|
| 58 |
+
return ext in self.include_extensions
|
| 59 |
+
|
| 60 |
+
def on_created(self, event):
|
| 61 |
+
"""
|
| 62 |
+
Called when a new file is created.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
event — watchdog event
|
| 66 |
+
"""
|
| 67 |
+
if(event.is_directory):
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
if(not self._is_relevant(event.src_path)):
|
| 71 |
+
return
|
| 72 |
+
|
| 73 |
+
if self._is_duplicate(event.src_path):
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
print(f"New file detected: {event.src_path}")
|
| 77 |
+
text = self.pipeline.extractor.extract(event.src_path)
|
| 78 |
+
if(not text.strip()):
|
| 79 |
+
print(f" Skipping (no text extracted)")
|
| 80 |
+
return
|
| 81 |
+
|
| 82 |
+
chunks = self.pipeline.chunker.chunk_file(text, event.src_path)
|
| 83 |
+
chunk_texts = [c["text"] for c in chunks]
|
| 84 |
+
embeddings = self.pipeline.embedder.embed_chunks(chunk_texts)
|
| 85 |
+
self.pipeline.store.remove_file_chunks(event.src_path)
|
| 86 |
+
self.pipeline.store.add_chunks(chunks, embeddings)
|
| 87 |
+
|
| 88 |
+
file_hash = self.pipeline.crawler.compute_hash(event.src_path)
|
| 89 |
+
self.pipeline.store.save_file_info(event.src_path, file_hash, len(chunks))
|
| 90 |
+
print(f" File stored: {event.src_path}")
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def on_modified(self, event):
|
| 94 |
+
"""
|
| 95 |
+
Called when an existing file is modified.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
event - watchdog event
|
| 99 |
+
"""
|
| 100 |
+
if(event.is_directory):
|
| 101 |
+
return
|
| 102 |
+
|
| 103 |
+
if(not self._is_relevant(event.src_path)):
|
| 104 |
+
return
|
| 105 |
+
|
| 106 |
+
if self._is_duplicate(event.src_path):
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
print(f"File modified: {event.src_path}")
|
| 110 |
+
|
| 111 |
+
self.pipeline.store.remove_file_chunks(event.src_path)
|
| 112 |
+
text = self.pipeline.extractor.extract(event.src_path)
|
| 113 |
+
if(not text.strip()):
|
| 114 |
+
print(f" Skipping (no text extracted)")
|
| 115 |
+
return
|
| 116 |
+
|
| 117 |
+
chunks = self.pipeline.chunker.chunk_file(text, event.src_path)
|
| 118 |
+
chunk_texts = [c["text"] for c in chunks]
|
| 119 |
+
embeddings = self.pipeline.embedder.embed_chunks(chunk_texts)
|
| 120 |
+
self.pipeline.store.add_chunks(chunks, embeddings)
|
| 121 |
+
|
| 122 |
+
file_hash = self.pipeline.crawler.compute_hash(event.src_path)
|
| 123 |
+
self.pipeline.store.save_file_info(event.src_path, file_hash, len(chunks))
|
| 124 |
+
print(f" File saved: {event.src_path}")
|
| 125 |
+
|
| 126 |
+
def on_deleted(self, event):
|
| 127 |
+
"""
|
| 128 |
+
Called when a file is deleted.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
event - watchdog event
|
| 132 |
+
"""
|
| 133 |
+
if(event.is_directory):
|
| 134 |
+
return
|
| 135 |
+
|
| 136 |
+
if(not self._is_relevant(event.src_path)):
|
| 137 |
+
return
|
| 138 |
+
|
| 139 |
+
print(f"File deleted: {event.src_path}")
|
| 140 |
+
self.pipeline.store.remove_file_chunks(event.src_path)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class Watcher:
|
| 144 |
+
"""
|
| 145 |
+
Starts watchdog observers on all configured watch_paths.
|
| 146 |
+
Runs continuously until the user presses Ctrl+C.
|
| 147 |
+
"""
|
| 148 |
+
|
| 149 |
+
def __init__(self, config_path="config.yaml"):
|
| 150 |
+
"""
|
| 151 |
+
Initialize the Watcher.
|
| 152 |
+
"""
|
| 153 |
+
self.pipeline = IndexingPipeline(config_path)
|
| 154 |
+
self.handler = IndexHandler(self.pipeline)
|
| 155 |
+
self.watch_paths = self.pipeline.crawler.watch_paths
|
| 156 |
+
|
| 157 |
+
def start(self):
|
| 158 |
+
"""
|
| 159 |
+
Start watching all configured directories.
|
| 160 |
+
"""
|
| 161 |
+
observer = Observer()
|
| 162 |
+
for path in self.watch_paths:
|
| 163 |
+
observer.schedule(self.handler, path, recursive=True)
|
| 164 |
+
observer.start()
|
| 165 |
+
|
| 166 |
+
print(f"Watchdog active. Watching {', '.join(self.watch_paths)}")
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
while True:
|
| 170 |
+
time.sleep(1)
|
| 171 |
+
except KeyboardInterrupt:
|
| 172 |
+
print("Stopping watcher...")
|
| 173 |
+
finally:
|
| 174 |
+
observer.stop()
|
| 175 |
+
observer.join()
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# --- Test it ---
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
# First run the full pipeline to index existing files
|
| 181 |
+
print("Running initial index...")
|
| 182 |
+
watcher = Watcher()
|
| 183 |
+
watcher.pipeline.run()
|
| 184 |
+
|
| 185 |
+
# Then start watching for changes
|
| 186 |
+
print("\nStarting file watcher...")
|
| 187 |
+
watcher.start()
|
main.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
import yaml
|
| 8 |
+
from fastapi import FastAPI, Request, Form
|
| 9 |
+
from fastapi.responses import HTMLResponse
|
| 10 |
+
from fastapi.staticfiles import StaticFiles
|
| 11 |
+
from fastapi.templating import Jinja2Templates
|
| 12 |
+
|
| 13 |
+
from evaluation.dataset_loader import DatasetLoader
|
| 14 |
+
|
| 15 |
+
app = FastAPI(title="Semantic Search Engine")
|
| 16 |
+
|
| 17 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 18 |
+
templates = Jinja2Templates(directory="templates")
|
| 19 |
+
|
| 20 |
+
# ── load search engine once at startup ──────────────────────────────────────
|
| 21 |
+
ENGINE_ERROR = None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@lru_cache(maxsize=1)
|
| 25 |
+
def get_engine():
|
| 26 |
+
global ENGINE_ERROR
|
| 27 |
+
try:
|
| 28 |
+
from searcher.search_engine import SearchEngine
|
| 29 |
+
ENGINE_ERROR = None
|
| 30 |
+
return SearchEngine("config.yaml")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
ENGINE_ERROR = str(e)
|
| 33 |
+
print(f"[Startup] Search engine unavailable: {e}")
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ── load dataset queries at startup ─────────────────────────────────────────
|
| 38 |
+
# These are the actual queries from SciFact and NFCorpus
|
| 39 |
+
# We use them to show "which dataset queries matched your search"
|
| 40 |
+
|
| 41 |
+
def load_dataset_queries() -> dict:
|
| 42 |
+
"""
|
| 43 |
+
Load all queries from SciFact and NFCorpus at startup.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
dict — {
|
| 47 |
+
"scifact": {query_id: query_text, ...},
|
| 48 |
+
"nfcorpus": {query_id: query_text, ...},
|
| 49 |
+
}
|
| 50 |
+
"""
|
| 51 |
+
all_queries = {}
|
| 52 |
+
|
| 53 |
+
datasets = {
|
| 54 |
+
"scifact": "data/scifact",
|
| 55 |
+
"nfcorpus": "data/nfcorpus",
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
for name, path in datasets.items():
|
| 59 |
+
if os.path.exists(path):
|
| 60 |
+
try:
|
| 61 |
+
loader = DatasetLoader(path)
|
| 62 |
+
all_queries[name] = loader.load_queries()
|
| 63 |
+
print(f"[Startup] Loaded {len(all_queries[name])} queries from {name}")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"[Startup] Could not load {name} queries: {e}")
|
| 66 |
+
all_queries[name] = {}
|
| 67 |
+
else:
|
| 68 |
+
print(f"[Startup] Dataset path not found: {path}")
|
| 69 |
+
all_queries[name] = {}
|
| 70 |
+
|
| 71 |
+
return all_queries
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# load once at startup — available globally
|
| 75 |
+
DATASET_QUERIES = load_dataset_queries()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ── helpers ──────────────────────────────────────────────────────────────────
|
| 79 |
+
|
| 80 |
+
def load_eval_results() -> dict:
|
| 81 |
+
path = "results/eval_all.json"
|
| 82 |
+
if os.path.exists(path):
|
| 83 |
+
with open(path, "r") as f:
|
| 84 |
+
return json.load(f)
|
| 85 |
+
return {}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def extract_doc_id(filepath: str) -> str:
|
| 89 |
+
if "://" in filepath:
|
| 90 |
+
return filepath.split("://", 1)[1]
|
| 91 |
+
return filepath
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def get_dataset_from_filepath(filepath: str) -> str:
|
| 95 |
+
if "scifact://" in filepath: return "scifact"
|
| 96 |
+
if "nfcorpus://" in filepath: return "nfcorpus"
|
| 97 |
+
return "filesystem"
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def get_file_icon(filepath: str) -> str:
|
| 101 |
+
if "scifact://" in filepath: return "🔬"
|
| 102 |
+
if "nfcorpus://" in filepath: return "🏥"
|
| 103 |
+
ext = filepath.lower().split(".")[-1] if "." in filepath else ""
|
| 104 |
+
icons = {
|
| 105 |
+
"pdf": "📄", "docx": "📝", "txt": "📃",
|
| 106 |
+
"pptx": "📊", "xlsx": "📋", "py": "🐍",
|
| 107 |
+
}
|
| 108 |
+
return icons.get(ext, "📄")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def find_matching_dataset_queries(
|
| 112 |
+
user_query: str,
|
| 113 |
+
top_results: list,
|
| 114 |
+
) -> list:
|
| 115 |
+
"""
|
| 116 |
+
Find which dataset queries are semantically related to what the user typed.
|
| 117 |
+
|
| 118 |
+
Strategy — two passes:
|
| 119 |
+
1. Exact / substring match — query text contains user words
|
| 120 |
+
2. Doc-based match — if a result doc came from dataset X,
|
| 121 |
+
show the queries that reference that doc
|
| 122 |
+
from the qrels (loaded separately)
|
| 123 |
+
|
| 124 |
+
We use simple word overlap here (no extra model call needed).
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
list of dicts — [
|
| 128 |
+
{
|
| 129 |
+
"query_id": "1234",
|
| 130 |
+
"query_text": "Does vitamin D cause cancer?",
|
| 131 |
+
"dataset": "scifact",
|
| 132 |
+
"match_type": "text" or "doc"
|
| 133 |
+
},
|
| 134 |
+
...
|
| 135 |
+
]
|
| 136 |
+
"""
|
| 137 |
+
matched = []
|
| 138 |
+
seen_ids = set()
|
| 139 |
+
|
| 140 |
+
# words from user query — lowercase, skip short words
|
| 141 |
+
user_words = set(
|
| 142 |
+
w.lower() for w in user_query.split()
|
| 143 |
+
if len(w) > 3
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Pass 1 — text overlap match
|
| 147 |
+
# check every dataset query for word overlap with user query
|
| 148 |
+
for dataset_name, queries in DATASET_QUERIES.items():
|
| 149 |
+
for qid, qtext in queries.items():
|
| 150 |
+
q_words = set(w.lower() for w in qtext.split() if len(w) > 3)
|
| 151 |
+
overlap = user_words & q_words
|
| 152 |
+
|
| 153 |
+
# need at least 1 word overlap
|
| 154 |
+
if overlap and qid not in seen_ids:
|
| 155 |
+
matched.append({
|
| 156 |
+
"query_id": qid,
|
| 157 |
+
"query_text": qtext,
|
| 158 |
+
"dataset": dataset_name,
|
| 159 |
+
"match_type": "text",
|
| 160 |
+
"overlap": len(overlap),
|
| 161 |
+
})
|
| 162 |
+
seen_ids.add(qid)
|
| 163 |
+
|
| 164 |
+
# sort by overlap count — most overlapping queries first
|
| 165 |
+
matched.sort(key=lambda x: x["overlap"], reverse=True)
|
| 166 |
+
|
| 167 |
+
# return top 8 matched queries max
|
| 168 |
+
return matched[:8]
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# ── routes ───────────────────────────────────────────────────────────────────
|
| 172 |
+
|
| 173 |
+
@app.get("/", response_class=HTMLResponse)
|
| 174 |
+
async def home(request: Request):
|
| 175 |
+
return templates.TemplateResponse("index.html", {
|
| 176 |
+
"request": request,
|
| 177 |
+
"scifact_count": len(DATASET_QUERIES.get("scifact", {})),
|
| 178 |
+
"nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
|
| 179 |
+
"error": ENGINE_ERROR,
|
| 180 |
+
})
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
@app.post("/search", response_class=HTMLResponse)
|
| 184 |
+
async def search(
|
| 185 |
+
request: Request,
|
| 186 |
+
query: str = Form(...),
|
| 187 |
+
top_k: int = Form(10),
|
| 188 |
+
mode: str = Form("full"),
|
| 189 |
+
):
|
| 190 |
+
if not query.strip():
|
| 191 |
+
return templates.TemplateResponse("index.html", {
|
| 192 |
+
"request": request,
|
| 193 |
+
"error": "Please enter a search query.",
|
| 194 |
+
"scifact_count": len(DATASET_QUERIES.get("scifact", {})),
|
| 195 |
+
"nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
|
| 196 |
+
})
|
| 197 |
+
|
| 198 |
+
engine = get_engine()
|
| 199 |
+
if engine is None:
|
| 200 |
+
return templates.TemplateResponse("index.html", {
|
| 201 |
+
"request": request,
|
| 202 |
+
"error": (
|
| 203 |
+
"Search is not ready yet. The semantic index is still missing or failed to build. "
|
| 204 |
+
f"Startup details: {ENGINE_ERROR}"
|
| 205 |
+
),
|
| 206 |
+
"scifact_count": len(DATASET_QUERIES.get("scifact", {})),
|
| 207 |
+
"nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
|
| 208 |
+
})
|
| 209 |
+
|
| 210 |
+
t0 = time.time()
|
| 211 |
+
output = engine.search(query.strip(), top_k=top_k)
|
| 212 |
+
elapsed = round(time.time() - t0, 3)
|
| 213 |
+
|
| 214 |
+
# format search results
|
| 215 |
+
results = []
|
| 216 |
+
for r in output.get("results", []):
|
| 217 |
+
filepath = r.get("filepath", "")
|
| 218 |
+
doc_id = extract_doc_id(filepath)
|
| 219 |
+
score = r.get("rerank_score", r.get("rrf_score", r.get("dense_score", 0)))
|
| 220 |
+
snippet = r.get("chunk_text", r.get("text", "No preview available."))
|
| 221 |
+
|
| 222 |
+
if len(snippet) > 200:
|
| 223 |
+
snippet = snippet[:200].rsplit(" ", 1)[0] + "..."
|
| 224 |
+
|
| 225 |
+
dataset = get_dataset_from_filepath(filepath)
|
| 226 |
+
|
| 227 |
+
results.append({
|
| 228 |
+
"doc_id": doc_id,
|
| 229 |
+
"filepath": filepath,
|
| 230 |
+
"score": round(float(score), 4),
|
| 231 |
+
"snippet": snippet,
|
| 232 |
+
"icon": get_file_icon(filepath),
|
| 233 |
+
"dataset": dataset,
|
| 234 |
+
})
|
| 235 |
+
|
| 236 |
+
# find matching dataset queries
|
| 237 |
+
matched_queries = find_matching_dataset_queries(query.strip(), results)
|
| 238 |
+
|
| 239 |
+
# group matched queries by dataset for display
|
| 240 |
+
matched_scifact = [q for q in matched_queries if q["dataset"] == "scifact"]
|
| 241 |
+
matched_nfcorpus = [q for q in matched_queries if q["dataset"] == "nfcorpus"]
|
| 242 |
+
|
| 243 |
+
return templates.TemplateResponse("results.html", {
|
| 244 |
+
"request": request,
|
| 245 |
+
"query": query,
|
| 246 |
+
"results": results,
|
| 247 |
+
"total": len(results),
|
| 248 |
+
"elapsed": elapsed,
|
| 249 |
+
"mode": mode,
|
| 250 |
+
"top_k": top_k,
|
| 251 |
+
"matched_scifact": matched_scifact,
|
| 252 |
+
"matched_nfcorpus": matched_nfcorpus,
|
| 253 |
+
"total_matched": len(matched_queries),
|
| 254 |
+
})
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
@app.get("/dashboard", response_class=HTMLResponse)
|
| 258 |
+
async def dashboard(request: Request):
|
| 259 |
+
eval_data = load_eval_results()
|
| 260 |
+
|
| 261 |
+
datasets = []
|
| 262 |
+
for dataset_name, mode_results in eval_data.items():
|
| 263 |
+
full = mode_results.get("full", {})
|
| 264 |
+
datasets.append({
|
| 265 |
+
"name": dataset_name,
|
| 266 |
+
"ndcg": full.get("NDCG@10", 0.0),
|
| 267 |
+
"mrr": full.get("MRR", 0.0),
|
| 268 |
+
"map": full.get("MAP@100", 0.0),
|
| 269 |
+
"recall": full.get("Recall@100", 0.0),
|
| 270 |
+
"precision": full.get("P@10", 0.0),
|
| 271 |
+
"queries": full.get("num_queries", 0),
|
| 272 |
+
"modes": mode_results,
|
| 273 |
+
})
|
| 274 |
+
|
| 275 |
+
return templates.TemplateResponse("dashboard.html", {
|
| 276 |
+
"request": request,
|
| 277 |
+
"datasets": datasets,
|
| 278 |
+
})
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
@app.get("/health")
|
| 282 |
+
async def health():
|
| 283 |
+
engine = get_engine()
|
| 284 |
+
return {
|
| 285 |
+
"status": "ok" if engine is not None else "degraded",
|
| 286 |
+
"engine_ready": engine is not None,
|
| 287 |
+
"engine_error": ENGINE_ERROR,
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
if __name__ == "__main__":
|
| 292 |
+
import uvicorn
|
| 293 |
+
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
# uvicorn main:app --reload --host 0.0.0.0 --port 8000
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
jinja2
|
| 4 |
+
python-multipart
|
| 5 |
+
sentence-transformers
|
| 6 |
+
transformers
|
| 7 |
+
torch
|
| 8 |
+
faiss-cpu
|
| 9 |
+
numpy
|
| 10 |
+
scipy
|
| 11 |
+
scikit-learn
|
| 12 |
+
networkx
|
| 13 |
+
nltk
|
| 14 |
+
pandas
|
| 15 |
+
pyyaml
|
| 16 |
+
python-docx
|
| 17 |
+
python-pptx
|
| 18 |
+
openpyxl
|
| 19 |
+
pillow
|
| 20 |
+
lxml
|
| 21 |
+
PyMuPDF
|
searcher/__init__.py
ADDED
|
File without changes
|
searcher/__pycache__/__init__.cpython-313.pyc.2070577919488
ADDED
|
Binary file (133 Bytes). View file
|
|
|
searcher/__pycache__/dense_retriever.cpython-313.pyc.2070577919488
ADDED
|
Binary file (4.17 kB). View file
|
|
|
searcher/__pycache__/facet_filter.cpython-313.pyc.2070577919488
ADDED
|
Binary file (3.08 kB). View file
|
|
|
searcher/__pycache__/fusion_ranker.cpython-313.pyc.2070577919488
ADDED
|
Binary file (3.62 kB). View file
|
|
|
searcher/__pycache__/highlighter.cpython-313.pyc.2070577919488
ADDED
|
Binary file (4.27 kB). View file
|
|
|
searcher/__pycache__/query_understanding.cpython-313.pyc.2070578319792
ADDED
|
Binary file (4.41 kB). View file
|
|
|