Spaces:
Sleeping
Sleeping
tarakjc2c
commited on
Commit
·
bdf83a5
1
Parent(s):
b7f3196
Fix: Auto-build corpora on first launch
Browse files- app_retrieval_cached.py +36 -0
app_retrieval_cached.py
CHANGED
|
@@ -9,6 +9,8 @@ from pathlib import Path
|
|
| 9 |
import pickle
|
| 10 |
import hashlib
|
| 11 |
import json
|
|
|
|
|
|
|
| 12 |
from retriever.index_bm25 import BM25Index
|
| 13 |
from retriever.index_dense import DenseIndex
|
| 14 |
from retriever.ingest import load_jsonl
|
|
@@ -26,6 +28,36 @@ print(" With disk caching for fast startup!")
|
|
| 26 |
print("=" * 70)
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def _default_corpora_config() -> Dict[str, dict]:
|
| 30 |
return {
|
| 31 |
"medical_qa": {"path": "data/corpora/medical_qa.jsonl",
|
|
@@ -139,6 +171,10 @@ class CachedRetriever:
|
|
| 139 |
return dense_index
|
| 140 |
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
# Initialize cached retriever (fast if cached, slow first time)
|
| 143 |
print("\nInitializing retrieval system...")
|
| 144 |
cfg = _available(_default_corpora_config())
|
|
|
|
| 9 |
import pickle
|
| 10 |
import hashlib
|
| 11 |
import json
|
| 12 |
+
import subprocess
|
| 13 |
+
import sys
|
| 14 |
from retriever.index_bm25 import BM25Index
|
| 15 |
from retriever.index_dense import DenseIndex
|
| 16 |
from retriever.ingest import load_jsonl
|
|
|
|
| 28 |
print("=" * 70)
|
| 29 |
|
| 30 |
|
| 31 |
+
def _ensure_corpora_exist():
|
| 32 |
+
"""Build corpora files if they don't exist"""
|
| 33 |
+
data_dir = Path("data/corpora")
|
| 34 |
+
required_files = [
|
| 35 |
+
data_dir / "medical_qa.jsonl",
|
| 36 |
+
data_dir / "miriad_text.jsonl",
|
| 37 |
+
data_dir / "unidoc_qa.jsonl"
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
if all(f.exists() for f in required_files):
|
| 41 |
+
return # All files exist
|
| 42 |
+
|
| 43 |
+
print("\n" + "=" * 70)
|
| 44 |
+
print("⚠️ Corpora files not found. Building them now...")
|
| 45 |
+
print(" This will take 2-3 minutes on first launch.")
|
| 46 |
+
print("=" * 70 + "\n")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
# Run build_corpora.py
|
| 50 |
+
subprocess.run(
|
| 51 |
+
[sys.executable, "adapters/build_corpora.py"],
|
| 52 |
+
check=True,
|
| 53 |
+
capture_output=False
|
| 54 |
+
)
|
| 55 |
+
print("\n✓ Corpora files built successfully!\n")
|
| 56 |
+
except subprocess.CalledProcessError as e:
|
| 57 |
+
print(f"\n✗ Failed to build corpora: {e}")
|
| 58 |
+
raise RuntimeError("Could not build corpora files. Please run 'python adapters/build_corpora.py' manually.")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
def _default_corpora_config() -> Dict[str, dict]:
|
| 62 |
return {
|
| 63 |
"medical_qa": {"path": "data/corpora/medical_qa.jsonl",
|
|
|
|
| 171 |
return dense_index
|
| 172 |
|
| 173 |
|
| 174 |
+
|
| 175 |
+
# Ensure corpora files exist (auto-build if missing)
|
| 176 |
+
_ensure_corpora_exist()
|
| 177 |
+
|
| 178 |
# Initialize cached retriever (fast if cached, slow first time)
|
| 179 |
print("\nInitializing retrieval system...")
|
| 180 |
cfg = _available(_default_corpora_config())
|