tarakjc2c commited on
Commit
bdf83a5
·
1 Parent(s): b7f3196

Fix: Auto-build corpora on first launch

Browse files
Files changed (1) hide show
  1. app_retrieval_cached.py +36 -0
app_retrieval_cached.py CHANGED
@@ -9,6 +9,8 @@ from pathlib import Path
9
  import pickle
10
  import hashlib
11
  import json
 
 
12
  from retriever.index_bm25 import BM25Index
13
  from retriever.index_dense import DenseIndex
14
  from retriever.ingest import load_jsonl
@@ -26,6 +28,36 @@ print(" With disk caching for fast startup!")
26
  print("=" * 70)
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def _default_corpora_config() -> Dict[str, dict]:
30
  return {
31
  "medical_qa": {"path": "data/corpora/medical_qa.jsonl",
@@ -139,6 +171,10 @@ class CachedRetriever:
139
  return dense_index
140
 
141
 
 
 
 
 
142
  # Initialize cached retriever (fast if cached, slow first time)
143
  print("\nInitializing retrieval system...")
144
  cfg = _available(_default_corpora_config())
 
9
  import pickle
10
  import hashlib
11
  import json
12
+ import subprocess
13
+ import sys
14
  from retriever.index_bm25 import BM25Index
15
  from retriever.index_dense import DenseIndex
16
  from retriever.ingest import load_jsonl
 
28
  print("=" * 70)
29
 
30
 
31
+ def _ensure_corpora_exist():
32
+ """Build corpora files if they don't exist"""
33
+ data_dir = Path("data/corpora")
34
+ required_files = [
35
+ data_dir / "medical_qa.jsonl",
36
+ data_dir / "miriad_text.jsonl",
37
+ data_dir / "unidoc_qa.jsonl"
38
+ ]
39
+
40
+ if all(f.exists() for f in required_files):
41
+ return # All files exist
42
+
43
+ print("\n" + "=" * 70)
44
+ print("⚠️ Corpora files not found. Building them now...")
45
+ print(" This will take 2-3 minutes on first launch.")
46
+ print("=" * 70 + "\n")
47
+
48
+ try:
49
+ # Run build_corpora.py
50
+ subprocess.run(
51
+ [sys.executable, "adapters/build_corpora.py"],
52
+ check=True,
53
+ capture_output=False
54
+ )
55
+ print("\n✓ Corpora files built successfully!\n")
56
+ except subprocess.CalledProcessError as e:
57
+ print(f"\n✗ Failed to build corpora: {e}")
58
+ raise RuntimeError("Could not build corpora files. Please run 'python adapters/build_corpora.py' manually.")
59
+
60
+
61
  def _default_corpora_config() -> Dict[str, dict]:
62
  return {
63
  "medical_qa": {"path": "data/corpora/medical_qa.jsonl",
 
171
  return dense_index
172
 
173
 
174
+
175
+ # Ensure corpora files exist (auto-build if missing)
176
+ _ensure_corpora_exist()
177
+
178
  # Initialize cached retriever (fast if cached, slow first time)
179
  print("\nInitializing retrieval system...")
180
  cfg = _available(_default_corpora_config())