JackSparrow89 commited on
Commit
6ff8eae
Β·
verified Β·
1 Parent(s): 57c96c1

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +64 -7
main.py CHANGED
@@ -12,10 +12,13 @@ from fastapi.templating import Jinja2Templates
12
 
13
  from evaluation.dataset_loader import DatasetLoader
14
 
 
 
 
15
  app = FastAPI(title="Semantic Search Engine")
16
 
17
- app.mount("/static", StaticFiles(directory="static"), name="static")
18
- templates = Jinja2Templates(directory="templates")
19
 
20
  # ── load search engine once at startup ──────────────────────────────────────
21
  ENGINE_ERROR = None
@@ -27,13 +30,24 @@ def get_engine():
27
  try:
28
  from searcher.search_engine import SearchEngine
29
  ENGINE_ERROR = None
30
- return SearchEngine("config.yaml")
31
  except Exception as e:
32
  ENGINE_ERROR = str(e)
33
  print(f"[Startup] Search engine unavailable: {e}")
34
  return None
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
37
  # ── load dataset queries at startup ─────────────────────────────────────────
38
  # These are the actual queries from SciFact and NFCorpus
39
  # We use them to show "which dataset queries matched your search"
@@ -50,9 +64,11 @@ def load_dataset_queries() -> dict:
50
  """
51
  all_queries = {}
52
 
 
 
53
  datasets = {
54
- "scifact": "data/scifact",
55
- "nfcorpus": "data/nfcorpus",
56
  }
57
 
58
  for name, path in datasets.items():
@@ -72,19 +88,60 @@ def load_dataset_queries() -> dict:
72
 
73
 
74
  # load once at startup β€” available globally
75
- DATASET_QUERIES = load_dataset_queries()
 
 
 
 
 
 
 
 
76
 
77
 
78
  # ── helpers ──────────────────────────────────────────────────────────────────
79
 
80
  def load_eval_results() -> dict:
81
- path = "results/eval_all.json"
82
  if os.path.exists(path):
83
  with open(path, "r") as f:
84
  return json.load(f)
85
  return {}
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def extract_doc_id(filepath: str) -> str:
89
  if "://" in filepath:
90
  return filepath.split("://", 1)[1]
 
12
 
13
  from evaluation.dataset_loader import DatasetLoader
14
 
15
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
16
+ CONFIG_PATH = os.path.join(BASE_DIR, "config.yaml")
17
+
18
  app = FastAPI(title="Semantic Search Engine")
19
 
20
+ app.mount("/static", StaticFiles(directory=os.path.join(BASE_DIR, "static")), name="static")
21
+ templates = Jinja2Templates(directory=os.path.join(BASE_DIR, "templates"))
22
 
23
  # ── load search engine once at startup ──────────────────────────────────────
24
  ENGINE_ERROR = None
 
30
  try:
31
  from searcher.search_engine import SearchEngine
32
  ENGINE_ERROR = None
33
+ return SearchEngine(CONFIG_PATH)
34
  except Exception as e:
35
  ENGINE_ERROR = str(e)
36
  print(f"[Startup] Search engine unavailable: {e}")
37
  return None
38
 
39
 
40
+ def resolve_path(path: str) -> str:
41
+ if os.path.isabs(path):
42
+ return path
43
+ return os.path.join(BASE_DIR, path)
44
+
45
+
46
+ def get_config() -> dict:
47
+ with open(CONFIG_PATH, "r", encoding="utf-8") as f:
48
+ return yaml.safe_load(f)
49
+
50
+
51
  # ── load dataset queries at startup ─────────────────────────────────────────
52
  # These are the actual queries from SciFact and NFCorpus
53
  # We use them to show "which dataset queries matched your search"
 
64
  """
65
  all_queries = {}
66
 
67
+ config = get_config()
68
+ watch_paths = config.get("watch_paths", [])
69
  datasets = {
70
+ "scifact": resolve_path(watch_paths[0]) if len(watch_paths) > 0 else resolve_path("data/scifact"),
71
+ "nfcorpus": resolve_path(watch_paths[1]) if len(watch_paths) > 1 else resolve_path("data/nfcorpus"),
72
  }
73
 
74
  for name, path in datasets.items():
 
88
 
89
 
90
  # load once at startup β€” available globally
91
+ DATASET_QUERIES = {}
92
+
93
+
94
+ @app.on_event("startup")
95
+ async def startup_event():
96
+ refresh_dataset_queries()
97
+ ensure_index_ready()
98
+ get_engine.cache_clear()
99
+ get_engine()
100
 
101
 
102
  # ── helpers ──────────────────────────────────────────────────────────────────
103
 
104
  def load_eval_results() -> dict:
105
+ path = resolve_path("results/eval_all.json")
106
  if os.path.exists(path):
107
  with open(path, "r") as f:
108
  return json.load(f)
109
  return {}
110
 
111
 
112
+ def refresh_dataset_queries() -> None:
113
+ global DATASET_QUERIES
114
+ DATASET_QUERIES = load_dataset_queries()
115
+
116
+
117
+ def ensure_index_ready() -> None:
118
+ config = get_config()
119
+ data_dir = resolve_path(config["data_dir"])
120
+ faiss_path = os.path.join(data_dir, "index.faiss")
121
+
122
+ if os.path.exists(faiss_path):
123
+ print(f"[Startup] Existing FAISS index found at {faiss_path}")
124
+ return
125
+
126
+ watch_paths = [resolve_path(path) for path in config.get("watch_paths", [])]
127
+ available_paths = [path for path in watch_paths if os.path.exists(path)]
128
+
129
+ if not available_paths:
130
+ print("[Startup] Skipping indexing because no configured dataset paths are available.")
131
+ return
132
+
133
+ print("[Startup] No FAISS index found. Running indexing pipeline...")
134
+ from indexer.pipeline import IndexingPipeline
135
+
136
+ pipeline = IndexingPipeline(CONFIG_PATH)
137
+ pipeline.run()
138
+
139
+ if os.path.exists(faiss_path):
140
+ print(f"[Startup] Index build complete: {faiss_path}")
141
+ else:
142
+ print(f"[Startup] Index build did not produce {faiss_path}")
143
+
144
+
145
  def extract_doc_id(filepath: str) -> str:
146
  if "://" in filepath:
147
  return filepath.split("://", 1)[1]