JackSparrow89 commited on
Commit
4f7687e
Β·
verified Β·
1 Parent(s): bb04c5f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +246 -244
main.py CHANGED
@@ -1,23 +1,23 @@
1
- # main.py
2
-
3
  import json
4
  import os
5
  import time
6
  from functools import lru_cache
7
  import yaml
8
- from fastapi import FastAPI, Request, Form
9
- from fastapi.responses import HTMLResponse
10
- from fastapi.staticfiles import StaticFiles
11
- from fastapi.templating import Jinja2Templates
12
-
13
  from evaluation.dataset_loader import DatasetLoader
14
-
15
- app = FastAPI(title="Semantic Search Engine")
16
-
17
- app.mount("/static", StaticFiles(directory="static"), name="static")
18
- templates = Jinja2Templates(directory="templates")
19
-
20
- # ── load search engine once at startup ──────────────────────────────────────
21
  ENGINE_ERROR = None
22
 
23
 
@@ -32,163 +32,163 @@ def get_engine():
32
  ENGINE_ERROR = str(e)
33
  print(f"[Startup] Search engine unavailable: {e}")
34
  return None
35
-
36
-
37
- # ── load dataset queries at startup ─────────────────────────────────────────
38
- # These are the actual queries from SciFact and NFCorpus
39
- # We use them to show "which dataset queries matched your search"
40
-
41
- def load_dataset_queries() -> dict:
42
- """
43
- Load all queries from SciFact and NFCorpus at startup.
44
-
45
- Returns:
46
- dict β€” {
47
- "scifact": {query_id: query_text, ...},
48
- "nfcorpus": {query_id: query_text, ...},
49
- }
50
- """
51
- all_queries = {}
52
-
53
- datasets = {
54
- "scifact": "data/scifact",
55
- "nfcorpus": "data/nfcorpus",
56
- }
57
-
58
- for name, path in datasets.items():
59
- if os.path.exists(path):
60
- try:
61
- loader = DatasetLoader(path)
62
- all_queries[name] = loader.load_queries()
63
- print(f"[Startup] Loaded {len(all_queries[name])} queries from {name}")
64
- except Exception as e:
65
- print(f"[Startup] Could not load {name} queries: {e}")
66
- all_queries[name] = {}
67
- else:
68
- print(f"[Startup] Dataset path not found: {path}")
69
- all_queries[name] = {}
70
-
71
- return all_queries
72
-
73
-
74
- # load once at startup β€” available globally
75
- DATASET_QUERIES = load_dataset_queries()
76
-
77
-
78
- # ── helpers ──────────────────────────────────────────────────────────────────
79
-
80
- def load_eval_results() -> dict:
81
- path = "results/eval_all.json"
82
- if os.path.exists(path):
83
- with open(path, "r") as f:
84
- return json.load(f)
85
- return {}
86
-
87
-
88
- def extract_doc_id(filepath: str) -> str:
89
- if "://" in filepath:
90
- return filepath.split("://", 1)[1]
91
- return filepath
92
-
93
-
94
- def get_dataset_from_filepath(filepath: str) -> str:
95
- if "scifact://" in filepath: return "scifact"
96
- if "nfcorpus://" in filepath: return "nfcorpus"
97
- return "filesystem"
98
-
99
-
100
- def get_file_icon(filepath: str) -> str:
101
- if "scifact://" in filepath: return "πŸ”¬"
102
- if "nfcorpus://" in filepath: return "πŸ₯"
103
- ext = filepath.lower().split(".")[-1] if "." in filepath else ""
104
- icons = {
105
- "pdf": "πŸ“„", "docx": "πŸ“", "txt": "πŸ“ƒ",
106
- "pptx": "πŸ“Š", "xlsx": "πŸ“‹", "py": "🐍",
107
- }
108
- return icons.get(ext, "πŸ“„")
109
-
110
-
111
- def find_matching_dataset_queries(
112
- user_query: str,
113
- top_results: list,
114
- ) -> list:
115
- """
116
- Find which dataset queries are semantically related to what the user typed.
117
-
118
- Strategy β€” two passes:
119
- 1. Exact / substring match β€” query text contains user words
120
- 2. Doc-based match β€” if a result doc came from dataset X,
121
- show the queries that reference that doc
122
- from the qrels (loaded separately)
123
-
124
- We use simple word overlap here (no extra model call needed).
125
-
126
- Returns:
127
- list of dicts β€” [
128
- {
129
- "query_id": "1234",
130
- "query_text": "Does vitamin D cause cancer?",
131
- "dataset": "scifact",
132
- "match_type": "text" or "doc"
133
- },
134
- ...
135
- ]
136
- """
137
- matched = []
138
- seen_ids = set()
139
-
140
- # words from user query β€” lowercase, skip short words
141
- user_words = set(
142
- w.lower() for w in user_query.split()
143
- if len(w) > 3
144
- )
145
-
146
- # Pass 1 β€” text overlap match
147
- # check every dataset query for word overlap with user query
148
- for dataset_name, queries in DATASET_QUERIES.items():
149
- for qid, qtext in queries.items():
150
- q_words = set(w.lower() for w in qtext.split() if len(w) > 3)
151
- overlap = user_words & q_words
152
-
153
- # need at least 1 word overlap
154
- if overlap and qid not in seen_ids:
155
- matched.append({
156
- "query_id": qid,
157
- "query_text": qtext,
158
- "dataset": dataset_name,
159
- "match_type": "text",
160
- "overlap": len(overlap),
161
- })
162
- seen_ids.add(qid)
163
-
164
- # sort by overlap count β€” most overlapping queries first
165
- matched.sort(key=lambda x: x["overlap"], reverse=True)
166
-
167
- # return top 8 matched queries max
168
- return matched[:8]
169
-
170
-
171
- # ── routes ───────────────────────────────────────────────────────────────────
172
-
173
- @app.get("/", response_class=HTMLResponse)
174
  async def home(request: Request):
175
- return templates.TemplateResponse("index.html", {
176
  "request": request,
177
  "scifact_count": len(DATASET_QUERIES.get("scifact", {})),
178
  "nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
179
  "error": ENGINE_ERROR,
180
  })
181
-
182
-
183
- @app.post("/search", response_class=HTMLResponse)
184
- async def search(
185
- request: Request,
186
- query: str = Form(...),
187
- top_k: int = Form(10),
188
- mode: str = Form("full"),
189
- ):
190
  if not query.strip():
191
- return templates.TemplateResponse("index.html", {
192
  "request": request,
193
  "error": "Please enter a search query.",
194
  "scifact_count": len(DATASET_QUERIES.get("scifact", {})),
@@ -197,7 +197,7 @@ async def search(
197
 
198
  engine = get_engine()
199
  if engine is None:
200
- return templates.TemplateResponse("index.html", {
201
  "request": request,
202
  "error": (
203
  "Search is not ready yet. The semantic index is still missing or failed to build. "
@@ -209,76 +209,78 @@ async def search(
209
 
210
  t0 = time.time()
211
  output = engine.search(query.strip(), top_k=top_k)
212
- elapsed = round(time.time() - t0, 3)
213
-
214
- # format search results
215
- results = []
216
- for r in output.get("results", []):
217
- filepath = r.get("filepath", "")
218
- doc_id = extract_doc_id(filepath)
219
- score = r.get("rerank_score", r.get("rrf_score", r.get("dense_score", 0)))
220
- snippet = r.get("chunk_text", r.get("text", "No preview available."))
221
-
222
- if len(snippet) > 200:
223
- snippet = snippet[:200].rsplit(" ", 1)[0] + "..."
224
-
225
- dataset = get_dataset_from_filepath(filepath)
226
-
227
- results.append({
228
- "doc_id": doc_id,
229
- "filepath": filepath,
230
- "score": round(float(score), 4),
231
- "snippet": snippet,
232
- "icon": get_file_icon(filepath),
233
- "dataset": dataset,
234
- })
235
-
236
- # find matching dataset queries
237
- matched_queries = find_matching_dataset_queries(query.strip(), results)
238
-
239
- # group matched queries by dataset for display
240
- matched_scifact = [q for q in matched_queries if q["dataset"] == "scifact"]
241
- matched_nfcorpus = [q for q in matched_queries if q["dataset"] == "nfcorpus"]
242
-
243
- return templates.TemplateResponse("results.html", {
244
- "request": request,
245
- "query": query,
246
- "results": results,
247
- "total": len(results),
248
- "elapsed": elapsed,
249
- "mode": mode,
250
- "top_k": top_k,
251
- "matched_scifact": matched_scifact,
252
- "matched_nfcorpus": matched_nfcorpus,
253
- "total_matched": len(matched_queries),
254
- })
255
-
256
-
257
- @app.get("/dashboard", response_class=HTMLResponse)
258
- async def dashboard(request: Request):
259
- eval_data = load_eval_results()
260
-
261
- datasets = []
262
- for dataset_name, mode_results in eval_data.items():
263
- full = mode_results.get("full", {})
264
- datasets.append({
265
- "name": dataset_name,
266
- "ndcg": full.get("NDCG@10", 0.0),
267
- "mrr": full.get("MRR", 0.0),
268
- "map": full.get("MAP@100", 0.0),
269
- "recall": full.get("Recall@100", 0.0),
270
- "precision": full.get("P@10", 0.0),
271
- "queries": full.get("num_queries", 0),
272
- "modes": mode_results,
273
- })
274
-
275
- return templates.TemplateResponse("dashboard.html", {
276
- "request": request,
277
- "datasets": datasets,
278
- })
279
-
280
-
281
- @app.get("/health")
 
 
282
  async def health():
283
  engine = get_engine()
284
  return {
@@ -286,13 +288,13 @@ async def health():
286
  "engine_ready": engine is not None,
287
  "engine_error": ENGINE_ERROR,
288
  }
289
-
290
-
291
- if __name__ == "__main__":
292
- import uvicorn
293
- uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
294
-
295
-
296
-
297
-
298
  # uvicorn main:app --reload --host 0.0.0.0 --port 8000
 
1
+ # main.py
2
+
3
  import json
4
  import os
5
  import time
6
  from functools import lru_cache
7
  import yaml
8
+ from fastapi import FastAPI, Request, Form
9
+ from fastapi.responses import HTMLResponse
10
+ from fastapi.staticfiles import StaticFiles
11
+ from fastapi.templating import Jinja2Templates
12
+
13
  from evaluation.dataset_loader import DatasetLoader
14
+
15
+ app = FastAPI(title="Semantic Search Engine")
16
+
17
+ app.mount("/static", StaticFiles(directory="static"), name="static")
18
+ templates = Jinja2Templates(directory="templates")
19
+
20
+ # ── load search engine once at startup ──────────────────────────────────────
21
  ENGINE_ERROR = None
22
 
23
 
 
32
  ENGINE_ERROR = str(e)
33
  print(f"[Startup] Search engine unavailable: {e}")
34
  return None
35
+
36
+
37
+ # ── load dataset queries at startup ─────────────────────────────────────────
38
+ # These are the actual queries from SciFact and NFCorpus
39
+ # We use them to show "which dataset queries matched your search"
40
+
41
+ def load_dataset_queries() -> dict:
42
+ """
43
+ Load all queries from SciFact and NFCorpus at startup.
44
+
45
+ Returns:
46
+ dict β€” {
47
+ "scifact": {query_id: query_text, ...},
48
+ "nfcorpus": {query_id: query_text, ...},
49
+ }
50
+ """
51
+ all_queries = {}
52
+
53
+ datasets = {
54
+ "scifact": "data/scifact",
55
+ "nfcorpus": "data/nfcorpus",
56
+ }
57
+
58
+ for name, path in datasets.items():
59
+ if os.path.exists(path):
60
+ try:
61
+ loader = DatasetLoader(path)
62
+ all_queries[name] = loader.load_queries()
63
+ print(f"[Startup] Loaded {len(all_queries[name])} queries from {name}")
64
+ except Exception as e:
65
+ print(f"[Startup] Could not load {name} queries: {e}")
66
+ all_queries[name] = {}
67
+ else:
68
+ print(f"[Startup] Dataset path not found: {path}")
69
+ all_queries[name] = {}
70
+
71
+ return all_queries
72
+
73
+
74
+ # load once at startup β€” available globally
75
+ DATASET_QUERIES = load_dataset_queries()
76
+
77
+
78
+ # ── helpers ──────────────────────────────────────────────────────────────────
79
+
80
+ def load_eval_results() -> dict:
81
+ path = "results/eval_all.json"
82
+ if os.path.exists(path):
83
+ with open(path, "r") as f:
84
+ return json.load(f)
85
+ return {}
86
+
87
+
88
+ def extract_doc_id(filepath: str) -> str:
89
+ if "://" in filepath:
90
+ return filepath.split("://", 1)[1]
91
+ return filepath
92
+
93
+
94
+ def get_dataset_from_filepath(filepath: str) -> str:
95
+ if "scifact://" in filepath: return "scifact"
96
+ if "nfcorpus://" in filepath: return "nfcorpus"
97
+ return "filesystem"
98
+
99
+
100
+ def get_file_icon(filepath: str) -> str:
101
+ if "scifact://" in filepath: return "πŸ”¬"
102
+ if "nfcorpus://" in filepath: return "πŸ₯"
103
+ ext = filepath.lower().split(".")[-1] if "." in filepath else ""
104
+ icons = {
105
+ "pdf": "πŸ“„", "docx": "πŸ“", "txt": "πŸ“ƒ",
106
+ "pptx": "πŸ“Š", "xlsx": "πŸ“‹", "py": "🐍",
107
+ }
108
+ return icons.get(ext, "πŸ“„")
109
+
110
+
111
+ def find_matching_dataset_queries(
112
+ user_query: str,
113
+ top_results: list,
114
+ ) -> list:
115
+ """
116
+ Find which dataset queries are semantically related to what the user typed.
117
+
118
+ Strategy β€” two passes:
119
+ 1. Exact / substring match β€” query text contains user words
120
+ 2. Doc-based match β€” if a result doc came from dataset X,
121
+ show the queries that reference that doc
122
+ from the qrels (loaded separately)
123
+
124
+ We use simple word overlap here (no extra model call needed).
125
+
126
+ Returns:
127
+ list of dicts β€” [
128
+ {
129
+ "query_id": "1234",
130
+ "query_text": "Does vitamin D cause cancer?",
131
+ "dataset": "scifact",
132
+ "match_type": "text" or "doc"
133
+ },
134
+ ...
135
+ ]
136
+ """
137
+ matched = []
138
+ seen_ids = set()
139
+
140
+ # words from user query β€” lowercase, skip short words
141
+ user_words = set(
142
+ w.lower() for w in user_query.split()
143
+ if len(w) > 3
144
+ )
145
+
146
+ # Pass 1 β€” text overlap match
147
+ # check every dataset query for word overlap with user query
148
+ for dataset_name, queries in DATASET_QUERIES.items():
149
+ for qid, qtext in queries.items():
150
+ q_words = set(w.lower() for w in qtext.split() if len(w) > 3)
151
+ overlap = user_words & q_words
152
+
153
+ # need at least 1 word overlap
154
+ if overlap and qid not in seen_ids:
155
+ matched.append({
156
+ "query_id": qid,
157
+ "query_text": qtext,
158
+ "dataset": dataset_name,
159
+ "match_type": "text",
160
+ "overlap": len(overlap),
161
+ })
162
+ seen_ids.add(qid)
163
+
164
+ # sort by overlap count β€” most overlapping queries first
165
+ matched.sort(key=lambda x: x["overlap"], reverse=True)
166
+
167
+ # return top 8 matched queries max
168
+ return matched[:8]
169
+
170
+
171
+ # ── routes ───────────────────────────────────────────────────────────────────
172
+
173
+ @app.get("/", response_class=HTMLResponse)
174
  async def home(request: Request):
175
+ return templates.TemplateResponse(request, "index.html", {
176
  "request": request,
177
  "scifact_count": len(DATASET_QUERIES.get("scifact", {})),
178
  "nfcorpus_count": len(DATASET_QUERIES.get("nfcorpus", {})),
179
  "error": ENGINE_ERROR,
180
  })
181
+
182
+
183
+ @app.post("/search", response_class=HTMLResponse)
184
+ async def search(
185
+ request: Request,
186
+ query: str = Form(...),
187
+ top_k: int = Form(10),
188
+ mode: str = Form("full"),
189
+ ):
190
  if not query.strip():
191
+ return templates.TemplateResponse(request, "index.html", {
192
  "request": request,
193
  "error": "Please enter a search query.",
194
  "scifact_count": len(DATASET_QUERIES.get("scifact", {})),
 
197
 
198
  engine = get_engine()
199
  if engine is None:
200
+ return templates.TemplateResponse(request, "index.html", {
201
  "request": request,
202
  "error": (
203
  "Search is not ready yet. The semantic index is still missing or failed to build. "
 
209
 
210
  t0 = time.time()
211
  output = engine.search(query.strip(), top_k=top_k)
212
+ elapsed = round(time.time() - t0, 3)
213
+
214
+ # format search results
215
+ results = []
216
+ for r in output.get("results", []):
217
+ filepath = r.get("filepath", "")
218
+ doc_id = extract_doc_id(filepath)
219
+ score = r.get("rerank_score", r.get("rrf_score", r.get("dense_score", 0)))
220
+ snippet = r.get("chunk_text", r.get("text", "No preview available."))
221
+
222
+ if len(snippet) > 200:
223
+ snippet = snippet[:200].rsplit(" ", 1)[0] + "..."
224
+
225
+ dataset = get_dataset_from_filepath(filepath)
226
+
227
+ results.append({
228
+ "doc_id": doc_id,
229
+ "filepath": filepath,
230
+ "score": round(float(score), 4),
231
+ "snippet": snippet,
232
+ "icon": get_file_icon(filepath),
233
+ "dataset": dataset,
234
+ })
235
+
236
+ # find matching dataset queries
237
+ matched_queries = find_matching_dataset_queries(query.strip(), results)
238
+
239
+ # group matched queries by dataset for display
240
+ matched_scifact = [q for q in matched_queries if q["dataset"] == "scifact"]
241
+ matched_nfcorpus = [q for q in matched_queries if q["dataset"] == "nfcorpus"]
242
+
243
+ return templates.TemplateResponse(request, "results.html", {
244
+ "request": request,
245
+ "query": query,
246
+ "results": results,
247
+ "total": len(results),
248
+ "elapsed": elapsed,
249
+ "mode": mode,
250
+ "top_k": top_k,
251
+ "matched_scifact": matched_scifact,
252
+ "matched_nfcorpus": matched_nfcorpus,
253
+ "scifact_matches": matched_scifact,
254
+ "nfcorpus_matches": matched_nfcorpus,
255
+ "total_matched": len(matched_queries),
256
+ })
257
+
258
+
259
+ @app.get("/dashboard", response_class=HTMLResponse)
260
+ async def dashboard(request: Request):
261
+ eval_data = load_eval_results()
262
+
263
+ datasets = []
264
+ for dataset_name, mode_results in eval_data.items():
265
+ full = mode_results.get("full", {})
266
+ datasets.append({
267
+ "name": dataset_name,
268
+ "ndcg": full.get("NDCG@10", 0.0),
269
+ "mrr": full.get("MRR", 0.0),
270
+ "map": full.get("MAP@100", 0.0),
271
+ "recall": full.get("Recall@100", 0.0),
272
+ "precision": full.get("P@10", 0.0),
273
+ "queries": full.get("num_queries", 0),
274
+ "modes": mode_results,
275
+ })
276
+
277
+ return templates.TemplateResponse(request, "dashboard.html", {
278
+ "request": request,
279
+ "datasets": datasets,
280
+ })
281
+
282
+
283
+ @app.get("/health")
284
  async def health():
285
  engine = get_engine()
286
  return {
 
288
  "engine_ready": engine is not None,
289
  "engine_error": ENGINE_ERROR,
290
  }
291
+
292
+
293
+ if __name__ == "__main__":
294
+ import uvicorn
295
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
296
+
297
+
298
+
299
+
300
  # uvicorn main:app --reload --host 0.0.0.0 --port 8000