JackSparrow89 commited on
Commit
8eca49d
·
verified ·
1 Parent(s): d885c3f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +73 -1
main.py CHANGED
@@ -4,8 +4,9 @@ import json
4
  import os
5
  import time
6
  from functools import lru_cache
 
7
  import yaml
8
- from fastapi import FastAPI, Request, Form
9
  from fastapi.responses import HTMLResponse
10
  from fastapi.staticfiles import StaticFiles
11
  from fastapi.templating import Jinja2Templates
@@ -91,6 +92,22 @@ def load_dataset_queries() -> dict:
91
  DATASET_QUERIES = {}
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  @app.on_event("startup")
95
  async def startup_event():
96
  refresh_dataset_queries()
@@ -208,6 +225,14 @@ def get_file_icon(filepath: str) -> str:
208
  return icons.get(ext, "📄")
209
 
210
 
 
 
 
 
 
 
 
 
211
  def find_matching_dataset_queries(
212
  user_query: str,
213
  top_results: list,
@@ -327,6 +352,7 @@ async def search(
327
  results.append({
328
  "doc_id": doc_id,
329
  "filepath": filepath,
 
330
  "score": round(float(score), 4),
331
  "snippet": snippet,
332
  "icon": get_file_icon(filepath),
@@ -380,6 +406,52 @@ async def dashboard(request: Request):
380
  })
381
 
382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  @app.get("/health")
384
  async def health():
385
  engine = get_engine()
 
4
  import os
5
  import time
6
  from functools import lru_cache
7
+ from urllib.parse import quote
8
  import yaml
9
+ from fastapi import FastAPI, Request, Form, HTTPException, Query
10
  from fastapi.responses import HTMLResponse
11
  from fastapi.staticfiles import StaticFiles
12
  from fastapi.templating import Jinja2Templates
 
92
  DATASET_QUERIES = {}
93
 
94
 
95
+ @lru_cache(maxsize=8)
96
+ def load_dataset_corpus(dataset_name: str) -> dict:
97
+ config = get_config()
98
+ watch_paths = config.get("watch_paths", [])
99
+ datasets = {
100
+ "scifact": resolve_path(watch_paths[0]) if len(watch_paths) > 0 else resolve_path("data/scifact"),
101
+ "nfcorpus": resolve_path(watch_paths[1]) if len(watch_paths) > 1 else resolve_path("data/nfcorpus"),
102
+ }
103
+
104
+ dataset_path = datasets.get(dataset_name)
105
+ if not dataset_path or not os.path.exists(dataset_path):
106
+ return {}
107
+
108
+ return DatasetLoader(dataset_path).load_corpus()
109
+
110
+
111
  @app.on_event("startup")
112
  async def startup_event():
113
  refresh_dataset_queries()
 
225
  return icons.get(ext, "📄")
226
 
227
 
228
+ def build_open_url(filepath: str) -> str:
229
+ dataset = get_dataset_from_filepath(filepath)
230
+ if dataset in {"scifact", "nfcorpus"}:
231
+ doc_id = extract_doc_id(filepath)
232
+ return f"/document?dataset={quote(dataset)}&doc_id={quote(doc_id)}"
233
+ return f"/document?path={quote(filepath)}"
234
+
235
+
236
  def find_matching_dataset_queries(
237
  user_query: str,
238
  top_results: list,
 
352
  results.append({
353
  "doc_id": doc_id,
354
  "filepath": filepath,
355
+ "open_url": build_open_url(filepath),
356
  "score": round(float(score), 4),
357
  "snippet": snippet,
358
  "icon": get_file_icon(filepath),
 
406
  })
407
 
408
 
409
+ @app.get("/document", response_class=HTMLResponse)
410
+ async def document(
411
+ request: Request,
412
+ dataset: str | None = Query(default=None),
413
+ doc_id: str | None = Query(default=None),
414
+ path: str | None = Query(default=None),
415
+ ):
416
+ if dataset and doc_id:
417
+ corpus = load_dataset_corpus(dataset)
418
+ doc = corpus.get(doc_id)
419
+ if doc is None:
420
+ raise HTTPException(status_code=404, detail="Document not found in dataset corpus.")
421
+
422
+ title = doc.get("title") or doc_id
423
+ text = doc.get("text") or "No document text available."
424
+ return templates.TemplateResponse(request, "document.html", {
425
+ "request": request,
426
+ "title": title,
427
+ "doc_id": doc_id,
428
+ "source": dataset,
429
+ "filepath": f"{dataset}://{doc_id}",
430
+ "text": text,
431
+ "is_dataset": True,
432
+ })
433
+
434
+ if path:
435
+ from indexer.extractor import Extractor
436
+
437
+ resolved = resolve_path(path)
438
+ if not os.path.exists(resolved):
439
+ raise HTTPException(status_code=404, detail="File path no longer exists on disk.")
440
+
441
+ text = Extractor().extract(resolved) or "No text could be extracted from this file."
442
+ return templates.TemplateResponse(request, "document.html", {
443
+ "request": request,
444
+ "title": os.path.basename(resolved),
445
+ "doc_id": os.path.basename(resolved),
446
+ "source": "filesystem",
447
+ "filepath": resolved,
448
+ "text": text,
449
+ "is_dataset": False,
450
+ })
451
+
452
+ raise HTTPException(status_code=400, detail="Provide either dataset/doc_id or path.")
453
+
454
+
455
  @app.get("/health")
456
  async def health():
457
  engine = get_engine()