Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -4,8 +4,9 @@ import json
|
|
| 4 |
import os
|
| 5 |
import time
|
| 6 |
from functools import lru_cache
|
|
|
|
| 7 |
import yaml
|
| 8 |
-
from fastapi import FastAPI, Request, Form
|
| 9 |
from fastapi.responses import HTMLResponse
|
| 10 |
from fastapi.staticfiles import StaticFiles
|
| 11 |
from fastapi.templating import Jinja2Templates
|
|
@@ -91,6 +92,22 @@ def load_dataset_queries() -> dict:
|
|
| 91 |
DATASET_QUERIES = {}
|
| 92 |
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
@app.on_event("startup")
|
| 95 |
async def startup_event():
|
| 96 |
refresh_dataset_queries()
|
|
@@ -208,6 +225,14 @@ def get_file_icon(filepath: str) -> str:
|
|
| 208 |
return icons.get(ext, "📄")
|
| 209 |
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
def find_matching_dataset_queries(
|
| 212 |
user_query: str,
|
| 213 |
top_results: list,
|
|
@@ -327,6 +352,7 @@ async def search(
|
|
| 327 |
results.append({
|
| 328 |
"doc_id": doc_id,
|
| 329 |
"filepath": filepath,
|
|
|
|
| 330 |
"score": round(float(score), 4),
|
| 331 |
"snippet": snippet,
|
| 332 |
"icon": get_file_icon(filepath),
|
|
@@ -380,6 +406,52 @@ async def dashboard(request: Request):
|
|
| 380 |
})
|
| 381 |
|
| 382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
@app.get("/health")
|
| 384 |
async def health():
|
| 385 |
engine = get_engine()
|
|
|
|
| 4 |
import os
|
| 5 |
import time
|
| 6 |
from functools import lru_cache
|
| 7 |
+
from urllib.parse import quote
|
| 8 |
import yaml
|
| 9 |
+
from fastapi import FastAPI, Request, Form, HTTPException, Query
|
| 10 |
from fastapi.responses import HTMLResponse
|
| 11 |
from fastapi.staticfiles import StaticFiles
|
| 12 |
from fastapi.templating import Jinja2Templates
|
|
|
|
| 92 |
DATASET_QUERIES = {}
|
| 93 |
|
| 94 |
|
| 95 |
+
@lru_cache(maxsize=8)
|
| 96 |
+
def load_dataset_corpus(dataset_name: str) -> dict:
|
| 97 |
+
config = get_config()
|
| 98 |
+
watch_paths = config.get("watch_paths", [])
|
| 99 |
+
datasets = {
|
| 100 |
+
"scifact": resolve_path(watch_paths[0]) if len(watch_paths) > 0 else resolve_path("data/scifact"),
|
| 101 |
+
"nfcorpus": resolve_path(watch_paths[1]) if len(watch_paths) > 1 else resolve_path("data/nfcorpus"),
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
dataset_path = datasets.get(dataset_name)
|
| 105 |
+
if not dataset_path or not os.path.exists(dataset_path):
|
| 106 |
+
return {}
|
| 107 |
+
|
| 108 |
+
return DatasetLoader(dataset_path).load_corpus()
|
| 109 |
+
|
| 110 |
+
|
| 111 |
@app.on_event("startup")
|
| 112 |
async def startup_event():
|
| 113 |
refresh_dataset_queries()
|
|
|
|
| 225 |
return icons.get(ext, "📄")
|
| 226 |
|
| 227 |
|
| 228 |
+
def build_open_url(filepath: str) -> str:
|
| 229 |
+
dataset = get_dataset_from_filepath(filepath)
|
| 230 |
+
if dataset in {"scifact", "nfcorpus"}:
|
| 231 |
+
doc_id = extract_doc_id(filepath)
|
| 232 |
+
return f"/document?dataset={quote(dataset)}&doc_id={quote(doc_id)}"
|
| 233 |
+
return f"/document?path={quote(filepath)}"
|
| 234 |
+
|
| 235 |
+
|
| 236 |
def find_matching_dataset_queries(
|
| 237 |
user_query: str,
|
| 238 |
top_results: list,
|
|
|
|
| 352 |
results.append({
|
| 353 |
"doc_id": doc_id,
|
| 354 |
"filepath": filepath,
|
| 355 |
+
"open_url": build_open_url(filepath),
|
| 356 |
"score": round(float(score), 4),
|
| 357 |
"snippet": snippet,
|
| 358 |
"icon": get_file_icon(filepath),
|
|
|
|
| 406 |
})
|
| 407 |
|
| 408 |
|
| 409 |
+
@app.get("/document", response_class=HTMLResponse)
|
| 410 |
+
async def document(
|
| 411 |
+
request: Request,
|
| 412 |
+
dataset: str | None = Query(default=None),
|
| 413 |
+
doc_id: str | None = Query(default=None),
|
| 414 |
+
path: str | None = Query(default=None),
|
| 415 |
+
):
|
| 416 |
+
if dataset and doc_id:
|
| 417 |
+
corpus = load_dataset_corpus(dataset)
|
| 418 |
+
doc = corpus.get(doc_id)
|
| 419 |
+
if doc is None:
|
| 420 |
+
raise HTTPException(status_code=404, detail="Document not found in dataset corpus.")
|
| 421 |
+
|
| 422 |
+
title = doc.get("title") or doc_id
|
| 423 |
+
text = doc.get("text") or "No document text available."
|
| 424 |
+
return templates.TemplateResponse(request, "document.html", {
|
| 425 |
+
"request": request,
|
| 426 |
+
"title": title,
|
| 427 |
+
"doc_id": doc_id,
|
| 428 |
+
"source": dataset,
|
| 429 |
+
"filepath": f"{dataset}://{doc_id}",
|
| 430 |
+
"text": text,
|
| 431 |
+
"is_dataset": True,
|
| 432 |
+
})
|
| 433 |
+
|
| 434 |
+
if path:
|
| 435 |
+
from indexer.extractor import Extractor
|
| 436 |
+
|
| 437 |
+
resolved = resolve_path(path)
|
| 438 |
+
if not os.path.exists(resolved):
|
| 439 |
+
raise HTTPException(status_code=404, detail="File path no longer exists on disk.")
|
| 440 |
+
|
| 441 |
+
text = Extractor().extract(resolved) or "No text could be extracted from this file."
|
| 442 |
+
return templates.TemplateResponse(request, "document.html", {
|
| 443 |
+
"request": request,
|
| 444 |
+
"title": os.path.basename(resolved),
|
| 445 |
+
"doc_id": os.path.basename(resolved),
|
| 446 |
+
"source": "filesystem",
|
| 447 |
+
"filepath": resolved,
|
| 448 |
+
"text": text,
|
| 449 |
+
"is_dataset": False,
|
| 450 |
+
})
|
| 451 |
+
|
| 452 |
+
raise HTTPException(status_code=400, detail="Provide either dataset/doc_id or path.")
|
| 453 |
+
|
| 454 |
+
|
| 455 |
@app.get("/health")
|
| 456 |
async def health():
|
| 457 |
engine = get_engine()
|