Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- README.md +1 -10
- eval/evaluate.py +113 -0
- requirements.txt +15 -0
- src/api_gateway/app.py +85 -0
- src/bench/benchmark_cache.py +0 -0
- src/bench/benchmark_search.py +0 -0
- src/doc_service/app.py +38 -0
- src/doc_service/utils.py +53 -0
- src/embed_service/app.py +63 -0
- src/embed_service/cache_manager.py +48 -0
- src/embed_service/embedder.py +19 -0
- src/explain_service/app.py +15 -0
- src/explain_service/explainer.py +114 -0
- src/search_service/app.py +33 -0
- src/search_service/indexer.py +48 -0
- src/ui/streamlit_app.py +407 -0
README.md
CHANGED
|
@@ -1,10 +1 @@
|
|
| 1 |
-
|
| 2 |
-
title: Docfetch
|
| 3 |
-
emoji: π
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: pink
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
---
|
| 9 |
-
|
| 10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
# Multi-document-Embedding-Search-Engine-with-Caching
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/evaluate.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import requests
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
API_URL = "http://localhost:8000/search"
|
| 7 |
+
|
| 8 |
+
# =====================================================
|
| 9 |
+
# Utility: MRR
|
| 10 |
+
# =====================================================
|
| 11 |
+
def compute_mrr(all_ranks):
|
| 12 |
+
if not all_ranks:
|
| 13 |
+
return 0.0
|
| 14 |
+
rr = [1.0 / r for r in all_ranks]
|
| 15 |
+
return float(np.mean(rr))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# =====================================================
|
| 19 |
+
# Utility: NDCG@K
|
| 20 |
+
# =====================================================
|
| 21 |
+
def compute_ndcg(results, k):
|
| 22 |
+
"""results = [1,0,0...] relevance for retrieved docs"""
|
| 23 |
+
dcg = 0
|
| 24 |
+
for rank, rel in enumerate(results[:k], start=1):
|
| 25 |
+
if rel == 1:
|
| 26 |
+
dcg += 1 / np.log2(rank + 1)
|
| 27 |
+
|
| 28 |
+
idcg = 1 / np.log2(1 + 1) # ideal rank = 1
|
| 29 |
+
return dcg / idcg if idcg != 0 else 0
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# =====================================================
|
| 33 |
+
# MAIN EVALUATION FUNCTION
|
| 34 |
+
# =====================================================
|
| 35 |
+
def run_evaluation(query_file="generated_queries.json", top_k=10):
|
| 36 |
+
"""
|
| 37 |
+
top_k is FIXED = 10 for a realistic evaluation.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
with open(query_file) as f:
|
| 41 |
+
queries = json.load(f)
|
| 42 |
+
|
| 43 |
+
correct = []
|
| 44 |
+
ranks = []
|
| 45 |
+
ndcg_scores = []
|
| 46 |
+
detailed = []
|
| 47 |
+
|
| 48 |
+
for item in queries:
|
| 49 |
+
query = item["query"]
|
| 50 |
+
expected = item["doc_id"] + ".txt"
|
| 51 |
+
|
| 52 |
+
# ----------------------------
|
| 53 |
+
# CALL API
|
| 54 |
+
# ----------------------------
|
| 55 |
+
resp = requests.post(API_URL, json={"query": query, "top_k": top_k})
|
| 56 |
+
if resp.status_code != 200:
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
results = resp.json().get("results", [])
|
| 60 |
+
retrieved = [r["filename"] for r in results]
|
| 61 |
+
|
| 62 |
+
# relevance array for NDCG
|
| 63 |
+
relevance = [1 if fn == expected else 0 for fn in retrieved]
|
| 64 |
+
|
| 65 |
+
# ----------------------------
|
| 66 |
+
# ACCURACY
|
| 67 |
+
# ----------------------------
|
| 68 |
+
hit = expected in retrieved
|
| 69 |
+
correct.append(1 if hit else 0)
|
| 70 |
+
|
| 71 |
+
# ----------------------------
|
| 72 |
+
# RANK for MRR
|
| 73 |
+
# ----------------------------
|
| 74 |
+
if hit:
|
| 75 |
+
rank_position = retrieved.index(expected) + 1
|
| 76 |
+
ranks.append(rank_position)
|
| 77 |
+
else:
|
| 78 |
+
rank_position = None
|
| 79 |
+
|
| 80 |
+
# ----------------------------
|
| 81 |
+
# NDCG
|
| 82 |
+
# ----------------------------
|
| 83 |
+
ndcg_scores.append(compute_ndcg(relevance, top_k))
|
| 84 |
+
|
| 85 |
+
# ----------------------------
|
| 86 |
+
# Save detail
|
| 87 |
+
# ----------------------------
|
| 88 |
+
detailed.append({
|
| 89 |
+
"query": query,
|
| 90 |
+
"expected": expected,
|
| 91 |
+
"retrieved": retrieved,
|
| 92 |
+
"rank": rank_position,
|
| 93 |
+
"is_correct": hit
|
| 94 |
+
})
|
| 95 |
+
|
| 96 |
+
# =====================================================
|
| 97 |
+
# FINAL METRICS
|
| 98 |
+
# =====================================================
|
| 99 |
+
accuracy = round(np.mean(correct) * 100, 2)
|
| 100 |
+
mrr = round(compute_mrr(ranks), 4)
|
| 101 |
+
mean_ndcg = round(float(np.mean(ndcg_scores)), 4)
|
| 102 |
+
|
| 103 |
+
summary = {
|
| 104 |
+
"accuracy": accuracy,
|
| 105 |
+
"mrr": mrr,
|
| 106 |
+
"ndcg": mean_ndcg,
|
| 107 |
+
"total_queries": len(queries),
|
| 108 |
+
"correct_count": sum(correct),
|
| 109 |
+
"incorrect_count": len(queries) - sum(correct),
|
| 110 |
+
"details": detailed
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
return summary
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
google-genai
|
| 4 |
+
sentence-transformers
|
| 5 |
+
faiss-cpu
|
| 6 |
+
numpy
|
| 7 |
+
scikit-learn
|
| 8 |
+
|
| 9 |
+
requests
|
| 10 |
+
pydantic
|
| 11 |
+
|
| 12 |
+
streamlit
|
| 13 |
+
|
| 14 |
+
# Only really needed in Colab for exposing Streamlit, safe to keep
|
| 15 |
+
pyngrok
|
src/api_gateway/app.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/api_gateway/app.py
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
import requests
|
| 5 |
+
import time
|
| 6 |
+
app = FastAPI(title="API Gateway")
|
| 7 |
+
|
| 8 |
+
DOC_URL = "http://localhost:9001"
|
| 9 |
+
EMBED_URL = "http://localhost:9002"
|
| 10 |
+
SEARCH_URL = "http://localhost:9003"
|
| 11 |
+
EXPLAIN_URL = "http://localhost:9004"
|
| 12 |
+
DATA_FOLDER = "data/docs"
|
| 13 |
+
|
| 14 |
+
class SearchQuery(BaseModel):
|
| 15 |
+
query: str
|
| 16 |
+
top_k: int = 5
|
| 17 |
+
|
| 18 |
+
@app.post("/initialize")
|
| 19 |
+
def initialize():
|
| 20 |
+
# 1) load docs
|
| 21 |
+
d = requests.post(f"{DOC_URL}/load_docs", json={"folder": DATA_FOLDER}, timeout=20)
|
| 22 |
+
if d.status_code != 200:
|
| 23 |
+
return {"error": "doc_load_failed", "detail": d.text}
|
| 24 |
+
docs = d.json().get("documents", [])
|
| 25 |
+
|
| 26 |
+
# 2) prepare docs for embed_batch: ensure keys filename,text,hash
|
| 27 |
+
batch_docs = [{"filename": x["filename"], "text": x.get("clean_text", x.get("text","")), "hash": x["hash"]} for x in docs]
|
| 28 |
+
|
| 29 |
+
# 3) embed batch
|
| 30 |
+
e = requests.post(f"{EMBED_URL}/embed_batch", json={"docs": batch_docs}, timeout=60)
|
| 31 |
+
if e.status_code != 200:
|
| 32 |
+
return {"error": "embed_failed", "detail": e.text}
|
| 33 |
+
embed_out = e.json()
|
| 34 |
+
embeddings = [r["embedding"] for r in embed_out["results"]]
|
| 35 |
+
meta = {i: r["filename"] for i, r in enumerate(embed_out["results"])}
|
| 36 |
+
|
| 37 |
+
# 4) build index
|
| 38 |
+
b = requests.post(f"{SEARCH_URL}/build_index", json={"embeddings": embeddings, "meta": meta}, timeout=60)
|
| 39 |
+
if b.status_code != 200:
|
| 40 |
+
return {"error": "build_index_failed", "detail": b.text}
|
| 41 |
+
|
| 42 |
+
return {"docs_loaded": len(docs), "embeddings": len(embeddings), "build": b.json()}
|
| 43 |
+
|
| 44 |
+
@app.post("/search")
|
| 45 |
+
def search(req: SearchQuery):
|
| 46 |
+
# embed query
|
| 47 |
+
unique_id = str(time.time())
|
| 48 |
+
q = requests.post(f"{EMBED_URL}/embed_document", json={"filename": f"query_{unique_id}", "text": req.query, "hash": unique_id}, timeout=10)
|
| 49 |
+
if q.status_code != 200:
|
| 50 |
+
return {"error": "embed_query_failed", "detail": q.text}
|
| 51 |
+
q_emb = q.json()["embedding"]
|
| 52 |
+
|
| 53 |
+
# search vectors
|
| 54 |
+
s = requests.post(f"{SEARCH_URL}/search_vectors", json={"query_embedding": q_emb, "top_k": req.top_k}, timeout=10)
|
| 55 |
+
if s.status_code != 200:
|
| 56 |
+
return {"error": "search_failed", "detail": s.text}
|
| 57 |
+
sdata = s.json()
|
| 58 |
+
if "error" in sdata:
|
| 59 |
+
return {"error": "search_index_error", "detail": sdata}
|
| 60 |
+
|
| 61 |
+
scores = sdata["scores"]
|
| 62 |
+
ids = sdata["ids"]
|
| 63 |
+
meta = sdata["meta"] # { "0": filename, ... }
|
| 64 |
+
|
| 65 |
+
# for each id load doc from doc service and call explain
|
| 66 |
+
results = []
|
| 67 |
+
for score, idx in zip(scores, ids):
|
| 68 |
+
filename = meta.get(str(idx))
|
| 69 |
+
if filename is None:
|
| 70 |
+
continue
|
| 71 |
+
doc_resp = requests.get(f"{DOC_URL}/get_doc/{filename}", timeout=10)
|
| 72 |
+
if doc_resp.status_code != 200:
|
| 73 |
+
continue
|
| 74 |
+
doc = doc_resp.json() # has clean_text, original_text, ...
|
| 75 |
+
# explain
|
| 76 |
+
exp = requests.post(f"{EXPLAIN_URL}/explain", json={"query": req.query, "document_text": doc.get("clean_text","")}, timeout=10)
|
| 77 |
+
explanation = exp.json() if exp.status_code == 200 else {}
|
| 78 |
+
results.append({
|
| 79 |
+
"filename": filename,
|
| 80 |
+
"score": float(score),
|
| 81 |
+
"preview": doc.get("clean_text","")[:350],
|
| 82 |
+
"full_text": doc.get("original_text",""),
|
| 83 |
+
"explanation": explanation
|
| 84 |
+
})
|
| 85 |
+
return {"results": results}
|
src/bench/benchmark_cache.py
ADDED
|
File without changes
|
src/bench/benchmark_search.py
ADDED
|
File without changes
|
src/doc_service/app.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/doc_service/app.py
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from .utils import preprocess_documents, load_original_text
|
| 5 |
+
|
| 6 |
+
app = FastAPI(title="Document Service")
|
| 7 |
+
|
| 8 |
+
class FolderRequest(BaseModel):
|
| 9 |
+
folder: str
|
| 10 |
+
|
| 11 |
+
# In-memory stores (simple)
|
| 12 |
+
_DOCUMENTS = {} # filename -> dict with clean_text, hash, length, original_text
|
| 13 |
+
|
| 14 |
+
@app.post("/load_docs")
|
| 15 |
+
def load_docs(req: FolderRequest):
|
| 16 |
+
try:
|
| 17 |
+
docs = preprocess_documents(req.folder)
|
| 18 |
+
for d in docs:
|
| 19 |
+
_DOCUMENTS[d["filename"]] = {
|
| 20 |
+
"filename": d["filename"],
|
| 21 |
+
"clean_text": d["clean_text"],
|
| 22 |
+
"hash": d["hash"],
|
| 23 |
+
"length": d["length"],
|
| 24 |
+
"original_text": d["original_text"]
|
| 25 |
+
}
|
| 26 |
+
return {"count": len(docs), "documents": list(_DOCUMENTS.values())}
|
| 27 |
+
except Exception as e:
|
| 28 |
+
return {"error": str(e)}
|
| 29 |
+
|
| 30 |
+
@app.get("/get_doc/{filename}")
|
| 31 |
+
def get_doc(filename: str):
|
| 32 |
+
if filename not in _DOCUMENTS:
|
| 33 |
+
return {"error": "not_found", "message": f"{filename} not found"}
|
| 34 |
+
return _DOCUMENTS[filename]
|
| 35 |
+
|
| 36 |
+
@app.get("/all_docs")
|
| 37 |
+
def all_docs():
|
| 38 |
+
return {"count": len(_DOCUMENTS), "documents": list(_DOCUMENTS.values())}
|
src/doc_service/utils.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/doc_service/utils.py
|
| 2 |
+
import os
|
| 3 |
+
import hashlib
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
def load_text_files(folder_path: str):
|
| 7 |
+
docs = []
|
| 8 |
+
for fname in sorted(os.listdir(folder_path)):
|
| 9 |
+
if fname.endswith(".txt"):
|
| 10 |
+
full_path = os.path.join(folder_path, fname)
|
| 11 |
+
with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 12 |
+
text = f.read()
|
| 13 |
+
docs.append({
|
| 14 |
+
"filename": fname,
|
| 15 |
+
"path": full_path,
|
| 16 |
+
"text": text
|
| 17 |
+
})
|
| 18 |
+
return docs
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_original_text(folder_path: str, filename: str):
|
| 22 |
+
path = os.path.join(folder_path, filename)
|
| 23 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 24 |
+
return f.read()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def clean_text(text: str) -> str:
|
| 28 |
+
text = text.lower()
|
| 29 |
+
text = re.sub(r'<.*?>', '', text)
|
| 30 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 31 |
+
return text
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def compute_hash(text: str) -> str:
|
| 35 |
+
return hashlib.md5(text.encode("utf-8")).hexdigest()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def preprocess_documents(folder_path: str):
|
| 39 |
+
raw_docs = load_text_files(folder_path)
|
| 40 |
+
result = []
|
| 41 |
+
|
| 42 |
+
for doc in raw_docs:
|
| 43 |
+
cleaned = clean_text(doc["text"])
|
| 44 |
+
h = compute_hash(cleaned)
|
| 45 |
+
result.append({
|
| 46 |
+
"filename": doc["filename"],
|
| 47 |
+
"clean_text": cleaned,
|
| 48 |
+
"hash": h,
|
| 49 |
+
"length": len(cleaned.split()),
|
| 50 |
+
"original_text": doc["text"]
|
| 51 |
+
})
|
| 52 |
+
|
| 53 |
+
return result
|
src/embed_service/app.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/embed_service/app.py
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from .embedder import Embedder
|
| 5 |
+
from .cache_manager import CacheManager
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
app = FastAPI(title="Embed Service")
|
| 9 |
+
|
| 10 |
+
embedder = Embedder()
|
| 11 |
+
cache = CacheManager()
|
| 12 |
+
|
| 13 |
+
class EmbedRequest(BaseModel):
|
| 14 |
+
filename: str
|
| 15 |
+
text: str
|
| 16 |
+
hash: str
|
| 17 |
+
|
| 18 |
+
@app.post("/embed_document")
|
| 19 |
+
def embed_document(req: EmbedRequest):
|
| 20 |
+
if cache.exists(req.filename, req.hash):
|
| 21 |
+
emb = cache.get_embedding(req.filename)
|
| 22 |
+
return {"filename": req.filename, "cached": True, "embedding": emb.tolist()}
|
| 23 |
+
emb = embedder.embed_text(req.text)
|
| 24 |
+
cache.add_embedding(req.filename, req.hash, emb)
|
| 25 |
+
return {"filename": req.filename, "cached": False, "embedding": emb.tolist()}
|
| 26 |
+
|
| 27 |
+
class BatchEmbedRequest(BaseModel):
|
| 28 |
+
docs: list
|
| 29 |
+
|
| 30 |
+
@app.post("/embed_batch")
|
| 31 |
+
def embed_batch(req: BatchEmbedRequest):
|
| 32 |
+
results = []
|
| 33 |
+
new_texts, new_files, new_hashes = [], [], []
|
| 34 |
+
for d in req.docs:
|
| 35 |
+
filename = d.get("filename")
|
| 36 |
+
file_hash = d.get("hash")
|
| 37 |
+
text = d.get("text") or d.get("clean_text") or ""
|
| 38 |
+
if cache.exists(filename, file_hash):
|
| 39 |
+
results.append({"filename": filename, "cached": True, "embedding": cache.get_embedding(filename).tolist()})
|
| 40 |
+
else:
|
| 41 |
+
new_files.append(filename)
|
| 42 |
+
new_hashes.append(file_hash)
|
| 43 |
+
new_texts.append(text)
|
| 44 |
+
|
| 45 |
+
if new_texts:
|
| 46 |
+
new_embs = embedder.embed_batch(new_texts)
|
| 47 |
+
for fname, h, emb in zip(new_files, new_hashes, new_embs):
|
| 48 |
+
cache.add_embedding(fname, h, emb)
|
| 49 |
+
results.append({"filename": fname, "cached": False, "embedding": emb.tolist()})
|
| 50 |
+
|
| 51 |
+
return {"count": len(results), "results": results}
|
| 52 |
+
|
| 53 |
+
@app.get("/all_embeddings")
|
| 54 |
+
def get_all_embeddings():
|
| 55 |
+
meta, embs = cache.all_embeddings()
|
| 56 |
+
return {"meta": meta, "embeddings": embs.tolist()}
|
| 57 |
+
|
| 58 |
+
# convenience endpoint called earlier by older code
|
| 59 |
+
@app.post("/embed_all")
|
| 60 |
+
def embed_all_docs(docs: list):
|
| 61 |
+
# docs: list of {filename, clean_text, hash}
|
| 62 |
+
batch = {"docs": [{"filename": d["filename"], "text": d.get("clean_text") or d.get("text", ""), "hash": d["hash"]} for d in docs]}
|
| 63 |
+
return embed_batch(BatchEmbedRequest(**batch))
|
src/embed_service/cache_manager.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/embed_service/cache_manager.py
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
CACHE_DIR = "cache"
|
| 7 |
+
META_PATH = f"{CACHE_DIR}/embed_meta.json"
|
| 8 |
+
EMB_PATH = f"{CACHE_DIR}/embeddings.npy"
|
| 9 |
+
|
| 10 |
+
class CacheManager:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 13 |
+
if os.path.exists(META_PATH):
|
| 14 |
+
with open(META_PATH, "r") as f:
|
| 15 |
+
self.meta = json.load(f)
|
| 16 |
+
else:
|
| 17 |
+
self.meta = {} # filename -> {"hash":..., "index": int}
|
| 18 |
+
|
| 19 |
+
if os.path.exists(EMB_PATH):
|
| 20 |
+
self.embeddings = np.load(EMB_PATH)
|
| 21 |
+
else:
|
| 22 |
+
# empty array shaped (0, dim) β we'll resize when first embedding arrives
|
| 23 |
+
self.embeddings = np.zeros((0, 384), dtype="float32")
|
| 24 |
+
|
| 25 |
+
def save(self):
|
| 26 |
+
with open(META_PATH, "w") as f:
|
| 27 |
+
json.dump(self.meta, f, indent=2)
|
| 28 |
+
np.save(EMB_PATH, self.embeddings)
|
| 29 |
+
|
| 30 |
+
def exists(self, filename: str, file_hash: str) -> bool:
|
| 31 |
+
return filename in self.meta and self.meta[filename]["hash"] == file_hash
|
| 32 |
+
|
| 33 |
+
def get_embedding(self, filename: str):
|
| 34 |
+
idx = int(self.meta[filename]["index"])
|
| 35 |
+
return self.embeddings[idx]
|
| 36 |
+
|
| 37 |
+
def add_embedding(self, filename: str, file_hash: str, embedding):
|
| 38 |
+
embedding = embedding.astype("float32")
|
| 39 |
+
idx = len(self.embeddings)
|
| 40 |
+
self.meta[filename] = {"hash": file_hash, "index": idx}
|
| 41 |
+
if self.embeddings.shape[0] == 0:
|
| 42 |
+
self.embeddings = embedding.reshape(1, -1)
|
| 43 |
+
else:
|
| 44 |
+
self.embeddings = np.vstack([self.embeddings, embedding.reshape(1, -1)])
|
| 45 |
+
self.save()
|
| 46 |
+
|
| 47 |
+
def all_embeddings(self):
|
| 48 |
+
return self.meta, self.embeddings
|
src/embed_service/embedder.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/embed_service/embedder.py
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
class Embedder:
|
| 6 |
+
def __init__(self, model_name="all-MiniLM-L6-v2"):
|
| 7 |
+
print(f"Loading embedding model: {model_name}")
|
| 8 |
+
self.model = SentenceTransformer(model_name)
|
| 9 |
+
|
| 10 |
+
def embed_text(self, text: str):
|
| 11 |
+
emb = self.model.encode(text, convert_to_numpy=True)
|
| 12 |
+
return emb.astype("float32")
|
| 13 |
+
|
| 14 |
+
def embed_batch(self, texts: list):
|
| 15 |
+
embs = self.model.encode(texts, convert_to_numpy=True)
|
| 16 |
+
return embs.astype("float32")
|
| 17 |
+
|
| 18 |
+
def dim(self):
|
| 19 |
+
return self.model.get_sentence_embedding_dimension()
|
src/explain_service/app.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/explain_service/app.py
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from .explainer import Explainer
|
| 5 |
+
|
| 6 |
+
app = FastAPI(title="Explain Service")
|
| 7 |
+
explainer = Explainer()
|
| 8 |
+
|
| 9 |
+
class ExplainRequest(BaseModel):
|
| 10 |
+
query: str
|
| 11 |
+
document_text: str
|
| 12 |
+
|
| 13 |
+
@app.post("/explain")
|
| 14 |
+
def explain_doc(req: ExplainRequest):
|
| 15 |
+
return explainer.explain(req.query, req.document_text)
|
src/explain_service/explainer.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/explain_service/explainer.py
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from google import genai
|
| 7 |
+
|
| 8 |
+
STOPWORDS = set("""
|
| 9 |
+
a an the and or but if while with without for on in into by to from of is are was were be been being as it this that these those
|
| 10 |
+
""".split())
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Explainer:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 16 |
+
# LLM Client
|
| 17 |
+
self.client = genai.Client(api_key="AIzaSyBXEkZ7DF_QxuKwyxsXQ6FZ_qLk4k7W1n0")
|
| 18 |
+
|
| 19 |
+
# ---------------------------
|
| 20 |
+
# TOKENIZER
|
| 21 |
+
# ---------------------------
|
| 22 |
+
def tokenize(self, text: str):
|
| 23 |
+
text = text.lower()
|
| 24 |
+
tokens = re.findall(r"[a-zA-Z]+", text)
|
| 25 |
+
tokens = [t for t in tokens if t not in STOPWORDS]
|
| 26 |
+
return tokens
|
| 27 |
+
|
| 28 |
+
# ---------------------------
|
| 29 |
+
# KEYWORD OVERLAP
|
| 30 |
+
# ---------------------------
|
| 31 |
+
def keyword_overlap(self, query: str, doc: str):
|
| 32 |
+
q_tokens = set(self.tokenize(query))
|
| 33 |
+
d_tokens = set(self.tokenize(doc))
|
| 34 |
+
|
| 35 |
+
overlap = q_tokens.intersection(d_tokens)
|
| 36 |
+
overlap_ratio = len(overlap) / (len(q_tokens) + 1e-8)
|
| 37 |
+
|
| 38 |
+
return list(overlap), float(overlap_ratio)
|
| 39 |
+
|
| 40 |
+
# ---------------------------
|
| 41 |
+
# BEST SENTENCES MATCHING QUERY
|
| 42 |
+
# ---------------------------
|
| 43 |
+
def best_sentences(self, query: str, doc: str, top_k=2):
|
| 44 |
+
sentences = re.split(r"[.!?]", doc)
|
| 45 |
+
sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
|
| 46 |
+
|
| 47 |
+
if len(sentences) == 0:
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
q_emb = self.model.encode(query, convert_to_numpy=True)
|
| 51 |
+
s_embs = self.model.encode(sentences, convert_to_numpy=True)
|
| 52 |
+
|
| 53 |
+
q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-10)
|
| 54 |
+
s_norm = s_embs / (np.linalg.norm(s_embs, axis=1, keepdims=True) + 1e-10)
|
| 55 |
+
|
| 56 |
+
sims = (s_norm @ q_emb).tolist()
|
| 57 |
+
top_ids = np.argsort(sims)[::-1][:top_k]
|
| 58 |
+
|
| 59 |
+
results = []
|
| 60 |
+
for idx in top_ids:
|
| 61 |
+
results.append({
|
| 62 |
+
"sentence": sentences[idx],
|
| 63 |
+
"score": float(sims[idx])
|
| 64 |
+
})
|
| 65 |
+
|
| 66 |
+
return results
|
| 67 |
+
|
| 68 |
+
# ---------------------------
|
| 69 |
+
# LLM-LEVEL EXPLANATION
|
| 70 |
+
# ---------------------------
|
| 71 |
+
def llm_explain(self, query, doc_text, top_sentences):
|
| 72 |
+
|
| 73 |
+
formatted_sentences = "\n".join(
|
| 74 |
+
[f"- {s['sentence']} (score: {s['score']:.2f})" for s in top_sentences]
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
prompt = f"""
|
| 78 |
+
You are an AI assistant that explains WHY a document matches a user query.
|
| 79 |
+
|
| 80 |
+
QUERY:
|
| 81 |
+
{query}
|
| 82 |
+
|
| 83 |
+
DOCUMENT EXCERPT:
|
| 84 |
+
{doc_text[:500]}
|
| 85 |
+
|
| 86 |
+
MOST RELEVANT SENTENCES:
|
| 87 |
+
{formatted_sentences}
|
| 88 |
+
|
| 89 |
+
Write 2β3 natural sentences explaining WHY this document is relevant.
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
response = self.client.models.generate_content(
|
| 93 |
+
model="gemini-2.5-flash",
|
| 94 |
+
contents=prompt,
|
| 95 |
+
config={"temperature": 0.4}
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
return response.text.strip()
|
| 99 |
+
|
| 100 |
+
# ---------------------------
|
| 101 |
+
# MAIN EXPLAIN FUNCTION
|
| 102 |
+
# ---------------------------
|
| 103 |
+
def explain(self, query: str, doc_text: str):
|
| 104 |
+
|
| 105 |
+
keywords, overlap_ratio = self.keyword_overlap(query, doc_text)
|
| 106 |
+
top_sents = self.best_sentences(query, doc_text)
|
| 107 |
+
llm_summary = self.llm_explain(query, doc_text, top_sents)
|
| 108 |
+
|
| 109 |
+
return {
|
| 110 |
+
"keyword_overlap": keywords,
|
| 111 |
+
"overlap_ratio": overlap_ratio,
|
| 112 |
+
"top_sentences": top_sents,
|
| 113 |
+
"llm_explanation": llm_summary
|
| 114 |
+
}
|
src/search_service/app.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/search_service/app.py
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from .indexer import FAISSIndexer
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
app = FastAPI(title="Search Service")
|
| 8 |
+
|
| 9 |
+
indexer = FAISSIndexer()
|
| 10 |
+
# attempt load if exists
|
| 11 |
+
indexer.try_load()
|
| 12 |
+
|
| 13 |
+
class BuildIndexRequest(BaseModel):
|
| 14 |
+
embeddings: list
|
| 15 |
+
meta: dict
|
| 16 |
+
|
| 17 |
+
@app.post("/build_index")
|
| 18 |
+
def build_index(req: BuildIndexRequest):
|
| 19 |
+
embeddings = np.array(req.embeddings, dtype="float32")
|
| 20 |
+
indexer.build(embeddings, req.meta)
|
| 21 |
+
return {"status": "index_built", "count": embeddings.shape[0]}
|
| 22 |
+
|
| 23 |
+
class SearchRequest(BaseModel):
|
| 24 |
+
query_embedding: list
|
| 25 |
+
top_k: int = 5
|
| 26 |
+
|
| 27 |
+
@app.post("/search_vectors")
|
| 28 |
+
def search_vectors(req: SearchRequest):
|
| 29 |
+
if indexer.index is None:
|
| 30 |
+
return {"error": "index_not_built"}
|
| 31 |
+
query = np.array(req.query_embedding, dtype="float32")
|
| 32 |
+
scores, ids = indexer.search(query, req.top_k)
|
| 33 |
+
return {"scores": scores, "ids": ids, "meta": indexer.meta}
|
src/search_service/indexer.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/search_service/indexer.py
|
| 2 |
+
import numpy as np
|
| 3 |
+
import faiss
|
| 4 |
+
import os
|
| 5 |
+
import pickle
|
| 6 |
+
|
| 7 |
+
class FAISSIndexer:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.index = None
|
| 10 |
+
self.meta = None
|
| 11 |
+
self.index_path = "faiss_index.bin"
|
| 12 |
+
self.meta_path = "faiss_meta.pkl"
|
| 13 |
+
|
| 14 |
+
def try_load(self):
|
| 15 |
+
if not os.path.exists(self.meta_path) or not os.path.exists(self.index_path):
|
| 16 |
+
return None, None
|
| 17 |
+
with open(self.meta_path, "rb") as f:
|
| 18 |
+
meta = pickle.load(f)
|
| 19 |
+
index = faiss.read_index(self.index_path)
|
| 20 |
+
self.index = index
|
| 21 |
+
self.meta = meta
|
| 22 |
+
return meta, None
|
| 23 |
+
|
| 24 |
+
def build(self, embeddings, meta):
|
| 25 |
+
# embeddings: numpy array (N, dim)
|
| 26 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10
|
| 27 |
+
embeddings = embeddings / norms
|
| 28 |
+
dim = embeddings.shape[1]
|
| 29 |
+
index = faiss.IndexFlatL2(dim)
|
| 30 |
+
index.add(embeddings)
|
| 31 |
+
faiss.write_index(index, self.index_path)
|
| 32 |
+
# normalize meta keys to str(index)->filename
|
| 33 |
+
meta_map = {}
|
| 34 |
+
for k, v in meta.items():
|
| 35 |
+
meta_map[str(k)] = v
|
| 36 |
+
with open(self.meta_path, "wb") as f:
|
| 37 |
+
pickle.dump(meta_map, f)
|
| 38 |
+
self.index = index
|
| 39 |
+
self.meta = meta_map
|
| 40 |
+
|
| 41 |
+
def search(self, query_emb, top_k):
|
| 42 |
+
if self.index is None:
|
| 43 |
+
raise ValueError("FAISS index is not loaded!")
|
| 44 |
+
q = query_emb / (np.linalg.norm(query_emb) + 1e-10)
|
| 45 |
+
q = q.reshape(1, -1)
|
| 46 |
+
distances, ids = self.index.search(q, top_k)
|
| 47 |
+
# distances shape (1, k), ids shape (1, k)
|
| 48 |
+
return distances[0].tolist(), ids[0].tolist()
|
src/ui/streamlit_app.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import html
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
import importlib
|
| 8 |
+
|
| 9 |
+
# ------------------------------------------
|
| 10 |
+
# Add project root + eval folder to path
|
| 11 |
+
# ------------------------------------------
|
| 12 |
+
CURRENT_DIR = os.path.dirname(__file__)
|
| 13 |
+
SRC_DIR = os.path.dirname(CURRENT_DIR)
|
| 14 |
+
PROJECT_ROOT = os.path.dirname(SRC_DIR)
|
| 15 |
+
|
| 16 |
+
sys.path.append(PROJECT_ROOT)
|
| 17 |
+
sys.path.append(os.path.join(PROJECT_ROOT, "eval"))
|
| 18 |
+
|
| 19 |
+
# ------------------------------------------
|
| 20 |
+
# FORCE RELOAD evaluate module
|
| 21 |
+
# ------------------------------------------
|
| 22 |
+
import eval.evaluate as eval_module
|
| 23 |
+
importlib.reload(eval_module)
|
| 24 |
+
from eval.evaluate import run_evaluation
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
API_GATEWAY_URL = "http://localhost:8000"
|
| 28 |
+
|
| 29 |
+
st.set_page_config(
|
| 30 |
+
page_title="Gemini Search",
|
| 31 |
+
page_icon="β¨",
|
| 32 |
+
layout="wide",
|
| 33 |
+
initial_sidebar_state="expanded", # Changed from "collapsed" to "expanded"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# =======================
|
| 37 |
+
# GEMINI UI STYLING
|
| 38 |
+
# =======================
|
| 39 |
+
st.markdown("""
|
| 40 |
+
<style>
|
| 41 |
+
/* Global Font & Background */
|
| 42 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
|
| 43 |
+
|
| 44 |
+
html, body, [class*="css"] {
|
| 45 |
+
font-family: 'Inter', sans-serif;
|
| 46 |
+
background-color: #ffffff; /* White Background */
|
| 47 |
+
color: #1f1f1f; /* Dark text for contrast */
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
/* --- INPUT FIELD FIX --- */
|
| 51 |
+
/* 1. Remove the default Streamlit border/background on the container */
|
| 52 |
+
.stTextInput > div[data-baseweb="input"] {
|
| 53 |
+
background-color: transparent !important;
|
| 54 |
+
border: none !important;
|
| 55 |
+
border-radius: 24px !important;
|
| 56 |
+
box-shadow: none !important;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
/* 2. Style the actual input element */
|
| 60 |
+
.stTextInput input {
|
| 61 |
+
border-radius: 24px !important;
|
| 62 |
+
background-color: #f0f4f9 !important; /* Light ash input */
|
| 63 |
+
border: 1px solid transparent !important;
|
| 64 |
+
color: #1f1f1f !important;
|
| 65 |
+
padding: 12px 20px !important;
|
| 66 |
+
font-size: 16px !important;
|
| 67 |
+
transition: all 0.2s ease;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
/* 3. Focus state - clean blue border, no default red overlay */
|
| 71 |
+
.stTextInput input:focus {
|
| 72 |
+
background-color: #ffffff !important;
|
| 73 |
+
border-color: #0b57d0 !important; /* Gemini Blue */
|
| 74 |
+
box-shadow: 0 0 0 2px rgba(11, 87, 208, 0.2) !important;
|
| 75 |
+
outline: none !important;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/* Button Styling */
|
| 79 |
+
.stButton > button {
|
| 80 |
+
border-radius: 20px;
|
| 81 |
+
font-weight: 500;
|
| 82 |
+
border: none;
|
| 83 |
+
padding: 0.5rem 1.5rem;
|
| 84 |
+
transition: all 0.3s ease;
|
| 85 |
+
white-space: nowrap; /* Forces text to stay on one line */
|
| 86 |
+
min-width: 140px; /* Ensures button is never too skinny */
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/* Primary Search Button */
|
| 90 |
+
button[kind="primary"] {
|
| 91 |
+
background: linear-gradient(90deg, #4b90ff, #ff5546);
|
| 92 |
+
color: white;
|
| 93 |
+
}
|
| 94 |
+
button[kind="primary"]:hover {
|
| 95 |
+
opacity: 0.9;
|
| 96 |
+
box-shadow: 0 4px 12px rgba(75, 144, 255, 0.3);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
/* Result Card - Light Ash Background */
|
| 100 |
+
.result-card {
|
| 101 |
+
background-color: #f0f4f9; /* Light Ash */
|
| 102 |
+
border-radius: 16px;
|
| 103 |
+
padding: 1.5rem;
|
| 104 |
+
margin-bottom: 1rem;
|
| 105 |
+
border: none; /* Removed border for cleaner look on light mode */
|
| 106 |
+
transition: transform 0.2s;
|
| 107 |
+
}
|
| 108 |
+
.result-card:hover {
|
| 109 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.05);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
/* Typography in Cards */
|
| 113 |
+
.card-title {
|
| 114 |
+
color: #1f1f1f; /* Dark Title */
|
| 115 |
+
font-size: 1.1rem;
|
| 116 |
+
font-weight: 600;
|
| 117 |
+
margin-bottom: 0.5rem;
|
| 118 |
+
display: flex;
|
| 119 |
+
align-items: center;
|
| 120 |
+
gap: 8px;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.card-preview {
|
| 124 |
+
color: #444746; /* Darker gray for readable preview */
|
| 125 |
+
font-size: 0.95rem;
|
| 126 |
+
line-height: 1.5;
|
| 127 |
+
margin-bottom: 1rem;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
/* Pills & Badges */
|
| 131 |
+
.score-badge {
|
| 132 |
+
background-color: #c4eed0; /* Light Green bg */
|
| 133 |
+
color: #0f5223; /* Dark Green text */
|
| 134 |
+
padding: 4px 12px;
|
| 135 |
+
border-radius: 12px;
|
| 136 |
+
font-size: 0.75rem;
|
| 137 |
+
font-weight: 500;
|
| 138 |
+
display: inline-block;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
.keyword-pill {
|
| 142 |
+
background-color: #c2e7ff; /* Light Blue bg */
|
| 143 |
+
color: #004a77; /* Dark Blue text */
|
| 144 |
+
padding: 2px 10px;
|
| 145 |
+
border-radius: 8px;
|
| 146 |
+
font-size: 0.8rem;
|
| 147 |
+
margin-right: 6px;
|
| 148 |
+
display: inline-block;
|
| 149 |
+
margin-bottom: 4px;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
/* Gradient Text for Header */
|
| 153 |
+
.gradient-text {
|
| 154 |
+
background: linear-gradient(to right, #4285f4, #9b72cb, #d96570);
|
| 155 |
+
-webkit-background-clip: text;
|
| 156 |
+
-webkit-text-fill-color: transparent;
|
| 157 |
+
font-weight: 700;
|
| 158 |
+
font-size: 3rem;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
/* Custom Info Box */
|
| 162 |
+
.stAlert {
|
| 163 |
+
background-color: #f0f4f9;
|
| 164 |
+
color: #1f1f1f;
|
| 165 |
+
}
|
| 166 |
+
</style>
|
| 167 |
+
""", unsafe_allow_html=True)
|
| 168 |
+
|
| 169 |
+
# =======================
|
| 170 |
+
# SIDEBAR (Settings)
|
| 171 |
+
# =======================
|
| 172 |
+
with st.sidebar:
|
| 173 |
+
st.markdown("### βοΈ Settings")
|
| 174 |
+
top_k = st.slider("Top-K Results", 1, 10, 5)
|
| 175 |
+
url_input = st.text_input("API Endpoint", API_GATEWAY_URL)
|
| 176 |
+
st.divider()
|
| 177 |
+
st.subheader(" Evaluation")
|
| 178 |
+
run_eval = st.button("Run Evaluation Script")
|
| 179 |
+
st.divider()
|
| 180 |
+
st.caption(" Powered by Sentence-Transformers")
|
| 181 |
+
|
| 182 |
+
API_GATEWAY_URL = url_input
|
| 183 |
+
|
| 184 |
+
# =======================
|
| 185 |
+
# MAIN HEADER (Gemini Style)
|
| 186 |
+
# =======================
|
| 187 |
+
col1, col2, col3 = st.columns([1, 6, 1])
|
| 188 |
+
with col2:
|
| 189 |
+
# Use HTML for the gradient text title
|
| 190 |
+
st.markdown('<div style="text-align: center; margin-bottom: 10px;"><span class="gradient-text">Hello, Explorer</span></div>', unsafe_allow_html=True)
|
| 191 |
+
st.markdown('<div style="text-align: center; color: #444746; font-size: 1.2rem; margin-bottom: 30px;">How can I help you find documents today?</div>', unsafe_allow_html=True)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# =======================
|
| 195 |
+
# SEARCH BAR CENTERED
|
| 196 |
+
# =======================
|
| 197 |
+
# Centering the search bar using columns
|
| 198 |
+
sc1, sc2, sc3 = st.columns([1, 4, 1])
|
| 199 |
+
|
| 200 |
+
with sc2:
|
| 201 |
+
query = st.text_input(
|
| 202 |
+
"Search Query", # Label hidden by CSS/Config if needed, or set visibility hidden
|
| 203 |
+
placeholder="Ask a question about your documents...",
|
| 204 |
+
label_visibility="collapsed"
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
# Buttons row
|
| 208 |
+
b1, b2, b3 = st.columns([2, 1, 2])
|
| 209 |
+
with b2:
|
| 210 |
+
submit_btn = st.button("Sparkle Search", type="primary", use_container_width=True)
|
| 211 |
+
|
| 212 |
+
# =======================
|
| 213 |
+
# SEARCH HANDLER
|
| 214 |
+
# =======================
|
| 215 |
+
if submit_btn and query.strip():
|
| 216 |
+
|
| 217 |
+
# Gemini-style spinner
|
| 218 |
+
with st.spinner("β¨ Analyzing semantics..."):
|
| 219 |
+
|
| 220 |
+
response = requests.post(
|
| 221 |
+
f"{API_GATEWAY_URL}/search",
|
| 222 |
+
json={"query": query, "top_k": top_k}
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
if response.status_code != 200:
|
| 226 |
+
st.error(f"β Connection Error: {response.text}")
|
| 227 |
+
st.stop()
|
| 228 |
+
|
| 229 |
+
try:
|
| 230 |
+
data = response.json()
|
| 231 |
+
except:
|
| 232 |
+
st.error("β Invalid JSON response.")
|
| 233 |
+
st.stop()
|
| 234 |
+
|
| 235 |
+
if "results" not in data:
|
| 236 |
+
st.info("No relevant documents found for that query.")
|
| 237 |
+
st.stop()
|
| 238 |
+
|
| 239 |
+
# Results Header
|
| 240 |
+
st.markdown("### β¨ Search Results")
|
| 241 |
+
st.markdown("---")
|
| 242 |
+
|
| 243 |
+
# =======================
|
| 244 |
+
# DISPLAY RESULTS (Card Style)
|
| 245 |
+
# =======================
|
| 246 |
+
for item in data["results"]:
|
| 247 |
+
filename = item["filename"]
|
| 248 |
+
score = item["score"]
|
| 249 |
+
explanation = item["explanation"]
|
| 250 |
+
preview = item["preview"]
|
| 251 |
+
full_text = item["full_text"]
|
| 252 |
+
|
| 253 |
+
safe_preview = html.escape(preview)
|
| 254 |
+
|
| 255 |
+
# Prepare keyword HTML
|
| 256 |
+
keywords = explanation.get("keyword_overlap", [])
|
| 257 |
+
keyword_html = ""
|
| 258 |
+
if keywords:
|
| 259 |
+
keyword_html = "".join([f"<span class='keyword-pill'>{kw}</span>" for kw in keywords])
|
| 260 |
+
|
| 261 |
+
# Doc Icon (SVG) - Changed stroke to dark blue for visibility on light bg
|
| 262 |
+
doc_icon = """<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="#0b57d0" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M14.5 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V7.5L14.5 2z"></path><polyline points="14 2 14 8 20 8"></polyline></svg>"""
|
| 263 |
+
|
| 264 |
+
# Main Card Render
|
| 265 |
+
st.markdown(f"""
|
| 266 |
+
<div class="result-card">
|
| 267 |
+
<div style="display:flex; justify-content:space-between; align-items:start;">
|
| 268 |
+
<div class="card-title">
|
| 269 |
+
{doc_icon} {filename}
|
| 270 |
+
</div>
|
| 271 |
+
<div class="score-badge">match: {score:.4f}</div>
|
| 272 |
+
</div>
|
| 273 |
+
<p class="card-preview">{safe_preview}...</p>
|
| 274 |
+
<div style="margin-top: 10px;">
|
| 275 |
+
<div style="font-weight:600; color:#1f1f1f; margin-bottom:6px;">
|
| 276 |
+
Keyword Overlap:
|
| 277 |
+
</div>
|
| 278 |
+
{keyword_html}
|
| 279 |
+
</div>
|
| 280 |
+
</div>
|
| 281 |
+
""", unsafe_allow_html=True)
|
| 282 |
+
|
| 283 |
+
# Details Expander (Standard Streamlit but styled via global CSS)
|
| 284 |
+
with st.expander(f"View Details & Full Text for {filename}"):
|
| 285 |
+
|
| 286 |
+
overlap_ratio = explanation.get("overlap_ratio", 0)
|
| 287 |
+
sentences = explanation.get("top_sentences", [])
|
| 288 |
+
|
| 289 |
+
st.caption(f"Semantic Overlap Ratio: {overlap_ratio:.3f}")
|
| 290 |
+
|
| 291 |
+
if sentences:
|
| 292 |
+
st.markdown("**Key Excerpts:**")
|
| 293 |
+
for s in sentences:
|
| 294 |
+
# Updated quote box for light mode
|
| 295 |
+
st.markdown(f"""
|
| 296 |
+
<div style="background: #ffffff; border-left: 3px solid #4285f4; padding: 10px; margin-bottom: 5px; border-radius: 0 8px 8px 0; box-shadow: 0 1px 3px rgba(0,0,0,0.05);">
|
| 297 |
+
<span style="color: #1f1f1f;">"{s['sentence']}"</span>
|
| 298 |
+
<span style="color: #5e5e5e; font-size: 0.8em; margin-left: 10px;">(conf: {s['score']:.2f})</span>
|
| 299 |
+
</div>
|
| 300 |
+
""", unsafe_allow_html=True)
|
| 301 |
+
llm_expl = explanation.get("llm_explanation")
|
| 302 |
+
if llm_expl:
|
| 303 |
+
st.markdown("**Why this document?**")
|
| 304 |
+
st.write(llm_expl)
|
| 305 |
+
st.markdown("---")
|
| 306 |
+
st.markdown("**π Full Document Content:**")
|
| 307 |
+
st.code(full_text, language="text") # Using code block for better readability of raw text
|
| 308 |
+
if run_eval:
|
| 309 |
+
|
| 310 |
+
st.info("Running evaluation... this may take 10β20 seconds...")
|
| 311 |
+
|
| 312 |
+
results = run_evaluation(top_k=10)
|
| 313 |
+
|
| 314 |
+
st.success("Evaluation Complete!")
|
| 315 |
+
|
| 316 |
+
# -----------------------------
|
| 317 |
+
# Summary Metrics (Horizontal)
|
| 318 |
+
# -----------------------------
|
| 319 |
+
st.markdown("## Evaluation Summary")
|
| 320 |
+
|
| 321 |
+
c1, c2, c3, c4 = st.columns(4)
|
| 322 |
+
with c1:
|
| 323 |
+
st.metric("Accuracy", f"{results['accuracy']}%")
|
| 324 |
+
with c2:
|
| 325 |
+
st.metric("MRR", results["mrr"])
|
| 326 |
+
with c3:
|
| 327 |
+
st.metric("NDCG", results["ndcg"])
|
| 328 |
+
with c4:
|
| 329 |
+
st.metric("Queries", results["total_queries"])
|
| 330 |
+
|
| 331 |
+
st.markdown(
|
| 332 |
+
f"**Correct:** {results['correct_count']} | "
|
| 333 |
+
f"**Incorrect:** {results['incorrect_count']}"
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
st.markdown("---")
|
| 337 |
+
|
| 338 |
+
# -----------------------------
|
| 339 |
+
# Incorrect Results
|
| 340 |
+
# -----------------------------
|
| 341 |
+
st.markdown("## Incorrect Fetches ")
|
| 342 |
+
|
| 343 |
+
wrong = [d for d in results["details"] if not d["is_correct"]]
|
| 344 |
+
|
| 345 |
+
if wrong:
|
| 346 |
+
for item in wrong:
|
| 347 |
+
st.markdown(f"""
|
| 348 |
+
<div style="
|
| 349 |
+
padding:14px;
|
| 350 |
+
background:#ffe5e5;
|
| 351 |
+
border-left:5px solid #ff4d4f;
|
| 352 |
+
border-radius:8px;
|
| 353 |
+
margin-bottom:10px;">
|
| 354 |
+
<b> Query:</b> {item['query']}<br>
|
| 355 |
+
<b>Expected:</b> {item['expected']}<br>
|
| 356 |
+
<b>Retrieved:</b> {item['retrieved']}<br>
|
| 357 |
+
<b>Rank:</b> {item['rank']}
|
| 358 |
+
</div>
|
| 359 |
+
""", unsafe_allow_html=True)
|
| 360 |
+
else:
|
| 361 |
+
st.success(" No incorrect queries!")
|
| 362 |
+
|
| 363 |
+
st.markdown("---")
|
| 364 |
+
|
| 365 |
+
# -----------------------------
|
| 366 |
+
# Correct Results
|
| 367 |
+
# -----------------------------
|
| 368 |
+
st.markdown("## Correct Fetches")
|
| 369 |
+
|
| 370 |
+
correct_items = [d for d in results["details"] if d["is_correct"]]
|
| 371 |
+
|
| 372 |
+
if correct_items:
|
| 373 |
+
for item in correct_items:
|
| 374 |
+
st.markdown(f"""
|
| 375 |
+
<div style="
|
| 376 |
+
padding:14px;
|
| 377 |
+
background:#e8ffe5;
|
| 378 |
+
border-left:5px solid #2ecc71;
|
| 379 |
+
border-radius:8px;
|
| 380 |
+
margin-bottom:10px;">
|
| 381 |
+
<b> Query:</b> {item['query']}<br>
|
| 382 |
+
<b>Expected:</b> {item['expected']}<br>
|
| 383 |
+
<b>Top-K Retrieved:</b> {item['retrieved']}<br>
|
| 384 |
+
<b>Rank:</b> {item['rank']}
|
| 385 |
+
</div>
|
| 386 |
+
""", unsafe_allow_html=True)
|
| 387 |
+
else:
|
| 388 |
+
st.info("No correct queries.")
|
| 389 |
+
|
| 390 |
+
st.markdown("---")
|
| 391 |
+
|
| 392 |
+
# -----------------------------
|
| 393 |
+
# Full Table
|
| 394 |
+
# -----------------------------
|
| 395 |
+
st.markdown("## Full Evaluation Table")
|
| 396 |
+
|
| 397 |
+
table_data = []
|
| 398 |
+
for item in results["details"]:
|
| 399 |
+
table_data.append({
|
| 400 |
+
"Query": item["query"],
|
| 401 |
+
"Expected Doc": item["expected"],
|
| 402 |
+
"Retrieved (Top-10)": ", ".join(item["retrieved"]),
|
| 403 |
+
"Correct?": "Yes" if item["is_correct"] else "No",
|
| 404 |
+
"Rank": item["rank"]
|
| 405 |
+
})
|
| 406 |
+
|
| 407 |
+
st.dataframe(table_data, use_container_width=True)
|