Sathvik-kota commited on
Commit
d0abef8
Β·
verified Β·
1 Parent(s): ff0177a

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,10 +1 @@
1
- ---
2
- title: Docfetch
3
- emoji: πŸ‘€
4
- colorFrom: red
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Multi-document-Embedding-Search-Engine-with-Caching
 
 
 
 
 
 
 
 
 
eval/evaluate.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import requests
3
+ import numpy as np
4
+
5
+
6
+ API_URL = "http://localhost:8000/search"
7
+
8
+ # =====================================================
9
+ # Utility: MRR
10
+ # =====================================================
11
+ def compute_mrr(all_ranks):
12
+ if not all_ranks:
13
+ return 0.0
14
+ rr = [1.0 / r for r in all_ranks]
15
+ return float(np.mean(rr))
16
+
17
+
18
+ # =====================================================
19
+ # Utility: NDCG@K
20
+ # =====================================================
21
+ def compute_ndcg(results, k):
22
+ """results = [1,0,0...] relevance for retrieved docs"""
23
+ dcg = 0
24
+ for rank, rel in enumerate(results[:k], start=1):
25
+ if rel == 1:
26
+ dcg += 1 / np.log2(rank + 1)
27
+
28
+ idcg = 1 / np.log2(1 + 1) # ideal rank = 1
29
+ return dcg / idcg if idcg != 0 else 0
30
+
31
+
32
+ # =====================================================
33
+ # MAIN EVALUATION FUNCTION
34
+ # =====================================================
35
+ def run_evaluation(query_file="generated_queries.json", top_k=10):
36
+ """
37
+ top_k is FIXED = 10 for a realistic evaluation.
38
+ """
39
+
40
+ with open(query_file) as f:
41
+ queries = json.load(f)
42
+
43
+ correct = []
44
+ ranks = []
45
+ ndcg_scores = []
46
+ detailed = []
47
+
48
+ for item in queries:
49
+ query = item["query"]
50
+ expected = item["doc_id"] + ".txt"
51
+
52
+ # ----------------------------
53
+ # CALL API
54
+ # ----------------------------
55
+ resp = requests.post(API_URL, json={"query": query, "top_k": top_k})
56
+ if resp.status_code != 200:
57
+ continue
58
+
59
+ results = resp.json().get("results", [])
60
+ retrieved = [r["filename"] for r in results]
61
+
62
+ # relevance array for NDCG
63
+ relevance = [1 if fn == expected else 0 for fn in retrieved]
64
+
65
+ # ----------------------------
66
+ # ACCURACY
67
+ # ----------------------------
68
+ hit = expected in retrieved
69
+ correct.append(1 if hit else 0)
70
+
71
+ # ----------------------------
72
+ # RANK for MRR
73
+ # ----------------------------
74
+ if hit:
75
+ rank_position = retrieved.index(expected) + 1
76
+ ranks.append(rank_position)
77
+ else:
78
+ rank_position = None
79
+
80
+ # ----------------------------
81
+ # NDCG
82
+ # ----------------------------
83
+ ndcg_scores.append(compute_ndcg(relevance, top_k))
84
+
85
+ # ----------------------------
86
+ # Save detail
87
+ # ----------------------------
88
+ detailed.append({
89
+ "query": query,
90
+ "expected": expected,
91
+ "retrieved": retrieved,
92
+ "rank": rank_position,
93
+ "is_correct": hit
94
+ })
95
+
96
+ # =====================================================
97
+ # FINAL METRICS
98
+ # =====================================================
99
+ accuracy = round(np.mean(correct) * 100, 2)
100
+ mrr = round(compute_mrr(ranks), 4)
101
+ mean_ndcg = round(float(np.mean(ndcg_scores)), 4)
102
+
103
+ summary = {
104
+ "accuracy": accuracy,
105
+ "mrr": mrr,
106
+ "ndcg": mean_ndcg,
107
+ "total_queries": len(queries),
108
+ "correct_count": sum(correct),
109
+ "incorrect_count": len(queries) - sum(correct),
110
+ "details": detailed
111
+ }
112
+
113
+ return summary
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ google-genai
4
+ sentence-transformers
5
+ faiss-cpu
6
+ numpy
7
+ scikit-learn
8
+
9
+ requests
10
+ pydantic
11
+
12
+ streamlit
13
+
14
+ # Only really needed in Colab for exposing Streamlit, safe to keep
15
+ pyngrok
src/api_gateway/app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/api_gateway/app.py
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ import requests
5
+ import time
6
+ app = FastAPI(title="API Gateway")
7
+
8
+ DOC_URL = "http://localhost:9001"
9
+ EMBED_URL = "http://localhost:9002"
10
+ SEARCH_URL = "http://localhost:9003"
11
+ EXPLAIN_URL = "http://localhost:9004"
12
+ DATA_FOLDER = "data/docs"
13
+
14
+ class SearchQuery(BaseModel):
15
+ query: str
16
+ top_k: int = 5
17
+
18
+ @app.post("/initialize")
19
+ def initialize():
20
+ # 1) load docs
21
+ d = requests.post(f"{DOC_URL}/load_docs", json={"folder": DATA_FOLDER}, timeout=20)
22
+ if d.status_code != 200:
23
+ return {"error": "doc_load_failed", "detail": d.text}
24
+ docs = d.json().get("documents", [])
25
+
26
+ # 2) prepare docs for embed_batch: ensure keys filename,text,hash
27
+ batch_docs = [{"filename": x["filename"], "text": x.get("clean_text", x.get("text","")), "hash": x["hash"]} for x in docs]
28
+
29
+ # 3) embed batch
30
+ e = requests.post(f"{EMBED_URL}/embed_batch", json={"docs": batch_docs}, timeout=60)
31
+ if e.status_code != 200:
32
+ return {"error": "embed_failed", "detail": e.text}
33
+ embed_out = e.json()
34
+ embeddings = [r["embedding"] for r in embed_out["results"]]
35
+ meta = {i: r["filename"] for i, r in enumerate(embed_out["results"])}
36
+
37
+ # 4) build index
38
+ b = requests.post(f"{SEARCH_URL}/build_index", json={"embeddings": embeddings, "meta": meta}, timeout=60)
39
+ if b.status_code != 200:
40
+ return {"error": "build_index_failed", "detail": b.text}
41
+
42
+ return {"docs_loaded": len(docs), "embeddings": len(embeddings), "build": b.json()}
43
+
44
+ @app.post("/search")
45
+ def search(req: SearchQuery):
46
+ # embed query
47
+ unique_id = str(time.time())
48
+ q = requests.post(f"{EMBED_URL}/embed_document", json={"filename": f"query_{unique_id}", "text": req.query, "hash": unique_id}, timeout=10)
49
+ if q.status_code != 200:
50
+ return {"error": "embed_query_failed", "detail": q.text}
51
+ q_emb = q.json()["embedding"]
52
+
53
+ # search vectors
54
+ s = requests.post(f"{SEARCH_URL}/search_vectors", json={"query_embedding": q_emb, "top_k": req.top_k}, timeout=10)
55
+ if s.status_code != 200:
56
+ return {"error": "search_failed", "detail": s.text}
57
+ sdata = s.json()
58
+ if "error" in sdata:
59
+ return {"error": "search_index_error", "detail": sdata}
60
+
61
+ scores = sdata["scores"]
62
+ ids = sdata["ids"]
63
+ meta = sdata["meta"] # { "0": filename, ... }
64
+
65
+ # for each id load doc from doc service and call explain
66
+ results = []
67
+ for score, idx in zip(scores, ids):
68
+ filename = meta.get(str(idx))
69
+ if filename is None:
70
+ continue
71
+ doc_resp = requests.get(f"{DOC_URL}/get_doc/{filename}", timeout=10)
72
+ if doc_resp.status_code != 200:
73
+ continue
74
+ doc = doc_resp.json() # has clean_text, original_text, ...
75
+ # explain
76
+ exp = requests.post(f"{EXPLAIN_URL}/explain", json={"query": req.query, "document_text": doc.get("clean_text","")}, timeout=10)
77
+ explanation = exp.json() if exp.status_code == 200 else {}
78
+ results.append({
79
+ "filename": filename,
80
+ "score": float(score),
81
+ "preview": doc.get("clean_text","")[:350],
82
+ "full_text": doc.get("original_text",""),
83
+ "explanation": explanation
84
+ })
85
+ return {"results": results}
src/bench/benchmark_cache.py ADDED
File without changes
src/bench/benchmark_search.py ADDED
File without changes
src/doc_service/app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/doc_service/app.py
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from .utils import preprocess_documents, load_original_text
5
+
6
+ app = FastAPI(title="Document Service")
7
+
8
+ class FolderRequest(BaseModel):
9
+ folder: str
10
+
11
+ # In-memory stores (simple)
12
+ _DOCUMENTS = {} # filename -> dict with clean_text, hash, length, original_text
13
+
14
+ @app.post("/load_docs")
15
+ def load_docs(req: FolderRequest):
16
+ try:
17
+ docs = preprocess_documents(req.folder)
18
+ for d in docs:
19
+ _DOCUMENTS[d["filename"]] = {
20
+ "filename": d["filename"],
21
+ "clean_text": d["clean_text"],
22
+ "hash": d["hash"],
23
+ "length": d["length"],
24
+ "original_text": d["original_text"]
25
+ }
26
+ return {"count": len(docs), "documents": list(_DOCUMENTS.values())}
27
+ except Exception as e:
28
+ return {"error": str(e)}
29
+
30
+ @app.get("/get_doc/{filename}")
31
+ def get_doc(filename: str):
32
+ if filename not in _DOCUMENTS:
33
+ return {"error": "not_found", "message": f"{filename} not found"}
34
+ return _DOCUMENTS[filename]
35
+
36
+ @app.get("/all_docs")
37
+ def all_docs():
38
+ return {"count": len(_DOCUMENTS), "documents": list(_DOCUMENTS.values())}
src/doc_service/utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/doc_service/utils.py
2
+ import os
3
+ import hashlib
4
+ import re
5
+
6
+ def load_text_files(folder_path: str):
7
+ docs = []
8
+ for fname in sorted(os.listdir(folder_path)):
9
+ if fname.endswith(".txt"):
10
+ full_path = os.path.join(folder_path, fname)
11
+ with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
12
+ text = f.read()
13
+ docs.append({
14
+ "filename": fname,
15
+ "path": full_path,
16
+ "text": text
17
+ })
18
+ return docs
19
+
20
+
21
+ def load_original_text(folder_path: str, filename: str):
22
+ path = os.path.join(folder_path, filename)
23
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
24
+ return f.read()
25
+
26
+
27
+ def clean_text(text: str) -> str:
28
+ text = text.lower()
29
+ text = re.sub(r'<.*?>', '', text)
30
+ text = re.sub(r'\s+', ' ', text).strip()
31
+ return text
32
+
33
+
34
+ def compute_hash(text: str) -> str:
35
+ return hashlib.md5(text.encode("utf-8")).hexdigest()
36
+
37
+
38
+ def preprocess_documents(folder_path: str):
39
+ raw_docs = load_text_files(folder_path)
40
+ result = []
41
+
42
+ for doc in raw_docs:
43
+ cleaned = clean_text(doc["text"])
44
+ h = compute_hash(cleaned)
45
+ result.append({
46
+ "filename": doc["filename"],
47
+ "clean_text": cleaned,
48
+ "hash": h,
49
+ "length": len(cleaned.split()),
50
+ "original_text": doc["text"]
51
+ })
52
+
53
+ return result
src/embed_service/app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/embed_service/app.py
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from .embedder import Embedder
5
+ from .cache_manager import CacheManager
6
+ import numpy as np
7
+
8
+ app = FastAPI(title="Embed Service")
9
+
10
+ embedder = Embedder()
11
+ cache = CacheManager()
12
+
13
+ class EmbedRequest(BaseModel):
14
+ filename: str
15
+ text: str
16
+ hash: str
17
+
18
+ @app.post("/embed_document")
19
+ def embed_document(req: EmbedRequest):
20
+ if cache.exists(req.filename, req.hash):
21
+ emb = cache.get_embedding(req.filename)
22
+ return {"filename": req.filename, "cached": True, "embedding": emb.tolist()}
23
+ emb = embedder.embed_text(req.text)
24
+ cache.add_embedding(req.filename, req.hash, emb)
25
+ return {"filename": req.filename, "cached": False, "embedding": emb.tolist()}
26
+
27
+ class BatchEmbedRequest(BaseModel):
28
+ docs: list
29
+
30
+ @app.post("/embed_batch")
31
+ def embed_batch(req: BatchEmbedRequest):
32
+ results = []
33
+ new_texts, new_files, new_hashes = [], [], []
34
+ for d in req.docs:
35
+ filename = d.get("filename")
36
+ file_hash = d.get("hash")
37
+ text = d.get("text") or d.get("clean_text") or ""
38
+ if cache.exists(filename, file_hash):
39
+ results.append({"filename": filename, "cached": True, "embedding": cache.get_embedding(filename).tolist()})
40
+ else:
41
+ new_files.append(filename)
42
+ new_hashes.append(file_hash)
43
+ new_texts.append(text)
44
+
45
+ if new_texts:
46
+ new_embs = embedder.embed_batch(new_texts)
47
+ for fname, h, emb in zip(new_files, new_hashes, new_embs):
48
+ cache.add_embedding(fname, h, emb)
49
+ results.append({"filename": fname, "cached": False, "embedding": emb.tolist()})
50
+
51
+ return {"count": len(results), "results": results}
52
+
53
+ @app.get("/all_embeddings")
54
+ def get_all_embeddings():
55
+ meta, embs = cache.all_embeddings()
56
+ return {"meta": meta, "embeddings": embs.tolist()}
57
+
58
+ # convenience endpoint called earlier by older code
59
+ @app.post("/embed_all")
60
+ def embed_all_docs(docs: list):
61
+ # docs: list of {filename, clean_text, hash}
62
+ batch = {"docs": [{"filename": d["filename"], "text": d.get("clean_text") or d.get("text", ""), "hash": d["hash"]} for d in docs]}
63
+ return embed_batch(BatchEmbedRequest(**batch))
src/embed_service/cache_manager.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/embed_service/cache_manager.py
2
+ import os
3
+ import json
4
+ import numpy as np
5
+
6
+ CACHE_DIR = "cache"
7
+ META_PATH = f"{CACHE_DIR}/embed_meta.json"
8
+ EMB_PATH = f"{CACHE_DIR}/embeddings.npy"
9
+
10
+ class CacheManager:
11
+ def __init__(self):
12
+ os.makedirs(CACHE_DIR, exist_ok=True)
13
+ if os.path.exists(META_PATH):
14
+ with open(META_PATH, "r") as f:
15
+ self.meta = json.load(f)
16
+ else:
17
+ self.meta = {} # filename -> {"hash":..., "index": int}
18
+
19
+ if os.path.exists(EMB_PATH):
20
+ self.embeddings = np.load(EMB_PATH)
21
+ else:
22
+ # empty array shaped (0, dim) β€” we'll resize when first embedding arrives
23
+ self.embeddings = np.zeros((0, 384), dtype="float32")
24
+
25
+ def save(self):
26
+ with open(META_PATH, "w") as f:
27
+ json.dump(self.meta, f, indent=2)
28
+ np.save(EMB_PATH, self.embeddings)
29
+
30
+ def exists(self, filename: str, file_hash: str) -> bool:
31
+ return filename in self.meta and self.meta[filename]["hash"] == file_hash
32
+
33
+ def get_embedding(self, filename: str):
34
+ idx = int(self.meta[filename]["index"])
35
+ return self.embeddings[idx]
36
+
37
+ def add_embedding(self, filename: str, file_hash: str, embedding):
38
+ embedding = embedding.astype("float32")
39
+ idx = len(self.embeddings)
40
+ self.meta[filename] = {"hash": file_hash, "index": idx}
41
+ if self.embeddings.shape[0] == 0:
42
+ self.embeddings = embedding.reshape(1, -1)
43
+ else:
44
+ self.embeddings = np.vstack([self.embeddings, embedding.reshape(1, -1)])
45
+ self.save()
46
+
47
+ def all_embeddings(self):
48
+ return self.meta, self.embeddings
src/embed_service/embedder.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/embed_service/embedder.py
2
+ from sentence_transformers import SentenceTransformer
3
+ import numpy as np
4
+
5
+ class Embedder:
6
+ def __init__(self, model_name="all-MiniLM-L6-v2"):
7
+ print(f"Loading embedding model: {model_name}")
8
+ self.model = SentenceTransformer(model_name)
9
+
10
+ def embed_text(self, text: str):
11
+ emb = self.model.encode(text, convert_to_numpy=True)
12
+ return emb.astype("float32")
13
+
14
+ def embed_batch(self, texts: list):
15
+ embs = self.model.encode(texts, convert_to_numpy=True)
16
+ return embs.astype("float32")
17
+
18
+ def dim(self):
19
+ return self.model.get_sentence_embedding_dimension()
src/explain_service/app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/explain_service/app.py
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from .explainer import Explainer
5
+
6
+ app = FastAPI(title="Explain Service")
7
+ explainer = Explainer()
8
+
9
+ class ExplainRequest(BaseModel):
10
+ query: str
11
+ document_text: str
12
+
13
+ @app.post("/explain")
14
+ def explain_doc(req: ExplainRequest):
15
+ return explainer.explain(req.query, req.document_text)
src/explain_service/explainer.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/explain_service/explainer.py
2
+
3
+ import re
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from google import genai
7
+
8
+ STOPWORDS = set("""
9
+ a an the and or but if while with without for on in into by to from of is are was were be been being as it this that these those
10
+ """.split())
11
+
12
+
13
+ class Explainer:
14
+ def __init__(self):
15
+ self.model = SentenceTransformer("all-MiniLM-L6-v2")
16
+ # LLM Client
17
+ self.client = genai.Client(api_key="AIzaSyBXEkZ7DF_QxuKwyxsXQ6FZ_qLk4k7W1n0")
18
+
19
+ # ---------------------------
20
+ # TOKENIZER
21
+ # ---------------------------
22
+ def tokenize(self, text: str):
23
+ text = text.lower()
24
+ tokens = re.findall(r"[a-zA-Z]+", text)
25
+ tokens = [t for t in tokens if t not in STOPWORDS]
26
+ return tokens
27
+
28
+ # ---------------------------
29
+ # KEYWORD OVERLAP
30
+ # ---------------------------
31
+ def keyword_overlap(self, query: str, doc: str):
32
+ q_tokens = set(self.tokenize(query))
33
+ d_tokens = set(self.tokenize(doc))
34
+
35
+ overlap = q_tokens.intersection(d_tokens)
36
+ overlap_ratio = len(overlap) / (len(q_tokens) + 1e-8)
37
+
38
+ return list(overlap), float(overlap_ratio)
39
+
40
+ # ---------------------------
41
+ # BEST SENTENCES MATCHING QUERY
42
+ # ---------------------------
43
+ def best_sentences(self, query: str, doc: str, top_k=2):
44
+ sentences = re.split(r"[.!?]", doc)
45
+ sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
46
+
47
+ if len(sentences) == 0:
48
+ return []
49
+
50
+ q_emb = self.model.encode(query, convert_to_numpy=True)
51
+ s_embs = self.model.encode(sentences, convert_to_numpy=True)
52
+
53
+ q_emb = q_emb / (np.linalg.norm(q_emb) + 1e-10)
54
+ s_norm = s_embs / (np.linalg.norm(s_embs, axis=1, keepdims=True) + 1e-10)
55
+
56
+ sims = (s_norm @ q_emb).tolist()
57
+ top_ids = np.argsort(sims)[::-1][:top_k]
58
+
59
+ results = []
60
+ for idx in top_ids:
61
+ results.append({
62
+ "sentence": sentences[idx],
63
+ "score": float(sims[idx])
64
+ })
65
+
66
+ return results
67
+
68
+ # ---------------------------
69
+ # LLM-LEVEL EXPLANATION
70
+ # ---------------------------
71
+ def llm_explain(self, query, doc_text, top_sentences):
72
+
73
+ formatted_sentences = "\n".join(
74
+ [f"- {s['sentence']} (score: {s['score']:.2f})" for s in top_sentences]
75
+ )
76
+
77
+ prompt = f"""
78
+ You are an AI assistant that explains WHY a document matches a user query.
79
+
80
+ QUERY:
81
+ {query}
82
+
83
+ DOCUMENT EXCERPT:
84
+ {doc_text[:500]}
85
+
86
+ MOST RELEVANT SENTENCES:
87
+ {formatted_sentences}
88
+
89
+ Write 2–3 natural sentences explaining WHY this document is relevant.
90
+ """
91
+
92
+ response = self.client.models.generate_content(
93
+ model="gemini-2.5-flash",
94
+ contents=prompt,
95
+ config={"temperature": 0.4}
96
+ )
97
+
98
+ return response.text.strip()
99
+
100
+ # ---------------------------
101
+ # MAIN EXPLAIN FUNCTION
102
+ # ---------------------------
103
+ def explain(self, query: str, doc_text: str):
104
+
105
+ keywords, overlap_ratio = self.keyword_overlap(query, doc_text)
106
+ top_sents = self.best_sentences(query, doc_text)
107
+ llm_summary = self.llm_explain(query, doc_text, top_sents)
108
+
109
+ return {
110
+ "keyword_overlap": keywords,
111
+ "overlap_ratio": overlap_ratio,
112
+ "top_sentences": top_sents,
113
+ "llm_explanation": llm_summary
114
+ }
src/search_service/app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/search_service/app.py
2
+ from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ from .indexer import FAISSIndexer
5
+ import numpy as np
6
+
7
+ app = FastAPI(title="Search Service")
8
+
9
+ indexer = FAISSIndexer()
10
+ # attempt load if exists
11
+ indexer.try_load()
12
+
13
+ class BuildIndexRequest(BaseModel):
14
+ embeddings: list
15
+ meta: dict
16
+
17
+ @app.post("/build_index")
18
+ def build_index(req: BuildIndexRequest):
19
+ embeddings = np.array(req.embeddings, dtype="float32")
20
+ indexer.build(embeddings, req.meta)
21
+ return {"status": "index_built", "count": embeddings.shape[0]}
22
+
23
+ class SearchRequest(BaseModel):
24
+ query_embedding: list
25
+ top_k: int = 5
26
+
27
+ @app.post("/search_vectors")
28
+ def search_vectors(req: SearchRequest):
29
+ if indexer.index is None:
30
+ return {"error": "index_not_built"}
31
+ query = np.array(req.query_embedding, dtype="float32")
32
+ scores, ids = indexer.search(query, req.top_k)
33
+ return {"scores": scores, "ids": ids, "meta": indexer.meta}
src/search_service/indexer.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/search_service/indexer.py
2
+ import numpy as np
3
+ import faiss
4
+ import os
5
+ import pickle
6
+
7
+ class FAISSIndexer:
8
+ def __init__(self):
9
+ self.index = None
10
+ self.meta = None
11
+ self.index_path = "faiss_index.bin"
12
+ self.meta_path = "faiss_meta.pkl"
13
+
14
+ def try_load(self):
15
+ if not os.path.exists(self.meta_path) or not os.path.exists(self.index_path):
16
+ return None, None
17
+ with open(self.meta_path, "rb") as f:
18
+ meta = pickle.load(f)
19
+ index = faiss.read_index(self.index_path)
20
+ self.index = index
21
+ self.meta = meta
22
+ return meta, None
23
+
24
+ def build(self, embeddings, meta):
25
+ # embeddings: numpy array (N, dim)
26
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-10
27
+ embeddings = embeddings / norms
28
+ dim = embeddings.shape[1]
29
+ index = faiss.IndexFlatL2(dim)
30
+ index.add(embeddings)
31
+ faiss.write_index(index, self.index_path)
32
+ # normalize meta keys to str(index)->filename
33
+ meta_map = {}
34
+ for k, v in meta.items():
35
+ meta_map[str(k)] = v
36
+ with open(self.meta_path, "wb") as f:
37
+ pickle.dump(meta_map, f)
38
+ self.index = index
39
+ self.meta = meta_map
40
+
41
+ def search(self, query_emb, top_k):
42
+ if self.index is None:
43
+ raise ValueError("FAISS index is not loaded!")
44
+ q = query_emb / (np.linalg.norm(query_emb) + 1e-10)
45
+ q = q.reshape(1, -1)
46
+ distances, ids = self.index.search(q, top_k)
47
+ # distances shape (1, k), ids shape (1, k)
48
+ return distances[0].tolist(), ids[0].tolist()
src/ui/streamlit_app.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import json
4
+ import html
5
+ import sys
6
+ import os
7
+ import importlib
8
+
9
+ # ------------------------------------------
10
+ # Add project root + eval folder to path
11
+ # ------------------------------------------
12
+ CURRENT_DIR = os.path.dirname(__file__)
13
+ SRC_DIR = os.path.dirname(CURRENT_DIR)
14
+ PROJECT_ROOT = os.path.dirname(SRC_DIR)
15
+
16
+ sys.path.append(PROJECT_ROOT)
17
+ sys.path.append(os.path.join(PROJECT_ROOT, "eval"))
18
+
19
+ # ------------------------------------------
20
+ # FORCE RELOAD evaluate module
21
+ # ------------------------------------------
22
+ import eval.evaluate as eval_module
23
+ importlib.reload(eval_module)
24
+ from eval.evaluate import run_evaluation
25
+
26
+
27
+ API_GATEWAY_URL = "http://localhost:8000"
28
+
29
+ st.set_page_config(
30
+ page_title="Gemini Search",
31
+ page_icon="✨",
32
+ layout="wide",
33
+ initial_sidebar_state="expanded", # Changed from "collapsed" to "expanded"
34
+ )
35
+
36
+ # =======================
37
+ # GEMINI UI STYLING
38
+ # =======================
39
+ st.markdown("""
40
+ <style>
41
+ /* Global Font & Background */
42
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
43
+
44
+ html, body, [class*="css"] {
45
+ font-family: 'Inter', sans-serif;
46
+ background-color: #ffffff; /* White Background */
47
+ color: #1f1f1f; /* Dark text for contrast */
48
+ }
49
+
50
+ /* --- INPUT FIELD FIX --- */
51
+ /* 1. Remove the default Streamlit border/background on the container */
52
+ .stTextInput > div[data-baseweb="input"] {
53
+ background-color: transparent !important;
54
+ border: none !important;
55
+ border-radius: 24px !important;
56
+ box-shadow: none !important;
57
+ }
58
+
59
+ /* 2. Style the actual input element */
60
+ .stTextInput input {
61
+ border-radius: 24px !important;
62
+ background-color: #f0f4f9 !important; /* Light ash input */
63
+ border: 1px solid transparent !important;
64
+ color: #1f1f1f !important;
65
+ padding: 12px 20px !important;
66
+ font-size: 16px !important;
67
+ transition: all 0.2s ease;
68
+ }
69
+
70
+ /* 3. Focus state - clean blue border, no default red overlay */
71
+ .stTextInput input:focus {
72
+ background-color: #ffffff !important;
73
+ border-color: #0b57d0 !important; /* Gemini Blue */
74
+ box-shadow: 0 0 0 2px rgba(11, 87, 208, 0.2) !important;
75
+ outline: none !important;
76
+ }
77
+
78
+ /* Button Styling */
79
+ .stButton > button {
80
+ border-radius: 20px;
81
+ font-weight: 500;
82
+ border: none;
83
+ padding: 0.5rem 1.5rem;
84
+ transition: all 0.3s ease;
85
+ white-space: nowrap; /* Forces text to stay on one line */
86
+ min-width: 140px; /* Ensures button is never too skinny */
87
+ }
88
+
89
+ /* Primary Search Button */
90
+ button[kind="primary"] {
91
+ background: linear-gradient(90deg, #4b90ff, #ff5546);
92
+ color: white;
93
+ }
94
+ button[kind="primary"]:hover {
95
+ opacity: 0.9;
96
+ box-shadow: 0 4px 12px rgba(75, 144, 255, 0.3);
97
+ }
98
+
99
+ /* Result Card - Light Ash Background */
100
+ .result-card {
101
+ background-color: #f0f4f9; /* Light Ash */
102
+ border-radius: 16px;
103
+ padding: 1.5rem;
104
+ margin-bottom: 1rem;
105
+ border: none; /* Removed border for cleaner look on light mode */
106
+ transition: transform 0.2s;
107
+ }
108
+ .result-card:hover {
109
+ box-shadow: 0 4px 12px rgba(0,0,0,0.05);
110
+ }
111
+
112
+ /* Typography in Cards */
113
+ .card-title {
114
+ color: #1f1f1f; /* Dark Title */
115
+ font-size: 1.1rem;
116
+ font-weight: 600;
117
+ margin-bottom: 0.5rem;
118
+ display: flex;
119
+ align-items: center;
120
+ gap: 8px;
121
+ }
122
+
123
+ .card-preview {
124
+ color: #444746; /* Darker gray for readable preview */
125
+ font-size: 0.95rem;
126
+ line-height: 1.5;
127
+ margin-bottom: 1rem;
128
+ }
129
+
130
+ /* Pills & Badges */
131
+ .score-badge {
132
+ background-color: #c4eed0; /* Light Green bg */
133
+ color: #0f5223; /* Dark Green text */
134
+ padding: 4px 12px;
135
+ border-radius: 12px;
136
+ font-size: 0.75rem;
137
+ font-weight: 500;
138
+ display: inline-block;
139
+ }
140
+
141
+ .keyword-pill {
142
+ background-color: #c2e7ff; /* Light Blue bg */
143
+ color: #004a77; /* Dark Blue text */
144
+ padding: 2px 10px;
145
+ border-radius: 8px;
146
+ font-size: 0.8rem;
147
+ margin-right: 6px;
148
+ display: inline-block;
149
+ margin-bottom: 4px;
150
+ }
151
+
152
+ /* Gradient Text for Header */
153
+ .gradient-text {
154
+ background: linear-gradient(to right, #4285f4, #9b72cb, #d96570);
155
+ -webkit-background-clip: text;
156
+ -webkit-text-fill-color: transparent;
157
+ font-weight: 700;
158
+ font-size: 3rem;
159
+ }
160
+
161
+ /* Custom Info Box */
162
+ .stAlert {
163
+ background-color: #f0f4f9;
164
+ color: #1f1f1f;
165
+ }
166
+ </style>
167
+ """, unsafe_allow_html=True)
168
+
169
+ # =======================
170
+ # SIDEBAR (Settings)
171
+ # =======================
172
+ with st.sidebar:
173
+ st.markdown("### βš™οΈ Settings")
174
+ top_k = st.slider("Top-K Results", 1, 10, 5)
175
+ url_input = st.text_input("API Endpoint", API_GATEWAY_URL)
176
+ st.divider()
177
+ st.subheader(" Evaluation")
178
+ run_eval = st.button("Run Evaluation Script")
179
+ st.divider()
180
+ st.caption(" Powered by Sentence-Transformers")
181
+
182
+ API_GATEWAY_URL = url_input
183
+
184
+ # =======================
185
+ # MAIN HEADER (Gemini Style)
186
+ # =======================
187
+ col1, col2, col3 = st.columns([1, 6, 1])
188
+ with col2:
189
+ # Use HTML for the gradient text title
190
+ st.markdown('<div style="text-align: center; margin-bottom: 10px;"><span class="gradient-text">Hello, Explorer</span></div>', unsafe_allow_html=True)
191
+ st.markdown('<div style="text-align: center; color: #444746; font-size: 1.2rem; margin-bottom: 30px;">How can I help you find documents today?</div>', unsafe_allow_html=True)
192
+
193
+
194
+ # =======================
195
+ # SEARCH BAR CENTERED
196
+ # =======================
197
+ # Centering the search bar using columns
198
+ sc1, sc2, sc3 = st.columns([1, 4, 1])
199
+
200
+ with sc2:
201
+ query = st.text_input(
202
+ "Search Query", # Label hidden by CSS/Config if needed, or set visibility hidden
203
+ placeholder="Ask a question about your documents...",
204
+ label_visibility="collapsed"
205
+ )
206
+
207
+ # Buttons row
208
+ b1, b2, b3 = st.columns([2, 1, 2])
209
+ with b2:
210
+ submit_btn = st.button("Sparkle Search", type="primary", use_container_width=True)
211
+
212
+ # =======================
213
+ # SEARCH HANDLER
214
+ # =======================
215
+ if submit_btn and query.strip():
216
+
217
+ # Gemini-style spinner
218
+ with st.spinner("✨ Analyzing semantics..."):
219
+
220
+ response = requests.post(
221
+ f"{API_GATEWAY_URL}/search",
222
+ json={"query": query, "top_k": top_k}
223
+ )
224
+
225
+ if response.status_code != 200:
226
+ st.error(f"❌ Connection Error: {response.text}")
227
+ st.stop()
228
+
229
+ try:
230
+ data = response.json()
231
+ except:
232
+ st.error("❌ Invalid JSON response.")
233
+ st.stop()
234
+
235
+ if "results" not in data:
236
+ st.info("No relevant documents found for that query.")
237
+ st.stop()
238
+
239
+ # Results Header
240
+ st.markdown("### ✨ Search Results")
241
+ st.markdown("---")
242
+
243
+ # =======================
244
+ # DISPLAY RESULTS (Card Style)
245
+ # =======================
246
+ for item in data["results"]:
247
+ filename = item["filename"]
248
+ score = item["score"]
249
+ explanation = item["explanation"]
250
+ preview = item["preview"]
251
+ full_text = item["full_text"]
252
+
253
+ safe_preview = html.escape(preview)
254
+
255
+ # Prepare keyword HTML
256
+ keywords = explanation.get("keyword_overlap", [])
257
+ keyword_html = ""
258
+ if keywords:
259
+ keyword_html = "".join([f"<span class='keyword-pill'>{kw}</span>" for kw in keywords])
260
+
261
+ # Doc Icon (SVG) - Changed stroke to dark blue for visibility on light bg
262
+ doc_icon = """<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="#0b57d0" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M14.5 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V7.5L14.5 2z"></path><polyline points="14 2 14 8 20 8"></polyline></svg>"""
263
+
264
+ # Main Card Render
265
+ st.markdown(f"""
266
+ <div class="result-card">
267
+ <div style="display:flex; justify-content:space-between; align-items:start;">
268
+ <div class="card-title">
269
+ {doc_icon} {filename}
270
+ </div>
271
+ <div class="score-badge">match: {score:.4f}</div>
272
+ </div>
273
+ <p class="card-preview">{safe_preview}...</p>
274
+ <div style="margin-top: 10px;">
275
+ <div style="font-weight:600; color:#1f1f1f; margin-bottom:6px;">
276
+ Keyword Overlap:
277
+ </div>
278
+ {keyword_html}
279
+ </div>
280
+ </div>
281
+ """, unsafe_allow_html=True)
282
+
283
+ # Details Expander (Standard Streamlit but styled via global CSS)
284
+ with st.expander(f"View Details & Full Text for {filename}"):
285
+
286
+ overlap_ratio = explanation.get("overlap_ratio", 0)
287
+ sentences = explanation.get("top_sentences", [])
288
+
289
+ st.caption(f"Semantic Overlap Ratio: {overlap_ratio:.3f}")
290
+
291
+ if sentences:
292
+ st.markdown("**Key Excerpts:**")
293
+ for s in sentences:
294
+ # Updated quote box for light mode
295
+ st.markdown(f"""
296
+ <div style="background: #ffffff; border-left: 3px solid #4285f4; padding: 10px; margin-bottom: 5px; border-radius: 0 8px 8px 0; box-shadow: 0 1px 3px rgba(0,0,0,0.05);">
297
+ <span style="color: #1f1f1f;">"{s['sentence']}"</span>
298
+ <span style="color: #5e5e5e; font-size: 0.8em; margin-left: 10px;">(conf: {s['score']:.2f})</span>
299
+ </div>
300
+ """, unsafe_allow_html=True)
301
+ llm_expl = explanation.get("llm_explanation")
302
+ if llm_expl:
303
+ st.markdown("**Why this document?**")
304
+ st.write(llm_expl)
305
+ st.markdown("---")
306
+ st.markdown("**πŸ“„ Full Document Content:**")
307
+ st.code(full_text, language="text") # Using code block for better readability of raw text
308
+ if run_eval:
309
+
310
+ st.info("Running evaluation... this may take 10–20 seconds...")
311
+
312
+ results = run_evaluation(top_k=10)
313
+
314
+ st.success("Evaluation Complete!")
315
+
316
+ # -----------------------------
317
+ # Summary Metrics (Horizontal)
318
+ # -----------------------------
319
+ st.markdown("## Evaluation Summary")
320
+
321
+ c1, c2, c3, c4 = st.columns(4)
322
+ with c1:
323
+ st.metric("Accuracy", f"{results['accuracy']}%")
324
+ with c2:
325
+ st.metric("MRR", results["mrr"])
326
+ with c3:
327
+ st.metric("NDCG", results["ndcg"])
328
+ with c4:
329
+ st.metric("Queries", results["total_queries"])
330
+
331
+ st.markdown(
332
+ f"**Correct:** {results['correct_count']} &nbsp;&nbsp;|&nbsp;&nbsp; "
333
+ f"**Incorrect:** {results['incorrect_count']}"
334
+ )
335
+
336
+ st.markdown("---")
337
+
338
+ # -----------------------------
339
+ # Incorrect Results
340
+ # -----------------------------
341
+ st.markdown("## Incorrect Fetches ")
342
+
343
+ wrong = [d for d in results["details"] if not d["is_correct"]]
344
+
345
+ if wrong:
346
+ for item in wrong:
347
+ st.markdown(f"""
348
+ <div style="
349
+ padding:14px;
350
+ background:#ffe5e5;
351
+ border-left:5px solid #ff4d4f;
352
+ border-radius:8px;
353
+ margin-bottom:10px;">
354
+ <b> Query:</b> {item['query']}<br>
355
+ <b>Expected:</b> {item['expected']}<br>
356
+ <b>Retrieved:</b> {item['retrieved']}<br>
357
+ <b>Rank:</b> {item['rank']}
358
+ </div>
359
+ """, unsafe_allow_html=True)
360
+ else:
361
+ st.success(" No incorrect queries!")
362
+
363
+ st.markdown("---")
364
+
365
+ # -----------------------------
366
+ # Correct Results
367
+ # -----------------------------
368
+ st.markdown("## Correct Fetches")
369
+
370
+ correct_items = [d for d in results["details"] if d["is_correct"]]
371
+
372
+ if correct_items:
373
+ for item in correct_items:
374
+ st.markdown(f"""
375
+ <div style="
376
+ padding:14px;
377
+ background:#e8ffe5;
378
+ border-left:5px solid #2ecc71;
379
+ border-radius:8px;
380
+ margin-bottom:10px;">
381
+ <b> Query:</b> {item['query']}<br>
382
+ <b>Expected:</b> {item['expected']}<br>
383
+ <b>Top-K Retrieved:</b> {item['retrieved']}<br>
384
+ <b>Rank:</b> {item['rank']}
385
+ </div>
386
+ """, unsafe_allow_html=True)
387
+ else:
388
+ st.info("No correct queries.")
389
+
390
+ st.markdown("---")
391
+
392
+ # -----------------------------
393
+ # Full Table
394
+ # -----------------------------
395
+ st.markdown("## Full Evaluation Table")
396
+
397
+ table_data = []
398
+ for item in results["details"]:
399
+ table_data.append({
400
+ "Query": item["query"],
401
+ "Expected Doc": item["expected"],
402
+ "Retrieved (Top-10)": ", ".join(item["retrieved"]),
403
+ "Correct?": "Yes" if item["is_correct"] else "No",
404
+ "Rank": item["rank"]
405
+ })
406
+
407
+ st.dataframe(table_data, use_container_width=True)