import json import faiss import numpy as np import re from sentence_transformers import SentenceTransformer EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", re.IGNORECASE) def load_index_and_meta(meta_path, chunks_path, index_path): meta_list = [] with open(meta_path, "r", encoding="utf-8") as f: for line in f: meta_list.append(json.loads(line)) mapping = {} with open(chunks_path, "r", encoding="utf-8") as f: for line in f: obj = json.loads(line) text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", obj["text"]) mapping[(obj["page"], obj["chunk_id"])] = text index = faiss.read_index(index_path) embed_model = SentenceTransformer("all-MiniLM-L6-v2") return meta_list, mapping, index, embed_model def retrieve_top_k(query, top_k, min_score, index, embed_model, meta_list, mapping): qvec = embed_model.encode([query], convert_to_numpy=True).astype("float32") faiss.normalize_L2(qvec) D, I = index.search(qvec, top_k) results = [] for dist, idx in zip(D[0], I[0]): if dist < min_score: continue if idx < 0 or idx >= len(meta_list): continue m = meta_list[idx] page = m["page"] chunk = m["chunk_id"] text = mapping.get((page, chunk), "") text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", text) results.append({ "score": float(dist), "page": page, "chunk_id": chunk, "text": text }) return results