Spaces:
Sleeping
Sleeping
| import json | |
| import faiss | |
| import numpy as np | |
| import re | |
| from sentence_transformers import SentenceTransformer | |
| EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", re.IGNORECASE) | |
| def load_index_and_meta(meta_path, chunks_path, index_path): | |
| meta_list = [] | |
| with open(meta_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| meta_list.append(json.loads(line)) | |
| mapping = {} | |
| with open(chunks_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| obj = json.loads(line) | |
| text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", obj["text"]) | |
| mapping[(obj["page"], obj["chunk_id"])] = text | |
| index = faiss.read_index(index_path) | |
| embed_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| return meta_list, mapping, index, embed_model | |
| def retrieve_top_k(query, top_k, min_score, index, embed_model, meta_list, mapping): | |
| qvec = embed_model.encode([query], convert_to_numpy=True).astype("float32") | |
| faiss.normalize_L2(qvec) | |
| D, I = index.search(qvec, top_k) | |
| results = [] | |
| for dist, idx in zip(D[0], I[0]): | |
| if dist < min_score: | |
| continue | |
| if idx < 0 or idx >= len(meta_list): | |
| continue | |
| m = meta_list[idx] | |
| page = m["page"] | |
| chunk = m["chunk_id"] | |
| text = mapping.get((page, chunk), "") | |
| text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", text) | |
| results.append({ | |
| "score": float(dist), | |
| "page": page, | |
| "chunk_id": chunk, | |
| "text": text | |
| }) | |
| return results | |