Flykite-HR-RAG-Bot / utils.py
saibsund's picture
Upload 4 files
8d25d65 verified
import json
import faiss
import numpy as np
import re
from sentence_transformers import SentenceTransformer
EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", re.IGNORECASE)
def load_index_and_meta(meta_path, chunks_path, index_path):
meta_list = []
with open(meta_path, "r", encoding="utf-8") as f:
for line in f:
meta_list.append(json.loads(line))
mapping = {}
with open(chunks_path, "r", encoding="utf-8") as f:
for line in f:
obj = json.loads(line)
text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", obj["text"])
mapping[(obj["page"], obj["chunk_id"])] = text
index = faiss.read_index(index_path)
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
return meta_list, mapping, index, embed_model
def retrieve_top_k(query, top_k, min_score, index, embed_model, meta_list, mapping):
qvec = embed_model.encode([query], convert_to_numpy=True).astype("float32")
faiss.normalize_L2(qvec)
D, I = index.search(qvec, top_k)
results = []
for dist, idx in zip(D[0], I[0]):
if dist < min_score:
continue
if idx < 0 or idx >= len(meta_list):
continue
m = meta_list[idx]
page = m["page"]
chunk = m["chunk_id"]
text = mapping.get((page, chunk), "")
text = EMAIL_PATTERN.sub("[REDACTED_EMAIL]", text)
results.append({
"score": float(dist),
"page": page,
"chunk_id": chunk,
"text": text
})
return results