|
|
|
|
|
import json
|
|
|
import numpy as np
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
import faiss
|
|
|
from pathlib import Path
|
|
|
|
|
|
DATA_DIR = Path("data")
|
|
|
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
|
TOP_K = 5
|
|
|
|
|
|
def load_index():
|
|
|
index = faiss.read_index(str(DATA_DIR / "vector_store.index"))
|
|
|
return index
|
|
|
|
|
|
def load_metadata():
|
|
|
with open(DATA_DIR / "metadata.json", "r", encoding="utf-8") as f:
|
|
|
return json.load(f)
|
|
|
|
|
|
def embed_query(model, query):
|
|
|
emb = model.encode([query], convert_to_numpy=True)
|
|
|
|
|
|
faiss.normalize_L2(emb)
|
|
|
return emb
|
|
|
|
|
|
def search(query, top_k=TOP_K):
|
|
|
model = SentenceTransformer(EMBED_MODEL)
|
|
|
index = load_index()
|
|
|
metadata = load_metadata()
|
|
|
|
|
|
q_emb = embed_query(model, query)
|
|
|
D, I = index.search(q_emb.astype('float32'), top_k)
|
|
|
|
|
|
results = []
|
|
|
for score, idx in zip(D[0], I[0]):
|
|
|
meta = metadata[idx]
|
|
|
results.append({"score": float(score), "doc": meta})
|
|
|
return results
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
q = input("Enter your question/query: ").strip()
|
|
|
res = search(q, top_k=5)
|
|
|
for i, r in enumerate(res, 1):
|
|
|
print(f"\n=== Result {i} (score={r['score']:.4f}) ===")
|
|
|
print("Source:", r["doc"]["source_file"], "page:", r["doc"]["page"])
|
|
|
print("Preview:", r["doc"]["text"][:800])
|
|
|
|