Railway / helpers /live_sources.py
NeerajRavi's picture
Update helpers/live_sources.py
de1c3c3 verified
# Retrieves links for all modules
import json
from pathlib import Path
import faiss
from sentence_transformers import SentenceTransformer
BASE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = BASE_DIR / "data"
VECTOR_DIR = DATA_DIR / "vector_store"
LIVE_FAISS_INDEX_PATH = VECTOR_DIR / "live_faiss.index"
LIVE_METADATA_PATH = VECTOR_DIR / "live_metadata.json"
index = faiss.read_index(str(LIVE_FAISS_INDEX_PATH))
with open(LIVE_METADATA_PATH, "r", encoding="utf-8") as f:
METADATA = json.load(f)
model = SentenceTransformer("all-MiniLM-L6-v2")
def retrieve_live_sources(
query: str,
*,
top_k: int = 2,
search_k: int = 2000
):
query_embedding = model.encode(
[query],
normalize_embeddings=True,
convert_to_numpy=True
)
scores, indices = index.search(query_embedding, search_k)
results = []
seen_urls = set()
for score, idx in zip(scores[0], indices[0]):
meta = METADATA[idx]
url = meta.get("document_path")
if not url or url in seen_urls:
continue
seen_urls.add(url)
results.append({
"url": url,
"authority": meta.get("authority"),
"description": meta.get("text"),
"similarity": float(score)
})
if len(results) >= top_k:
break
return results