Spaces:

NeerajRavi
/

Railway

Sleeping

Railway / helpers /live_sources.py

Update helpers/live_sources.py

de1c3c3 verified 3 months ago

1.35 kB

	# Retrieves links for all modules
	import json
	from pathlib import Path
	import faiss
	from sentence_transformers import SentenceTransformer
	BASE_DIR = Path(__file__).resolve().parent.parent
	DATA_DIR = BASE_DIR / "data"
	VECTOR_DIR = DATA_DIR / "vector_store"
	LIVE_FAISS_INDEX_PATH = VECTOR_DIR / "live_faiss.index"
	LIVE_METADATA_PATH = VECTOR_DIR / "live_metadata.json"
	index = faiss.read_index(str(LIVE_FAISS_INDEX_PATH))
	with open(LIVE_METADATA_PATH, "r", encoding="utf-8") as f:
	METADATA = json.load(f)
	model = SentenceTransformer("all-MiniLM-L6-v2")
	def retrieve_live_sources(
	query: str,
	*,
	top_k: int = 2,
	search_k: int = 2000
	):
	query_embedding = model.encode(
	[query],
	normalize_embeddings=True,
	convert_to_numpy=True
	)
	scores, indices = index.search(query_embedding, search_k)
	results = []
	seen_urls = set()
	for score, idx in zip(scores[0], indices[0]):
	meta = METADATA[idx]
	url = meta.get("document_path")
	if not url or url in seen_urls:
	continue
	seen_urls.add(url)
	results.append({
	"url": url,
	"authority": meta.get("authority"),
	"description": meta.get("text"),
	"similarity": float(score)
	})
	if len(results) >= top_k:
	break
	return results