NeerajRavi commited on
Commit
de1c3c3
·
verified ·
1 Parent(s): 38c1d86

Update helpers/live_sources.py

Browse files
Files changed (1) hide show
  1. helpers/live_sources.py +42 -41
helpers/live_sources.py CHANGED
@@ -1,42 +1,43 @@
1
- # Retrieves links for all modules
2
- import json
3
- from pathlib import Path
4
- import faiss
5
- from sentence_transformers import SentenceTransformer
6
- DATA_DIR = Path("data")
7
- VECTOR_DIR = DATA_DIR / "vector_store"
8
- LIVE_FAISS_INDEX_PATH = VECTOR_DIR / "live_faiss.index"
9
- LIVE_METADATA_PATH = VECTOR_DIR / "live_metadata.json"
10
- index = faiss.read_index(str(LIVE_FAISS_INDEX_PATH))
11
- with open(LIVE_METADATA_PATH, "r", encoding="utf-8") as f:
12
- METADATA = json.load(f)
13
- model = SentenceTransformer("all-MiniLM-L6-v2")
14
- def retrieve_live_sources(
15
- query: str,
16
- *,
17
- top_k: int = 2,
18
- search_k: int = 2000
19
- ):
20
- query_embedding = model.encode(
21
- [query],
22
- normalize_embeddings=True,
23
- convert_to_numpy=True
24
- )
25
- scores, indices = index.search(query_embedding, search_k)
26
- results = []
27
- seen_urls = set()
28
- for score, idx in zip(scores[0], indices[0]):
29
- meta = METADATA[idx]
30
- url = meta.get("document_path")
31
- if not url or url in seen_urls:
32
- continue
33
- seen_urls.add(url)
34
- results.append({
35
- "url": url,
36
- "authority": meta.get("authority"),
37
- "description": meta.get("text"),
38
- "similarity": float(score)
39
- })
40
- if len(results) >= top_k:
41
- break
 
42
  return results
 
1
+ # Retrieves links for all modules
2
+ import json
3
+ from pathlib import Path
4
+ import faiss
5
+ from sentence_transformers import SentenceTransformer
6
+ BASE_DIR = Path(__file__).resolve().parent.parent
7
+ DATA_DIR = BASE_DIR / "data"
8
+ VECTOR_DIR = DATA_DIR / "vector_store"
9
+ LIVE_FAISS_INDEX_PATH = VECTOR_DIR / "live_faiss.index"
10
+ LIVE_METADATA_PATH = VECTOR_DIR / "live_metadata.json"
11
+ index = faiss.read_index(str(LIVE_FAISS_INDEX_PATH))
12
+ with open(LIVE_METADATA_PATH, "r", encoding="utf-8") as f:
13
+ METADATA = json.load(f)
14
+ model = SentenceTransformer("all-MiniLM-L6-v2")
15
+ def retrieve_live_sources(
16
+ query: str,
17
+ *,
18
+ top_k: int = 2,
19
+ search_k: int = 2000
20
+ ):
21
+ query_embedding = model.encode(
22
+ [query],
23
+ normalize_embeddings=True,
24
+ convert_to_numpy=True
25
+ )
26
+ scores, indices = index.search(query_embedding, search_k)
27
+ results = []
28
+ seen_urls = set()
29
+ for score, idx in zip(scores[0], indices[0]):
30
+ meta = METADATA[idx]
31
+ url = meta.get("document_path")
32
+ if not url or url in seen_urls:
33
+ continue
34
+ seen_urls.add(url)
35
+ results.append({
36
+ "url": url,
37
+ "authority": meta.get("authority"),
38
+ "description": meta.get("text"),
39
+ "similarity": float(score)
40
+ })
41
+ if len(results) >= top_k:
42
+ break
43
  return results