Vishwanath77 commited on
Commit
535ee95
·
verified ·
1 Parent(s): b9ced55

Upload 2 files

Browse files
Files changed (2) hide show
  1. src/apps/utils/main.py +18 -3
  2. src/apps/utils/retriever.py +35 -19
src/apps/utils/main.py CHANGED
@@ -6,10 +6,25 @@ import pickle
6
  import os
7
 
8
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
9
- pkl_path = os.path.join(BASE_DIR, "volumes", "metadata", "new_pdfs_corpus_data.pkl")
 
 
 
 
 
 
 
 
 
 
10
 
11
- with open(pkl_path, "rb") as p:
12
- metadata = pickle.load(p)
 
 
 
 
 
13
 
14
  # ids = list(metadata.keys())
15
 
 
6
  import os
7
 
8
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
9
+ def get_path(folder, filename):
10
+ # Try the standard 'volumes/' path (local structure)
11
+ path1 = os.path.join(BASE_DIR, "volumes", folder, filename)
12
+ if os.path.exists(path1):
13
+ return path1
14
+ # Try the root-level path (Hugging Face structure)
15
+ path2 = os.path.join(BASE_DIR, folder, filename)
16
+ if os.path.exists(path2):
17
+ return path2
18
+ # Default to path1 if neither exists, so it raises the error on the first expected path
19
+ return path1
20
 
21
+ pkl_path = get_path("metadata", "new_pdfs_corpus_data.pkl")
22
+
23
+ if not os.path.exists(pkl_path):
24
+ print(f"⚠️ ERROR: Metadata file not found at {pkl_path}")
25
+ else:
26
+ with open(pkl_path, "rb") as p:
27
+ metadata = pickle.load(p)
28
 
29
  # ids = list(metadata.keys())
30
 
src/apps/utils/retriever.py CHANGED
@@ -1,19 +1,35 @@
1
- import os
2
- os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # Temporary fix
3
- os.environ["FAISS_NO_OPENMP"] = "1" # Prevent FAISS from using OpenMP
4
-
5
- import faiss
6
- from numpy.linalg import norm
7
-
8
- BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
9
- index_path = os.path.join(BASE_DIR, "volumes", "indexes", "law_corpus_index2.bin")
10
- if not os.path.exists(index_path):
11
- index_path = os.path.join(BASE_DIR, "indexes", "law_corpus_index2.bin")
12
- index = faiss.read_index(index_path)
13
- print("Index loaded successfully!")
14
- print("Number of vectors in the index:", index.ntotal)
15
-
16
- def vector_db_retriever(query_embeddings, top_k=10):
17
- query_embeddings = query_embeddings / norm(query_embeddings[0])
18
- distances, indices = index.search(query_embeddings, top_k)
19
- return indices, distances
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # Temporary fix
3
+ os.environ["FAISS_NO_OPENMP"] = "1" # Prevent FAISS from using OpenMP
4
+
5
+ import faiss
6
+ from numpy.linalg import norm
7
+
8
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
9
+ def get_path(folder, filename):
10
+ # Try the standard 'volumes/' path
11
+ path1 = os.path.join(BASE_DIR, "volumes", folder, filename)
12
+ if os.path.exists(path1):
13
+ return path1
14
+ # Try the root-level path
15
+ path2 = os.path.join(BASE_DIR, folder, filename)
16
+ if os.path.exists(path2):
17
+ return path2
18
+ return path1
19
+
20
+ index_path = get_path("indexes", "law_corpus_index2.bin")
21
+
22
+ if not os.path.exists(index_path):
23
+ print(f"⚠️ ERROR: Index file not found at {index_path}")
24
+ # Fail gracefully or provide a placeholder for the index if needed
25
+ # (Leaving it to crash on faiss.read_index if it truly doesn't exist)
26
+ pass
27
+
28
+ index = faiss.read_index(index_path)
29
+ print("Index loaded successfully!")
30
+ print("Number of vectors in the index:", index.ntotal)
31
+
32
+ def vector_db_retriever(query_embeddings, top_k=10):
33
+ query_embeddings = query_embeddings / norm(query_embeddings[0])
34
+ distances, indices = index.search(query_embeddings, top_k)
35
+ return indices, distances