Spaces:
Running
Running
Upload 2 files
Browse files- src/apps/utils/main.py +18 -3
- src/apps/utils/retriever.py +35 -19
src/apps/utils/main.py
CHANGED
|
@@ -6,10 +6,25 @@ import pickle
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# ids = list(metadata.keys())
|
| 15 |
|
|
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
| 9 |
+
def get_path(folder, filename):
|
| 10 |
+
# Try the standard 'volumes/' path (local structure)
|
| 11 |
+
path1 = os.path.join(BASE_DIR, "volumes", folder, filename)
|
| 12 |
+
if os.path.exists(path1):
|
| 13 |
+
return path1
|
| 14 |
+
# Try the root-level path (Hugging Face structure)
|
| 15 |
+
path2 = os.path.join(BASE_DIR, folder, filename)
|
| 16 |
+
if os.path.exists(path2):
|
| 17 |
+
return path2
|
| 18 |
+
# Default to path1 if neither exists, so it raises the error on the first expected path
|
| 19 |
+
return path1
|
| 20 |
|
| 21 |
+
pkl_path = get_path("metadata", "new_pdfs_corpus_data.pkl")
|
| 22 |
+
|
| 23 |
+
if not os.path.exists(pkl_path):
|
| 24 |
+
print(f"⚠️ ERROR: Metadata file not found at {pkl_path}")
|
| 25 |
+
else:
|
| 26 |
+
with open(pkl_path, "rb") as p:
|
| 27 |
+
metadata = pickle.load(p)
|
| 28 |
|
| 29 |
# ids = list(metadata.keys())
|
| 30 |
|
src/apps/utils/retriever.py
CHANGED
|
@@ -1,19 +1,35 @@
|
|
| 1 |
-
import os
|
| 2 |
-
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # Temporary fix
|
| 3 |
-
os.environ["FAISS_NO_OPENMP"] = "1" # Prevent FAISS from using OpenMP
|
| 4 |
-
|
| 5 |
-
import faiss
|
| 6 |
-
from numpy.linalg import norm
|
| 7 |
-
|
| 8 |
-
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # Temporary fix
|
| 3 |
+
os.environ["FAISS_NO_OPENMP"] = "1" # Prevent FAISS from using OpenMP
|
| 4 |
+
|
| 5 |
+
import faiss
|
| 6 |
+
from numpy.linalg import norm
|
| 7 |
+
|
| 8 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
| 9 |
+
def get_path(folder, filename):
|
| 10 |
+
# Try the standard 'volumes/' path
|
| 11 |
+
path1 = os.path.join(BASE_DIR, "volumes", folder, filename)
|
| 12 |
+
if os.path.exists(path1):
|
| 13 |
+
return path1
|
| 14 |
+
# Try the root-level path
|
| 15 |
+
path2 = os.path.join(BASE_DIR, folder, filename)
|
| 16 |
+
if os.path.exists(path2):
|
| 17 |
+
return path2
|
| 18 |
+
return path1
|
| 19 |
+
|
| 20 |
+
index_path = get_path("indexes", "law_corpus_index2.bin")
|
| 21 |
+
|
| 22 |
+
if not os.path.exists(index_path):
|
| 23 |
+
print(f"⚠️ ERROR: Index file not found at {index_path}")
|
| 24 |
+
# Fail gracefully or provide a placeholder for the index if needed
|
| 25 |
+
# (Leaving it to crash on faiss.read_index if it truly doesn't exist)
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
+
index = faiss.read_index(index_path)
|
| 29 |
+
print("Index loaded successfully!")
|
| 30 |
+
print("Number of vectors in the index:", index.ntotal)
|
| 31 |
+
|
| 32 |
+
def vector_db_retriever(query_embeddings, top_k=10):
|
| 33 |
+
query_embeddings = query_embeddings / norm(query_embeddings[0])
|
| 34 |
+
distances, indices = index.search(query_embeddings, top_k)
|
| 35 |
+
return indices, distances
|