Spaces:
Runtime error
Runtime error
Fnu Mahnoor commited on
Commit ·
ab97519
1
Parent(s): 299a880
Fix inference
Browse files- app.py +7 -3
- requirements.txt +1 -1
- src/embeddings.py +6 -2
- src/embeddings_utils.py +24 -2
- src/graph_index.py +11 -6
app.py
CHANGED
|
@@ -98,7 +98,7 @@ def main():
|
|
| 98 |
|
| 99 |
try:
|
| 100 |
# 1. Standard Vector Indexing
|
| 101 |
-
session_folder_str = upload_and_index_session(
|
| 102 |
saved_paths,
|
| 103 |
model_name=emb_model_val,
|
| 104 |
max_tokens=tokens,
|
|
@@ -106,7 +106,9 @@ def main():
|
|
| 106 |
)
|
| 107 |
|
| 108 |
session_path = Path(session_folder_str)
|
| 109 |
-
|
|
|
|
|
|
|
| 110 |
# 2. Graph Initialization
|
| 111 |
gm = HierarchicalGraphManager(storage_path=session_path / "graph_data.pkl")
|
| 112 |
|
|
@@ -123,10 +125,12 @@ def main():
|
|
| 123 |
def summarizer(prompt):
|
| 124 |
return generate_answer(prompt, [""], backend=llm_backend, model_name=llm_model_val, use_hf_api=use_hf_api)
|
| 125 |
gm.build_hierarchy(llm_summarizer_callback=summarizer)
|
|
|
|
|
|
|
| 126 |
gm.save() # Persists everything to graph_data.pkl
|
| 127 |
|
| 128 |
|
| 129 |
-
yield f"✅ Success! Session {session_path.name} is fully indexed (Vector + Graph)."
|
| 130 |
|
| 131 |
except Exception as e:
|
| 132 |
logging.exception("Ingestion failed")
|
|
|
|
| 98 |
|
| 99 |
try:
|
| 100 |
# 1. Standard Vector Indexing
|
| 101 |
+
session_folder_str, total_chunks = upload_and_index_session(
|
| 102 |
saved_paths,
|
| 103 |
model_name=emb_model_val,
|
| 104 |
max_tokens=tokens,
|
|
|
|
| 106 |
)
|
| 107 |
|
| 108 |
session_path = Path(session_folder_str)
|
| 109 |
+
|
| 110 |
+
yield f"📦 Phase 1 Complete: Created {total_chunks} semantic chunks."
|
| 111 |
+
|
| 112 |
# 2. Graph Initialization
|
| 113 |
gm = HierarchicalGraphManager(storage_path=session_path / "graph_data.pkl")
|
| 114 |
|
|
|
|
| 125 |
def summarizer(prompt):
|
| 126 |
return generate_answer(prompt, [""], backend=llm_backend, model_name=llm_model_val, use_hf_api=use_hf_api)
|
| 127 |
gm.build_hierarchy(llm_summarizer_callback=summarizer)
|
| 128 |
+
|
| 129 |
+
yield "⏳Finalizing and saving graph..."
|
| 130 |
gm.save() # Persists everything to graph_data.pkl
|
| 131 |
|
| 132 |
|
| 133 |
+
yield f"✅ Success! Session {session_path.name} is fully indexed (Vector + Graph).\nCreated {total_chunks} semantic chunks."
|
| 134 |
|
| 135 |
except Exception as e:
|
| 136 |
logging.exception("Ingestion failed")
|
requirements.txt
CHANGED
|
@@ -87,4 +87,4 @@ python-louvain>=0.16 # The "Community" detection engine
|
|
| 87 |
cdlib # Advanced community detection (optional)
|
| 88 |
|
| 89 |
# --- SPEED OPTIMIZATIONS ---
|
| 90 |
-
lxml # Much faster HTML/Docx parsing
|
|
|
|
| 87 |
cdlib # Advanced community detection (optional)
|
| 88 |
|
| 89 |
# --- SPEED OPTIMIZATIONS ---
|
| 90 |
+
lxml # Much faster HTML/Docx parsing
|
src/embeddings.py
CHANGED
|
@@ -3,6 +3,7 @@ import shutil
|
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
import faiss
|
|
|
|
| 6 |
from pathlib import Path
|
| 7 |
from datetime import datetime
|
| 8 |
from typing import List, Dict, Optional, Tuple
|
|
@@ -139,9 +140,12 @@ def append_file(file_path: str, model_name: str, index_path: str, meta_path: str
|
|
| 139 |
metas.append({"source": str(p), "page": page_num, "chunk_id": i, "text": c})
|
| 140 |
chunks.append(c)
|
| 141 |
|
|
|
|
| 142 |
embeddings = EmbeddingManager.embed(chunks, model_name)
|
| 143 |
add_embeddings_to_index(index_path, embeddings)
|
| 144 |
append_metadata(meta_path, metas)
|
|
|
|
|
|
|
| 145 |
return {"indexed_chunks": len(chunks)}
|
| 146 |
|
| 147 |
def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic-embed-text-v1", max_tokens: int = 400, overlap_sentences: int = 2,):
|
|
@@ -157,8 +161,8 @@ def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic
|
|
| 157 |
res = append_file(str(dest), model_name, idx_path, meta_path, force=True, max_tokens=max_tokens, overlap_sentences=overlap_sentences)
|
| 158 |
total += res.get("indexed_chunks", 0)
|
| 159 |
|
| 160 |
-
|
| 161 |
-
return str(upload_root)
|
| 162 |
|
| 163 |
if __name__ == "__main__":
|
| 164 |
# Add your argparse logic here if needed
|
|
|
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
import faiss
|
| 6 |
+
import logging
|
| 7 |
from pathlib import Path
|
| 8 |
from datetime import datetime
|
| 9 |
from typing import List, Dict, Optional, Tuple
|
|
|
|
| 140 |
metas.append({"source": str(p), "page": page_num, "chunk_id": i, "text": c})
|
| 141 |
chunks.append(c)
|
| 142 |
|
| 143 |
+
logging.info(f"Indexing {len(chunks)} chunks and metadata {len(metas)}.")
|
| 144 |
embeddings = EmbeddingManager.embed(chunks, model_name)
|
| 145 |
add_embeddings_to_index(index_path, embeddings)
|
| 146 |
append_metadata(meta_path, metas)
|
| 147 |
+
|
| 148 |
+
logging.info(f"Appended file {file_path} to index and metadata.")
|
| 149 |
return {"indexed_chunks": len(chunks)}
|
| 150 |
|
| 151 |
def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic-embed-text-v1", max_tokens: int = 400, overlap_sentences: int = 2,):
|
|
|
|
| 161 |
res = append_file(str(dest), model_name, idx_path, meta_path, force=True, max_tokens=max_tokens, overlap_sentences=overlap_sentences)
|
| 162 |
total += res.get("indexed_chunks", 0)
|
| 163 |
|
| 164 |
+
logging.info(f"Session created at {upload_root}. Total chunks: {total}")
|
| 165 |
+
return str(upload_root), total
|
| 166 |
|
| 167 |
if __name__ == "__main__":
|
| 168 |
# Add your argparse logic here if needed
|
src/embeddings_utils.py
CHANGED
|
@@ -70,15 +70,36 @@ def add_embeddings_to_index(index_path: str, embeddings: np.ndarray):
|
|
| 70 |
|
| 71 |
faiss.write_index(idx, index_path)
|
| 72 |
|
| 73 |
-
|
|
|
|
| 74 |
"""
|
| 75 |
Efficiently appends to a pickle file using 'ab' (append binary) mode.
|
| 76 |
This avoids loading the entire existing metadata list into memory.
|
|
|
|
| 77 |
"""
|
| 78 |
os.makedirs(os.path.dirname(meta_path), exist_ok=True)
|
|
|
|
|
|
|
| 79 |
with open(meta_path, "ab") as f:
|
| 80 |
-
# Pickle can store multiple objects in one file; load_metadata handles this.
|
| 81 |
pickle.dump(new_meta, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
def load_metadata(path: str) -> list:
|
| 84 |
"""Loads all objects from an appended pickle file into a single flat list."""
|
|
@@ -93,6 +114,7 @@ def load_metadata(path: str) -> list:
|
|
| 93 |
break
|
| 94 |
return all_data
|
| 95 |
|
|
|
|
| 96 |
def compute_embeddings(
|
| 97 |
texts: List[str],
|
| 98 |
model_name: str = "nomic-ai/nomic-embed-text-v1",
|
|
|
|
| 70 |
|
| 71 |
faiss.write_index(idx, index_path)
|
| 72 |
|
| 73 |
+
|
| 74 |
+
def append_metadata(meta_path: str, new_meta: list) -> int:
|
| 75 |
"""
|
| 76 |
Efficiently appends to a pickle file using 'ab' (append binary) mode.
|
| 77 |
This avoids loading the entire existing metadata list into memory.
|
| 78 |
+
And returns the TOTAL count of chunks in the file.
|
| 79 |
"""
|
| 80 |
os.makedirs(os.path.dirname(meta_path), exist_ok=True)
|
| 81 |
+
|
| 82 |
+
# 1. Perform the append
|
| 83 |
with open(meta_path, "ab") as f:
|
|
|
|
| 84 |
pickle.dump(new_meta, f, protocol=pickle.HIGHEST_PROTOCOL)
|
| 85 |
+
|
| 86 |
+
# 2. Calculate the total size by reading the "stacked" objects
|
| 87 |
+
total_count = 0
|
| 88 |
+
try:
|
| 89 |
+
with open(meta_path, "rb") as f:
|
| 90 |
+
while True:
|
| 91 |
+
try:
|
| 92 |
+
data = pickle.load(f)
|
| 93 |
+
# If data is a list, add its length; if it's a single dict, add 1
|
| 94 |
+
total_count += len(data) if isinstance(data, list) else 1
|
| 95 |
+
except EOFError:
|
| 96 |
+
break
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logging.error(f"Error calculating metadata size: {e}")
|
| 99 |
+
|
| 100 |
+
logging.info(f"Total metadata chunks after append: {total_count}")
|
| 101 |
+
return total_count
|
| 102 |
+
|
| 103 |
|
| 104 |
def load_metadata(path: str) -> list:
|
| 105 |
"""Loads all objects from an appended pickle file into a single flat list."""
|
|
|
|
| 114 |
break
|
| 115 |
return all_data
|
| 116 |
|
| 117 |
+
|
| 118 |
def compute_embeddings(
|
| 119 |
texts: List[str],
|
| 120 |
model_name: str = "nomic-ai/nomic-embed-text-v1",
|
src/graph_index.py
CHANGED
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
| 6 |
import logging
|
| 7 |
import numpy as np
|
| 8 |
import faiss
|
| 9 |
-
|
| 10 |
class HierarchicalGraphManager:
|
| 11 |
def __init__(self, storage_path: str = "data/uploads/graph_data.pkl"):
|
| 12 |
self.storage_path = Path(storage_path)
|
|
@@ -20,7 +20,7 @@ class HierarchicalGraphManager:
|
|
| 20 |
Builds a semantic proximity graph by extracting vectors from FAISS.
|
| 21 |
No LLM is used for the graph construction phase.
|
| 22 |
"""
|
| 23 |
-
logging.info(f"🕸️ Building Semantic Graph from: {idx_path.name}")
|
| 24 |
|
| 25 |
try:
|
| 26 |
# 1. Load FAISS index
|
|
@@ -42,14 +42,19 @@ class HierarchicalGraphManager:
|
|
| 42 |
return
|
| 43 |
|
| 44 |
# 3. Load Metadata (Chunks)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# 4. Semantic Similarity Matrix
|
| 49 |
# Using CPU-efficient dot product on normalized vectors
|
| 50 |
faiss.normalize_L2(embeddings)
|
| 51 |
sim_matrix = np.dot(embeddings, embeddings.T)
|
| 52 |
-
|
|
|
|
| 53 |
# 5. Build Relationships based on Semantic Threshold
|
| 54 |
triples = []
|
| 55 |
for i in range(n_total):
|
|
|
|
| 6 |
import logging
|
| 7 |
import numpy as np
|
| 8 |
import faiss
|
| 9 |
+
from .embeddings_utils import load_metadata
|
| 10 |
class HierarchicalGraphManager:
|
| 11 |
def __init__(self, storage_path: str = "data/uploads/graph_data.pkl"):
|
| 12 |
self.storage_path = Path(storage_path)
|
|
|
|
| 20 |
Builds a semantic proximity graph by extracting vectors from FAISS.
|
| 21 |
No LLM is used for the graph construction phase.
|
| 22 |
"""
|
| 23 |
+
logging.info(f"🕸️ Building Semantic Graph from: {meta_path}, {idx_path.name}")
|
| 24 |
|
| 25 |
try:
|
| 26 |
# 1. Load FAISS index
|
|
|
|
| 42 |
return
|
| 43 |
|
| 44 |
# 3. Load Metadata (Chunks)
|
| 45 |
+
metadata = load_metadata(meta_path)
|
| 46 |
+
|
| 47 |
+
if n_total != len(metadata):
|
| 48 |
+
logging.warning(f"⚠️ Data Mismatch: FAISS index has {n_total} items, "
|
| 49 |
+
f"but Metadata has {len(metadata)}. Processing first {n_total}.")
|
| 50 |
+
metadata = metadata[:n_total]
|
| 51 |
+
|
| 52 |
# 4. Semantic Similarity Matrix
|
| 53 |
# Using CPU-efficient dot product on normalized vectors
|
| 54 |
faiss.normalize_L2(embeddings)
|
| 55 |
sim_matrix = np.dot(embeddings, embeddings.T)
|
| 56 |
+
logging.info(f"Computed similarity matrix of shape {sim_matrix.shape}")
|
| 57 |
+
|
| 58 |
# 5. Build Relationships based on Semantic Threshold
|
| 59 |
triples = []
|
| 60 |
for i in range(n_total):
|