Spaces:

Mahnoor00
/

advance-multidoc-rag

Runtime error

App Files Files Community

Fnu Mahnoor commited on Jan 17

Commit

ab97519

1 Parent(s): 299a880

Fix inference

Browse files

Files changed (5) hide show

app.py +7 -3
requirements.txt +1 -1
src/embeddings.py +6 -2
src/embeddings_utils.py +24 -2
src/graph_index.py +11 -6

app.py CHANGED Viewed

@@ -98,7 +98,7 @@ def main():
             try:
                 # 1. Standard Vector Indexing
-                session_folder_str = upload_and_index_session(
                     saved_paths,
                     model_name=emb_model_val,
                     max_tokens=tokens,
@@ -106,7 +106,9 @@ def main():
                 )
                 session_path = Path(session_folder_str)
                 # 2. Graph Initialization
                 gm = HierarchicalGraphManager(storage_path=session_path / "graph_data.pkl")
@@ -123,10 +125,12 @@ def main():
                 def summarizer(prompt):
                     return generate_answer(prompt, [""], backend=llm_backend, model_name=llm_model_val, use_hf_api=use_hf_api)
                 gm.build_hierarchy(llm_summarizer_callback=summarizer)
                 gm.save() # Persists everything to graph_data.pkl
-                yield f"✅ Success! Session {session_path.name} is fully indexed (Vector + Graph)."
             except Exception as e:
                 logging.exception("Ingestion failed")

             try:
                 # 1. Standard Vector Indexing
+                session_folder_str, total_chunks = upload_and_index_session(
                     saved_paths,
                     model_name=emb_model_val,
                     max_tokens=tokens,
                 )
                 session_path = Path(session_folder_str)
+                yield f"📦 Phase 1 Complete: Created {total_chunks} semantic chunks."
                 # 2. Graph Initialization
                 gm = HierarchicalGraphManager(storage_path=session_path / "graph_data.pkl")
                 def summarizer(prompt):
                     return generate_answer(prompt, [""], backend=llm_backend, model_name=llm_model_val, use_hf_api=use_hf_api)
                 gm.build_hierarchy(llm_summarizer_callback=summarizer)
+                yield  "⏳Finalizing and saving graph..."
                 gm.save() # Persists everything to graph_data.pkl
+                yield f"✅ Success! Session {session_path.name} is fully indexed (Vector + Graph).\nCreated {total_chunks} semantic chunks."
             except Exception as e:
                 logging.exception("Ingestion failed")

requirements.txt CHANGED Viewed

@@ -87,4 +87,4 @@ python-louvain>=0.16     # The "Community" detection engine
 cdlib                    # Advanced community detection (optional)
 # --- SPEED OPTIMIZATIONS ---
-lxml                     # Much faster HTML/Docx parsing

 cdlib                    # Advanced community detection (optional)
 # --- SPEED OPTIMIZATIONS ---
+lxml                     # Much faster HTML/Docx parsing

src/embeddings.py CHANGED Viewed

@@ -3,6 +3,7 @@ import shutil
 import torch
 import numpy as np
 import faiss
 from pathlib import Path
 from datetime import datetime
 from typing import List, Dict, Optional, Tuple
@@ -139,9 +140,12 @@ def append_file(file_path: str, model_name: str, index_path: str, meta_path: str
             metas.append({"source": str(p), "page": page_num, "chunk_id": i, "text": c})
             chunks.append(c)
     embeddings = EmbeddingManager.embed(chunks, model_name)
     add_embeddings_to_index(index_path, embeddings)
     append_metadata(meta_path, metas)
     return {"indexed_chunks": len(chunks)}
 def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic-embed-text-v1",  max_tokens: int = 400, overlap_sentences: int = 2,):
@@ -157,8 +161,8 @@ def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic
         res = append_file(str(dest), model_name, idx_path, meta_path, force=True, max_tokens=max_tokens, overlap_sentences=overlap_sentences)
         total += res.get("indexed_chunks", 0)
-    print (f"Session created at {upload_root}. Total chunks: {total}")
-    return str(upload_root)
 if __name__ == "__main__":
     # Add your argparse logic here if needed

 import torch
 import numpy as np
 import faiss
+import logging
 from pathlib import Path
 from datetime import datetime
 from typing import List, Dict, Optional, Tuple
             metas.append({"source": str(p), "page": page_num, "chunk_id": i, "text": c})
             chunks.append(c)
+    logging.info(f"Indexing {len(chunks)} chunks and metadata {len(metas)}.")
     embeddings = EmbeddingManager.embed(chunks, model_name)
     add_embeddings_to_index(index_path, embeddings)
     append_metadata(meta_path, metas)
+    logging.info(f"Appended file {file_path} to index and metadata.")
     return {"indexed_chunks": len(chunks)}
 def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic-embed-text-v1",  max_tokens: int = 400, overlap_sentences: int = 2,):
         res = append_file(str(dest), model_name, idx_path, meta_path, force=True, max_tokens=max_tokens, overlap_sentences=overlap_sentences)
         total += res.get("indexed_chunks", 0)
+    logging.info(f"Session created at {upload_root}. Total chunks: {total}")
+    return str(upload_root), total
 if __name__ == "__main__":
     # Add your argparse logic here if needed

src/embeddings_utils.py CHANGED Viewed

@@ -70,15 +70,36 @@ def add_embeddings_to_index(index_path: str, embeddings: np.ndarray):
     faiss.write_index(idx, index_path)
-def append_metadata(meta_path: str, new_meta: list):
     """
     Efficiently appends to a pickle file using 'ab' (append binary) mode.
     This avoids loading the entire existing metadata list into memory.
     """
     os.makedirs(os.path.dirname(meta_path), exist_ok=True)
     with open(meta_path, "ab") as f:
-        # Pickle can store multiple objects in one file; load_metadata handles this.
         pickle.dump(new_meta, f, protocol=pickle.HIGHEST_PROTOCOL)
 def load_metadata(path: str) -> list:
     """Loads all objects from an appended pickle file into a single flat list."""
@@ -93,6 +114,7 @@ def load_metadata(path: str) -> list:
                 break
     return all_data
 def compute_embeddings(
     texts: List[str],
     model_name: str = "nomic-ai/nomic-embed-text-v1",

     faiss.write_index(idx, index_path)
+def append_metadata(meta_path: str, new_meta: list) -> int:
     """
     Efficiently appends to a pickle file using 'ab' (append binary) mode.
     This avoids loading the entire existing metadata list into memory.
+    And returns the TOTAL count of chunks in the file.
     """
     os.makedirs(os.path.dirname(meta_path), exist_ok=True)
+    # 1. Perform the append
     with open(meta_path, "ab") as f:
         pickle.dump(new_meta, f, protocol=pickle.HIGHEST_PROTOCOL)
+    # 2. Calculate the total size by reading the "stacked" objects
+    total_count = 0
+    try:
+        with open(meta_path, "rb") as f:
+            while True:
+                try:
+                    data = pickle.load(f)
+                    # If data is a list, add its length; if it's a single dict, add 1
+                    total_count += len(data) if isinstance(data, list) else 1
+                except EOFError:
+                    break
+    except Exception as e:
+        logging.error(f"Error calculating metadata size: {e}")
+    logging.info(f"Total metadata chunks after append: {total_count}")
+    return total_count
 def load_metadata(path: str) -> list:
     """Loads all objects from an appended pickle file into a single flat list."""
                 break
     return all_data
 def compute_embeddings(
     texts: List[str],
     model_name: str = "nomic-ai/nomic-embed-text-v1",

src/graph_index.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pathlib import Path
 import logging
 import numpy as np
 import faiss
 class HierarchicalGraphManager:
     def __init__(self, storage_path: str = "data/uploads/graph_data.pkl"):
         self.storage_path = Path(storage_path)
@@ -20,7 +20,7 @@ class HierarchicalGraphManager:
         Builds a semantic proximity graph by extracting vectors from FAISS.
         No LLM is used for the graph construction phase.
         """
-        logging.info(f"🕸️ Building Semantic Graph from: {idx_path.name}")
         try:
             # 1. Load FAISS index
@@ -42,14 +42,19 @@ class HierarchicalGraphManager:
                 return
             # 3. Load Metadata (Chunks)
-            with open(meta_path, 'rb') as f:
-                metadata = pickle.load(f)
             # 4. Semantic Similarity Matrix
             # Using CPU-efficient dot product on normalized vectors
             faiss.normalize_L2(embeddings)
             sim_matrix = np.dot(embeddings, embeddings.T)
             # 5. Build Relationships based on Semantic Threshold
             triples = []
             for i in range(n_total):

 import logging
 import numpy as np
 import faiss
+from .embeddings_utils import load_metadata
 class HierarchicalGraphManager:
     def __init__(self, storage_path: str = "data/uploads/graph_data.pkl"):
         self.storage_path = Path(storage_path)
         Builds a semantic proximity graph by extracting vectors from FAISS.
         No LLM is used for the graph construction phase.
         """
+        logging.info(f"🕸️ Building Semantic Graph from: {meta_path}, {idx_path.name}")
         try:
             # 1. Load FAISS index
                 return
             # 3. Load Metadata (Chunks)
+            metadata = load_metadata(meta_path)
+            if n_total != len(metadata):
+                logging.warning(f"⚠️ Data Mismatch: FAISS index has {n_total} items, "
+                                f"but Metadata has {len(metadata)}. Processing first {n_total}.")
+                metadata = metadata[:n_total]
             # 4. Semantic Similarity Matrix
             # Using CPU-efficient dot product on normalized vectors
             faiss.normalize_L2(embeddings)
             sim_matrix = np.dot(embeddings, embeddings.T)
+            logging.info(f"Computed similarity matrix of shape {sim_matrix.shape}")
             # 5. Build Relationships based on Semantic Threshold
             triples = []
             for i in range(n_total):