Fnu Mahnoor commited on
Commit
ab97519
·
1 Parent(s): 299a880

Fix inference

Browse files
app.py CHANGED
@@ -98,7 +98,7 @@ def main():
98
 
99
  try:
100
  # 1. Standard Vector Indexing
101
- session_folder_str = upload_and_index_session(
102
  saved_paths,
103
  model_name=emb_model_val,
104
  max_tokens=tokens,
@@ -106,7 +106,9 @@ def main():
106
  )
107
 
108
  session_path = Path(session_folder_str)
109
-
 
 
110
  # 2. Graph Initialization
111
  gm = HierarchicalGraphManager(storage_path=session_path / "graph_data.pkl")
112
 
@@ -123,10 +125,12 @@ def main():
123
  def summarizer(prompt):
124
  return generate_answer(prompt, [""], backend=llm_backend, model_name=llm_model_val, use_hf_api=use_hf_api)
125
  gm.build_hierarchy(llm_summarizer_callback=summarizer)
 
 
126
  gm.save() # Persists everything to graph_data.pkl
127
 
128
 
129
- yield f"✅ Success! Session {session_path.name} is fully indexed (Vector + Graph)."
130
 
131
  except Exception as e:
132
  logging.exception("Ingestion failed")
 
98
 
99
  try:
100
  # 1. Standard Vector Indexing
101
+ session_folder_str, total_chunks = upload_and_index_session(
102
  saved_paths,
103
  model_name=emb_model_val,
104
  max_tokens=tokens,
 
106
  )
107
 
108
  session_path = Path(session_folder_str)
109
+
110
+ yield f"📦 Phase 1 Complete: Created {total_chunks} semantic chunks."
111
+
112
  # 2. Graph Initialization
113
  gm = HierarchicalGraphManager(storage_path=session_path / "graph_data.pkl")
114
 
 
125
  def summarizer(prompt):
126
  return generate_answer(prompt, [""], backend=llm_backend, model_name=llm_model_val, use_hf_api=use_hf_api)
127
  gm.build_hierarchy(llm_summarizer_callback=summarizer)
128
+
129
+ yield "⏳Finalizing and saving graph..."
130
  gm.save() # Persists everything to graph_data.pkl
131
 
132
 
133
+ yield f"✅ Success! Session {session_path.name} is fully indexed (Vector + Graph).\nCreated {total_chunks} semantic chunks."
134
 
135
  except Exception as e:
136
  logging.exception("Ingestion failed")
requirements.txt CHANGED
@@ -87,4 +87,4 @@ python-louvain>=0.16 # The "Community" detection engine
87
  cdlib # Advanced community detection (optional)
88
 
89
  # --- SPEED OPTIMIZATIONS ---
90
- lxml # Much faster HTML/Docx parsing
 
87
  cdlib # Advanced community detection (optional)
88
 
89
  # --- SPEED OPTIMIZATIONS ---
90
+ lxml # Much faster HTML/Docx parsing
src/embeddings.py CHANGED
@@ -3,6 +3,7 @@ import shutil
3
  import torch
4
  import numpy as np
5
  import faiss
 
6
  from pathlib import Path
7
  from datetime import datetime
8
  from typing import List, Dict, Optional, Tuple
@@ -139,9 +140,12 @@ def append_file(file_path: str, model_name: str, index_path: str, meta_path: str
139
  metas.append({"source": str(p), "page": page_num, "chunk_id": i, "text": c})
140
  chunks.append(c)
141
 
 
142
  embeddings = EmbeddingManager.embed(chunks, model_name)
143
  add_embeddings_to_index(index_path, embeddings)
144
  append_metadata(meta_path, metas)
 
 
145
  return {"indexed_chunks": len(chunks)}
146
 
147
  def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic-embed-text-v1", max_tokens: int = 400, overlap_sentences: int = 2,):
@@ -157,8 +161,8 @@ def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic
157
  res = append_file(str(dest), model_name, idx_path, meta_path, force=True, max_tokens=max_tokens, overlap_sentences=overlap_sentences)
158
  total += res.get("indexed_chunks", 0)
159
 
160
- print (f"Session created at {upload_root}. Total chunks: {total}")
161
- return str(upload_root)
162
 
163
  if __name__ == "__main__":
164
  # Add your argparse logic here if needed
 
3
  import torch
4
  import numpy as np
5
  import faiss
6
+ import logging
7
  from pathlib import Path
8
  from datetime import datetime
9
  from typing import List, Dict, Optional, Tuple
 
140
  metas.append({"source": str(p), "page": page_num, "chunk_id": i, "text": c})
141
  chunks.append(c)
142
 
143
+ logging.info(f"Indexing {len(chunks)} chunks and metadata {len(metas)}.")
144
  embeddings = EmbeddingManager.embed(chunks, model_name)
145
  add_embeddings_to_index(index_path, embeddings)
146
  append_metadata(meta_path, metas)
147
+
148
+ logging.info(f"Appended file {file_path} to index and metadata.")
149
  return {"indexed_chunks": len(chunks)}
150
 
151
  def upload_and_index_session(file_paths: list, model_name: str = "nomic-ai/nomic-embed-text-v1", max_tokens: int = 400, overlap_sentences: int = 2,):
 
161
  res = append_file(str(dest), model_name, idx_path, meta_path, force=True, max_tokens=max_tokens, overlap_sentences=overlap_sentences)
162
  total += res.get("indexed_chunks", 0)
163
 
164
+ logging.info(f"Session created at {upload_root}. Total chunks: {total}")
165
+ return str(upload_root), total
166
 
167
  if __name__ == "__main__":
168
  # Add your argparse logic here if needed
src/embeddings_utils.py CHANGED
@@ -70,15 +70,36 @@ def add_embeddings_to_index(index_path: str, embeddings: np.ndarray):
70
 
71
  faiss.write_index(idx, index_path)
72
 
73
- def append_metadata(meta_path: str, new_meta: list):
 
74
  """
75
  Efficiently appends to a pickle file using 'ab' (append binary) mode.
76
  This avoids loading the entire existing metadata list into memory.
 
77
  """
78
  os.makedirs(os.path.dirname(meta_path), exist_ok=True)
 
 
79
  with open(meta_path, "ab") as f:
80
- # Pickle can store multiple objects in one file; load_metadata handles this.
81
  pickle.dump(new_meta, f, protocol=pickle.HIGHEST_PROTOCOL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  def load_metadata(path: str) -> list:
84
  """Loads all objects from an appended pickle file into a single flat list."""
@@ -93,6 +114,7 @@ def load_metadata(path: str) -> list:
93
  break
94
  return all_data
95
 
 
96
  def compute_embeddings(
97
  texts: List[str],
98
  model_name: str = "nomic-ai/nomic-embed-text-v1",
 
70
 
71
  faiss.write_index(idx, index_path)
72
 
73
+
74
+ def append_metadata(meta_path: str, new_meta: list) -> int:
75
  """
76
  Efficiently appends to a pickle file using 'ab' (append binary) mode.
77
  This avoids loading the entire existing metadata list into memory.
78
+ And returns the TOTAL count of chunks in the file.
79
  """
80
  os.makedirs(os.path.dirname(meta_path), exist_ok=True)
81
+
82
+ # 1. Perform the append
83
  with open(meta_path, "ab") as f:
 
84
  pickle.dump(new_meta, f, protocol=pickle.HIGHEST_PROTOCOL)
85
+
86
+ # 2. Calculate the total size by reading the "stacked" objects
87
+ total_count = 0
88
+ try:
89
+ with open(meta_path, "rb") as f:
90
+ while True:
91
+ try:
92
+ data = pickle.load(f)
93
+ # If data is a list, add its length; if it's a single dict, add 1
94
+ total_count += len(data) if isinstance(data, list) else 1
95
+ except EOFError:
96
+ break
97
+ except Exception as e:
98
+ logging.error(f"Error calculating metadata size: {e}")
99
+
100
+ logging.info(f"Total metadata chunks after append: {total_count}")
101
+ return total_count
102
+
103
 
104
  def load_metadata(path: str) -> list:
105
  """Loads all objects from an appended pickle file into a single flat list."""
 
114
  break
115
  return all_data
116
 
117
+
118
  def compute_embeddings(
119
  texts: List[str],
120
  model_name: str = "nomic-ai/nomic-embed-text-v1",
src/graph_index.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
  import logging
7
  import numpy as np
8
  import faiss
9
-
10
  class HierarchicalGraphManager:
11
  def __init__(self, storage_path: str = "data/uploads/graph_data.pkl"):
12
  self.storage_path = Path(storage_path)
@@ -20,7 +20,7 @@ class HierarchicalGraphManager:
20
  Builds a semantic proximity graph by extracting vectors from FAISS.
21
  No LLM is used for the graph construction phase.
22
  """
23
- logging.info(f"🕸️ Building Semantic Graph from: {idx_path.name}")
24
 
25
  try:
26
  # 1. Load FAISS index
@@ -42,14 +42,19 @@ class HierarchicalGraphManager:
42
  return
43
 
44
  # 3. Load Metadata (Chunks)
45
- with open(meta_path, 'rb') as f:
46
- metadata = pickle.load(f)
47
-
 
 
 
 
48
  # 4. Semantic Similarity Matrix
49
  # Using CPU-efficient dot product on normalized vectors
50
  faiss.normalize_L2(embeddings)
51
  sim_matrix = np.dot(embeddings, embeddings.T)
52
-
 
53
  # 5. Build Relationships based on Semantic Threshold
54
  triples = []
55
  for i in range(n_total):
 
6
  import logging
7
  import numpy as np
8
  import faiss
9
+ from .embeddings_utils import load_metadata
10
  class HierarchicalGraphManager:
11
  def __init__(self, storage_path: str = "data/uploads/graph_data.pkl"):
12
  self.storage_path = Path(storage_path)
 
20
  Builds a semantic proximity graph by extracting vectors from FAISS.
21
  No LLM is used for the graph construction phase.
22
  """
23
+ logging.info(f"🕸️ Building Semantic Graph from: {meta_path}, {idx_path.name}")
24
 
25
  try:
26
  # 1. Load FAISS index
 
42
  return
43
 
44
  # 3. Load Metadata (Chunks)
45
+ metadata = load_metadata(meta_path)
46
+
47
+ if n_total != len(metadata):
48
+ logging.warning(f"⚠️ Data Mismatch: FAISS index has {n_total} items, "
49
+ f"but Metadata has {len(metadata)}. Processing first {n_total}.")
50
+ metadata = metadata[:n_total]
51
+
52
  # 4. Semantic Similarity Matrix
53
  # Using CPU-efficient dot product on normalized vectors
54
  faiss.normalize_L2(embeddings)
55
  sim_matrix = np.dot(embeddings, embeddings.T)
56
+ logging.info(f"Computed similarity matrix of shape {sim_matrix.shape}")
57
+
58
  # 5. Build Relationships based on Semantic Threshold
59
  triples = []
60
  for i in range(n_total):