NavyDevilDoc commited on
Commit
10e6f84
·
verified ·
1 Parent(s): b663de0

Update src/rag_engine.py

Browse files
Files changed (1) hide show
  1. src/rag_engine.py +117 -275
src/rag_engine.py CHANGED
@@ -1,204 +1,147 @@
1
  import os
2
  import shutil
3
  import logging
4
- from typing import List, Literal, Tuple
5
-
6
- # --- LANGCHAIN & DB IMPORTS ---
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_openai import OpenAIEmbeddings
 
9
  from langchain_core.documents import Document
10
- from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
11
- from sentence_transformers import CrossEncoder
12
-
13
-
14
- # --- CUSTOM CORE IMPORTS ---
15
  from core.PineconeManager import PineconeManager
16
- from core.ParagraphChunker import ParagraphChunker
17
- from core.TokenChunker import TokenChunker
18
  from core.AcronymManager import AcronymManager
 
19
 
20
- # --- CONFIGURATION ---
21
- UPLOAD_DIR = "source_documents"
22
- EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
23
- RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
24
  PINECONE_KEY = os.getenv("PINECONE_API_KEY")
25
-
26
- # Configure Logging
27
- logging.basicConfig(level=logging.INFO)
28
  logger = logging.getLogger(__name__)
29
 
30
- # --- LAZY LOADING GLOBALS ---
31
- _embedding_func = None
32
- _rerank_model = None
 
 
 
 
33
 
34
  def get_embedding_func(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
35
- """
36
- Dynamically loads the correct embedding model based on the selection.
37
- """
38
  try:
39
- # 1. OpenAI Models
40
  if "openai" in model_name.lower():
41
- if not os.getenv("OPENAI_API_KEY"):
42
- raise ValueError("OpenAI API Key not found.")
43
-
44
- # Map friendly names to actual API model names if needed
45
- # But usually we just pass the exact string like "text-embedding-3-small"
46
  return OpenAIEmbeddings(model=model_name)
47
-
48
- # 2. Hugging Face Models (Local / CPU-friendly)
49
  else:
50
- # Default to all-MiniLM if something weird is passed, or use the specific HF model
51
  return HuggingFaceEmbeddings(model_name=model_name)
52
-
53
  except Exception as e:
54
  logger.error(f"Failed to load embedding model '{model_name}': {e}")
55
- # Fallback to the safe default if everything explodes
56
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
57
 
58
- def get_rerank_model():
59
- """Lazy loads the Cross-Encoder model."""
60
- global _rerank_model
61
- if _rerank_model is None:
62
- logger.info(f" Loading Reranker: {RERANK_MODEL_NAME}...")
63
- _rerank_model = CrossEncoder(RERANK_MODEL_NAME)
64
- logger.info("✅ Reranker Loaded.")
65
- return _rerank_model
66
-
67
- # --- PART 1: CHUNKING LOGIC (The New System) ---
68
-
69
- def _process_markdown(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> List[Document]:
70
- """Internal helper to process Markdown files using Header Semantic Splitting."""
 
71
  try:
72
- with open(file_path, 'r', encoding='utf-8') as f:
73
- markdown_text = f.read()
74
-
75
- headers_to_split_on = [
76
- ("#", "Header 1"),
77
- ("##", "Header 2"),
78
- ("###", "Header 3"),
79
- ]
80
-
81
- markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
82
- md_header_splits = markdown_splitter.split_text(markdown_text)
83
-
84
- text_splitter = RecursiveCharacterTextSplitter(
85
- chunk_size=chunk_size,
86
- chunk_overlap=chunk_overlap
87
- )
88
- final_docs = text_splitter.split_documents(md_header_splits)
89
 
90
- for doc in final_docs:
91
- doc.metadata['source'] = os.path.basename(file_path)
92
- doc.metadata['file_type'] = 'md'
93
- doc.metadata['strategy'] = 'markdown_header'
94
-
95
- return final_docs
96
- except Exception as e:
97
- logger.error(f"Error processing Markdown file {file_path}: {e}")
98
- return []
99
-
100
- def process_file(
101
- file_path: str,
102
- chunking_strategy: Literal["paragraph", "token"] = "paragraph",
103
- chunk_size: int = 512,
104
- chunk_overlap: int = 100,
105
- model_name: str = "gpt-4o"
106
- ) -> List[Document]:
107
- """
108
- Main chunking engine. Routes file to specific chunkers based on type/strategy.
109
- """
110
- if not os.path.exists(file_path):
111
- logger.error(f"File not found: {file_path}")
112
- return []
113
-
114
- file_extension = os.path.splitext(file_path)[1].lower()
115
- file_name = os.path.basename(file_path)
116
- logger.info(f"Processing {file_name} using strategy: {chunking_strategy}")
117
-
118
- # 1. Handle Markdown
119
- if file_extension == ".md":
120
- return _process_markdown(file_path, chunk_size, chunk_overlap)
121
-
122
- # 2. Handle PDF and TXT
123
- elif file_extension in [".pdf", ".txt"]:
124
  if chunking_strategy == "token":
125
- chunker = TokenChunker(
126
- model_name=model_name,
127
- chunk_size=chunk_size,
128
- chunk_overlap=chunk_overlap
129
- )
130
  else:
131
- chunker = ParagraphChunker(model_name=model_name)
132
-
133
- try:
134
- if file_extension == ".pdf":
135
- docs = chunker.process_document(file_path)
136
- elif file_extension == ".txt":
137
- docs = chunker.process_text_file(file_path)
138
 
139
- # Ensure metadata consistency
140
- for doc in docs:
141
- doc.metadata["source"] = file_name
142
- doc.metadata["strategy"] = chunking_strategy
143
-
144
- return docs
145
 
146
- except Exception as e:
147
- logger.error(f"Error using {chunking_strategy} chunker on {file_name}: {e}")
148
- return []
149
- else:
150
- logger.warning(f"Unsupported file extension: {file_extension}")
151
  return []
152
 
153
- # --- PART 2: DATABASE & FILE MANAGEMENT (Pinecone Version) ---
154
-
155
- def save_uploaded_file(uploaded_file, username: str = "default") -> str:
156
- """Saves a StreamlitUploadedFile to disk so the loaders can read it."""
 
 
 
 
 
157
  try:
158
- user_dir = os.path.join(UPLOAD_DIR, username)
159
- os.makedirs(user_dir, exist_ok=True)
160
- file_path = os.path.join(user_dir, uploaded_file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- with open(file_path, "wb") as f:
163
- f.write(uploaded_file.getbuffer())
164
- return file_path
 
 
 
 
 
 
165
  except Exception as e:
166
- logger.error(f"Error saving file: {e}")
167
- return None
168
 
169
  def process_and_add_text(text: str, source_name: str, username: str, index_name: str) -> Tuple[bool, str]:
170
- """
171
- Ingests raw text.
172
- UPGRADE: Performs 'Clean Replace' - deletes old version of this source before adding new.
173
- """
174
  if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
175
-
176
  try:
177
  pm = PineconeManager(PINECONE_KEY)
178
 
179
- # 1. PRE-EMPTIVE DELETE (The Fix)
180
- # We wipe any existing vectors with this source name to prevent duplicates.
181
- # This effectively makes this an "Update/Replace" operation.
182
  pm.delete_file(index_name, source_name, namespace=username)
183
 
184
- # 2. SAVE PHYSICAL BACKUP (For Quiz Engine)
185
  user_docs_dir = os.path.join(UPLOAD_DIR, username)
186
  os.makedirs(user_docs_dir, exist_ok=True)
187
  backup_path = os.path.join(user_docs_dir, source_name)
188
-
189
  with open(backup_path, "w", encoding='utf-8') as f:
190
  f.write(text)
191
 
192
- # 3. UPLOAD TO PINECONE
193
- emb_fn = get_embedding_func() # Uses default or last active model logic internally
194
-
195
- doc = Document(
196
- page_content=text,
197
- metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"}
198
- )
199
-
200
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
201
- # Custom ID isn't strictly necessary for single-doc flattened text, but good for consistency
202
  vstore.add_documents([doc], ids=[f"{source_name}_0"])
203
 
204
  return True, f"Updated: {source_name}"
@@ -207,12 +150,7 @@ def process_and_add_text(text: str, source_name: str, username: str, index_name:
207
  return False, str(e)
208
 
209
  def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str = None, strategy: str = "paragraph") -> Tuple[bool, str]:
210
- """
211
- Chunks and uploads file.
212
- UPGRADE: Performs 'Clean Replace' - deletes old chunks before uploading new ones.
213
- """
214
  if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
215
-
216
  try:
217
  # 1. Chunking
218
  docs = process_file(file_path, chunking_strategy=strategy)
@@ -226,26 +164,20 @@ def ingest_file(file_path: str, username: str, index_name: str, embed_model_name
226
  # 3. Pinecone Manager
227
  pm = PineconeManager(PINECONE_KEY)
228
 
229
- # 4. SAFETY CHECK (Dimensions)
230
  emb_fn = get_embedding_func(embed_model_name)
231
  test_vec = emb_fn.embed_query("test")
232
  model_dim = len(test_vec)
233
-
234
  if not pm.check_dimension_compatibility(index_name, model_dim):
235
  return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors."
236
 
237
- # 5. PRE-EMPTIVE DELETE (The Fix)
238
- # Wipe the slate clean for this specific filename
239
  filename = os.path.basename(file_path)
240
  pm.delete_file(index_name, filename, namespace=username)
241
 
242
- # 6. UPLOAD NEW CHUNKS
243
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
244
-
245
- # Generate readable IDs: "filename_0", "filename_1"
246
- # This helps with the 'Frankenstein' sorting fix we added earlier
247
  custom_ids = [f"{doc.metadata.get('source', filename)}_{i}" for i, doc in enumerate(docs)]
248
-
249
  vstore.add_documents(docs, ids=custom_ids)
250
 
251
  return True, f"Successfully updated {filename} ({len(docs)} chunks)."
@@ -254,147 +186,57 @@ def ingest_file(file_path: str, username: str, index_name: str, embed_model_name
254
  logger.error(f"Ingestion failed: {e}")
255
  return False, str(e)
256
 
257
- def search_knowledge_base(query: str, username: str, index_name: str, embed_model_name: str, k: int = 10, final_k: int = 4) -> List[Document]:
258
- """Retrieves from Pinecone -> Reranks."""
259
- if not PINECONE_KEY or not index_name: return []
 
260
 
261
- try:
262
- # 1. Expand Query (Acronyms)
263
- acronym_mgr = AcronymManager()
264
- expanded_query = acronym_mgr.expand_query(query)
265
-
266
- # 2. Vector Search
267
- pm = PineconeManager(PINECONE_KEY)
268
- emb_fn = get_embedding_func(embed_model_name)
269
- vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
270
-
271
- results = vstore.similarity_search(expanded_query, k=k)
272
- if not results: return []
273
-
274
- # 3. Reranking
275
- candidate_docs = results
276
- candidate_texts = [doc.page_content for doc in candidate_docs]
277
- pairs = [[expanded_query, text] for text in candidate_texts]
278
-
279
- reranker = get_rerank_model()
280
- scores = reranker.predict(pairs)
281
-
282
- # Sort
283
- scored_docs = list(zip(candidate_docs, scores))
284
- scored_docs.sort(key=lambda x: x[1], reverse=True)
285
-
286
- return [doc for doc, score in scored_docs[:final_k]]
287
-
288
- except Exception as e:
289
- logger.error(f"Search Error: {e}")
290
- return []
291
 
292
  def list_documents(username: str) -> List[dict]:
293
- """
294
- NOTE: Pinecone does not support easy listing of all unique files.
295
- We return the Local Cache (source_documents) as a proxy for what is
296
- available for the Quiz Engine.
297
- """
298
  user_dir = os.path.join(UPLOAD_DIR, username)
299
  if not os.path.exists(user_dir): return []
300
-
301
- files = []
302
- for f in os.listdir(user_dir):
303
- if f.lower().endswith(('.pdf', '.txt', '.md')):
304
- files.append({"filename": f, "source": f, "strategy": "local_cache"})
305
- return files
306
-
307
- def delete_document(username: str, filename: str, index_name: str) -> Tuple[bool, str]:
308
- """Deletes from Pinecone AND Local Disk."""
309
- if not PINECONE_KEY or not index_name: return False, "Config Missing."
310
-
311
- try:
312
- # 1. Delete from Pinecone
313
- pm = PineconeManager(PINECONE_KEY)
314
- pm.delete_file(index_name, filename, namespace=username)
315
-
316
- # 2. Delete from Disk (Clean up Quiz Cache)
317
- local_path = os.path.join(UPLOAD_DIR, username, filename)
318
- if os.path.exists(local_path):
319
- os.remove(local_path)
320
-
321
- return True, f"Deleted {filename} from Index and Disk."
322
- except Exception as e:
323
- return False, str(e)
324
-
325
- def reset_knowledge_base(username: str) -> Tuple[bool, str]:
326
- """
327
- WARNING: This deletes the USER NAMESPACE in Pinecone, not the whole Index.
328
- """
329
- # Pinecone delete_all is index-wide usually.
330
- # For safety in namespace-based multi-tenancy, we usually skip this
331
- # or implement a delete_all(delete_all=True, namespace=username)
332
- return False, "Resetting entire DB via API is disabled for safety. Use Delete."
333
 
334
  def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
335
- """
336
- Downloads text from Pinecone and reconstructs local source files.
337
- FIX: Sorts chunks numerically (_0, _1, _2) to prevent 'Frankenstein' files.
338
- """
339
- if not PINECONE_KEY or not index_name:
340
- return False, "Pinecone config missing."
341
-
342
  try:
343
  pm = PineconeManager(PINECONE_KEY)
344
-
345
- # 1. Get all Vector IDs
346
  ids = pm.get_all_ids(index_name, username)
347
  if not ids: return False, "No data found in Pinecone."
348
 
349
- # 2. Fetch content
350
  batch_size = 100
351
- reconstructed_files = {} # { "filename.txt": [ (index, text), (index, text) ] }
352
-
353
  for i in range(0, len(ids), batch_size):
354
  batch_ids = ids[i : i + batch_size]
355
  response = pm.fetch_vectors(index_name, batch_ids, username)
356
  vectors = response.vectors
357
-
358
  for vec_id, vec_data in vectors.items():
359
  meta = vec_data.metadata or {}
360
  source = meta.get('source', 'unknown.txt')
361
- # Try to get text from 'text' (langchain default) or 'page_content' (our backup)
362
  text = meta.get('text') or meta.get('page_content') or ''
363
-
364
- # EXTRACT CHUNK INDEX FROM ID (e.g., "doc.txt_12" -> 12)
365
  try:
366
- # Assumes ID format "filename_index" from our new ingestion logic
367
- if "_" in vec_id:
368
- chunk_index = int(vec_id.rsplit('_', 1)[-1])
369
- else:
370
- chunk_index = 0
371
- except ValueError:
372
- chunk_index = 0 # Fallback
373
-
374
- if source not in reconstructed_files:
375
- reconstructed_files[source] = []
376
  reconstructed_files[source].append((chunk_index, text))
377
 
378
- # 3. Write to Disk (Sorted)
379
  user_dir = os.path.join(UPLOAD_DIR, username)
380
  os.makedirs(user_dir, exist_ok=True)
381
-
382
  count = 0
383
  for filename, chunks in reconstructed_files.items():
384
- # SORT BY INDEX (The Fix)
385
- # This ensures Paragraph 1 comes before Paragraph 2
386
- chunks.sort(key=lambda x: x[0])
387
-
388
- # Join text only
389
  full_text = "\n\n".join([c[1] for c in chunks])
390
-
391
  file_path = os.path.join(user_dir, filename)
392
- with open(file_path, "w", encoding="utf-8") as f:
393
- f.write(full_text)
394
  count += 1
395
-
396
  return True, f"Restored {count} files (Sorted) from Pinecone!"
397
-
398
  except Exception as e:
399
  logger.error(f"Cache rebuild failed: {e}")
400
  return False, str(e)
 
1
  import os
2
  import shutil
3
  import logging
4
+ from typing import List, Tuple, Optional
5
+ from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_openai import OpenAIEmbeddings
9
+ from langchain_community.vectorstores import Pinecone as LangchainPinecone
10
  from langchain_core.documents import Document
 
 
 
 
 
11
  from core.PineconeManager import PineconeManager
 
 
12
  from core.AcronymManager import AcronymManager
13
+ from flashrank import Ranker, RerankRequest # NEW IMPORT
14
 
15
+ # CONFIGURATION
 
 
 
16
  PINECONE_KEY = os.getenv("PINECONE_API_KEY")
17
+ UPLOAD_DIR = "source_documents"
 
 
18
  logger = logging.getLogger(__name__)
19
 
20
+ # Initialize Reranker (Small, fast CPU model)
21
+ # Only initializes once when the app starts
22
+ try:
23
+ reranker = Ranker(model_name="ms-marco-TinyBERT-L-2-v2", cache_dir="/tmp/flashrank_cache")
24
+ except Exception as e:
25
+ logger.warning(f"Reranker failed to load: {e}")
26
+ reranker = None
27
 
28
  def get_embedding_func(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
 
 
 
29
  try:
 
30
  if "openai" in model_name.lower():
31
+ if not os.getenv("OPENAI_API_KEY"): raise ValueError("OpenAI API Key not found.")
 
 
 
 
32
  return OpenAIEmbeddings(model=model_name)
 
 
33
  else:
 
34
  return HuggingFaceEmbeddings(model_name=model_name)
 
35
  except Exception as e:
36
  logger.error(f"Failed to load embedding model '{model_name}': {e}")
 
37
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
38
 
39
+ def save_uploaded_file(uploaded_file, username: str) -> str:
40
+ user_dir = os.path.join(UPLOAD_DIR, username)
41
+ os.makedirs(user_dir, exist_ok=True)
42
+ file_path = os.path.join(user_dir, uploaded_file.name)
43
+ with open(file_path, "wb") as f:
44
+ f.write(uploaded_file.getbuffer())
45
+ return file_path
46
+
47
+ class ParagraphChunker:
48
+ def split_text(self, text):
49
+ return [p.strip() for p in text.split('\n\n') if p.strip()]
50
+
51
+ def process_file(file_path: str, chunking_strategy: str = "paragraph") -> List[Document]:
52
+ ext = os.path.splitext(file_path)[1].lower()
53
  try:
54
+ if ext == ".pdf": loader = PyPDFLoader(file_path)
55
+ elif ext == ".txt": loader = TextLoader(file_path, encoding='utf-8')
56
+ elif ext == ".docx": loader = UnstructuredWordDocumentLoader(file_path)
57
+ elif ext == ".pptx": loader = UnstructuredPowerPointLoader(file_path)
58
+ elif ext == ".md": loader = TextLoader(file_path, encoding='utf-8')
59
+ else: return []
60
+
61
+ raw_docs = loader.load()
62
+ text = "\n\n".join([d.page_content for d in raw_docs])
 
 
 
 
 
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  if chunking_strategy == "token":
65
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
66
+ chunks = splitter.create_documents([text])
 
 
 
67
  else:
68
+ chunker = ParagraphChunker()
69
+ texts = chunker.split_text(text)
70
+ chunks = [Document(page_content=t) for t in texts]
 
 
 
 
71
 
72
+ # Add metadata
73
+ filename = os.path.basename(file_path)
74
+ for doc in chunks:
75
+ doc.metadata["source"] = filename
76
+ doc.metadata["strategy"] = chunking_strategy
 
77
 
78
+ return chunks
79
+ except Exception as e:
80
+ logger.error(f"Error processing {file_path}: {e}")
 
 
81
  return []
82
 
83
+ def search_knowledge_base(query: str, username: str, index_name: str, embed_model_name: str, k: int = 5, final_k: int = 5):
84
+ """
85
+ Searches Pinecone with Reranking.
86
+ 1. Fetches 3x candidates (Top 15).
87
+ 2. Reranks using TinyBERT.
88
+ 3. Returns Top 5.
89
+ """
90
+ if not PINECONE_KEY or not index_name: return []
91
+
92
  try:
93
+ pm = PineconeManager(PINECONE_KEY)
94
+ emb_fn = get_embedding_func(embed_model_name)
95
+ vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
96
+
97
+ # 1. RETRIEVE BROAD (Fetch 3x what we need)
98
+ broad_k = final_k * 3
99
+ initial_docs = vstore.similarity_search(query, k=broad_k)
100
+
101
+ if not initial_docs or not reranker:
102
+ return initial_docs[:final_k]
103
+
104
+ # 2. RERANK (The Brain Upgrade)
105
+ passages = [
106
+ {"id": str(i), "text": doc.page_content, "meta": doc.metadata}
107
+ for i, doc in enumerate(initial_docs)
108
+ ]
109
+
110
+ rerank_request = RerankRequest(query=query, passages=passages)
111
+ ranked_results = reranker.rerank(rerank_request)
112
 
113
+ # 3. SELECT TOP K
114
+ final_docs = []
115
+ for res in ranked_results[:final_k]:
116
+ meta = res.get("meta", {})
117
+ meta["rerank_score"] = res.get("score") # Useful for debugging
118
+ final_docs.append(Document(page_content=res["text"], metadata=meta))
119
+
120
+ return final_docs
121
+
122
  except Exception as e:
123
+ logger.error(f"Search failed: {e}")
124
+ return []
125
 
126
  def process_and_add_text(text: str, source_name: str, username: str, index_name: str) -> Tuple[bool, str]:
 
 
 
 
127
  if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
 
128
  try:
129
  pm = PineconeManager(PINECONE_KEY)
130
 
131
+ # 1. PRE-EMPTIVE DELETE
 
 
132
  pm.delete_file(index_name, source_name, namespace=username)
133
 
134
+ # 2. SAVE BACKUP
135
  user_docs_dir = os.path.join(UPLOAD_DIR, username)
136
  os.makedirs(user_docs_dir, exist_ok=True)
137
  backup_path = os.path.join(user_docs_dir, source_name)
 
138
  with open(backup_path, "w", encoding='utf-8') as f:
139
  f.write(text)
140
 
141
+ # 3. UPLOAD
142
+ emb_fn = get_embedding_func()
143
+ doc = Document(page_content=text, metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"})
 
 
 
 
 
144
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
 
145
  vstore.add_documents([doc], ids=[f"{source_name}_0"])
146
 
147
  return True, f"Updated: {source_name}"
 
150
  return False, str(e)
151
 
152
  def ingest_file(file_path: str, username: str, index_name: str, embed_model_name: str = None, strategy: str = "paragraph") -> Tuple[bool, str]:
 
 
 
 
153
  if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
 
154
  try:
155
  # 1. Chunking
156
  docs = process_file(file_path, chunking_strategy=strategy)
 
164
  # 3. Pinecone Manager
165
  pm = PineconeManager(PINECONE_KEY)
166
 
167
+ # 4. SAFETY CHECK
168
  emb_fn = get_embedding_func(embed_model_name)
169
  test_vec = emb_fn.embed_query("test")
170
  model_dim = len(test_vec)
 
171
  if not pm.check_dimension_compatibility(index_name, model_dim):
172
  return False, f"Dimension Mismatch! Index '{index_name}' expects {model_dim}d vectors."
173
 
174
+ # 5. PRE-EMPTIVE DELETE
 
175
  filename = os.path.basename(file_path)
176
  pm.delete_file(index_name, filename, namespace=username)
177
 
178
+ # 6. UPLOAD
179
  vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
 
 
 
180
  custom_ids = [f"{doc.metadata.get('source', filename)}_{i}" for i, doc in enumerate(docs)]
 
181
  vstore.add_documents(docs, ids=custom_ids)
182
 
183
  return True, f"Successfully updated {filename} ({len(docs)} chunks)."
 
186
  logger.error(f"Ingestion failed: {e}")
187
  return False, str(e)
188
 
189
+ def delete_document(username: str, filename: str, index_name: str):
190
+ user_dir = os.path.join(UPLOAD_DIR, username)
191
+ file_path = os.path.join(user_dir, filename)
192
+ if os.path.exists(file_path): os.remove(file_path)
193
 
194
+ if PINECONE_KEY and index_name:
195
+ try:
196
+ pm = PineconeManager(PINECONE_KEY)
197
+ pm.delete_file(index_name, filename, namespace=username)
198
+ except Exception as e:
199
+ logger.error(f"Pinecone delete failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  def list_documents(username: str) -> List[dict]:
 
 
 
 
 
202
  user_dir = os.path.join(UPLOAD_DIR, username)
203
  if not os.path.exists(user_dir): return []
204
+ return [{"filename": f, "source": f} for f in os.listdir(user_dir) if f.lower().endswith(('.txt', '.md', '.pdf', '.docx'))]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  def rebuild_cache_from_pinecone(username: str, index_name: str) -> Tuple[bool, str]:
207
+ if not PINECONE_KEY or not index_name: return False, "Pinecone config missing."
 
 
 
 
 
 
208
  try:
209
  pm = PineconeManager(PINECONE_KEY)
 
 
210
  ids = pm.get_all_ids(index_name, username)
211
  if not ids: return False, "No data found in Pinecone."
212
 
 
213
  batch_size = 100
214
+ reconstructed_files = {}
 
215
  for i in range(0, len(ids), batch_size):
216
  batch_ids = ids[i : i + batch_size]
217
  response = pm.fetch_vectors(index_name, batch_ids, username)
218
  vectors = response.vectors
 
219
  for vec_id, vec_data in vectors.items():
220
  meta = vec_data.metadata or {}
221
  source = meta.get('source', 'unknown.txt')
 
222
  text = meta.get('text') or meta.get('page_content') or ''
 
 
223
  try:
224
+ if "_" in vec_id: chunk_index = int(vec_id.rsplit('_', 1)[-1])
225
+ else: chunk_index = 0
226
+ except ValueError: chunk_index = 0
227
+ if source not in reconstructed_files: reconstructed_files[source] = []
 
 
 
 
 
 
228
  reconstructed_files[source].append((chunk_index, text))
229
 
 
230
  user_dir = os.path.join(UPLOAD_DIR, username)
231
  os.makedirs(user_dir, exist_ok=True)
 
232
  count = 0
233
  for filename, chunks in reconstructed_files.items():
234
+ chunks.sort(key=lambda x: x[0]) # SORTING FIX
 
 
 
 
235
  full_text = "\n\n".join([c[1] for c in chunks])
 
236
  file_path = os.path.join(user_dir, filename)
237
+ with open(file_path, "w", encoding="utf-8") as f: f.write(full_text)
 
238
  count += 1
 
239
  return True, f"Restored {count} files (Sorted) from Pinecone!"
 
240
  except Exception as e:
241
  logger.error(f"Cache rebuild failed: {e}")
242
  return False, str(e)