rohannsinghal commited on
Commit
d81c7f2
Β·
1 Parent(s): 163de00

changes in add_document

Browse files
Files changed (1) hide show
  1. app/main_api.py +34 -46
app/main_api.py CHANGED
@@ -729,7 +729,7 @@ class DeadlockFreeRAGPipeline:
729
  logger.info(f"πŸš€ Deadlock-Free RAG Pipeline initialized: {collection_name}")
730
 
731
  async def add_documents(self, chunks: List[Dict[str, Any]]):
732
- """GEMINI'S FIX: Direct embedding management - no deadlock"""
733
  if not chunks:
734
  return
735
 
@@ -770,15 +770,15 @@ class DeadlockFreeRAGPipeline:
770
 
771
  if not quality_chunks:
772
  return
 
 
 
773
 
774
- # GEMINI'S FIX: Step 1 - Get texts
775
- texts = [chunk['content'] for chunk in quality_chunks[:100]] # Reduced from 150 for speed
776
-
777
- # GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (Manager gets sauce first)
778
  logger.info(f"πŸš€ Embedding {len(texts)} chunks via Kaggle...")
779
  embeddings = await self.kaggle_client.generate_embeddings(texts)
780
 
781
- # --- ADD THIS DEBUGGING SECTION ---
782
  logger.info("--- HF DEBUG ---")
783
  logger.info(f"Type of embeddings received: {type(embeddings)}")
784
  if isinstance(embeddings, list) and len(embeddings) > 0:
@@ -787,58 +787,46 @@ class DeadlockFreeRAGPipeline:
787
  if isinstance(embeddings[0], list):
788
  logger.info(f"Dimension of first embedding: {len(embeddings[0])}")
789
  logger.info(f"First few values: {embeddings[0][:5] if len(embeddings[0]) > 5 else embeddings[0]}")
790
- else:
791
- logger.info("Embeddings variable is either not a list or is empty!")
792
  logger.info("--- END HF DEBUG ---")
793
- # ---------------------------------
794
 
795
  if not embeddings or len(embeddings) != len(texts):
796
- logger.error("Embedding failed or returned mismatched count.")
797
  logger.error(f"Expected {len(texts)} embeddings, got {len(embeddings) if embeddings else 0}")
798
  return
799
-
800
- # Validate embeddings format
801
- if not isinstance(embeddings, list):
802
- logger.error(f"Embeddings must be a list, got {type(embeddings)}")
803
- return
804
-
805
- # Check if all embeddings are lists of floats
806
- for i, emb in enumerate(embeddings[:3]): # Check first 3
807
- if not isinstance(emb, list):
808
- logger.error(f"Embedding {i} is not a list: {type(emb)}")
809
- return
810
- if not all(isinstance(x, (int, float)) for x in emb[:5]): # Check first 5 values
811
- logger.error(f"Embedding {i} contains non-numeric values")
812
- return
813
-
814
- # GEMINI'S FIX: Step 3 - Add to Chroma with pre-calculated embeddings
815
- # This completely avoids the deadlock!
816
  try:
817
- logger.info("πŸ”„ Adding texts to Chroma with embeddings...")
818
- self.vectorstore.add_texts(
819
- texts=texts,
820
- metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]],
821
- embeddings=embeddings # <-- THE CRITICAL FIX - MAKE SURE THIS IS HERE
 
 
 
 
 
 
 
 
 
822
  )
823
- logger.info(f"βœ… Added {len(texts)} documents with embeddings to vector store (DEADLOCK-FREE)")
824
 
 
 
825
  except Exception as e:
826
- logger.error(f"❌ ChromaDB add_texts failed: {e}")
827
  logger.error(f"❌ Error type: {type(e)}")
828
 
829
- # Try adding without embeddings as fallback (this will show if it's an embedding format issue)
830
- try:
831
- logger.info("πŸ”„ Trying to add texts without embeddings (fallback)...")
832
- self.vectorstore.add_texts(
833
- texts=texts,
834
- metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]]
835
- # No embeddings parameter - let Chroma handle it
836
- )
837
- logger.info("βœ… Fallback successful - issue is with embedding format")
838
- except Exception as fallback_error:
839
- logger.error(f"❌ Even fallback failed: {fallback_error}")
840
 
841
- raise e # Re-raise original error
 
 
842
 
843
  async def answer_question(self, question: str) -> str:
844
  """GEMINI'S FIX: Direct query embedding - no deadlock"""
 
729
  logger.info(f"πŸš€ Deadlock-Free RAG Pipeline initialized: {collection_name}")
730
 
731
  async def add_documents(self, chunks: List[Dict[str, Any]]):
732
+ """FINAL FIX: Bypasses the faulty LangChain wrapper to talk to ChromaDB directly."""
733
  if not chunks:
734
  return
735
 
 
770
 
771
  if not quality_chunks:
772
  return
773
+
774
+ documents_to_process = quality_chunks[:100]
775
+ texts = [chunk['content'] for chunk in documents_to_process]
776
 
777
+ # GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (this works perfectly)
 
 
 
778
  logger.info(f"πŸš€ Embedding {len(texts)} chunks via Kaggle...")
779
  embeddings = await self.kaggle_client.generate_embeddings(texts)
780
 
781
+ # Debug logging (keep this to confirm data is still perfect)
782
  logger.info("--- HF DEBUG ---")
783
  logger.info(f"Type of embeddings received: {type(embeddings)}")
784
  if isinstance(embeddings, list) and len(embeddings) > 0:
 
787
  if isinstance(embeddings[0], list):
788
  logger.info(f"Dimension of first embedding: {len(embeddings[0])}")
789
  logger.info(f"First few values: {embeddings[0][:5] if len(embeddings[0]) > 5 else embeddings[0]}")
 
 
790
  logger.info("--- END HF DEBUG ---")
 
791
 
792
  if not embeddings or len(embeddings) != len(texts):
793
+ logger.error("Embedding generation failed.")
794
  logger.error(f"Expected {len(texts)} embeddings, got {len(embeddings) if embeddings else 0}")
795
  return
796
+
797
+ # --- THE FINAL FIX: BYPASS LANGCHAIN BUG ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
798
  try:
799
+ logger.info("🎯 FINAL FIX: Bypassing faulty LangChain wrapper, adding to ChromaDB directly...")
800
+
801
+ # Get the raw, underlying collection object from Chroma
802
+ collection = self.vectorstore._collection
803
+
804
+ # The direct `add` method requires a unique ID for each document
805
+ ids = [str(uuid.uuid4()) for _ in texts]
806
+
807
+ # Use the direct .add() method instead of the buggy .add_texts()
808
+ collection.add(
809
+ embeddings=embeddings,
810
+ documents=texts,
811
+ metadatas=[chunk['metadata'] for chunk in documents_to_process],
812
+ ids=ids
813
  )
 
814
 
815
+ logger.info(f"πŸŽ‰ FINAL SUCCESS! Directly added {len(texts)} documents to ChromaDB collection (BYPASSED LANGCHAIN BUG)")
816
+
817
  except Exception as e:
818
+ logger.error(f"❌ Direct ChromaDB add failed: {e}")
819
  logger.error(f"❌ Error type: {type(e)}")
820
 
821
+ # Additional debug info
822
+ logger.error(f"❌ Collection info: {type(collection)}")
823
+ logger.error(f"❌ Embeddings type: {type(embeddings)}")
824
+ logger.error(f"❌ Texts count: {len(texts)}")
825
+ logger.error(f"❌ IDs count: {len(ids)}")
 
 
 
 
 
 
826
 
827
+ # Re-raise the exception to be caught by the main error handler
828
+ raise e
829
+
830
 
831
  async def answer_question(self, question: str) -> str:
832
  """GEMINI'S FIX: Direct query embedding - no deadlock"""