Spaces:

rohannsinghal
/

hackrx6.0

Sleeping

App Files Files Community

rohannsinghal commited on Aug 9, 2025

Commit

d81c7f2

1 Parent(s): 163de00

changes in add_document

Browse files

Files changed (1) hide show

app/main_api.py +34 -46

app/main_api.py CHANGED Viewed

@@ -729,7 +729,7 @@ class DeadlockFreeRAGPipeline:
         logger.info(f"🚀 Deadlock-Free RAG Pipeline initialized: {collection_name}")
     async def add_documents(self, chunks: List[Dict[str, Any]]):
-        """GEMINI'S FIX: Direct embedding management - no deadlock"""
         if not chunks:
             return
@@ -770,15 +770,15 @@ class DeadlockFreeRAGPipeline:
         if not quality_chunks:
             return
-        # GEMINI'S FIX: Step 1 - Get texts
-        texts = [chunk['content'] for chunk in quality_chunks[:100]]  # Reduced from 150 for speed
-        # GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (Manager gets sauce first)
         logger.info(f"🚀 Embedding {len(texts)} chunks via Kaggle...")
         embeddings = await self.kaggle_client.generate_embeddings(texts)
-        # --- ADD THIS DEBUGGING SECTION ---
         logger.info("--- HF DEBUG ---")
         logger.info(f"Type of embeddings received: {type(embeddings)}")
         if isinstance(embeddings, list) and len(embeddings) > 0:
@@ -787,58 +787,46 @@ class DeadlockFreeRAGPipeline:
             if isinstance(embeddings[0], list):
                 logger.info(f"Dimension of first embedding: {len(embeddings[0])}")
                 logger.info(f"First few values: {embeddings[0][:5] if len(embeddings[0]) > 5 else embeddings[0]}")
-        else:
-            logger.info("Embeddings variable is either not a list or is empty!")
         logger.info("--- END HF DEBUG ---")
-        # ---------------------------------
         if not embeddings or len(embeddings) != len(texts):
-            logger.error("Embedding failed or returned mismatched count.")
             logger.error(f"Expected {len(texts)} embeddings, got {len(embeddings) if embeddings else 0}")
             return
-        # Validate embeddings format
-        if not isinstance(embeddings, list):
-            logger.error(f"Embeddings must be a list, got {type(embeddings)}")
-            return
-        # Check if all embeddings are lists of floats
-        for i, emb in enumerate(embeddings[:3]):  # Check first 3
-            if not isinstance(emb, list):
-                logger.error(f"Embedding {i} is not a list: {type(emb)}")
-                return
-            if not all(isinstance(x, (int, float)) for x in emb[:5]):  # Check first 5 values
-                logger.error(f"Embedding {i} contains non-numeric values")
-                return
-        # GEMINI'S FIX: Step 3 - Add to Chroma with pre-calculated embeddings
-        # This completely avoids the deadlock!
         try:
-            logger.info("🔄 Adding texts to Chroma with embeddings...")
-            self.vectorstore.add_texts(
-                texts=texts,
-                metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]],
-                embeddings=embeddings  # <-- THE CRITICAL FIX - MAKE SURE THIS IS HERE
             )
-            logger.info(f"✅ Added {len(texts)} documents with embeddings to vector store (DEADLOCK-FREE)")
         except Exception as e:
-            logger.error(f"❌ ChromaDB add_texts failed: {e}")
             logger.error(f"❌ Error type: {type(e)}")
-            # Try adding without embeddings as fallback (this will show if it's an embedding format issue)
-            try:
-                logger.info("🔄 Trying to add texts without embeddings (fallback)...")
-                self.vectorstore.add_texts(
-                    texts=texts,
-                    metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]]
-                    # No embeddings parameter - let Chroma handle it
-                )
-                logger.info("✅ Fallback successful - issue is with embedding format")
-            except Exception as fallback_error:
-                logger.error(f"❌ Even fallback failed: {fallback_error}")
-            raise e  # Re-raise original error
     async def answer_question(self, question: str) -> str:
         """GEMINI'S FIX: Direct query embedding - no deadlock"""

         logger.info(f"🚀 Deadlock-Free RAG Pipeline initialized: {collection_name}")
     async def add_documents(self, chunks: List[Dict[str, Any]]):
+        """FINAL FIX: Bypasses the faulty LangChain wrapper to talk to ChromaDB directly."""
         if not chunks:
             return
         if not quality_chunks:
             return
+        documents_to_process = quality_chunks[:100]
+        texts = [chunk['content'] for chunk in documents_to_process]
+        # GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (this works perfectly)
         logger.info(f"🚀 Embedding {len(texts)} chunks via Kaggle...")
         embeddings = await self.kaggle_client.generate_embeddings(texts)
+        # Debug logging (keep this to confirm data is still perfect)
         logger.info("--- HF DEBUG ---")
         logger.info(f"Type of embeddings received: {type(embeddings)}")
         if isinstance(embeddings, list) and len(embeddings) > 0:
             if isinstance(embeddings[0], list):
                 logger.info(f"Dimension of first embedding: {len(embeddings[0])}")
                 logger.info(f"First few values: {embeddings[0][:5] if len(embeddings[0]) > 5 else embeddings[0]}")
         logger.info("--- END HF DEBUG ---")
         if not embeddings or len(embeddings) != len(texts):
+            logger.error("Embedding generation failed.")
             logger.error(f"Expected {len(texts)} embeddings, got {len(embeddings) if embeddings else 0}")
             return
+        # --- THE FINAL FIX: BYPASS LANGCHAIN BUG ---
         try:
+            logger.info("🎯 FINAL FIX: Bypassing faulty LangChain wrapper, adding to ChromaDB directly...")
+            # Get the raw, underlying collection object from Chroma
+            collection = self.vectorstore._collection
+            # The direct `add` method requires a unique ID for each document
+            ids = [str(uuid.uuid4()) for _ in texts]
+            # Use the direct .add() method instead of the buggy .add_texts()
+            collection.add(
+                embeddings=embeddings,
+                documents=texts,
+                metadatas=[chunk['metadata'] for chunk in documents_to_process],
+                ids=ids
             )
+            logger.info(f"🎉 FINAL SUCCESS! Directly added {len(texts)} documents to ChromaDB collection (BYPASSED LANGCHAIN BUG)")
         except Exception as e:
+            logger.error(f"❌ Direct ChromaDB add failed: {e}")
             logger.error(f"❌ Error type: {type(e)}")
+            # Additional debug info
+            logger.error(f"❌ Collection info: {type(collection)}")
+            logger.error(f"❌ Embeddings type: {type(embeddings)}")
+            logger.error(f"❌ Texts count: {len(texts)}")
+            logger.error(f"❌ IDs count: {len(ids)}")
+            # Re-raise the exception to be caught by the main error handler
+            raise e
     async def answer_question(self, question: str) -> str:
         """GEMINI'S FIX: Direct query embedding - no deadlock"""