Spaces:
Sleeping
Sleeping
Commit
Β·
d81c7f2
1
Parent(s):
163de00
changes in add_document
Browse files- app/main_api.py +34 -46
app/main_api.py
CHANGED
|
@@ -729,7 +729,7 @@ class DeadlockFreeRAGPipeline:
|
|
| 729 |
logger.info(f"π Deadlock-Free RAG Pipeline initialized: {collection_name}")
|
| 730 |
|
| 731 |
async def add_documents(self, chunks: List[Dict[str, Any]]):
|
| 732 |
-
"""
|
| 733 |
if not chunks:
|
| 734 |
return
|
| 735 |
|
|
@@ -770,15 +770,15 @@ class DeadlockFreeRAGPipeline:
|
|
| 770 |
|
| 771 |
if not quality_chunks:
|
| 772 |
return
|
|
|
|
|
|
|
|
|
|
| 773 |
|
| 774 |
-
# GEMINI'S FIX: Step
|
| 775 |
-
texts = [chunk['content'] for chunk in quality_chunks[:100]] # Reduced from 150 for speed
|
| 776 |
-
|
| 777 |
-
# GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (Manager gets sauce first)
|
| 778 |
logger.info(f"π Embedding {len(texts)} chunks via Kaggle...")
|
| 779 |
embeddings = await self.kaggle_client.generate_embeddings(texts)
|
| 780 |
|
| 781 |
-
#
|
| 782 |
logger.info("--- HF DEBUG ---")
|
| 783 |
logger.info(f"Type of embeddings received: {type(embeddings)}")
|
| 784 |
if isinstance(embeddings, list) and len(embeddings) > 0:
|
|
@@ -787,58 +787,46 @@ class DeadlockFreeRAGPipeline:
|
|
| 787 |
if isinstance(embeddings[0], list):
|
| 788 |
logger.info(f"Dimension of first embedding: {len(embeddings[0])}")
|
| 789 |
logger.info(f"First few values: {embeddings[0][:5] if len(embeddings[0]) > 5 else embeddings[0]}")
|
| 790 |
-
else:
|
| 791 |
-
logger.info("Embeddings variable is either not a list or is empty!")
|
| 792 |
logger.info("--- END HF DEBUG ---")
|
| 793 |
-
# ---------------------------------
|
| 794 |
|
| 795 |
if not embeddings or len(embeddings) != len(texts):
|
| 796 |
-
logger.error("Embedding failed
|
| 797 |
logger.error(f"Expected {len(texts)} embeddings, got {len(embeddings) if embeddings else 0}")
|
| 798 |
return
|
| 799 |
-
|
| 800 |
-
#
|
| 801 |
-
if not isinstance(embeddings, list):
|
| 802 |
-
logger.error(f"Embeddings must be a list, got {type(embeddings)}")
|
| 803 |
-
return
|
| 804 |
-
|
| 805 |
-
# Check if all embeddings are lists of floats
|
| 806 |
-
for i, emb in enumerate(embeddings[:3]): # Check first 3
|
| 807 |
-
if not isinstance(emb, list):
|
| 808 |
-
logger.error(f"Embedding {i} is not a list: {type(emb)}")
|
| 809 |
-
return
|
| 810 |
-
if not all(isinstance(x, (int, float)) for x in emb[:5]): # Check first 5 values
|
| 811 |
-
logger.error(f"Embedding {i} contains non-numeric values")
|
| 812 |
-
return
|
| 813 |
-
|
| 814 |
-
# GEMINI'S FIX: Step 3 - Add to Chroma with pre-calculated embeddings
|
| 815 |
-
# This completely avoids the deadlock!
|
| 816 |
try:
|
| 817 |
-
logger.info("
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
)
|
| 823 |
-
logger.info(f"β
Added {len(texts)} documents with embeddings to vector store (DEADLOCK-FREE)")
|
| 824 |
|
|
|
|
|
|
|
| 825 |
except Exception as e:
|
| 826 |
-
logger.error(f"β ChromaDB
|
| 827 |
logger.error(f"β Error type: {type(e)}")
|
| 828 |
|
| 829 |
-
#
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]]
|
| 835 |
-
# No embeddings parameter - let Chroma handle it
|
| 836 |
-
)
|
| 837 |
-
logger.info("β
Fallback successful - issue is with embedding format")
|
| 838 |
-
except Exception as fallback_error:
|
| 839 |
-
logger.error(f"β Even fallback failed: {fallback_error}")
|
| 840 |
|
| 841 |
-
|
|
|
|
|
|
|
| 842 |
|
| 843 |
async def answer_question(self, question: str) -> str:
|
| 844 |
"""GEMINI'S FIX: Direct query embedding - no deadlock"""
|
|
|
|
| 729 |
logger.info(f"π Deadlock-Free RAG Pipeline initialized: {collection_name}")
|
| 730 |
|
| 731 |
async def add_documents(self, chunks: List[Dict[str, Any]]):
|
| 732 |
+
"""FINAL FIX: Bypasses the faulty LangChain wrapper to talk to ChromaDB directly."""
|
| 733 |
if not chunks:
|
| 734 |
return
|
| 735 |
|
|
|
|
| 770 |
|
| 771 |
if not quality_chunks:
|
| 772 |
return
|
| 773 |
+
|
| 774 |
+
documents_to_process = quality_chunks[:100]
|
| 775 |
+
texts = [chunk['content'] for chunk in documents_to_process]
|
| 776 |
|
| 777 |
+
# GEMINI'S FIX: Step 2 - Embed all texts via Kaggle (this works perfectly)
|
|
|
|
|
|
|
|
|
|
| 778 |
logger.info(f"π Embedding {len(texts)} chunks via Kaggle...")
|
| 779 |
embeddings = await self.kaggle_client.generate_embeddings(texts)
|
| 780 |
|
| 781 |
+
# Debug logging (keep this to confirm data is still perfect)
|
| 782 |
logger.info("--- HF DEBUG ---")
|
| 783 |
logger.info(f"Type of embeddings received: {type(embeddings)}")
|
| 784 |
if isinstance(embeddings, list) and len(embeddings) > 0:
|
|
|
|
| 787 |
if isinstance(embeddings[0], list):
|
| 788 |
logger.info(f"Dimension of first embedding: {len(embeddings[0])}")
|
| 789 |
logger.info(f"First few values: {embeddings[0][:5] if len(embeddings[0]) > 5 else embeddings[0]}")
|
|
|
|
|
|
|
| 790 |
logger.info("--- END HF DEBUG ---")
|
|
|
|
| 791 |
|
| 792 |
if not embeddings or len(embeddings) != len(texts):
|
| 793 |
+
logger.error("Embedding generation failed.")
|
| 794 |
logger.error(f"Expected {len(texts)} embeddings, got {len(embeddings) if embeddings else 0}")
|
| 795 |
return
|
| 796 |
+
|
| 797 |
+
# --- THE FINAL FIX: BYPASS LANGCHAIN BUG ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 798 |
try:
|
| 799 |
+
logger.info("π― FINAL FIX: Bypassing faulty LangChain wrapper, adding to ChromaDB directly...")
|
| 800 |
+
|
| 801 |
+
# Get the raw, underlying collection object from Chroma
|
| 802 |
+
collection = self.vectorstore._collection
|
| 803 |
+
|
| 804 |
+
# The direct `add` method requires a unique ID for each document
|
| 805 |
+
ids = [str(uuid.uuid4()) for _ in texts]
|
| 806 |
+
|
| 807 |
+
# Use the direct .add() method instead of the buggy .add_texts()
|
| 808 |
+
collection.add(
|
| 809 |
+
embeddings=embeddings,
|
| 810 |
+
documents=texts,
|
| 811 |
+
metadatas=[chunk['metadata'] for chunk in documents_to_process],
|
| 812 |
+
ids=ids
|
| 813 |
)
|
|
|
|
| 814 |
|
| 815 |
+
logger.info(f"π FINAL SUCCESS! Directly added {len(texts)} documents to ChromaDB collection (BYPASSED LANGCHAIN BUG)")
|
| 816 |
+
|
| 817 |
except Exception as e:
|
| 818 |
+
logger.error(f"β Direct ChromaDB add failed: {e}")
|
| 819 |
logger.error(f"β Error type: {type(e)}")
|
| 820 |
|
| 821 |
+
# Additional debug info
|
| 822 |
+
logger.error(f"β Collection info: {type(collection)}")
|
| 823 |
+
logger.error(f"β Embeddings type: {type(embeddings)}")
|
| 824 |
+
logger.error(f"β Texts count: {len(texts)}")
|
| 825 |
+
logger.error(f"β IDs count: {len(ids)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 826 |
|
| 827 |
+
# Re-raise the exception to be caught by the main error handler
|
| 828 |
+
raise e
|
| 829 |
+
|
| 830 |
|
| 831 |
async def answer_question(self, question: str) -> str:
|
| 832 |
"""GEMINI'S FIX: Direct query embedding - no deadlock"""
|