Spaces:
Sleeping
Sleeping
Commit
Β·
e85c76b
1
Parent(s):
5375a1a
made changes to add_document fxn
Browse files- app/main_api.py +56 -8
app/main_api.py
CHANGED
|
@@ -778,20 +778,68 @@ class DeadlockFreeRAGPipeline:
|
|
| 778 |
logger.info(f"π Embedding {len(texts)} chunks via Kaggle...")
|
| 779 |
embeddings = await self.kaggle_client.generate_embeddings(texts)
|
| 780 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
if not embeddings or len(embeddings) != len(texts):
|
| 782 |
logger.error("Embedding failed or returned mismatched count.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
return
|
| 784 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
# GEMINI'S FIX: Step 3 - Add to Chroma with pre-calculated embeddings
|
| 786 |
# This completely avoids the deadlock!
|
| 787 |
-
|
| 788 |
-
texts
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 795 |
async def answer_question(self, question: str) -> str:
|
| 796 |
"""GEMINI'S FIX: Direct query embedding - no deadlock"""
|
| 797 |
# Security check
|
|
|
|
| 778 |
logger.info(f"π Embedding {len(texts)} chunks via Kaggle...")
|
| 779 |
embeddings = await self.kaggle_client.generate_embeddings(texts)
|
| 780 |
|
| 781 |
+
# --- ADD THIS DEBUGGING SECTION ---
|
| 782 |
+
logger.info("--- HF DEBUG ---")
|
| 783 |
+
logger.info(f"Type of embeddings received: {type(embeddings)}")
|
| 784 |
+
if isinstance(embeddings, list) and len(embeddings) > 0:
|
| 785 |
+
logger.info(f"Number of embeddings: {len(embeddings)}")
|
| 786 |
+
logger.info(f"Type of first item: {type(embeddings[0])}")
|
| 787 |
+
if isinstance(embeddings[0], list):
|
| 788 |
+
logger.info(f"Dimension of first embedding: {len(embeddings[0])}")
|
| 789 |
+
logger.info(f"First few values: {embeddings[0][:5] if len(embeddings[0]) > 5 else embeddings[0]}")
|
| 790 |
+
else:
|
| 791 |
+
logger.info("Embeddings variable is either not a list or is empty!")
|
| 792 |
+
logger.info("--- END HF DEBUG ---")
|
| 793 |
+
# ---------------------------------
|
| 794 |
+
|
| 795 |
if not embeddings or len(embeddings) != len(texts):
|
| 796 |
logger.error("Embedding failed or returned mismatched count.")
|
| 797 |
+
logger.error(f"Expected {len(texts)} embeddings, got {len(embeddings) if embeddings else 0}")
|
| 798 |
+
return
|
| 799 |
+
|
| 800 |
+
# Validate embeddings format
|
| 801 |
+
if not isinstance(embeddings, list):
|
| 802 |
+
logger.error(f"Embeddings must be a list, got {type(embeddings)}")
|
| 803 |
return
|
| 804 |
|
| 805 |
+
# Check if all embeddings are lists of floats
|
| 806 |
+
for i, emb in enumerate(embeddings[:3]): # Check first 3
|
| 807 |
+
if not isinstance(emb, list):
|
| 808 |
+
logger.error(f"Embedding {i} is not a list: {type(emb)}")
|
| 809 |
+
return
|
| 810 |
+
if not all(isinstance(x, (int, float)) for x in emb[:5]): # Check first 5 values
|
| 811 |
+
logger.error(f"Embedding {i} contains non-numeric values")
|
| 812 |
+
return
|
| 813 |
+
|
| 814 |
# GEMINI'S FIX: Step 3 - Add to Chroma with pre-calculated embeddings
|
| 815 |
# This completely avoids the deadlock!
|
| 816 |
+
try:
|
| 817 |
+
logger.info("π Adding texts to Chroma with embeddings...")
|
| 818 |
+
self.vectorstore.add_texts(
|
| 819 |
+
texts=texts,
|
| 820 |
+
metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]],
|
| 821 |
+
embeddings=embeddings # <-- THE CRITICAL FIX - MAKE SURE THIS IS HERE
|
| 822 |
+
)
|
| 823 |
+
logger.info(f"β
Added {len(texts)} documents with embeddings to vector store (DEADLOCK-FREE)")
|
| 824 |
+
|
| 825 |
+
except Exception as e:
|
| 826 |
+
logger.error(f"β ChromaDB add_texts failed: {e}")
|
| 827 |
+
logger.error(f"β Error type: {type(e)}")
|
| 828 |
+
|
| 829 |
+
# Try adding without embeddings as fallback (this will show if it's an embedding format issue)
|
| 830 |
+
try:
|
| 831 |
+
logger.info("π Trying to add texts without embeddings (fallback)...")
|
| 832 |
+
self.vectorstore.add_texts(
|
| 833 |
+
texts=texts,
|
| 834 |
+
metadatas=[chunk['metadata'] for chunk in quality_chunks[:100]]
|
| 835 |
+
# No embeddings parameter - let Chroma handle it
|
| 836 |
+
)
|
| 837 |
+
logger.info("β
Fallback successful - issue is with embedding format")
|
| 838 |
+
except Exception as fallback_error:
|
| 839 |
+
logger.error(f"β Even fallback failed: {fallback_error}")
|
| 840 |
+
|
| 841 |
+
raise e # Re-raise original error
|
| 842 |
+
|
| 843 |
async def answer_question(self, question: str) -> str:
|
| 844 |
"""GEMINI'S FIX: Direct query embedding - no deadlock"""
|
| 845 |
# Security check
|