Spaces:

abhi2400
/

Document_retrieval

Runtime error

App Files Files Community

abhi2400 commited on Aug 6, 2025

Commit

20ad7d8

verified ·

1 Parent(s): f600a89

Update main.py

Browse files

Files changed (1) hide show

main.py +805 -805

main.py CHANGED Viewed

@@ -1,806 +1,806 @@
-import os
-import logging
-import requests
-import fitz  # PyMuPDF
-import google.generativeai as genai
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel, validator
-from typing import List, Dict
-import re
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-import time
-import numpy as np
-from sentence_transformers import SentenceTransformer
-import hashlib
-from pinecone import Pinecone, ServerlessSpec
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(_name_)
-# Initialize FastAPI
-app = FastAPI(title="Debug Document QA API", version="5.1.0")
-# Configuration
-GEMINI_API_KEY = "AIzaSyBPa-4UMLTi81OgKUhTBuqczGzaKec4zP4"
-PINECONE_API_KEY = "pcsk_7M5Zsf_84MeAJ4hBxCMN5z4AT3gkNNnTqqicAzA5A6o5m9XViUkCFRTjsk46FVc6mKiynD"
-INDEX_NAME = "qa-fast-v2"
-# Initialize services
-genai.configure(api_key=GEMINI_API_KEY)
-model = genai.GenerativeModel('gemini-2.0-flash')
-# Lightweight embedding model
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-embedding_model.max_seq_length = 256
-# Initialize Pinecone
-try:
-    pc = Pinecone(api_key=PINECONE_API_KEY)
-    # Create index if doesn't exist
-    if INDEX_NAME not in pc.list_indexes().names():
-        pc.create_index(
-            name=INDEX_NAME,
-            dimension=384,
-            metric='cosine',
-            spec=ServerlessSpec(cloud='aws', region='us-east-1')
-        )
-        time.sleep(5)
-    index = pc.Index(INDEX_NAME)
-    logger.info("✅ Pinecone connected successfully")
-except Exception as e:
-    logger.error(f"❌ Pinecone failed: {e}")
-    index = None
-executor = ThreadPoolExecutor(max_workers=4)
-# Models
-class QARequest(BaseModel):
-    documents: str
-    questions: List[str]
-    @validator('documents')
-    def validate_url(cls, v):
-        # Remove query parameters for extension check
-        base_url = v.split('?')[0]
-        if not base_url.lower().endswith('.pdf'):
-            raise ValueError('Must be PDF URL')
-        return v
-class QAResponse(BaseModel):
-    answers: List[str]
-# Document processor
-class DocumentProcessor:
-    def _init_(self):
-        self.cache = {}
-    def download_pdf(self, url: str) -> bytes:
-        """Internal PDF download method with better error handling"""
-        try:
-            logger.info(f"📥 Downloading PDF from: {url}")
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-                'Accept': 'application/pdf,application/octet-stream,/'
-            }
-            response = requests.get(url, headers=headers, timeout=30, stream=True)
-            response.raise_for_status()
-            content = response.content
-            logger.info(f"📄 Downloaded {len(content)} bytes")
-            return content
-        except Exception as e:
-            logger.error(f"❌ Failed to download PDF: {str(e)}")
-            raise HTTPException(status_code=400, detail=f"Failed to download PDF: {str(e)}")
-    def extract_text(self, pdf_bytes: bytes) -> str:
-        """Extract text with better debugging"""
-        try:
-            logger.info(f"📖 Extracting text from {len(pdf_bytes)} bytes PDF")
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            text_parts = []
-            max_pages = min(doc.page_count, 50)  # Process more pages
-            for page_num in range(max_pages):
-                page = doc.load_page(page_num)
-                text = page.get_text()
-                if text.strip():
-                    text_parts.append(text)
-                    logger.info(f"Page {page_num + 1}: {len(text)} characters")
-            doc.close()
-            # Combine and clean
-            full_text = ' '.join(text_parts)
-            full_text = re.sub(r'\s+', ' ', full_text.strip())
-            full_text = re.sub(r'Page \d+', '', full_text, flags=re.IGNORECASE)
-            logger.info(f"📄 Total extracted text: {len(full_text)} characters from {max_pages} pages")
-            # Log sample text for debugging
-            sample_text = full_text[:500] if len(full_text) > 500 else full_text
-            logger.info(f"📝 Sample text: {sample_text}...")
-            return full_text
-        except Exception as e:
-            logger.error(f"❌ Text extraction failed: {e}")
-            raise HTTPException(status_code=500, detail=f"Cannot extract text: {e}")
-    def create_chunks(self, text: str) -> List[Dict]:
-        """Create chunks with better debugging"""
-        logger.info(f"🔪 Creating chunks from {len(text)} characters")
-        chunk_size = 1000  # Smaller chunks for better matching
-        overlap = 150
-        # Split into sentences
-        sentences = re.split(r'(?<=[.!?])\s+', text)
-        logger.info(f"📋 Found {len(sentences)} sentences")
-        chunks = []
-        current_chunk = []
-        current_length = 0
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if len(sentence) < 15:  # Skip very short sentences
-                continue
-            if current_length + len(sentence) > chunk_size and current_chunk:
-                # Create chunk
-                chunk_text = ' '.join(current_chunk)
-                chunks.append({
-                    'text': chunk_text,
-                    'id': f"chunk_{len(chunks)}"
-                })
-                # Keep some overlap
-                if len(current_chunk) > 1:
-                    current_chunk = current_chunk[-1:] + [sentence]
-                    current_length = sum(len(s) for s in current_chunk)
-                else:
-                    current_chunk = [sentence]
-                    current_length = len(sentence)
-            else:
-                current_chunk.append(sentence)
-                current_length += len(sentence)
-        # Add final chunk
-        if current_chunk:
-            chunk_text = ' '.join(current_chunk)
-            chunks.append({
-                'text': chunk_text,
-                'id': f"chunk_{len(chunks)}"
-            })
-        logger.info(f"📊 Created {len(chunks)} chunks")
-        # Log sample chunks for debugging
-        for i, chunk in enumerate(chunks[:3]):
-            logger.info(f"Chunk {i}: {chunk['text'][:100]}...")
-        return chunks
-    async def store_in_pinecone(self, chunks: List[Dict], doc_id: str):
-        """Store chunks in Pinecone with debugging"""
-        if not index:
-            logger.error("❌ Pinecone not available - storing chunks in memory fallback")
-            # Store in memory as fallback
-            self.cache[doc_id] = chunks
-            return f"memory_{doc_id}"
-        try:
-            namespace = f"doc_{doc_id}"
-            logger.info(f"💾 Storing {len(chunks)} chunks in Pinecone namespace: {namespace}")
-            # Create embeddings in batch
-            texts = [chunk['text'] for chunk in chunks]
-            logger.info("🧠 Creating embeddings...")
-            embeddings = embedding_model.encode(texts, batch_size=16, show_progress_bar=False)
-            logger.info(f"✅ Created {len(embeddings)} embeddings, dimension: {len(embeddings[0])}")
-            # Prepare vectors
-            vectors = []
-            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-                vectors.append({
-                    'id': f"{doc_id}_{chunk['id']}",
-                    'values': embedding.tolist(),
-                    'metadata': {
-                        'text': chunk['text'][:1000],  # Limit metadata size
-                        'chunk_id': chunk['id']
-                    }
-                })
-            # Upsert in batches
-            batch_size = 50
-            for i in range(0, len(vectors), batch_size):
-                batch = vectors[i:i + batch_size]
-                result = index.upsert(vectors=batch, namespace=namespace)
-                logger.info(f"📤 Upserted batch {i//batch_size + 1}: {result}")
-            # Verify storage
-            time.sleep(2)  # Wait for indexing
-            stats = index.describe_index_stats()
-            logger.info(f"📊 Index stats: {stats}")
-            return namespace
-        except Exception as e:
-            logger.error(f"❌ Pinecone storage failed: {e}")
-            # Fallback to memory storage
-            self.cache[doc_id] = chunks
-            return f"memory_{doc_id}"
-# QA Processor
-class QAProcessor:
-    def _init_(self):
-        self.answer_cache = {}
-    async def search_context(self, question: str, namespace: str, doc_processor: DocumentProcessor) -> List[str]:
-        """Enhanced context search with memory fallback"""
-        logger.info(f"🔍 Searching for: '{question}' in namespace: {namespace}")
-        # Check if using memory fallback
-        if namespace.startswith("memory_"):
-            doc_id = namespace.replace("memory_", "")
-            if doc_id in doc_processor.cache:
-                chunks = doc_processor.cache[doc_id]
-                logger.info(f"🧠 Using memory fallback with {len(chunks)} chunks")
-                # Simple keyword matching for memory fallback
-                question_words = set(question.lower().split())
-                scored_chunks = []
-                for chunk in chunks:
-                    chunk_words = set(chunk['text'].lower().split())
-                    overlap = len(question_words.intersection(chunk_words))
-                    if overlap > 0:
-                        scored_chunks.append((chunk['text'], overlap))
-                scored_chunks.sort(key=lambda x: x[1], reverse=True)
-                contexts = [chunk for chunk, _ in scored_chunks[:8]]
-                logger.info(f"📋 Found {len(contexts)} relevant chunks via memory search")
-                return contexts
-        if not index:
-            logger.error("❌ Both Pinecone and memory fallback failed")
-            return []
-        try:
-            # Create query embedding
-            logger.info("🧠 Creating query embedding...")
-            query_embedding = embedding_model.encode([question])[0]
-            logger.info(f"✅ Query embedding created: dimension {len(query_embedding)}")
-            # Search in Pinecone
-            logger.info(f"🔍 Querying Pinecone in namespace: {namespace}")
-            results = index.query(
-                vector=query_embedding.tolist(),
-                top_k=15,
-                namespace=namespace,
-                include_metadata=True
-            )
-            logger.info(f"📊 Pinecone returned {len(results.matches)} matches")
-            # Log match scores for debugging
-            for i, match in enumerate(results.matches[:5]):
-                logger.info(f"Match {i}: score={match.score:.4f}, text={match.metadata['text'][:100]}...")
-            # Collect contexts with lower threshold
-            contexts = []
-            for match in results.matches:
-                if match.score > 0.1:  # Lower threshold
-                    contexts.append(match.metadata['text'])
-            logger.info(f"📋 Selected {len(contexts)} contexts above threshold")
-            return contexts
-        except Exception as e:
-            logger.error(f"❌ Search failed: {e}")
-            return []
-    async def generate_answer(self, question: str, contexts: List[str]) -> str:
-        """Generate concise, accurate answers"""
-        logger.info(f"🤖 Generating answer for: '{question}' with {len(contexts)} contexts")
-        if not contexts:
-            logger.warning("⚠ No contexts found - trying direct text search")
-            return "Answer not found in document."
-        # Combine contexts intelligently
-        combined_context = '\n'.join(contexts[:8])
-        if len(combined_context) > 5000:
-            combined_context = combined_context[:5000]
-        logger.info(f"📝 Combined context length: {len(combined_context)}")
-        # Enhanced prompt for better extraction
-        prompt = f"""You are an expert at extracting specific information from insurance policy documents.
-Your task: Find the exact answer to the question from the policy document below. Be concise and specific.
-Insurance Policy Document:
-{combined_context}
-Question: {question}
-Instructions:
-- Give a direct, concise answer
-- Include specific numbers, periods, percentages when mentioned
-- If there are conditions, mention the key ones briefly
-- Don't start with "Based on" or "According to"
-- Keep answer under 100 words
-- If no relevant information exists, say "Answer not found in document"
-Answer:"""
-        try:
-            logger.info("🤖 Calling Gemini...")
-            response = await asyncio.wait_for(
-                asyncio.get_event_loop().run_in_executor(
-                    executor,
-                    lambda: model.generate_content(
-                        prompt,
-                        generation_config=genai.types.GenerationConfig(
-                            temperature=0.05,  # Very low for consistency
-                            max_output_tokens=150,  # Shorter answers
-                            candidate_count=1,
-                            top_p=0.9
-                        )
-                    )
-                ),
-                timeout=15.0
-            )
-            answer = response.text.strip()
-            logger.info(f"✅ Generated answer: {answer[:100]}...")
-            # Clean answer
-            answer = self.clean_answer(answer)
-            # Enhanced validation
-            if self.is_valid_answer(answer, question):
-                return answer
-            # Try rule-based extraction for specific patterns
-            logger.info("🛠 Trying enhanced rule-based extraction...")
-            rule_answer = self.enhanced_rule_extraction(question, combined_context)
-            if rule_answer != "Answer not found in document.":
-                return rule_answer
-            # Final fallback - return AI answer if it's not completely empty
-            if answer and len(answer) > 10 and "not found" not in answer.lower():
-                return answer
-            return "Answer not found in document."
-        except Exception as e:
-            logger.error(f"❌ Generation failed: {e}")
-            return self.enhanced_rule_extraction(question, combined_context)
-    def clean_answer(self, answer: str) -> str:
-        """Clean and format answer"""
-        if not answer:
-            return ""
-        # Remove common prefixes
-        prefixes = [
-            "ANSWER:", "Based on the", "According to", "The context",
-            "The document", "From the policy", "Answer:"
-        ]
-        for prefix in prefixes:
-            if answer.startswith(prefix):
-                answer = answer[len(prefix):].strip()
-                if answer.startswith(':'):
-                    answer = answer[1:].strip()
-        # Clean formatting
-        answer = re.sub(r'\s+', ' ', answer.strip())
-        # Capitalize first letter
-        if answer and answer[0].islower():
-            answer = answer[0].upper() + answer[1:]
-        return answer
-    def is_valid_answer(self, answer: str, question: str) -> bool:
-        """Check if answer is valid and relevant"""
-        if not answer or len(answer) < 5:
-            return False
-        # Check for non-answers
-        non_answers = [
-            "answer not found", "not mentioned", "does not contain",
-            "no information", "cannot be determined", "not specified"
-        ]
-        if any(phrase in answer.lower() for phrase in non_answers):
-            return False
-        # Check if answer contains relevant keywords from question
-        question_words = set(question.lower().split())
-        answer_words = set(answer.lower().split())
-        # Should have some overlap
-        overlap = len(question_words.intersection(answer_words))
-        return overlap >= 1
-    def enhanced_rule_extraction(self, question: str, context: str) -> str:
-        """Enhanced rule-based extraction for insurance-specific queries"""
-        logger.info(f"🛠 Enhanced rule extraction for: '{question}'")
-        q_lower = question.lower()
-        # Define comprehensive patterns for insurance terms
-        insurance_patterns = {
-            'grace period': {
-                'patterns': [
-                    r'grace period.?(\d+)\s(days?|months?)',
-                    r'(\d+)\s*days?\s*grace\s*period',
-                    r'premium.?grace.?(\d+)\s*days?',
-                    r'grace.*?(\d+)\s*days?'
-                ],
-                'extract_sentence': True
-            },
-            'waiting period': {
-                'patterns': [
-                    r'waiting period.?(\d+)\s(days?|months?|years?)',
-                    r'(\d+)\s*(days?|months?|years?).*?waiting\s*period',
-                    r'pre.?existing.?(\d+)\s*(months?|years?)',
-                    r'(\d+)\s*months?.*?continuous\s*coverage'
-                ],
-                'extract_sentence': True
-            },
-            'maternity': {
-                'patterns': [
-                    r'maternity.?(\d+)\s(months?|years?)',
-                    r'(\d+)\s*months?.*?maternity',
-                    r'pregnancy.*?(\d+)\s*months?',
-                    r'childbirth.*?(\d+)\s*months?',
-                    r'continuous.?covered.?(\d+)\s*months?'
-                ],
-                'extract_full': True
-            },
-            'cataract': {
-                'patterns': [
-                    r'cataract.?(\d+)\s(years?|months?)',
-                    r'(\d+)\s*years?.*?cataract',
-                    r'eye.?surgery.?(\d+)\s*years?',
-                    r'cataract.?waiting.?(\d+)'
-                ],
-                'extract_sentence': True
-            },
-            'ncd|no claim discount': {
-                'patterns': [
-                    r'no claim discount.*?(\d+)%',
-                    r'ncd.*?(\d+)%',
-                    r'(\d+)%.*?no claim',
-                    r'cumulative bonus.*?(\d+)%',
-                    r'(\d+)%.?claim.?free'
-                ],
-                'extract_sentence': True
-            },
-            'room rent|icu': {
-                'patterns': [
-                    r'room rent.*?(\d+)%',
-                    r'icu.*?(\d+)%',
-                    r'(\d+)%.*?room rent',
-                    r'(\d+)%.*?sum insured'
-                ],
-                'extract_sentence': True
-            },
-            'ayush': {
-                'patterns': [
-                    r'ayurveda.?yoga.?naturopathy',
-                    r'ayush.*?hospital',
-                    r'unani.?siddha.?homeopathy'
-                ],
-                'extract_full': True
-            },
-            'hospital': {
-                'patterns': [
-                    r'hospital.?means.?institution',
-                    r'(\d+).*?inpatient beds',
-                    r'qualified nursing staff'
-                ],
-                'extract_full': True
-            }
-        }
-        # Find relevant pattern category
-        for key, config in insurance_patterns.items():
-            if any(word in q_lower for word in key.split('|')):
-                logger.info(f"🔍 Checking patterns for: {key}")
-                for pattern in config['patterns']:
-                    matches = list(re.finditer(pattern, context, re.IGNORECASE))
-                    if matches:
-                        logger.info(f"✅ Pattern matched: {pattern}")
-                        # Extract based on configuration
-                        if config.get('extract_full'):
-                            # Extract larger context around match
-                            match = matches[0]
-                            start = max(0, match.start() - 200)
-                            end = min(len(context), match.end() + 200)
-                            full_context = context[start:end]
-                            # Find complete sentences
-                            sentences = re.split(r'[.!?]+', full_context)
-                            relevant_sentences = []
-                            for sentence in sentences:
-                                if (re.search(pattern, sentence, re.IGNORECASE) or
-                                    any(word in sentence.lower() for word in key.split('|'))):
-                                    relevant_sentences.append(sentence.strip())
-                            if relevant_sentences:
-                                result = '. '.join(relevant_sentences[:2])
-                                return self.clean_extracted_answer(result)
-                        else:  # extract_sentence
-                            # Find the sentence containing the match
-                            match = matches[0]
-                            # Expand search area
-                            start = max(0, match.start() - 150)
-                            end = min(len(context), match.end() + 150)
-                            sentence_area = context[start:end]
-                            sentences = re.split(r'[.!?]+', sentence_area)
-                            for sentence in sentences:
-                                if re.search(pattern, sentence, re.IGNORECASE) and len(sentence.strip()) > 15:
-                                    result = sentence.strip()
-                                    return self.clean_extracted_answer(result)
-        # Fallback: keyword-based extraction
-        return self.keyword_based_extraction(question, context)
-    def keyword_based_extraction(self, question: str, context: str) -> str:
-        """Extract answer based on keyword matching"""
-        question_keywords = [word.lower() for word in question.split() if len(word) > 3]
-        if not question_keywords:
-            return "Answer not found in document."
-        sentences = re.split(r'[.!?]+', context)
-        scored_sentences = []
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if len(sentence) < 20:
-                continue
-            sentence_lower = sentence.lower()
-            score = 0
-            # Count keyword matches
-            for keyword in question_keywords:
-                if keyword in sentence_lower:
-                    score += 1
-            # Bonus for numbers (common in insurance)
-            if re.search(r'\d+', sentence):
-                score += 0.5
-            # Bonus for insurance terms
-            insurance_terms = ['policy', 'coverage', 'benefit', 'premium', 'claim', 'period', 'limit']
-            for term in insurance_terms:
-                if term in sentence_lower:
-                    score += 0.3
-            if score >= 1.5:  # Threshold for relevance
-                scored_sentences.append((sentence, score))
-        if scored_sentences:
-            # Sort by score and return best match
-            scored_sentences.sort(key=lambda x: x[1], reverse=True)
-            best_sentence = scored_sentences[0][0]
-            return self.clean_extracted_answer(best_sentence)
-        return "Answer not found in document."
-    def clean_extracted_answer(self, answer: str) -> str:
-        """Clean extracted answers"""
-        if not answer:
-            return ""
-        # Remove common prefixes and suffixes
-        prefixes_to_remove = [
-            "however,", "therefore,", "moreover,", "furthermore,",
-            "in addition,", "also,", "but,", "and,"
-        ]
-        answer = answer.strip()
-        for prefix in prefixes_to_remove:
-            if answer.lower().startswith(prefix):
-                answer = answer[len(prefix):].strip()
-        # Ensure proper capitalization
-        if answer and answer[0].islower():
-            answer = answer[0].upper() + answer[1:]
-        # Add period if missing
-        if answer and not answer.endswith(('.', '!', '?')):
-            answer += '.'
-        return answer
-    async def process_all_questions(self, questions: List[str], namespace: str, doc_processor: DocumentProcessor) -> List[str]:
-        """Process all questions with better error handling"""
-        logger.info(f"🚀 Processing {len(questions)} questions")
-        async def process_single(question: str) -> str:
-            try:
-                logger.info(f"❓ Processing: {question}")
-                # Search and answer
-                contexts = await self.search_context(question, namespace, doc_processor)
-                answer = await self.generate_answer(question, contexts)
-                logger.info(f"✅ Answer for '{question[:30]}...': {answer[:100]}...")
-                return answer
-            except Exception as e:
-                logger.error(f"❌ Question processing failed: {e}")
-                return "Answer not found in document."
-        # Process questions sequentially for better debugging
-        answers = []
-        for question in questions:
-            answer = await process_single(question)
-            answers.append(answer)
-        return answers
-# Initialize processors
-doc_processor = DocumentProcessor()
-qa_processor = QAProcessor()
-# API Routes
-@app.get("/")
-async def root():
-    return {
-        "message": "Debug Document QA API",
-        "version": "5.1.0",
-        "status": "ready",
-        "pinecone": "connected" if index else "disconnected"
-    }
-@app.post("/hackrx/run", response_model=QAResponse)
-async def process_qa(request: QARequest):
-    """Debug QA endpoint with detailed logging"""
-    start_time = time.time()
-    logger.info(f"🚀 Starting QA processing for {len(request.questions)} questions")
-    logger.info(f"📄 Document URL: {request.documents}")
-    try:
-        # Generate document ID
-        doc_id = hashlib.md5(request.documents.encode()).hexdigest()[:12]
-        namespace = f"doc_{doc_id}"
-        logger.info(f"🆔 Document ID: {doc_id}, Namespace: {namespace}")
-        # Check if document already processed
-        doc_exists = False
-        if index:
-            try:
-                # Test query to see if namespace exists
-                test_result = index.query(
-                    vector=[0.0] * 384,
-                    top_k=1,
-                    namespace=namespace,
-                    include_metadata=False
-                )
-                doc_exists = len(test_result.matches) > 0
-                logger.info(f"📋 Document exists in Pinecone: {doc_exists}")
-            except Exception as e:
-                logger.error(f"❌ Error checking document existence: {e}")
-                doc_exists = False
-        # Also check memory cache
-        if not doc_exists and doc_id in doc_processor.cache:
-            doc_exists = True
-            namespace = f"memory_{doc_id}"
-            logger.info(f"📋 Document exists in memory cache")
-        # Process document if needed
-        if not doc_exists:
-            logger.info("📝 Processing new document...")
-            # Download and extract
-            pdf_bytes = doc_processor.download_pdf(request.documents)
-            text = doc_processor.extract_text(pdf_bytes)
-            if len(text) < 100:
-                raise HTTPException(status_code=400, detail="No meaningful content found in PDF")
-            # Create chunks and store
-            chunks = doc_processor.create_chunks(text)
-            namespace = await doc_processor.store_in_pinecone(chunks, doc_id)
-            if not namespace:
-                raise HTTPException(status_code=500, detail="Failed to process document")
-            logger.info(f"✅ Document processed in {time.time() - start_time:.2f}s")
-        else:
-            logger.info("📋 Using cached document")
-        # Process all questions
-        answers = await qa_processor.process_all_questions(request.questions, namespace, doc_processor)
-        total_time = time.time() - start_time
-        logger.info(f"🎯 All processing completed in {total_time:.2f}s")
-        logger.info(f"📊 Final answers: {[ans[:50] + '...' if len(ans) > 50 else ans for ans in answers]}")
-        return QAResponse(answers=answers)
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"❌ Processing failed: {e}")
-        raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
-@app.get("/debug/stats")
-async def debug_stats():
-    """Debug endpoint to check system status"""
-    stats = {
-        "pinecone_connected": index is not None,
-        "embedding_model": str(embedding_model),
-        "cache_size": len(doc_processor.cache),
-        "answer_cache_size": len(qa_processor.answer_cache)
-    }
-    if index:
-        try:
-            index_stats = index.describe_index_stats()
-            stats["index_stats"] = index_stats
-        except Exception as e:
-            stats["index_error"] = str(e)
-    return stats
-@app.delete("/debug/clear")
-async def clear_all_cache():
-    """Clear all caches and namespaces"""
-    doc_processor.cache.clear()
-    qa_processor.answer_cache.clear()
-    # Optionally clear Pinecone namespaces (be careful!)
-    # if index:
-    #     try:
-    #         index.delete(delete_all=True)
-    #     except Exception as e:
-    #         pass
-    return {"message": "All caches cleared"}
-@app.get("/health")
-async def health():
-    return {
-        "status": "healthy",
-        "pinecone": "connected" if index else "disconnected",
-        "gemini": "configured"
-    }
-if _name_ == "_main_":
-    import uvicorn
-    print("🚀 Starting DEBUG Document QA API...")
-    print("🔍 Debug features enabled:")
-    print("  - Detailed logging")
-    print("  - Memory fallback for Pinecone")
-    print("  - Enhanced rule-based matching")
-    print("  - Debug endpoints (/debug/stats, /debug/clear)")
-    print("  - Lower similarity thresholds")
     uvicorn.run(app, host="0.0.0.0", port=8000)

+import os
+import logging
+import requests
+import fitz  # PyMuPDF
+import google.generativeai as genai
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, validator
+from typing import List, Dict
+import re
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import time
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import hashlib
+from pinecone import Pinecone, ServerlessSpec
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Initialize FastAPI
+app = FastAPI(title="Debug Document QA API", version="5.1.0")
+# Configuration
+GEMINI_API_KEY = "AIzaSyBPa-4UMLTi81OgKUhTBuqczGzaKec4zP4"
+PINECONE_API_KEY = "pcsk_7M5Zsf_84MeAJ4hBxCMN5z4AT3gkNNnTqqicAzA5A6o5m9XViUkCFRTjsk46FVc6mKiynD"
+INDEX_NAME = "qa-fast-v2"
+# Initialize services
+genai.configure(api_key=GEMINI_API_KEY)
+model = genai.GenerativeModel('gemini-2.0-flash')
+# Lightweight embedding model
+embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+embedding_model.max_seq_length = 256
+# Initialize Pinecone
+try:
+    pc = Pinecone(api_key=PINECONE_API_KEY)
+    # Create index if doesn't exist
+    if INDEX_NAME not in pc.list_indexes().names():
+        pc.create_index(
+            name=INDEX_NAME,
+            dimension=384,
+            metric='cosine',
+            spec=ServerlessSpec(cloud='aws', region='us-east-1')
+        )
+        time.sleep(5)
+    index = pc.Index(INDEX_NAME)
+    logger.info("✅ Pinecone connected successfully")
+except Exception as e:
+    logger.error(f"❌ Pinecone failed: {e}")
+    index = None
+executor = ThreadPoolExecutor(max_workers=4)
+# Models
+class QARequest(BaseModel):
+    documents: str
+    questions: List[str]
+    @validator('documents')
+    def validate_url(cls, v):
+        # Remove query parameters for extension check
+        base_url = v.split('?')[0]
+        if not base_url.lower().endswith('.pdf'):
+            raise ValueError('Must be PDF URL')
+        return v
+class QAResponse(BaseModel):
+    answers: List[str]
+# Document processor
+class DocumentProcessor:
+    def __init__(self):
+        self.cache = {}
+    def download_pdf(self, url: str) -> bytes:
+        """Internal PDF download method with better error handling"""
+        try:
+            logger.info(f"📥 Downloading PDF from: {url}")
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+                'Accept': 'application/pdf,application/octet-stream,/'
+            }
+            response = requests.get(url, headers=headers, timeout=30, stream=True)
+            response.raise_for_status()
+            content = response.content
+            logger.info(f"📄 Downloaded {len(content)} bytes")
+            return content
+        except Exception as e:
+            logger.error(f"❌ Failed to download PDF: {str(e)}")
+            raise HTTPException(status_code=400, detail=f"Failed to download PDF: {str(e)}")
+    def extract_text(self, pdf_bytes: bytes) -> str:
+        """Extract text with better debugging"""
+        try:
+            logger.info(f"📖 Extracting text from {len(pdf_bytes)} bytes PDF")
+            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+            text_parts = []
+            max_pages = min(doc.page_count, 50)  # Process more pages
+            for page_num in range(max_pages):
+                page = doc.load_page(page_num)
+                text = page.get_text()
+                if text.strip():
+                    text_parts.append(text)
+                    logger.info(f"Page {page_num + 1}: {len(text)} characters")
+            doc.close()
+            # Combine and clean
+            full_text = ' '.join(text_parts)
+            full_text = re.sub(r'\s+', ' ', full_text.strip())
+            full_text = re.sub(r'Page \d+', '', full_text, flags=re.IGNORECASE)
+            logger.info(f"📄 Total extracted text: {len(full_text)} characters from {max_pages} pages")
+            # Log sample text for debugging
+            sample_text = full_text[:500] if len(full_text) > 500 else full_text
+            logger.info(f"📝 Sample text: {sample_text}...")
+            return full_text
+        except Exception as e:
+            logger.error(f"❌ Text extraction failed: {e}")
+            raise HTTPException(status_code=500, detail=f"Cannot extract text: {e}")
+    def create_chunks(self, text: str) -> List[Dict]:
+        """Create chunks with better debugging"""
+        logger.info(f"🔪 Creating chunks from {len(text)} characters")
+        chunk_size = 1000  # Smaller chunks for better matching
+        overlap = 150
+        # Split into sentences
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        logger.info(f"📋 Found {len(sentences)} sentences")
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if len(sentence) < 15:  # Skip very short sentences
+                continue
+            if current_length + len(sentence) > chunk_size and current_chunk:
+                # Create chunk
+                chunk_text = ' '.join(current_chunk)
+                chunks.append({
+                    'text': chunk_text,
+                    'id': f"chunk_{len(chunks)}"
+                })
+                # Keep some overlap
+                if len(current_chunk) > 1:
+                    current_chunk = current_chunk[-1:] + [sentence]
+                    current_length = sum(len(s) for s in current_chunk)
+                else:
+                    current_chunk = [sentence]
+                    current_length = len(sentence)
+            else:
+                current_chunk.append(sentence)
+                current_length += len(sentence)
+        # Add final chunk
+        if current_chunk:
+            chunk_text = ' '.join(current_chunk)
+            chunks.append({
+                'text': chunk_text,
+                'id': f"chunk_{len(chunks)}"
+            })
+        logger.info(f"📊 Created {len(chunks)} chunks")
+        # Log sample chunks for debugging
+        for i, chunk in enumerate(chunks[:3]):
+            logger.info(f"Chunk {i}: {chunk['text'][:100]}...")
+        return chunks
+    async def store_in_pinecone(self, chunks: List[Dict], doc_id: str):
+        """Store chunks in Pinecone with debugging"""
+        if not index:
+            logger.error("❌ Pinecone not available - storing chunks in memory fallback")
+            # Store in memory as fallback
+            self.cache[doc_id] = chunks
+            return f"memory_{doc_id}"
+        try:
+            namespace = f"doc_{doc_id}"
+            logger.info(f"💾 Storing {len(chunks)} chunks in Pinecone namespace: {namespace}")
+            # Create embeddings in batch
+            texts = [chunk['text'] for chunk in chunks]
+            logger.info("🧠 Creating embeddings...")
+            embeddings = embedding_model.encode(texts, batch_size=16, show_progress_bar=False)
+            logger.info(f"✅ Created {len(embeddings)} embeddings, dimension: {len(embeddings[0])}")
+            # Prepare vectors
+            vectors = []
+            for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+                vectors.append({
+                    'id': f"{doc_id}_{chunk['id']}",
+                    'values': embedding.tolist(),
+                    'metadata': {
+                        'text': chunk['text'][:1000],  # Limit metadata size
+                        'chunk_id': chunk['id']
+                    }
+                })
+            # Upsert in batches
+            batch_size = 50
+            for i in range(0, len(vectors), batch_size):
+                batch = vectors[i:i + batch_size]
+                result = index.upsert(vectors=batch, namespace=namespace)
+                logger.info(f"📤 Upserted batch {i//batch_size + 1}: {result}")
+            # Verify storage
+            time.sleep(2)  # Wait for indexing
+            stats = index.describe_index_stats()
+            logger.info(f"📊 Index stats: {stats}")
+            return namespace
+        except Exception as e:
+            logger.error(f"❌ Pinecone storage failed: {e}")
+            # Fallback to memory storage
+            self.cache[doc_id] = chunks
+            return f"memory_{doc_id}"
+# QA Processor
+class QAProcessor:
+    def __init__(self):
+        self.answer_cache = {}
+    async def search_context(self, question: str, namespace: str, doc_processor: DocumentProcessor) -> List[str]:
+        """Enhanced context search with memory fallback"""
+        logger.info(f"🔍 Searching for: '{question}' in namespace: {namespace}")
+        # Check if using memory fallback
+        if namespace.startswith("memory_"):
+            doc_id = namespace.replace("memory_", "")
+            if doc_id in doc_processor.cache:
+                chunks = doc_processor.cache[doc_id]
+                logger.info(f"🧠 Using memory fallback with {len(chunks)} chunks")
+                # Simple keyword matching for memory fallback
+                question_words = set(question.lower().split())
+                scored_chunks = []
+                for chunk in chunks:
+                    chunk_words = set(chunk['text'].lower().split())
+                    overlap = len(question_words.intersection(chunk_words))
+                    if overlap > 0:
+                        scored_chunks.append((chunk['text'], overlap))
+                scored_chunks.sort(key=lambda x: x[1], reverse=True)
+                contexts = [chunk for chunk, _ in scored_chunks[:8]]
+                logger.info(f"📋 Found {len(contexts)} relevant chunks via memory search")
+                return contexts
+        if not index:
+            logger.error("❌ Both Pinecone and memory fallback failed")
+            return []
+        try:
+            # Create query embedding
+            logger.info("🧠 Creating query embedding...")
+            query_embedding = embedding_model.encode([question])[0]
+            logger.info(f"✅ Query embedding created: dimension {len(query_embedding)}")
+            # Search in Pinecone
+            logger.info(f"🔍 Querying Pinecone in namespace: {namespace}")
+            results = index.query(
+                vector=query_embedding.tolist(),
+                top_k=15,
+                namespace=namespace,
+                include_metadata=True
+            )
+            logger.info(f"📊 Pinecone returned {len(results.matches)} matches")
+            # Log match scores for debugging
+            for i, match in enumerate(results.matches[:5]):
+                logger.info(f"Match {i}: score={match.score:.4f}, text={match.metadata['text'][:100]}...")
+            # Collect contexts with lower threshold
+            contexts = []
+            for match in results.matches:
+                if match.score > 0.1:  # Lower threshold
+                    contexts.append(match.metadata['text'])
+            logger.info(f"📋 Selected {len(contexts)} contexts above threshold")
+            return contexts
+        except Exception as e:
+            logger.error(f"❌ Search failed: {e}")
+            return []
+    async def generate_answer(self, question: str, contexts: List[str]) -> str:
+        """Generate concise, accurate answers"""
+        logger.info(f"🤖 Generating answer for: '{question}' with {len(contexts)} contexts")
+        if not contexts:
+            logger.warning("⚠ No contexts found - trying direct text search")
+            return "Answer not found in document."
+        # Combine contexts intelligently
+        combined_context = '\n'.join(contexts[:8])
+        if len(combined_context) > 5000:
+            combined_context = combined_context[:5000]
+        logger.info(f"📝 Combined context length: {len(combined_context)}")
+        # Enhanced prompt for better extraction
+        prompt = f"""You are an expert at extracting specific information from insurance policy documents.
+Your task: Find the exact answer to the question from the policy document below. Be concise and specific.
+Insurance Policy Document:
+{combined_context}
+Question: {question}
+Instructions:
+- Give a direct, concise answer
+- Include specific numbers, periods, percentages when mentioned
+- If there are conditions, mention the key ones briefly
+- Don't start with "Based on" or "According to"
+- Keep answer under 100 words
+- If no relevant information exists, say "Answer not found in document"
+Answer:"""
+        try:
+            logger.info("🤖 Calling Gemini...")
+            response = await asyncio.wait_for(
+                asyncio.get_event_loop().run_in_executor(
+                    executor,
+                    lambda: model.generate_content(
+                        prompt,
+                        generation_config=genai.types.GenerationConfig(
+                            temperature=0.05,  # Very low for consistency
+                            max_output_tokens=150,  # Shorter answers
+                            candidate_count=1,
+                            top_p=0.9
+                        )
+                    )
+                ),
+                timeout=15.0
+            )
+            answer = response.text.strip()
+            logger.info(f"✅ Generated answer: {answer[:100]}...")
+            # Clean answer
+            answer = self.clean_answer(answer)
+            # Enhanced validation
+            if self.is_valid_answer(answer, question):
+                return answer
+            # Try rule-based extraction for specific patterns
+            logger.info("🛠 Trying enhanced rule-based extraction...")
+            rule_answer = self.enhanced_rule_extraction(question, combined_context)
+            if rule_answer != "Answer not found in document.":
+                return rule_answer
+            # Final fallback - return AI answer if it's not completely empty
+            if answer and len(answer) > 10 and "not found" not in answer.lower():
+                return answer
+            return "Answer not found in document."
+        except Exception as e:
+            logger.error(f"❌ Generation failed: {e}")
+            return self.enhanced_rule_extraction(question, combined_context)
+    def clean_answer(self, answer: str) -> str:
+        """Clean and format answer"""
+        if not answer:
+            return ""
+        # Remove common prefixes
+        prefixes = [
+            "ANSWER:", "Based on the", "According to", "The context",
+            "The document", "From the policy", "Answer:"
+        ]
+        for prefix in prefixes:
+            if answer.startswith(prefix):
+                answer = answer[len(prefix):].strip()
+                if answer.startswith(':'):
+                    answer = answer[1:].strip()
+        # Clean formatting
+        answer = re.sub(r'\s+', ' ', answer.strip())
+        # Capitalize first letter
+        if answer and answer[0].islower():
+            answer = answer[0].upper() + answer[1:]
+        return answer
+    def is_valid_answer(self, answer: str, question: str) -> bool:
+        """Check if answer is valid and relevant"""
+        if not answer or len(answer) < 5:
+            return False
+        # Check for non-answers
+        non_answers = [
+            "answer not found", "not mentioned", "does not contain",
+            "no information", "cannot be determined", "not specified"
+        ]
+        if any(phrase in answer.lower() for phrase in non_answers):
+            return False
+        # Check if answer contains relevant keywords from question
+        question_words = set(question.lower().split())
+        answer_words = set(answer.lower().split())
+        # Should have some overlap
+        overlap = len(question_words.intersection(answer_words))
+        return overlap >= 1
+    def enhanced_rule_extraction(self, question: str, context: str) -> str:
+        """Enhanced rule-based extraction for insurance-specific queries"""
+        logger.info(f"🛠 Enhanced rule extraction for: '{question}'")
+        q_lower = question.lower()
+        # Define comprehensive patterns for insurance terms
+        insurance_patterns = {
+            'grace period': {
+                'patterns': [
+                    r'grace period.?(\d+)\s(days?|months?)',
+                    r'(\d+)\s*days?\s*grace\s*period',
+                    r'premium.?grace.?(\d+)\s*days?',
+                    r'grace.*?(\d+)\s*days?'
+                ],
+                'extract_sentence': True
+            },
+            'waiting period': {
+                'patterns': [
+                    r'waiting period.?(\d+)\s(days?|months?|years?)',
+                    r'(\d+)\s*(days?|months?|years?).*?waiting\s*period',
+                    r'pre.?existing.?(\d+)\s*(months?|years?)',
+                    r'(\d+)\s*months?.*?continuous\s*coverage'
+                ],
+                'extract_sentence': True
+            },
+            'maternity': {
+                'patterns': [
+                    r'maternity.?(\d+)\s(months?|years?)',
+                    r'(\d+)\s*months?.*?maternity',
+                    r'pregnancy.*?(\d+)\s*months?',
+                    r'childbirth.*?(\d+)\s*months?',
+                    r'continuous.?covered.?(\d+)\s*months?'
+                ],
+                'extract_full': True
+            },
+            'cataract': {
+                'patterns': [
+                    r'cataract.?(\d+)\s(years?|months?)',
+                    r'(\d+)\s*years?.*?cataract',
+                    r'eye.?surgery.?(\d+)\s*years?',
+                    r'cataract.?waiting.?(\d+)'
+                ],
+                'extract_sentence': True
+            },
+            'ncd|no claim discount': {
+                'patterns': [
+                    r'no claim discount.*?(\d+)%',
+                    r'ncd.*?(\d+)%',
+                    r'(\d+)%.*?no claim',
+                    r'cumulative bonus.*?(\d+)%',
+                    r'(\d+)%.?claim.?free'
+                ],
+                'extract_sentence': True
+            },
+            'room rent|icu': {
+                'patterns': [
+                    r'room rent.*?(\d+)%',
+                    r'icu.*?(\d+)%',
+                    r'(\d+)%.*?room rent',
+                    r'(\d+)%.*?sum insured'
+                ],
+                'extract_sentence': True
+            },
+            'ayush': {
+                'patterns': [
+                    r'ayurveda.?yoga.?naturopathy',
+                    r'ayush.*?hospital',
+                    r'unani.?siddha.?homeopathy'
+                ],
+                'extract_full': True
+            },
+            'hospital': {
+                'patterns': [
+                    r'hospital.?means.?institution',
+                    r'(\d+).*?inpatient beds',
+                    r'qualified nursing staff'
+                ],
+                'extract_full': True
+            }
+        }
+        # Find relevant pattern category
+        for key, config in insurance_patterns.items():
+            if any(word in q_lower for word in key.split('|')):
+                logger.info(f"🔍 Checking patterns for: {key}")
+                for pattern in config['patterns']:
+                    matches = list(re.finditer(pattern, context, re.IGNORECASE))
+                    if matches:
+                        logger.info(f"✅ Pattern matched: {pattern}")
+                        # Extract based on configuration
+                        if config.get('extract_full'):
+                            # Extract larger context around match
+                            match = matches[0]
+                            start = max(0, match.start() - 200)
+                            end = min(len(context), match.end() + 200)
+                            full_context = context[start:end]
+                            # Find complete sentences
+                            sentences = re.split(r'[.!?]+', full_context)
+                            relevant_sentences = []
+                            for sentence in sentences:
+                                if (re.search(pattern, sentence, re.IGNORECASE) or
+                                    any(word in sentence.lower() for word in key.split('|'))):
+                                    relevant_sentences.append(sentence.strip())
+                            if relevant_sentences:
+                                result = '. '.join(relevant_sentences[:2])
+                                return self.clean_extracted_answer(result)
+                        else:  # extract_sentence
+                            # Find the sentence containing the match
+                            match = matches[0]
+                            # Expand search area
+                            start = max(0, match.start() - 150)
+                            end = min(len(context), match.end() + 150)
+                            sentence_area = context[start:end]
+                            sentences = re.split(r'[.!?]+', sentence_area)
+                            for sentence in sentences:
+                                if re.search(pattern, sentence, re.IGNORECASE) and len(sentence.strip()) > 15:
+                                    result = sentence.strip()
+                                    return self.clean_extracted_answer(result)
+        # Fallback: keyword-based extraction
+        return self.keyword_based_extraction(question, context)
+    def keyword_based_extraction(self, question: str, context: str) -> str:
+        """Extract answer based on keyword matching"""
+        question_keywords = [word.lower() for word in question.split() if len(word) > 3]
+        if not question_keywords:
+            return "Answer not found in document."
+        sentences = re.split(r'[.!?]+', context)
+        scored_sentences = []
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if len(sentence) < 20:
+                continue
+            sentence_lower = sentence.lower()
+            score = 0
+            # Count keyword matches
+            for keyword in question_keywords:
+                if keyword in sentence_lower:
+                    score += 1
+            # Bonus for numbers (common in insurance)
+            if re.search(r'\d+', sentence):
+                score += 0.5
+            # Bonus for insurance terms
+            insurance_terms = ['policy', 'coverage', 'benefit', 'premium', 'claim', 'period', 'limit']
+            for term in insurance_terms:
+                if term in sentence_lower:
+                    score += 0.3
+            if score >= 1.5:  # Threshold for relevance
+                scored_sentences.append((sentence, score))
+        if scored_sentences:
+            # Sort by score and return best match
+            scored_sentences.sort(key=lambda x: x[1], reverse=True)
+            best_sentence = scored_sentences[0][0]
+            return self.clean_extracted_answer(best_sentence)
+        return "Answer not found in document."
+    def clean_extracted_answer(self, answer: str) -> str:
+        """Clean extracted answers"""
+        if not answer:
+            return ""
+        # Remove common prefixes and suffixes
+        prefixes_to_remove = [
+            "however,", "therefore,", "moreover,", "furthermore,",
+            "in addition,", "also,", "but,", "and,"
+        ]
+        answer = answer.strip()
+        for prefix in prefixes_to_remove:
+            if answer.lower().startswith(prefix):
+                answer = answer[len(prefix):].strip()
+        # Ensure proper capitalization
+        if answer and answer[0].islower():
+            answer = answer[0].upper() + answer[1:]
+        # Add period if missing
+        if answer and not answer.endswith(('.', '!', '?')):
+            answer += '.'
+        return answer
+    async def process_all_questions(self, questions: List[str], namespace: str, doc_processor: DocumentProcessor) -> List[str]:
+        """Process all questions with better error handling"""
+        logger.info(f"🚀 Processing {len(questions)} questions")
+        async def process_single(question: str) -> str:
+            try:
+                logger.info(f"❓ Processing: {question}")
+                # Search and answer
+                contexts = await self.search_context(question, namespace, doc_processor)
+                answer = await self.generate_answer(question, contexts)
+                logger.info(f"✅ Answer for '{question[:30]}...': {answer[:100]}...")
+                return answer
+            except Exception as e:
+                logger.error(f"❌ Question processing failed: {e}")
+                return "Answer not found in document."
+        # Process questions sequentially for better debugging
+        answers = []
+        for question in questions:
+            answer = await process_single(question)
+            answers.append(answer)
+        return answers
+# Initialize processors
+doc_processor = DocumentProcessor()
+qa_processor = QAProcessor()
+# API Routes
+@app.get("/")
+async def root():
+    return {
+        "message": "Debug Document QA API",
+        "version": "5.1.0",
+        "status": "ready",
+        "pinecone": "connected" if index else "disconnected"
+    }
+@app.post("/hackrx/run", response_model=QAResponse)
+async def process_qa(request: QARequest):
+    """Debug QA endpoint with detailed logging"""
+    start_time = time.time()
+    logger.info(f"🚀 Starting QA processing for {len(request.questions)} questions")
+    logger.info(f"📄 Document URL: {request.documents}")
+    try:
+        # Generate document ID
+        doc_id = hashlib.md5(request.documents.encode()).hexdigest()[:12]
+        namespace = f"doc_{doc_id}"
+        logger.info(f"🆔 Document ID: {doc_id}, Namespace: {namespace}")
+        # Check if document already processed
+        doc_exists = False
+        if index:
+            try:
+                # Test query to see if namespace exists
+                test_result = index.query(
+                    vector=[0.0] * 384,
+                    top_k=1,
+                    namespace=namespace,
+                    include_metadata=False
+                )
+                doc_exists = len(test_result.matches) > 0
+                logger.info(f"📋 Document exists in Pinecone: {doc_exists}")
+            except Exception as e:
+                logger.error(f"❌ Error checking document existence: {e}")
+                doc_exists = False
+        # Also check memory cache
+        if not doc_exists and doc_id in doc_processor.cache:
+            doc_exists = True
+            namespace = f"memory_{doc_id}"
+            logger.info(f"📋 Document exists in memory cache")
+        # Process document if needed
+        if not doc_exists:
+            logger.info("📝 Processing new document...")
+            # Download and extract
+            pdf_bytes = doc_processor.download_pdf(request.documents)
+            text = doc_processor.extract_text(pdf_bytes)
+            if len(text) < 100:
+                raise HTTPException(status_code=400, detail="No meaningful content found in PDF")
+            # Create chunks and store
+            chunks = doc_processor.create_chunks(text)
+            namespace = await doc_processor.store_in_pinecone(chunks, doc_id)
+            if not namespace:
+                raise HTTPException(status_code=500, detail="Failed to process document")
+            logger.info(f"✅ Document processed in {time.time() - start_time:.2f}s")
+        else:
+            logger.info("📋 Using cached document")
+        # Process all questions
+        answers = await qa_processor.process_all_questions(request.questions, namespace, doc_processor)
+        total_time = time.time() - start_time
+        logger.info(f"🎯 All processing completed in {total_time:.2f}s")
+        logger.info(f"📊 Final answers: {[ans[:50] + '...' if len(ans) > 50 else ans for ans in answers]}")
+        return QAResponse(answers=answers)
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Processing failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
+@app.get("/debug/stats")
+async def debug_stats():
+    """Debug endpoint to check system status"""
+    stats = {
+        "pinecone_connected": index is not None,
+        "embedding_model": str(embedding_model),
+        "cache_size": len(doc_processor.cache),
+        "answer_cache_size": len(qa_processor.answer_cache)
+    }
+    if index:
+        try:
+            index_stats = index.describe_index_stats()
+            stats["index_stats"] = index_stats
+        except Exception as e:
+            stats["index_error"] = str(e)
+    return stats
+@app.delete("/debug/clear")
+async def clear_all_cache():
+    """Clear all caches and namespaces"""
+    doc_processor.cache.clear()
+    qa_processor.answer_cache.clear()
+    # Optionally clear Pinecone namespaces (be careful!)
+    # if index:
+    #     try:
+    #         index.delete(delete_all=True)
+    #     except Exception as e:
+    #         pass
+    return {"message": "All caches cleared"}
+@app.get("/health")
+async def health():
+    return {
+        "status": "healthy",
+        "pinecone": "connected" if index else "disconnected",
+        "gemini": "configured"
+    }
+if _name_ == "__main__":
+    import uvicorn
+    print("🚀 Starting DEBUG Document QA API...")
+    print("🔍 Debug features enabled:")
+    print("  - Detailed logging")
+    print("  - Memory fallback for Pinecone")
+    print("  - Enhanced rule-based matching")
+    print("  - Debug endpoints (/debug/stats, /debug/clear)")
+    print("  - Lower similarity thresholds")
     uvicorn.run(app, host="0.0.0.0", port=8000)