Spaces:

ChAbhishek28
/

PensionBot

Runtime error

App Files Files Community

ChAbhishek28 commited on Oct 10, 2025

Commit

67a99cd

1 Parent(s): a1986d7

Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs

Browse files

Files changed (5) hide show

analyze_database.py +129 -0
app.py +19 -4
bulk_document_loader.py +47 -0
check_document_count.py +93 -0
document_status_logger.py +58 -0

analyze_database.py ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/usr/bin/env python3
+"""
+Comprehensive analysis of the actual LanceDB database contents
+"""
+import sys
+import os
+import traceback
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+try:
+    import lancedb
+    import pandas as pd
+    from pathlib import Path
+    def analyze_lancedb_contents():
+        """Analyze the actual contents of the LanceDB database"""
+        db_path = "./lancedb_data"
+        print("🔍 LanceDB Database Analysis")
+        print("=" * 60)
+        try:
+            db = lancedb.connect(db_path)
+            table_names = db.table_names()
+            print(f"📊 Found {len(table_names)} tables: {table_names}")
+            print()
+            total_documents = 0
+            for table_name in table_names:
+                print(f"📋 Table: {table_name}")
+                print("-" * 40)
+                try:
+                    table = db.open_table(table_name)
+                    count = table.count_rows()
+                    total_documents += count
+                    print(f"   📊 Total rows: {count}")
+                    if count > 0:
+                        # Get schema info
+                        try:
+                            schema = table.schema
+                            print(f"   📝 Columns: {[field.name for field in schema]}")
+                        except:
+                            pass
+                        # Show sample data
+                        try:
+                            sample_size = min(3, count)
+                            sample = table.head(sample_size)
+                            sample_data = sample.to_pylist()
+                            print(f"   📄 Sample documents ({sample_size}/{count}):")
+                            for i, row in enumerate(sample_data):
+                                print(f"      Document {i+1}:")
+                                # Show content preview
+                                if 'content' in row:
+                                    content = str(row['content'])[:200] + "..." if len(str(row['content'])) > 200 else str(row['content'])
+                                    print(f"         Content: {content}")
+                                # Show filename if available
+                                if 'filename' in row:
+                                    print(f"         Filename: {row['filename']}")
+                                # Show other relevant fields
+                                for key, value in row.items():
+                                    if key not in ['content', 'filename', 'vector', 'id'] and value:
+                                        print(f"         {key}: {str(value)[:100]}")
+                                print()
+                        except Exception as e:
+                            print(f"      ⚠️ Could not read sample data: {e}")
+                    print()
+                except Exception as e:
+                    print(f"   ❌ Error reading table {table_name}: {e}")
+                    print()
+            print("=" * 60)
+            print(f"🎯 SUMMARY:")
+            print(f"   Total Documents Across All Tables: {total_documents}")
+            print(f"   Database Size: {'LARGE' if total_documents > 100 else 'MEDIUM' if total_documents > 10 else 'SMALL'}")
+            # Check specifically for voice bot usage
+            if 'rajasthan_documents' in table_names:
+                raj_table = db.open_table('rajasthan_documents')
+                raj_count = raj_table.count_rows()
+                print(f"   Voice Bot Documents: {raj_count} (rajasthan_documents table)")
+            if 'documents' in table_names:
+                doc_table = db.open_table('documents')
+                doc_count = doc_table.count_rows()
+                print(f"   General Documents: {doc_count} (documents table)")
+            print()
+            print("🤖 Voice Bot Analysis:")
+            if total_documents >= 1000:
+                print("   ✅ YES - Voice bot has access to 1000+ documents!")
+            elif total_documents >= 100:
+                print("   ⚠️ PARTIAL - Voice bot has substantial documents but less than 1000")
+            elif total_documents >= 10:
+                print("   ⚠️ LIMITED - Voice bot has moderate document access")
+            else:
+                print("   ❌ MINIMAL - Voice bot has very limited document access")
+            return total_documents
+        except Exception as e:
+            print(f"❌ Error connecting to database: {e}")
+            traceback.print_exc()
+            return 0
+    if __name__ == "__main__":
+        total = analyze_lancedb_contents()
+        print(f"\n🎯 Final Answer: Your voice bot has access to {total} documents")
+except ImportError as e:
+    print(f"❌ Missing dependencies: {e}")
+    print("Please install: pip install lancedb pandas")
+except Exception as e:
+    print(f"❌ Unexpected error: {e}")
+    traceback.print_exc()

app.py CHANGED Viewed

@@ -52,12 +52,27 @@ async def lifespan(app: FastAPI):
     # Startup
     logger.info("🚀 Starting Voice Bot Application...")
-    # Setup sample documents if database is empty
     try:
-        from setup_documents import setup_sample_documents
-        await setup_sample_documents()
     except Exception as e:
-        logger.warning(f"⚠️ Could not setup sample documents: {e}")
     logger.info("✅ Application started successfully")
     yield

     # Startup
     logger.info("🚀 Starting Voice Bot Application...")
+    # Check document database status
     try:
+        from document_status_logger import log_document_status
+        document_count = await log_document_status()
+        # Only setup sample documents if database is truly empty
+        if document_count < 5:
+            logger.info("📝 Database is empty - setting up sample documents...")
+            from setup_documents import setup_sample_documents
+            await setup_sample_documents()
+        else:
+            logger.info(f"✅ Voice Bot ready with {document_count:,} documents in knowledge base")
     except Exception as e:
+        logger.warning(f"⚠️ Could not check document status: {e}")
+        # Fallback to basic sample setup
+        try:
+            from setup_documents import setup_sample_documents
+            await setup_sample_documents()
+        except Exception as e2:
+            logger.error(f"❌ Could not setup sample documents: {e2}")
     logger.info("✅ Application started successfully")
     yield

bulk_document_loader.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Enhanced document loader for 1000+ government documents
+Add this to your setup_documents.py or create as separate service
+"""
+import os
+import json
+from pathlib import Path
+def load_bulk_documents():
+    """Load documents from external sources"""
+    # Example: Load from a documents directory
+    documents = []
+    # Option 1: Load from JSON files
+    docs_dir = Path("government_docs")  # Create this directory
+    if docs_dir.exists():
+        for json_file in docs_dir.glob("*.json"):
+            with open(json_file, 'r', encoding='utf-8') as f:
+                batch_docs = json.load(f)
+                documents.extend(batch_docs)
+    # Option 2: Load from text files
+    text_docs_dir = Path("text_documents")
+    if text_docs_dir.exists():
+        for txt_file in text_docs_dir.glob("*.txt"):
+            with open(txt_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+                documents.append({
+                    "content": content,
+                    "filename": txt_file.name,
+                    "source": "Government Policy Manual"
+                })
+    # Option 3: Load from PDF directory (requires PyPDF2)
+    # pdf_docs_dir = Path("pdf_documents")
+    # if pdf_docs_dir.exists():
+    #     import PyPDF2
+    #     for pdf_file in pdf_docs_dir.glob("*.pdf"):
+    #         # Extract text from PDF and add to documents
+    return documents
+# Add this to your setup_sample_documents() function:
+# bulk_docs = load_bulk_documents()
+# SAMPLE_DOCUMENTS.extend(bulk_docs)

check_document_count.py ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/usr/bin/env python3
+"""
+Check how many documents are actually in the LanceDB database
+"""
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from lancedb_service import lancedb_service
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def check_document_count():
+    """Check how many documents are in each table"""
+    try:
+        db = lancedb_service.db
+        print("📊 Document Count Analysis")
+        print("=" * 50)
+        # Check all tables
+        table_names = db.table_names()
+        print(f"Available tables: {table_names}")
+        print()
+        for table_name in table_names:
+            try:
+                table = db.open_table(table_name)
+                count = table.count_rows()
+                print(f"📋 {table_name}: {count} documents")
+                # Show sample data for document tables
+                if count > 0 and 'document' in table_name.lower():
+                    print(f"   Sample documents from {table_name}:")
+                    sample = table.head(3)
+                    for i, row in enumerate(sample.to_pylist()):
+                        content_preview = row.get('content', 'No content')[:100] + "..." if len(row.get('content', '')) > 100 else row.get('content', 'No content')
+                        filename = row.get('filename', 'No filename')
+                        print(f"   - Document {i+1}: {filename}")
+                        print(f"     Content: {content_preview}")
+                    print()
+            except Exception as e:
+                print(f"❌ Error checking {table_name}: {e}")
+        print("\n🔍 Voice Bot Document Usage Analysis:")
+        print("-" * 40)
+        # Check if voice bot is using documents
+        if 'rajasthan_documents' in table_names:
+            raj_table = db.open_table('rajasthan_documents')
+            raj_count = raj_table.count_rows()
+            print(f"✅ Voice Bot has access to {raj_count} Rajasthan documents")
+            if raj_count > 0:
+                print("📄 Document topics include:")
+                documents = raj_table.head(10).to_pylist()
+                for doc in documents:
+                    filename = doc.get('filename', 'Unknown')
+                    content_snippet = doc.get('content', '')[:200] + "..."
+                    print(f"   • {filename}")
+                    if 'pension' in content_snippet.lower():
+                        print("     - Contains pension information ✅")
+                    if 'leave' in content_snippet.lower():
+                        print("     - Contains leave information ✅")
+                    if 'salary' in content_snippet.lower():
+                        print("     - Contains salary information ✅")
+        else:
+            print("❌ No rajasthan_documents table found!")
+        # Check regular documents table
+        if 'documents' in table_names:
+            doc_table = db.open_table('documents')
+            doc_count = doc_table.count_rows()
+            print(f"📚 General documents table: {doc_count} documents")
+        print(f"\n📝 Summary:")
+        print(f"- The voice bot is {'✅ USING' if raj_count > 0 else '❌ NOT USING'} the document database")
+        print(f"- Total accessible documents: {raj_count if 'rajasthan_documents' in table_names else 0}")
+        print(f"- This is {'✅ GOOD' if raj_count >= 5 else '⚠️ LIMITED'} for comprehensive responses")
+        if raj_count < 100:
+            print(f"\n💡 Note: You mentioned 1000+ documents, but only {raj_count} are currently loaded.")
+            print("   Consider adding more documents to improve response quality.")
+    except Exception as e:
+        logger.error(f"❌ Error checking document count: {e}")
+if __name__ == "__main__":
+    check_document_count()

document_status_logger.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+Enhanced startup logging to show actual document usage
+"""
+import logging
+from lancedb_service import lancedb_service
+logger = logging.getLogger("app")
+async def log_document_status():
+    """Log comprehensive document status during startup"""
+    try:
+        logger.info("📊 Document Database Status Check...")
+        total_documents = 0
+        # Check rajasthan_documents table (main voice bot documents)
+        if "rajasthan_documents" in lancedb_service.db.table_names():
+            raj_table = lancedb_service.db.open_table("rajasthan_documents")
+            raj_count = raj_table.count_rows()
+            total_documents += raj_count
+            logger.info(f"🏛️ Rajasthan Documents: {raj_count:,} (Voice Bot Primary Source)")
+        # Check general documents table
+        if "documents" in lancedb_service.db.table_names():
+            doc_table = lancedb_service.db.open_table("documents")
+            doc_count = doc_table.count_rows()
+            total_documents += doc_count
+            logger.info(f"📚 General Documents: {doc_count:,}")
+        # Summary
+        logger.info(f"🎯 TOTAL AVAILABLE DOCUMENTS: {total_documents:,}")
+        if total_documents >= 1000:
+            logger.info("✅ Voice Bot has EXCELLENT document coverage (1000+ docs)")
+        elif total_documents >= 100:
+            logger.info("⚠️ Voice Bot has GOOD document coverage (100+ docs)")
+        elif total_documents >= 10:
+            logger.info("⚠️ Voice Bot has LIMITED document coverage (<100 docs)")
+        else:
+            logger.info("❌ Voice Bot has MINIMAL document coverage")
+        # Show sample document topics if available
+        if "rajasthan_documents" in lancedb_service.db.table_names():
+            raj_table = lancedb_service.db.open_table("rajasthan_documents")
+            if raj_table.count_rows() > 0:
+                sample = raj_table.head(3).to_pylist()
+                logger.info("📄 Sample document topics available:")
+                for i, doc in enumerate(sample, 1):
+                    filename = doc.get('filename', 'Unknown')
+                    content_preview = doc.get('content', '')[:100] + "..."
+                    logger.info(f"   {i}. {filename}: {content_preview}")
+        return total_documents
+    except Exception as e:
+        logger.error(f"❌ Error checking document status: {e}")
+        return 0