ChAbhishek28 commited on
Commit
67a99cd
Β·
1 Parent(s): a1986d7

Enhanced startup logging to show actual document count (23K+ docs) instead of just 7 sample docs

Browse files
analyze_database.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Comprehensive analysis of the actual LanceDB database contents
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ import traceback
9
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
10
+
11
+ try:
12
+ import lancedb
13
+ import pandas as pd
14
+ from pathlib import Path
15
+
16
+ def analyze_lancedb_contents():
17
+ """Analyze the actual contents of the LanceDB database"""
18
+
19
+ db_path = "./lancedb_data"
20
+ print("πŸ” LanceDB Database Analysis")
21
+ print("=" * 60)
22
+
23
+ try:
24
+ db = lancedb.connect(db_path)
25
+ table_names = db.table_names()
26
+
27
+ print(f"πŸ“Š Found {len(table_names)} tables: {table_names}")
28
+ print()
29
+
30
+ total_documents = 0
31
+
32
+ for table_name in table_names:
33
+ print(f"πŸ“‹ Table: {table_name}")
34
+ print("-" * 40)
35
+
36
+ try:
37
+ table = db.open_table(table_name)
38
+ count = table.count_rows()
39
+ total_documents += count
40
+
41
+ print(f" πŸ“Š Total rows: {count}")
42
+
43
+ if count > 0:
44
+ # Get schema info
45
+ try:
46
+ schema = table.schema
47
+ print(f" πŸ“ Columns: {[field.name for field in schema]}")
48
+ except:
49
+ pass
50
+
51
+ # Show sample data
52
+ try:
53
+ sample_size = min(3, count)
54
+ sample = table.head(sample_size)
55
+ sample_data = sample.to_pylist()
56
+
57
+ print(f" πŸ“„ Sample documents ({sample_size}/{count}):")
58
+
59
+ for i, row in enumerate(sample_data):
60
+ print(f" Document {i+1}:")
61
+
62
+ # Show content preview
63
+ if 'content' in row:
64
+ content = str(row['content'])[:200] + "..." if len(str(row['content'])) > 200 else str(row['content'])
65
+ print(f" Content: {content}")
66
+
67
+ # Show filename if available
68
+ if 'filename' in row:
69
+ print(f" Filename: {row['filename']}")
70
+
71
+ # Show other relevant fields
72
+ for key, value in row.items():
73
+ if key not in ['content', 'filename', 'vector', 'id'] and value:
74
+ print(f" {key}: {str(value)[:100]}")
75
+ print()
76
+
77
+ except Exception as e:
78
+ print(f" ⚠️ Could not read sample data: {e}")
79
+
80
+ print()
81
+
82
+ except Exception as e:
83
+ print(f" ❌ Error reading table {table_name}: {e}")
84
+ print()
85
+
86
+ print("=" * 60)
87
+ print(f"🎯 SUMMARY:")
88
+ print(f" Total Documents Across All Tables: {total_documents}")
89
+ print(f" Database Size: {'LARGE' if total_documents > 100 else 'MEDIUM' if total_documents > 10 else 'SMALL'}")
90
+
91
+ # Check specifically for voice bot usage
92
+ if 'rajasthan_documents' in table_names:
93
+ raj_table = db.open_table('rajasthan_documents')
94
+ raj_count = raj_table.count_rows()
95
+ print(f" Voice Bot Documents: {raj_count} (rajasthan_documents table)")
96
+
97
+ if 'documents' in table_names:
98
+ doc_table = db.open_table('documents')
99
+ doc_count = doc_table.count_rows()
100
+ print(f" General Documents: {doc_count} (documents table)")
101
+
102
+ print()
103
+ print("πŸ€– Voice Bot Analysis:")
104
+ if total_documents >= 1000:
105
+ print(" βœ… YES - Voice bot has access to 1000+ documents!")
106
+ elif total_documents >= 100:
107
+ print(" ⚠️ PARTIAL - Voice bot has substantial documents but less than 1000")
108
+ elif total_documents >= 10:
109
+ print(" ⚠️ LIMITED - Voice bot has moderate document access")
110
+ else:
111
+ print(" ❌ MINIMAL - Voice bot has very limited document access")
112
+
113
+ return total_documents
114
+
115
+ except Exception as e:
116
+ print(f"❌ Error connecting to database: {e}")
117
+ traceback.print_exc()
118
+ return 0
119
+
120
+ if __name__ == "__main__":
121
+ total = analyze_lancedb_contents()
122
+ print(f"\n🎯 Final Answer: Your voice bot has access to {total} documents")
123
+
124
+ except ImportError as e:
125
+ print(f"❌ Missing dependencies: {e}")
126
+ print("Please install: pip install lancedb pandas")
127
+ except Exception as e:
128
+ print(f"❌ Unexpected error: {e}")
129
+ traceback.print_exc()
app.py CHANGED
@@ -52,12 +52,27 @@ async def lifespan(app: FastAPI):
52
  # Startup
53
  logger.info("πŸš€ Starting Voice Bot Application...")
54
 
55
- # Setup sample documents if database is empty
56
  try:
57
- from setup_documents import setup_sample_documents
58
- await setup_sample_documents()
 
 
 
 
 
 
 
 
 
59
  except Exception as e:
60
- logger.warning(f"⚠️ Could not setup sample documents: {e}")
 
 
 
 
 
 
61
 
62
  logger.info("βœ… Application started successfully")
63
  yield
 
52
  # Startup
53
  logger.info("πŸš€ Starting Voice Bot Application...")
54
 
55
+ # Check document database status
56
  try:
57
+ from document_status_logger import log_document_status
58
+ document_count = await log_document_status()
59
+
60
+ # Only setup sample documents if database is truly empty
61
+ if document_count < 5:
62
+ logger.info("πŸ“ Database is empty - setting up sample documents...")
63
+ from setup_documents import setup_sample_documents
64
+ await setup_sample_documents()
65
+ else:
66
+ logger.info(f"βœ… Voice Bot ready with {document_count:,} documents in knowledge base")
67
+
68
  except Exception as e:
69
+ logger.warning(f"⚠️ Could not check document status: {e}")
70
+ # Fallback to basic sample setup
71
+ try:
72
+ from setup_documents import setup_sample_documents
73
+ await setup_sample_documents()
74
+ except Exception as e2:
75
+ logger.error(f"❌ Could not setup sample documents: {e2}")
76
 
77
  logger.info("βœ… Application started successfully")
78
  yield
bulk_document_loader.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced document loader for 1000+ government documents
3
+ Add this to your setup_documents.py or create as separate service
4
+ """
5
+
6
+ import os
7
+ import json
8
+ from pathlib import Path
9
+
10
+ def load_bulk_documents():
11
+ """Load documents from external sources"""
12
+
13
+ # Example: Load from a documents directory
14
+ documents = []
15
+
16
+ # Option 1: Load from JSON files
17
+ docs_dir = Path("government_docs") # Create this directory
18
+ if docs_dir.exists():
19
+ for json_file in docs_dir.glob("*.json"):
20
+ with open(json_file, 'r', encoding='utf-8') as f:
21
+ batch_docs = json.load(f)
22
+ documents.extend(batch_docs)
23
+
24
+ # Option 2: Load from text files
25
+ text_docs_dir = Path("text_documents")
26
+ if text_docs_dir.exists():
27
+ for txt_file in text_docs_dir.glob("*.txt"):
28
+ with open(txt_file, 'r', encoding='utf-8') as f:
29
+ content = f.read()
30
+ documents.append({
31
+ "content": content,
32
+ "filename": txt_file.name,
33
+ "source": "Government Policy Manual"
34
+ })
35
+
36
+ # Option 3: Load from PDF directory (requires PyPDF2)
37
+ # pdf_docs_dir = Path("pdf_documents")
38
+ # if pdf_docs_dir.exists():
39
+ # import PyPDF2
40
+ # for pdf_file in pdf_docs_dir.glob("*.pdf"):
41
+ # # Extract text from PDF and add to documents
42
+
43
+ return documents
44
+
45
+ # Add this to your setup_sample_documents() function:
46
+ # bulk_docs = load_bulk_documents()
47
+ # SAMPLE_DOCUMENTS.extend(bulk_docs)
check_document_count.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Check how many documents are actually in the LanceDB database
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
9
+
10
+ from lancedb_service import lancedb_service
11
+ import logging
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def check_document_count():
17
+ """Check how many documents are in each table"""
18
+ try:
19
+ db = lancedb_service.db
20
+
21
+ print("πŸ“Š Document Count Analysis")
22
+ print("=" * 50)
23
+
24
+ # Check all tables
25
+ table_names = db.table_names()
26
+ print(f"Available tables: {table_names}")
27
+ print()
28
+
29
+ for table_name in table_names:
30
+ try:
31
+ table = db.open_table(table_name)
32
+ count = table.count_rows()
33
+ print(f"πŸ“‹ {table_name}: {count} documents")
34
+
35
+ # Show sample data for document tables
36
+ if count > 0 and 'document' in table_name.lower():
37
+ print(f" Sample documents from {table_name}:")
38
+ sample = table.head(3)
39
+ for i, row in enumerate(sample.to_pylist()):
40
+ content_preview = row.get('content', 'No content')[:100] + "..." if len(row.get('content', '')) > 100 else row.get('content', 'No content')
41
+ filename = row.get('filename', 'No filename')
42
+ print(f" - Document {i+1}: {filename}")
43
+ print(f" Content: {content_preview}")
44
+ print()
45
+
46
+ except Exception as e:
47
+ print(f"❌ Error checking {table_name}: {e}")
48
+
49
+ print("\nπŸ” Voice Bot Document Usage Analysis:")
50
+ print("-" * 40)
51
+
52
+ # Check if voice bot is using documents
53
+ if 'rajasthan_documents' in table_names:
54
+ raj_table = db.open_table('rajasthan_documents')
55
+ raj_count = raj_table.count_rows()
56
+ print(f"βœ… Voice Bot has access to {raj_count} Rajasthan documents")
57
+
58
+ if raj_count > 0:
59
+ print("πŸ“„ Document topics include:")
60
+ documents = raj_table.head(10).to_pylist()
61
+ for doc in documents:
62
+ filename = doc.get('filename', 'Unknown')
63
+ content_snippet = doc.get('content', '')[:200] + "..."
64
+ print(f" β€’ {filename}")
65
+ if 'pension' in content_snippet.lower():
66
+ print(" - Contains pension information βœ…")
67
+ if 'leave' in content_snippet.lower():
68
+ print(" - Contains leave information βœ…")
69
+ if 'salary' in content_snippet.lower():
70
+ print(" - Contains salary information βœ…")
71
+ else:
72
+ print("❌ No rajasthan_documents table found!")
73
+
74
+ # Check regular documents table
75
+ if 'documents' in table_names:
76
+ doc_table = db.open_table('documents')
77
+ doc_count = doc_table.count_rows()
78
+ print(f"πŸ“š General documents table: {doc_count} documents")
79
+
80
+ print(f"\nπŸ“ Summary:")
81
+ print(f"- The voice bot is {'βœ… USING' if raj_count > 0 else '❌ NOT USING'} the document database")
82
+ print(f"- Total accessible documents: {raj_count if 'rajasthan_documents' in table_names else 0}")
83
+ print(f"- This is {'βœ… GOOD' if raj_count >= 5 else '⚠️ LIMITED'} for comprehensive responses")
84
+
85
+ if raj_count < 100:
86
+ print(f"\nπŸ’‘ Note: You mentioned 1000+ documents, but only {raj_count} are currently loaded.")
87
+ print(" Consider adding more documents to improve response quality.")
88
+
89
+ except Exception as e:
90
+ logger.error(f"❌ Error checking document count: {e}")
91
+
92
+ if __name__ == "__main__":
93
+ check_document_count()
document_status_logger.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced startup logging to show actual document usage
3
+ """
4
+
5
+ import logging
6
+ from lancedb_service import lancedb_service
7
+
8
+ logger = logging.getLogger("app")
9
+
10
+ async def log_document_status():
11
+ """Log comprehensive document status during startup"""
12
+ try:
13
+ logger.info("πŸ“Š Document Database Status Check...")
14
+
15
+ total_documents = 0
16
+
17
+ # Check rajasthan_documents table (main voice bot documents)
18
+ if "rajasthan_documents" in lancedb_service.db.table_names():
19
+ raj_table = lancedb_service.db.open_table("rajasthan_documents")
20
+ raj_count = raj_table.count_rows()
21
+ total_documents += raj_count
22
+ logger.info(f"πŸ›οΈ Rajasthan Documents: {raj_count:,} (Voice Bot Primary Source)")
23
+
24
+ # Check general documents table
25
+ if "documents" in lancedb_service.db.table_names():
26
+ doc_table = lancedb_service.db.open_table("documents")
27
+ doc_count = doc_table.count_rows()
28
+ total_documents += doc_count
29
+ logger.info(f"πŸ“š General Documents: {doc_count:,}")
30
+
31
+ # Summary
32
+ logger.info(f"🎯 TOTAL AVAILABLE DOCUMENTS: {total_documents:,}")
33
+
34
+ if total_documents >= 1000:
35
+ logger.info("βœ… Voice Bot has EXCELLENT document coverage (1000+ docs)")
36
+ elif total_documents >= 100:
37
+ logger.info("⚠️ Voice Bot has GOOD document coverage (100+ docs)")
38
+ elif total_documents >= 10:
39
+ logger.info("⚠️ Voice Bot has LIMITED document coverage (<100 docs)")
40
+ else:
41
+ logger.info("❌ Voice Bot has MINIMAL document coverage")
42
+
43
+ # Show sample document topics if available
44
+ if "rajasthan_documents" in lancedb_service.db.table_names():
45
+ raj_table = lancedb_service.db.open_table("rajasthan_documents")
46
+ if raj_table.count_rows() > 0:
47
+ sample = raj_table.head(3).to_pylist()
48
+ logger.info("πŸ“„ Sample document topics available:")
49
+ for i, doc in enumerate(sample, 1):
50
+ filename = doc.get('filename', 'Unknown')
51
+ content_preview = doc.get('content', '')[:100] + "..."
52
+ logger.info(f" {i}. {filename}: {content_preview}")
53
+
54
+ return total_documents
55
+
56
+ except Exception as e:
57
+ logger.error(f"❌ Error checking document status: {e}")
58
+ return 0