File size: 3,446 Bytes
3998131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Test retrieval from vector database
Validates that semantic search is working correctly
"""

import logging
from typing import List, Tuple

from .embeddings import EmbeddingGenerator
from .vector_db import LegalVectorDB
from .config import LOG_LEVEL, LOG_FORMAT

logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT)
logger = logging.getLogger(__name__)


def test_query(
    query: str,
    vector_db: LegalVectorDB,
    embedder: EmbeddingGenerator,
    n_results: int = 3
) -> None:
    """
    Test a single query and display results
    
    Args:
        query: Query string
        vector_db: Vector database instance
        embedder: Embedding generator instance
        n_results: Number of results to retrieve
    """
    print(f"\n{'=' * 80}")
    print(f"Query: {query}")
    print(f"{'=' * 80}")
    
    # Generate query embedding
    query_embedding = embedder.generate_embedding(query)
    
    # Search
    results = vector_db.query_with_embedding(query_embedding.tolist(), n_results=n_results)
    
    # Display results
    if not results['documents'][0]:
        print("No results found!")
        return
    
    for i, (doc, metadata, distance) in enumerate(zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    ), 1):
        print(f"\nResult {i} (Distance: {distance:.4f}):")
        print(f"  Source: {metadata.get('source_file', 'N/A')}")
        print(f"  Section: {metadata.get('article_section', 'N/A')}")
        print(f"  Words: {metadata.get('word_count', 'N/A')}")
        print(f"  Text preview: {doc[:200]}...")
        print("-" * 80)


def main():
    """Run test queries"""
    print("=" * 80)
    print("Testing Vector Database Retrieval")
    print("=" * 80)
    
    try:
        # Initialize
        print("\nInitializing embedding model and vector database...")
        embedder = EmbeddingGenerator()
        vector_db = LegalVectorDB()
        
        db_count = vector_db.get_count()
        print(f"✓ Embedding model loaded: {embedder.model_name}")
        print(f"✓ Vector database loaded: {db_count} chunks indexed")
        
        if db_count == 0:
            print("\n✗ Error: Vector database is empty!")
            print("Please run 'python -m module_a.build_vector_db' first")
            return 1
        
        # Test queries from the implementation guide
        test_queries = [
            "I am a single mother, how to get citizenship for my child?",
            "Can daughters inherit property like sons?",
            "What documents needed for marriage registration?",
            "citizenship through mother",
            "right to equality",
            "fundamental rights of citizens",
        ]
        
        print(f"\nRunning {len(test_queries)} test queries...")
        
        for query in test_queries:
            test_query(query, vector_db, embedder, n_results=3)
        
        print("\n" + "=" * 80)
        print("Retrieval Testing Complete!")
        print("=" * 80)
        print("\n✓ All test queries executed successfully")
        print("✓ Vector database is working correctly")
        print("\nNext step: Integrate with LLM for Step 4 (RAG chain)")
        
        return 0
        
    except Exception as e:
        logger.error(f"Testing failed: {e}", exc_info=True)
        print(f"\n✗ Testing failed: {e}")
        return 1


if __name__ == "__main__":
    exit(main())