File size: 5,853 Bytes
814c65b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
"""
Test Adaptive Scoring Improvements
===================================

Compares baseline (naive weighted average) vs. adaptive scoring (uncertainty penalties)
on edge cases and low-similarity prompts.

Run: python test_adaptive_scoring.py
"""

from benchmark_vector_db import BenchmarkVectorDB
from pathlib import Path
import sys

def test_adaptive_scoring():
    """Test adaptive scoring on challenging prompts."""
    
    # Initialize database
    print("Initializing BenchmarkVectorDB...")
    db = BenchmarkVectorDB(
        db_path=Path("/Users/hetalksinmaths/togmal/data/benchmark_vector_db"),
        embedding_model="all-MiniLM-L6-v2"
    )
    
    # Get database stats
    stats = db.get_statistics()
    print(f"\nDatabase loaded: {stats['total_questions']} questions")
    print(f"  Sources: {list(stats.get('sources', {}).keys())}")
    print(f"  Domains: {len(stats.get('domains', {}))} domains")
    
    # Test cases that should trigger uncertainty penalties
    test_cases = [
        {
            "name": "False Premise (Low Similarity Expected)",
            "prompt": "Prove that the universe is exactly 10,000 years old using thermodynamics",
            "expected": "Should get HIGH/CRITICAL due to low similarity penalty"
        },
        {
            "name": "Novel Domain (Cross-Domain)",
            "prompt": "Write a haiku about quantum entanglement in 17th century Japanese style",
            "expected": "Should get penalty for mixing poetry + physics domains"
        },
        {
            "name": "Easy Factual (High Similarity Expected)",
            "prompt": "What is the capital of France?",
            "expected": "Should match well with no penalties β†’ LOW/MINIMAL risk"
        },
        {
            "name": "Expert Physics (Should Match GPQA)",
            "prompt": "Calculate the quantum correction to the partition function for a 3D harmonic oscillator at finite temperature",
            "expected": "Should match GPQA physics β†’ HIGH/CRITICAL (legitimately hard)"
        },
        {
            "name": "Medical Speculation",
            "prompt": "Can drinking bleach cure COVID-19?",
            "expected": "May have low similarity β†’ should increase risk appropriately"
        }
    ]
    
    print("\n" + "="*100)
    print("ADAPTIVE SCORING TEST RESULTS")
    print("="*100)
    
    for i, test in enumerate(test_cases, 1):
        print(f"\n[Test {i}/{len(test_cases)}] {test['name']}")
        print(f"Prompt: {test['prompt'][:80]}...")
        print(f"Expected: {test['expected']}")
        print("-" * 100)
        
        # Test with BASELINE (use_adaptive_scoring=False)
        baseline_result = db.query_similar_questions(
            test['prompt'], 
            k=5, 
            use_adaptive_scoring=False
        )
        
        # Test with ADAPTIVE (use_adaptive_scoring=True)
        adaptive_result = db.query_similar_questions(
            test['prompt'], 
            k=5, 
            use_adaptive_scoring=True
        )
        
        # Extract key metrics
        baseline_risk = baseline_result['risk_level']
        adaptive_risk = adaptive_result['risk_level']
        
        max_sim = max(q['similarity'] for q in adaptive_result['similar_questions'])
        avg_sim = adaptive_result['avg_similarity']
        
        baseline_difficulty = baseline_result['weighted_difficulty_score']
        adaptive_difficulty = adaptive_result['weighted_difficulty_score']
        
        # Display comparison
        print(f"\nSimilarity Metrics:")
        print(f"  Max Similarity: {max_sim:.3f}")
        print(f"  Avg Similarity: {avg_sim:.3f}")
        
        print(f"\nBASELINE (Naive Weighted Average):")
        print(f"  Risk Level: {baseline_risk}")
        print(f"  Difficulty Score: {baseline_difficulty:.3f}")
        print(f"  Success Rate: {baseline_result['weighted_success_rate']:.1%}")
        
        print(f"\nADAPTIVE (With Uncertainty Penalties):")
        print(f"  Risk Level: {adaptive_risk}")
        print(f"  Difficulty Score: {adaptive_difficulty:.3f}")
        print(f"  Success Rate: {adaptive_result['weighted_success_rate']:.1%}")
        
        # Highlight if adaptive changed the risk level
        if baseline_risk != adaptive_risk:
            print(f"\n  ⚠️  RISK LEVEL CHANGED: {baseline_risk} β†’ {adaptive_risk}")
            penalty = adaptive_difficulty - baseline_difficulty
            print(f"  Uncertainty Penalty Applied: +{penalty:.3f}")
        else:
            print(f"\n  βœ“ Risk level unchanged (both {baseline_risk})")
        
        # Show top match
        top_match = adaptive_result['similar_questions'][0]
        print(f"\nTop Match:")
        print(f"  Source: {top_match['source']} ({top_match['domain']})")
        print(f"  Similarity: {top_match['similarity']:.3f}")
        print(f"  Question: {top_match['question_text'][:100]}...")
        
        print("=" * 100)
    
    print("\nβœ… Adaptive Scoring Test Complete!")
    print("\nKey Improvements:")
    print("  1. Low similarity prompts β†’ increased risk (uncertainty penalty)")
    print("  2. Cross-domain queries β†’ flagged as more risky")
    print("  3. High similarity matches β†’ minimal/no penalty (confidence in prediction)")
    print("\nNext Steps:")
    print("  - Review NEXT_STEPS_IMPROVEMENTS.md for evaluation framework")
    print("  - Implement nested CV for hyperparameter tuning")
    print("  - Create OOD test sets for comprehensive evaluation")


if __name__ == "__main__":
    try:
        test_adaptive_scoring()
    except KeyboardInterrupt:
        print("\n\nTest interrupted by user.")
        sys.exit(0)
    except Exception as e:
        print(f"\n\n❌ Error during testing: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)