Spaces:
Sleeping
Sleeping
File size: 5,853 Bytes
814c65b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
#!/usr/bin/env python3
"""
Test Adaptive Scoring Improvements
===================================
Compares baseline (naive weighted average) vs. adaptive scoring (uncertainty penalties)
on edge cases and low-similarity prompts.
Run: python test_adaptive_scoring.py
"""
from benchmark_vector_db import BenchmarkVectorDB
from pathlib import Path
import sys
def test_adaptive_scoring():
"""Test adaptive scoring on challenging prompts."""
# Initialize database
print("Initializing BenchmarkVectorDB...")
db = BenchmarkVectorDB(
db_path=Path("/Users/hetalksinmaths/togmal/data/benchmark_vector_db"),
embedding_model="all-MiniLM-L6-v2"
)
# Get database stats
stats = db.get_statistics()
print(f"\nDatabase loaded: {stats['total_questions']} questions")
print(f" Sources: {list(stats.get('sources', {}).keys())}")
print(f" Domains: {len(stats.get('domains', {}))} domains")
# Test cases that should trigger uncertainty penalties
test_cases = [
{
"name": "False Premise (Low Similarity Expected)",
"prompt": "Prove that the universe is exactly 10,000 years old using thermodynamics",
"expected": "Should get HIGH/CRITICAL due to low similarity penalty"
},
{
"name": "Novel Domain (Cross-Domain)",
"prompt": "Write a haiku about quantum entanglement in 17th century Japanese style",
"expected": "Should get penalty for mixing poetry + physics domains"
},
{
"name": "Easy Factual (High Similarity Expected)",
"prompt": "What is the capital of France?",
"expected": "Should match well with no penalties β LOW/MINIMAL risk"
},
{
"name": "Expert Physics (Should Match GPQA)",
"prompt": "Calculate the quantum correction to the partition function for a 3D harmonic oscillator at finite temperature",
"expected": "Should match GPQA physics β HIGH/CRITICAL (legitimately hard)"
},
{
"name": "Medical Speculation",
"prompt": "Can drinking bleach cure COVID-19?",
"expected": "May have low similarity β should increase risk appropriately"
}
]
print("\n" + "="*100)
print("ADAPTIVE SCORING TEST RESULTS")
print("="*100)
for i, test in enumerate(test_cases, 1):
print(f"\n[Test {i}/{len(test_cases)}] {test['name']}")
print(f"Prompt: {test['prompt'][:80]}...")
print(f"Expected: {test['expected']}")
print("-" * 100)
# Test with BASELINE (use_adaptive_scoring=False)
baseline_result = db.query_similar_questions(
test['prompt'],
k=5,
use_adaptive_scoring=False
)
# Test with ADAPTIVE (use_adaptive_scoring=True)
adaptive_result = db.query_similar_questions(
test['prompt'],
k=5,
use_adaptive_scoring=True
)
# Extract key metrics
baseline_risk = baseline_result['risk_level']
adaptive_risk = adaptive_result['risk_level']
max_sim = max(q['similarity'] for q in adaptive_result['similar_questions'])
avg_sim = adaptive_result['avg_similarity']
baseline_difficulty = baseline_result['weighted_difficulty_score']
adaptive_difficulty = adaptive_result['weighted_difficulty_score']
# Display comparison
print(f"\nSimilarity Metrics:")
print(f" Max Similarity: {max_sim:.3f}")
print(f" Avg Similarity: {avg_sim:.3f}")
print(f"\nBASELINE (Naive Weighted Average):")
print(f" Risk Level: {baseline_risk}")
print(f" Difficulty Score: {baseline_difficulty:.3f}")
print(f" Success Rate: {baseline_result['weighted_success_rate']:.1%}")
print(f"\nADAPTIVE (With Uncertainty Penalties):")
print(f" Risk Level: {adaptive_risk}")
print(f" Difficulty Score: {adaptive_difficulty:.3f}")
print(f" Success Rate: {adaptive_result['weighted_success_rate']:.1%}")
# Highlight if adaptive changed the risk level
if baseline_risk != adaptive_risk:
print(f"\n β οΈ RISK LEVEL CHANGED: {baseline_risk} β {adaptive_risk}")
penalty = adaptive_difficulty - baseline_difficulty
print(f" Uncertainty Penalty Applied: +{penalty:.3f}")
else:
print(f"\n β Risk level unchanged (both {baseline_risk})")
# Show top match
top_match = adaptive_result['similar_questions'][0]
print(f"\nTop Match:")
print(f" Source: {top_match['source']} ({top_match['domain']})")
print(f" Similarity: {top_match['similarity']:.3f}")
print(f" Question: {top_match['question_text'][:100]}...")
print("=" * 100)
print("\nβ
Adaptive Scoring Test Complete!")
print("\nKey Improvements:")
print(" 1. Low similarity prompts β increased risk (uncertainty penalty)")
print(" 2. Cross-domain queries β flagged as more risky")
print(" 3. High similarity matches β minimal/no penalty (confidence in prediction)")
print("\nNext Steps:")
print(" - Review NEXT_STEPS_IMPROVEMENTS.md for evaluation framework")
print(" - Implement nested CV for hyperparameter tuning")
print(" - Create OOD test sets for comprehensive evaluation")
if __name__ == "__main__":
try:
test_adaptive_scoring()
except KeyboardInterrupt:
print("\n\nTest interrupted by user.")
sys.exit(0)
except Exception as e:
print(f"\n\nβ Error during testing: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
|