File size: 2,437 Bytes
acd8e16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
"""
Test script to verify the evaluation pipeline works with mock mode.
"""

import os
import sys

# Add src to path for imports
sys.path.append('src')

from evaluator import evaluator
from models_registry import models_registry


def test_evaluation_pipeline():
    """Test the complete evaluation pipeline with mock mode."""
    print("🧪 Testing Evaluation Pipeline with Mock Mode")
    print("=" * 50)
    
    # Enable mock mode
    os.environ["MOCK_MODE"] = "true"
    
    # Test parameters
    dataset_name = "nyc_taxi_small"
    dialect = "presto"
    case_id = "avg_fare_amount"
    model_name = "CodeLlama-7B-Instruct"
    
    # Load prompt template
    template_path = f"prompts/template_{dialect}.txt"
    with open(template_path, 'r') as f:
        prompt_template = f.read()
    
    print(f"Testing evaluation:")
    print(f"  Dataset: {dataset_name}")
    print(f"  Dialect: {dialect}")
    print(f"  Case: {case_id}")
    print(f"  Model: {model_name}")
    print()
    
    try:
        # Run evaluation
        result = evaluator.evaluate_model_on_case(
            model_name, dataset_name, case_id, dialect, prompt_template
        )
        
        print("✅ Evaluation completed successfully!")
        print()
        print("Results:")
        print(f"  Model: {result['model_name']}")
        print(f"  Question: {result['question']}")
        print(f"  Reference SQL: {result['reference_sql']}")
        print(f"  Generated SQL: {result['candidate_sql']}")
        print(f"  Composite Score: {result['composite_score']:.4f}")
        print(f"  Correctness: {result['correctness_exact']:.2f}")
        print(f"  Execution Success: {result['exec_success']:.2f}")
        print(f"  Result Match F1: {result['result_match_f1']:.4f}")
        print(f"  Latency: {result['latency_ms']:.1f}ms")
        print(f"  Dialect OK: {result['dialect_ok']:.2f}")
        
        # Check if we got reasonable results
        if result['composite_score'] > 0:
            print("\n🎉 SUCCESS: Evaluation pipeline is working!")
            return True
        else:
            print("\n❌ ISSUE: All scores are zero")
            return False
            
    except Exception as e:
        print(f"❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        return False


if __name__ == "__main__":
    success = test_evaluation_pipeline()
    sys.exit(0 if success else 1)