File size: 3,223 Bytes
62fbd09
 
 
 
 
 
 
 
 
 
 
 
 
dc97fe1
62fbd09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc97fe1
62fbd09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import requests
import sys
import time
import subprocess
import os
from typing import Dict, List, Any

# Ensure we can import from the current directory
sys.path.append(os.getcwd())

def test_internal_logic():
    print("πŸ” [TEST] Internal Logic & Task Enumeration...")
    try:
        from backend.env import CustomerSupportEnv, TASKS
    except ImportError as e:
        print(f"❌ Error: Could not import environment components: {e}")
        return False

    env = CustomerSupportEnv()
    
    # 1. Check get_tasks exists and returns correct number
    tasks = env.get_tasks()
    print(f"βœ… Found {len(tasks)} tasks via get_tasks().")
    
    if len(tasks) < 3:
        print(f"❌ Error: Only found {len(tasks)} tasks, expected at least 3.")
        return False
    
    # 2. Check each task has required metadata
    for task in tasks:
        task_id = task.get('id')
        required_keys = ['has_grader', 'has_evaluator', 'grader']
        for key in required_keys:
            if task.get(key) is not True:
                print(f"❌ Error: Task {task_id} {key} is NOT True.")
                return False
    
    # 3. Test Grading
    mock_state = {"classification": "refund", "priority": "high", "status": "closed", "response": "sorry", "sentiment": "angry"}
    ground_truth = {"expected_classification": "refund", "expected_priority": "high", "sentiment": "angry"}
    try:
        score = env.grade(tasks[0]['id'], [{"state": mock_state}], ground_truth)
        print(f"βœ… Grading execution successful. Score: {score:.3f}")
        if not (0.0 <= score <= 1.0):
            print("❌ Error: Score out of range!")
            return False
    except Exception as e:
        print(f"❌ Error: grade() method failed: {e}")
        return False

    print("βœ… Internal logic tests passed!\n")
    return True

def test_endpoints():
    print("πŸ” [TEST] API Endpoints...")
    
    # Start the server
    cmd = [sys.executable, "-m", "uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "7861"]
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    
    time.sleep(5) # Wait for server
    
    try:
        # Test /tasks
        r = requests.get("http://localhost:7861/tasks")
        if r.status_code != 200 or len(r.json()) < 3:
            print("❌ Error: /tasks endpoint failed.")
            return False
        
        # Test /grader
        task_id = r.json()[0]["id"]
        r_grader = requests.get(f"http://localhost:7861/grader?task_id={task_id}")
        if r_grader.status_code != 200 or "score" not in r_grader.json():
            print("❌ Error: /grader endpoint failed.")
            return False
            
        print("βœ… API endpoint tests passed!")
        return True
    except Exception as e:
        print(f"❌ Error during API test: {e}")
        return False
    finally:
        process.terminate()

def main():
    print("πŸš€ Starting consolidated validation...")
    if test_internal_logic() and test_endpoints():
        print("\n✨ ALL VALIDATION CHECKS PASSED!")
        sys.exit(0)
    else:
        print("\n❌ VALIDATION FAILED.")
        sys.exit(1)

if __name__ == "__main__":
    main()