File size: 3,992 Bytes
a1180f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
from datetime import datetime
import openai
import json

from chat_utils import chat
from tests.test_config import TEST_QUESTIONS

def validate_with_ai(test_entry, bot_response):
    """
    Validator works with narrative bot responses. 
    The bot does not need to output JSON.
    The LLM analyzes the bot response and returns a JSON validation.
    """
    prompt = f"""
You are a validator AI. The user provided the following bot response:

Bot Response:
\"\"\"{bot_response}\"\"\"

Expected attributes:
- Sources: {test_entry.get('expected_sources', [])}
- Azhwar: {test_entry.get('expected_azhwar', [])}
- Topics: {test_entry.get('expected_topics', [])}
- Keywords: {test_entry.get('expected_keywords', [])}
- Number of results: {test_entry.get('n_results', 1)}

Check the bot response and answer **only** in JSON with two fields:
{{
  "valid": true/false,   // True if bot response matches the expected attributes
  "feedback": "short explanation why it passed or failed"
}}

Do **not** ask the bot to output the JSON itself. You should parse the narrative internally and return JSON.
"""
    resp = openai.chat.completions.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}],
    )
    try:
        content = resp.choices[0].message.content
        return json.loads(content)
    except Exception as e:
        return {"valid": False, "feedback": f"Validator parsing error: {e}"}

def run_tests(debug_mode=False):
    history = []
    thread_id = "test_thread"

    # Create log directory if it doesn't exist
    log_dir = "outputs/tests"
    os.makedirs(log_dir, exist_ok=True)

    # Markdown log file with timestamp
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file_path = os.path.join(log_dir, f"{run_id}.md")

    # Keep track of summary
    total_tests = len(TEST_QUESTIONS)
    passed_tests = 0
    results_summary = []

    with open(log_file_path, "w", encoding="utf-8") as f:
        f.write(f"# Sanatan AI Test Run - {run_id}\n\n")
        for idx, test in enumerate(TEST_QUESTIONS, start=1):
            f.write(f"## Test {idx}: {test['q']}\n\n")
            f.write(f"**Type:** {test['type']}  \n")
            f.write(f"**Difficulty:** {test['difficulty']}  \n")
            f.write(f"**Expected Summary:** {test.get('expected_answer_summary', '')}\n\n")

            print(f"\n=== Testing Question ===\n{test['q']}")
            bot_response = chat(debug_mode, test["q"], history, thread_id)
            f.write(f"### Bot Response\n```\n{bot_response}\n```\n\n")

            validation = validate_with_ai(test, bot_response)
            f.write(f"### Validation\n- **Valid:** {validation['valid']}\n- **Feedback:** {validation['feedback']}\n\n")

            print(f"Valid: {validation['valid']}\nFeedback: {validation['feedback']}")

            # Track results for summary
            results_summary.append({
                "question": test['q'],
                "valid": validation['valid']
            })
            if validation['valid']:
                passed_tests += 1

        # Write run summary
        failed_tests = total_tests - passed_tests
        pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
        f.write(f"# Run Summary\n\n")
        f.write(f"- **Total Tests:** {total_tests}\n")
        f.write(f"- **Passed:** {passed_tests}\n")
        f.write(f"- **Failed:** {failed_tests}\n")
        f.write(f"- **Pass Rate:** {pass_rate:.2f}%\n\n")

        # Optional: Table of all test results
        f.write("## Test Results Table\n\n")
        f.write("| Test | Question | Valid |\n")
        f.write("|------|----------|-------|\n")
        for i, res in enumerate(results_summary, start=1):
            valid_str = "✅" if res['valid'] else "❌"
            f.write(f"| {i} | {res['question']} | {valid_str} |\n")

    print(f"\nTest run complete. Markdown log saved to {log_file_path}")

if __name__ == "__main__":
    run_tests(debug_mode=True)