Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| from datetime import datetime | |
| import openai | |
| import json | |
| from chat_utils import chat | |
| from tests.test_config import TEST_QUESTIONS | |
| def validate_with_ai(test_entry, bot_response): | |
| """ | |
| Validator works with narrative bot responses. | |
| The bot does not need to output JSON. | |
| The LLM analyzes the bot response and returns a JSON validation. | |
| """ | |
| prompt = f""" | |
| You are a validator AI. The user provided the following bot response: | |
| Bot Response: | |
| \"\"\"{bot_response}\"\"\" | |
| Expected attributes: | |
| - Sources: {test_entry.get('expected_sources', [])} | |
| - Azhwar: {test_entry.get('expected_azhwar', [])} | |
| - Topics: {test_entry.get('expected_topics', [])} | |
| - Keywords: {test_entry.get('expected_keywords', [])} | |
| - Number of results: {test_entry.get('n_results', 1)} | |
| Check the bot response and answer **only** in JSON with two fields: | |
| {{ | |
| "valid": true/false, // True if bot response matches the expected attributes | |
| "feedback": "short explanation why it passed or failed" | |
| }} | |
| Do **not** ask the bot to output the JSON itself. You should parse the narrative internally and return JSON. | |
| """ | |
| resp = openai.chat.completions.create( | |
| model="gpt-5-nano", | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| try: | |
| content = resp.choices[0].message.content | |
| return json.loads(content) | |
| except Exception as e: | |
| return {"valid": False, "feedback": f"Validator parsing error: {e}"} | |
| def run_tests(debug_mode=False): | |
| history = [] | |
| thread_id = "test_thread" | |
| # Create log directory if it doesn't exist | |
| log_dir = "outputs/tests" | |
| os.makedirs(log_dir, exist_ok=True) | |
| # Markdown log file with timestamp | |
| run_id = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| log_file_path = os.path.join(log_dir, f"{run_id}.md") | |
| # Keep track of summary | |
| total_tests = len(TEST_QUESTIONS) | |
| passed_tests = 0 | |
| results_summary = [] | |
| with open(log_file_path, "w", encoding="utf-8") as f: | |
| f.write(f"# Sanatan AI Test Run - {run_id}\n\n") | |
| for idx, test in enumerate(TEST_QUESTIONS, start=1): | |
| f.write(f"## Test {idx}: {test['q']}\n\n") | |
| f.write(f"**Type:** {test['type']} \n") | |
| f.write(f"**Difficulty:** {test['difficulty']} \n") | |
| f.write(f"**Expected Summary:** {test.get('expected_answer_summary', '')}\n\n") | |
| print(f"\n=== Testing Question ===\n{test['q']}") | |
| bot_response = chat(debug_mode, test["q"], history, thread_id) | |
| f.write(f"### Bot Response\n```\n{bot_response}\n```\n\n") | |
| validation = validate_with_ai(test, bot_response) | |
| f.write(f"### Validation\n- **Valid:** {validation['valid']}\n- **Feedback:** {validation['feedback']}\n\n") | |
| print(f"Valid: {validation['valid']}\nFeedback: {validation['feedback']}") | |
| # Track results for summary | |
| results_summary.append({ | |
| "question": test['q'], | |
| "valid": validation['valid'] | |
| }) | |
| if validation['valid']: | |
| passed_tests += 1 | |
| # Write run summary | |
| failed_tests = total_tests - passed_tests | |
| pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0 | |
| f.write(f"# Run Summary\n\n") | |
| f.write(f"- **Total Tests:** {total_tests}\n") | |
| f.write(f"- **Passed:** {passed_tests}\n") | |
| f.write(f"- **Failed:** {failed_tests}\n") | |
| f.write(f"- **Pass Rate:** {pass_rate:.2f}%\n\n") | |
| # Optional: Table of all test results | |
| f.write("## Test Results Table\n\n") | |
| f.write("| Test | Question | Valid |\n") | |
| f.write("|------|----------|-------|\n") | |
| for i, res in enumerate(results_summary, start=1): | |
| valid_str = "β " if res['valid'] else "β" | |
| f.write(f"| {i} | {res['question']} | {valid_str} |\n") | |
| print(f"\nTest run complete. Markdown log saved to {log_file_path}") | |
| if __name__ == "__main__": | |
| run_tests(debug_mode=True) | |