sanatan_ai / tests /test_evaluator.py
vikramvasudevan's picture
Upload folder using huggingface_hub
a1180f7 verified
import os
from datetime import datetime
import openai
import json
from chat_utils import chat
from tests.test_config import TEST_QUESTIONS
def validate_with_ai(test_entry, bot_response):
"""
Validator works with narrative bot responses.
The bot does not need to output JSON.
The LLM analyzes the bot response and returns a JSON validation.
"""
prompt = f"""
You are a validator AI. The user provided the following bot response:
Bot Response:
\"\"\"{bot_response}\"\"\"
Expected attributes:
- Sources: {test_entry.get('expected_sources', [])}
- Azhwar: {test_entry.get('expected_azhwar', [])}
- Topics: {test_entry.get('expected_topics', [])}
- Keywords: {test_entry.get('expected_keywords', [])}
- Number of results: {test_entry.get('n_results', 1)}
Check the bot response and answer **only** in JSON with two fields:
{{
"valid": true/false, // True if bot response matches the expected attributes
"feedback": "short explanation why it passed or failed"
}}
Do **not** ask the bot to output the JSON itself. You should parse the narrative internally and return JSON.
"""
resp = openai.chat.completions.create(
model="gpt-5-nano",
messages=[{"role": "user", "content": prompt}],
)
try:
content = resp.choices[0].message.content
return json.loads(content)
except Exception as e:
return {"valid": False, "feedback": f"Validator parsing error: {e}"}
def run_tests(debug_mode=False):
history = []
thread_id = "test_thread"
# Create log directory if it doesn't exist
log_dir = "outputs/tests"
os.makedirs(log_dir, exist_ok=True)
# Markdown log file with timestamp
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file_path = os.path.join(log_dir, f"{run_id}.md")
# Keep track of summary
total_tests = len(TEST_QUESTIONS)
passed_tests = 0
results_summary = []
with open(log_file_path, "w", encoding="utf-8") as f:
f.write(f"# Sanatan AI Test Run - {run_id}\n\n")
for idx, test in enumerate(TEST_QUESTIONS, start=1):
f.write(f"## Test {idx}: {test['q']}\n\n")
f.write(f"**Type:** {test['type']} \n")
f.write(f"**Difficulty:** {test['difficulty']} \n")
f.write(f"**Expected Summary:** {test.get('expected_answer_summary', '')}\n\n")
print(f"\n=== Testing Question ===\n{test['q']}")
bot_response = chat(debug_mode, test["q"], history, thread_id)
f.write(f"### Bot Response\n```\n{bot_response}\n```\n\n")
validation = validate_with_ai(test, bot_response)
f.write(f"### Validation\n- **Valid:** {validation['valid']}\n- **Feedback:** {validation['feedback']}\n\n")
print(f"Valid: {validation['valid']}\nFeedback: {validation['feedback']}")
# Track results for summary
results_summary.append({
"question": test['q'],
"valid": validation['valid']
})
if validation['valid']:
passed_tests += 1
# Write run summary
failed_tests = total_tests - passed_tests
pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
f.write(f"# Run Summary\n\n")
f.write(f"- **Total Tests:** {total_tests}\n")
f.write(f"- **Passed:** {passed_tests}\n")
f.write(f"- **Failed:** {failed_tests}\n")
f.write(f"- **Pass Rate:** {pass_rate:.2f}%\n\n")
# Optional: Table of all test results
f.write("## Test Results Table\n\n")
f.write("| Test | Question | Valid |\n")
f.write("|------|----------|-------|\n")
for i, res in enumerate(results_summary, start=1):
valid_str = "βœ…" if res['valid'] else "❌"
f.write(f"| {i} | {res['question']} | {valid_str} |\n")
print(f"\nTest run complete. Markdown log saved to {log_file_path}")
if __name__ == "__main__":
run_tests(debug_mode=True)