Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

sanatan_ai / tests /test_evaluator.py

vikramvasudevan

Upload folder using huggingface_hub

a1180f7 verified 3 months ago

raw

history blame contribute delete

3.99 kB

	import os
	from datetime import datetime
	import openai
	import json

	from chat_utils import chat
	from tests.test_config import TEST_QUESTIONS

	def validate_with_ai(test_entry, bot_response):
	"""
	Validator works with narrative bot responses.
	The bot does not need to output JSON.
	The LLM analyzes the bot response and returns a JSON validation.
	"""
	prompt = f"""
	You are a validator AI. The user provided the following bot response:

	Bot Response:
	\"\"\"{bot_response}\"\"\"

	Expected attributes:
	- Sources: {test_entry.get('expected_sources', [])}
	- Azhwar: {test_entry.get('expected_azhwar', [])}
	- Topics: {test_entry.get('expected_topics', [])}
	- Keywords: {test_entry.get('expected_keywords', [])}
	- Number of results: {test_entry.get('n_results', 1)}

	Check the bot response and answer only in JSON with two fields:
	{{
	"valid": true/false, // True if bot response matches the expected attributes
	"feedback": "short explanation why it passed or failed"
	}}

	Do not ask the bot to output the JSON itself. You should parse the narrative internally and return JSON.
	"""
	resp = openai.chat.completions.create(
	model="gpt-5-nano",
	messages=[{"role": "user", "content": prompt}],
	)
	try:
	content = resp.choices[0].message.content
	return json.loads(content)
	except Exception as e:
	return {"valid": False, "feedback": f"Validator parsing error: {e}"}

	def run_tests(debug_mode=False):
	history = []
	thread_id = "test_thread"

	# Create log directory if it doesn't exist
	log_dir = "outputs/tests"
	os.makedirs(log_dir, exist_ok=True)

	# Markdown log file with timestamp
	run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
	log_file_path = os.path.join(log_dir, f"{run_id}.md")

	# Keep track of summary
	total_tests = len(TEST_QUESTIONS)
	passed_tests = 0
	results_summary = []

	with open(log_file_path, "w", encoding="utf-8") as f:
	f.write(f"# Sanatan AI Test Run - {run_id}\n\n")
	for idx, test in enumerate(TEST_QUESTIONS, start=1):
	f.write(f"## Test {idx}: {test['q']}\n\n")
	f.write(f"Type: {test['type']} \n")
	f.write(f"Difficulty: {test['difficulty']} \n")
	f.write(f"Expected Summary: {test.get('expected_answer_summary', '')}\n\n")

	print(f"\n=== Testing Question ===\n{test['q']}")
	bot_response = chat(debug_mode, test["q"], history, thread_id)
	f.write(f"### Bot Response\n```\n{bot_response}\n```\n\n")

	validation = validate_with_ai(test, bot_response)
	f.write(f"### Validation\n- Valid: {validation['valid']}\n- Feedback: {validation['feedback']}\n\n")

	print(f"Valid: {validation['valid']}\nFeedback: {validation['feedback']}")

	# Track results for summary
	results_summary.append({
	"question": test['q'],
	"valid": validation['valid']
	})
	if validation['valid']:
	passed_tests += 1

	# Write run summary
	failed_tests = total_tests - passed_tests
	pass_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
	f.write(f"# Run Summary\n\n")
	f.write(f"- Total Tests: {total_tests}\n")
	f.write(f"- Passed: {passed_tests}\n")
	f.write(f"- Failed: {failed_tests}\n")
	f.write(f"- Pass Rate: {pass_rate:.2f}%\n\n")

	# Optional: Table of all test results
	f.write("## Test Results Table\n\n")
	f.write("\| Test \| Question \| Valid \|\n")
	f.write("\|------\|----------\|-------\|\n")
	for i, res in enumerate(results_summary, start=1):
	valid_str = "✅" if res['valid'] else "❌"
	f.write(f"\| {i} \| {res['question']} \| {valid_str} \|\n")

	print(f"\nTest run complete. Markdown log saved to {log_file_path}")

	if __name__ == "__main__":
	run_tests(debug_mode=True)