Spaces:

A7m0d
/

eval_sales_korean_agent

Sleeping

App Files Files Community

eval_sales_korean_agent / src /tests /test_fix.py

A7m0d

Upload folder using huggingface_hub

7e5c5ea verified 5 months ago

raw

history blame contribute delete

2.34 kB

	#!/usr/bin/env python3
	"""Quick test to verify the evaluator fix works."""

	import sys
	from pathlib import Path

	# Add src to path
	sys.path.append(str(Path(__file__).parent.parent))

	from src.logger import setup_logging
	from src.config import Config
	from src.evaluator import KoreanQAEvaluator

	def test_single_evaluation():
	"""Test single case evaluation to verify score extraction."""

	# Setup logging
	logger_setup = setup_logging(log_level="INFO")
	logger = logger_setup.get_logger(__name__)

	try:
	# Load config
	config_path = Path(__file__).parent / "config.yaml"
	config = Config(str(config_path))

	# Initialize evaluator
	evaluator = KoreanQAEvaluator(
	model_name=config.gemini_model,
	api_key=config.google_api_key,
	threshold=0.8,
	verbose_mode=True
	)

	# Test case
	input_text = "이번 달 우리 회사 전체 매출은 얼마야?"
	actual_output = "2025년 1월 삼광 Global 전체 매출은 335.4억원입니다."

	# Run evaluation
	logger.info("Testing single case evaluation...")
	results = evaluator.evaluate_single_case(input_text, actual_output)

	# Check if we got real scores
	detailed_results = results.get('detailed_results', [])
	if detailed_results:
	first_case = detailed_results[0]
	metrics = first_case.get('metrics', {})

	logger.info("Evaluation results:")
	for metric_name, metric_data in metrics.items():
	score = metric_data.get('score')
	passed = metric_data.get('passed')
	reason = metric_data.get('reason', '')

	logger.info(f" {metric_name}: {score:.4f} ({'PASS' if passed else 'FAIL'})")
	if reason and not reason.startswith('Mock') and not reason.startswith('Fallback'):
	logger.info(" ✓ Real DeepEval score extracted successfully!")
	else:
	logger.warning(" ⚠ Still using fallback/mock scores")

	return results

	except Exception as e:
	logger.error(f"Test failed: {e}")
	return None

	if __name__ == "__main__":
	test_single_evaluation()