Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Quick test to verify the evaluator fix works.""" | |
| import sys | |
| from pathlib import Path | |
| # Add src to path | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| from src.logger import setup_logging | |
| from src.config import Config | |
| from src.evaluator import KoreanQAEvaluator | |
| def test_single_evaluation(): | |
| """Test single case evaluation to verify score extraction.""" | |
| # Setup logging | |
| logger_setup = setup_logging(log_level="INFO") | |
| logger = logger_setup.get_logger(__name__) | |
| try: | |
| # Load config | |
| config_path = Path(__file__).parent / "config.yaml" | |
| config = Config(str(config_path)) | |
| # Initialize evaluator | |
| evaluator = KoreanQAEvaluator( | |
| model_name=config.gemini_model, | |
| api_key=config.google_api_key, | |
| threshold=0.8, | |
| verbose_mode=True | |
| ) | |
| # Test case | |
| input_text = "์ด๋ฒ ๋ฌ ์ฐ๋ฆฌ ํ์ฌ ์ ์ฒด ๋งค์ถ์ ์ผ๋ง์ผ?" | |
| actual_output = "2025๋ 1์ ์ผ๊ด Global ์ ์ฒด ๋งค์ถ์ 335.4์ต์์ ๋๋ค." | |
| # Run evaluation | |
| logger.info("Testing single case evaluation...") | |
| results = evaluator.evaluate_single_case(input_text, actual_output) | |
| # Check if we got real scores | |
| detailed_results = results.get('detailed_results', []) | |
| if detailed_results: | |
| first_case = detailed_results[0] | |
| metrics = first_case.get('metrics', {}) | |
| logger.info("Evaluation results:") | |
| for metric_name, metric_data in metrics.items(): | |
| score = metric_data.get('score') | |
| passed = metric_data.get('passed') | |
| reason = metric_data.get('reason', '') | |
| logger.info(f" {metric_name}: {score:.4f} ({'PASS' if passed else 'FAIL'})") | |
| if reason and not reason.startswith('Mock') and not reason.startswith('Fallback'): | |
| logger.info(" โ Real DeepEval score extracted successfully!") | |
| else: | |
| logger.warning(" โ Still using fallback/mock scores") | |
| return results | |
| except Exception as e: | |
| logger.error(f"Test failed: {e}") | |
| return None | |
| if __name__ == "__main__": | |
| test_single_evaluation() |