"""Tests for the evaluation framework.""" import pytest from src.evaluation.gleu_scorer import GLEUScorer def test_gleu_scorer_instantiation(): """Test that GLEU scorer can be created.""" scorer = GLEUScorer() assert scorer is not None def test_gleu_perfect_score(): """Test that identical predictions and references score high.""" scorer = GLEUScorer() preds = ["The cat sat on the mat.", "Hello world."] refs = ["The cat sat on the mat.", "Hello world."] score = scorer.compute_gleu(preds, refs) assert score > 90.0 # Should be near-perfect def test_gleu_empty_input(): """Test empty input handling.""" scorer = GLEUScorer() assert scorer.compute_gleu([], []) == 0.0 def test_awl_coverage_score(): """Test AWL coverage scoring.""" from src.vocabulary.awl_loader import AWLLoader from src.style.fingerprinter import StyleFingerprinter from src.evaluation.style_metrics import StyleEvaluator import tempfile, os with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write("analysis\nresearch\nmethod\nsignificant\nestablish\n") awl_path = f.name try: awl = AWLLoader(primary_path=awl_path, synonyms_path=None) fp = StyleFingerprinter(spacy_model="en_core_web_sm", awl_path=awl_path) evaluator = StyleEvaluator(fp, awl) coverage = evaluator.awl_coverage("The analysis shows significant research results.") assert 0.0 <= coverage <= 1.0 finally: os.unlink(awl_path)