Spaces:
Sleeping
Sleeping
Claude commited on
Fix CI : adapter les tests au 5e concurrent VLM introduit en Sprint 10
Browse files- test_report.py : 3 assertions == 4 → == 5 (3 OCR + 1 pipeline LLM + 1 VLM zero-shot)
- test_sprint3_llm_pipelines.py : exclure gpt-4o-vision (zero-shot) des vérifications
"non-pipeline", car un concurrent VLM zero-shot EST correctement un pipeline
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
tests/test_report.py
CHANGED
|
@@ -32,8 +32,8 @@ class TestGenerateSampleBenchmark:
|
|
| 32 |
assert isinstance(sample_benchmark, BenchmarkResult)
|
| 33 |
|
| 34 |
def test_correct_engine_count(self, sample_benchmark):
|
| 35 |
-
# 3 moteurs OCR + 1 pipeline tesseract → gpt-4o
|
| 36 |
-
assert len(sample_benchmark.engine_reports) ==
|
| 37 |
|
| 38 |
def test_correct_doc_count(self, sample_benchmark):
|
| 39 |
assert sample_benchmark.document_count == 3
|
|
@@ -89,8 +89,8 @@ class TestBuildReportData:
|
|
| 89 |
|
| 90 |
def test_engines_count(self, sample_benchmark):
|
| 91 |
data = _build_report_data(sample_benchmark, {})
|
| 92 |
-
# 3 moteurs OCR + 1 pipeline tesseract → gpt-4o
|
| 93 |
-
assert len(data["engines"]) ==
|
| 94 |
|
| 95 |
def test_engine_fields(self, sample_benchmark):
|
| 96 |
data = _build_report_data(sample_benchmark, {})
|
|
@@ -221,7 +221,7 @@ class TestReportGenerator:
|
|
| 221 |
data = json.loads(match.group(1))
|
| 222 |
assert "engines" in data
|
| 223 |
assert "documents" in data
|
| 224 |
-
assert len(data["engines"]) ==
|
| 225 |
|
| 226 |
|
| 227 |
# ---------------------------------------------------------------------------
|
|
|
|
| 32 |
assert isinstance(sample_benchmark, BenchmarkResult)
|
| 33 |
|
| 34 |
def test_correct_engine_count(self, sample_benchmark):
|
| 35 |
+
# 3 moteurs OCR + 1 pipeline tesseract → gpt-4o + 1 VLM zero-shot (Sprint 10)
|
| 36 |
+
assert len(sample_benchmark.engine_reports) == 5
|
| 37 |
|
| 38 |
def test_correct_doc_count(self, sample_benchmark):
|
| 39 |
assert sample_benchmark.document_count == 3
|
|
|
|
| 89 |
|
| 90 |
def test_engines_count(self, sample_benchmark):
|
| 91 |
data = _build_report_data(sample_benchmark, {})
|
| 92 |
+
# 3 moteurs OCR + 1 pipeline tesseract → gpt-4o + 1 VLM zero-shot (Sprint 10)
|
| 93 |
+
assert len(data["engines"]) == 5
|
| 94 |
|
| 95 |
def test_engine_fields(self, sample_benchmark):
|
| 96 |
data = _build_report_data(sample_benchmark, {})
|
|
|
|
| 221 |
data = json.loads(match.group(1))
|
| 222 |
assert "engines" in data
|
| 223 |
assert "documents" in data
|
| 224 |
+
assert len(data["engines"]) == 5 # 3 OCR + 1 pipeline LLM + 1 VLM zero-shot (Sprint 10)
|
| 225 |
|
| 226 |
|
| 227 |
# ---------------------------------------------------------------------------
|
tests/test_sprint3_llm_pipelines.py
CHANGED
|
@@ -372,8 +372,10 @@ class TestFixturesPipeline:
|
|
| 372 |
assert steps[1]["type"] == "llm"
|
| 373 |
|
| 374 |
def test_non_pipeline_reports_empty_pipeline_info(self, benchmark):
|
|
|
|
|
|
|
| 375 |
for report in benchmark.engine_reports:
|
| 376 |
-
if report.engine_name
|
| 377 |
assert not report.is_pipeline
|
| 378 |
assert report.pipeline_info == {}
|
| 379 |
|
|
@@ -401,8 +403,10 @@ class TestReportWithPipeline:
|
|
| 401 |
assert pipeline_e["is_pipeline"] is True
|
| 402 |
|
| 403 |
def test_non_pipeline_engines_not_flagged(self, report_data):
|
|
|
|
|
|
|
| 404 |
for e in report_data["engines"]:
|
| 405 |
-
if e["name"]
|
| 406 |
assert e["is_pipeline"] is False
|
| 407 |
|
| 408 |
def test_pipeline_has_over_normalization_in_info(self, report_data):
|
|
|
|
| 372 |
assert steps[1]["type"] == "llm"
|
| 373 |
|
| 374 |
def test_non_pipeline_reports_empty_pipeline_info(self, benchmark):
|
| 375 |
+
# Les concurrents pipeline (LLM ou VLM) ont un pipeline_info non vide
|
| 376 |
+
pipeline_engines = {"tesseract → gpt-4o", "gpt-4o-vision (zero-shot)"}
|
| 377 |
for report in benchmark.engine_reports:
|
| 378 |
+
if report.engine_name not in pipeline_engines:
|
| 379 |
assert not report.is_pipeline
|
| 380 |
assert report.pipeline_info == {}
|
| 381 |
|
|
|
|
| 403 |
assert pipeline_e["is_pipeline"] is True
|
| 404 |
|
| 405 |
def test_non_pipeline_engines_not_flagged(self, report_data):
|
| 406 |
+
# Les concurrents pipeline (LLM ou VLM zero-shot) sont correctement marqués is_pipeline=True
|
| 407 |
+
pipeline_engines = {"tesseract → gpt-4o", "gpt-4o-vision (zero-shot)"}
|
| 408 |
for e in report_data["engines"]:
|
| 409 |
+
if e["name"] not in pipeline_engines:
|
| 410 |
assert e["is_pipeline"] is False
|
| 411 |
|
| 412 |
def test_pipeline_has_over_normalization_in_info(self, report_data):
|