Spaces:
Sleeping
Sleeping
File size: 1,607 Bytes
2d0ef3b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | from pathlib import Path
from app.pipelines.classification_pipeline import classification_pipeline
import app.pipelines.classification_pipeline as pipeline_module
def test_classify_file_uses_pdf_first_page_only(monkeypatch):
extraction_flags: list[bool] = []
def _fake_extract_text(file_name, file_path, pdf_first_page_only=False):
extraction_flags.append(pdf_first_page_only)
return "This is enough content for preprocessing and classification."
monkeypatch.setattr(pipeline_module.extraction_service, "extract_text", _fake_extract_text)
monkeypatch.setattr(pipeline_module.language_service, "detect_language", lambda text: "en")
monkeypatch.setattr(pipeline_module.label_service, "get_labels", lambda: ["news", "sport"])
monkeypatch.setattr(pipeline_module.classifier_service, "classify", lambda text, labels: "news")
result = classification_pipeline.classify_file("sample.pdf", Path("sample.pdf"))
assert extraction_flags == [True]
assert result == {"label": "news", "language": "en"}
def test_transform_file_uses_full_extraction(monkeypatch):
extraction_flags: list[bool] = []
def _fake_extract_text(file_name, file_path, pdf_first_page_only=False):
extraction_flags.append(pdf_first_page_only)
return "This is full extracted content."
monkeypatch.setattr(pipeline_module.extraction_service, "extract_text", _fake_extract_text)
content = classification_pipeline.transform_file("sample.pdf", Path("sample.pdf"))
assert extraction_flags == [False]
assert content == "This is full extracted content."
|