Spaces:

AyoubChLin
/

classifier-general

Sleeping

classifier-general / tests /test_classification_pipeline_behavior.py

feat: update classifier model to local zero-shot NLI and enhance language detection with local library

2d0ef3b 10 days ago

1.61 kB

	from pathlib import Path

	from app.pipelines.classification_pipeline import classification_pipeline
	import app.pipelines.classification_pipeline as pipeline_module


	def test_classify_file_uses_pdf_first_page_only(monkeypatch):
	extraction_flags: list[bool] = []

	def _fake_extract_text(file_name, file_path, pdf_first_page_only=False):
	extraction_flags.append(pdf_first_page_only)
	return "This is enough content for preprocessing and classification."

	monkeypatch.setattr(pipeline_module.extraction_service, "extract_text", _fake_extract_text)
	monkeypatch.setattr(pipeline_module.language_service, "detect_language", lambda text: "en")
	monkeypatch.setattr(pipeline_module.label_service, "get_labels", lambda: ["news", "sport"])
	monkeypatch.setattr(pipeline_module.classifier_service, "classify", lambda text, labels: "news")

	result = classification_pipeline.classify_file("sample.pdf", Path("sample.pdf"))

	assert extraction_flags == [True]
	assert result == {"label": "news", "language": "en"}


	def test_transform_file_uses_full_extraction(monkeypatch):
	extraction_flags: list[bool] = []

	def _fake_extract_text(file_name, file_path, pdf_first_page_only=False):
	extraction_flags.append(pdf_first_page_only)
	return "This is full extracted content."

	monkeypatch.setattr(pipeline_module.extraction_service, "extract_text", _fake_extract_text)

	content = classification_pipeline.transform_file("sample.pdf", Path("sample.pdf"))

	assert extraction_flags == [False]
	assert content == "This is full extracted content."