classifier-general / app /pipelines /classification_pipeline.py
AyoubChLin's picture
feat: update classifier model to local zero-shot NLI and enhance language detection with local library
2d0ef3b
from pathlib import Path
from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError
from app.pipelines.text_pipeline import preprocess_text
from app.services.classifier_service import classifier_service
from app.services.extraction_service import extraction_service
from app.services.label_service import label_service
from app.services.language_service import language_service
class ClassificationPipeline:
def classify_text(self, text: str) -> str:
preprocessed_text = preprocess_text(text)
labels = label_service.get_labels()
return classifier_service.classify(preprocessed_text, labels)
def detect_language(self, text: str) -> str:
preprocessed_text = preprocess_text(text)
return language_service.detect_language(preprocessed_text)
def transform_file(self, original_filename: str, file_path: Path) -> str:
text = extraction_service.extract_text(original_filename, file_path)
if not text or not text.strip():
raise ExtractionError("No text extracted from file")
return text
def classify_file(self, original_filename: str, file_path: Path) -> dict:
text = extraction_service.extract_text(original_filename, file_path, pdf_first_page_only=True)
if not text or not text.strip():
raise ExtractionError("No text extracted from file")
preprocessed_text = preprocess_text(text)
language = language_service.detect_language(preprocessed_text)
labels = label_service.get_labels()
topic = classifier_service.classify(preprocessed_text, labels)
result = {"label": topic, "language": language}
if language != "en":
result["type"] = "not english"
return result
classification_pipeline = ClassificationPipeline()
__all__ = [
"classification_pipeline",
"ClassificationError",
"LanguageDetectionError",
"ExtractionError",
"ValidationError",
]