Spaces:
Sleeping
Sleeping
feat: update classifier model to local zero-shot NLI and enhance language detection with local library
2d0ef3b | from pathlib import Path | |
| from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError | |
| from app.pipelines.text_pipeline import preprocess_text | |
| from app.services.classifier_service import classifier_service | |
| from app.services.extraction_service import extraction_service | |
| from app.services.label_service import label_service | |
| from app.services.language_service import language_service | |
| class ClassificationPipeline: | |
| def classify_text(self, text: str) -> str: | |
| preprocessed_text = preprocess_text(text) | |
| labels = label_service.get_labels() | |
| return classifier_service.classify(preprocessed_text, labels) | |
| def detect_language(self, text: str) -> str: | |
| preprocessed_text = preprocess_text(text) | |
| return language_service.detect_language(preprocessed_text) | |
| def transform_file(self, original_filename: str, file_path: Path) -> str: | |
| text = extraction_service.extract_text(original_filename, file_path) | |
| if not text or not text.strip(): | |
| raise ExtractionError("No text extracted from file") | |
| return text | |
| def classify_file(self, original_filename: str, file_path: Path) -> dict: | |
| text = extraction_service.extract_text(original_filename, file_path, pdf_first_page_only=True) | |
| if not text or not text.strip(): | |
| raise ExtractionError("No text extracted from file") | |
| preprocessed_text = preprocess_text(text) | |
| language = language_service.detect_language(preprocessed_text) | |
| labels = label_service.get_labels() | |
| topic = classifier_service.classify(preprocessed_text, labels) | |
| result = {"label": topic, "language": language} | |
| if language != "en": | |
| result["type"] = "not english" | |
| return result | |
| classification_pipeline = ClassificationPipeline() | |
| __all__ = [ | |
| "classification_pipeline", | |
| "ClassificationError", | |
| "LanguageDetectionError", | |
| "ExtractionError", | |
| "ValidationError", | |
| ] | |