Spaces:
Sleeping
Sleeping
File size: 2,001 Bytes
50231a8 2d0ef3b 50231a8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | from pathlib import Path
from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError
from app.pipelines.text_pipeline import preprocess_text
from app.services.classifier_service import classifier_service
from app.services.extraction_service import extraction_service
from app.services.label_service import label_service
from app.services.language_service import language_service
class ClassificationPipeline:
def classify_text(self, text: str) -> str:
preprocessed_text = preprocess_text(text)
labels = label_service.get_labels()
return classifier_service.classify(preprocessed_text, labels)
def detect_language(self, text: str) -> str:
preprocessed_text = preprocess_text(text)
return language_service.detect_language(preprocessed_text)
def transform_file(self, original_filename: str, file_path: Path) -> str:
text = extraction_service.extract_text(original_filename, file_path)
if not text or not text.strip():
raise ExtractionError("No text extracted from file")
return text
def classify_file(self, original_filename: str, file_path: Path) -> dict:
text = extraction_service.extract_text(original_filename, file_path, pdf_first_page_only=True)
if not text or not text.strip():
raise ExtractionError("No text extracted from file")
preprocessed_text = preprocess_text(text)
language = language_service.detect_language(preprocessed_text)
labels = label_service.get_labels()
topic = classifier_service.classify(preprocessed_text, labels)
result = {"label": topic, "language": language}
if language != "en":
result["type"] = "not english"
return result
classification_pipeline = ClassificationPipeline()
__all__ = [
"classification_pipeline",
"ClassificationError",
"LanguageDetectionError",
"ExtractionError",
"ValidationError",
]
|