Spaces:

AyoubChLin
/

classifier-general

Sleeping

App Files Files Community

classifier-general / app /pipelines /classification_pipeline.py

AyoubChLin

feat: update classifier model to local zero-shot NLI and enhance language detection with local library

2d0ef3b about 1 month ago

raw

history blame contribute delete

2 kB

	from pathlib import Path

	from app.core.exceptions import ClassificationError, ExtractionError, LanguageDetectionError, ValidationError
	from app.pipelines.text_pipeline import preprocess_text
	from app.services.classifier_service import classifier_service
	from app.services.extraction_service import extraction_service
	from app.services.label_service import label_service
	from app.services.language_service import language_service


	class ClassificationPipeline:
	def classify_text(self, text: str) -> str:
	preprocessed_text = preprocess_text(text)
	labels = label_service.get_labels()
	return classifier_service.classify(preprocessed_text, labels)

	def detect_language(self, text: str) -> str:
	preprocessed_text = preprocess_text(text)
	return language_service.detect_language(preprocessed_text)

	def transform_file(self, original_filename: str, file_path: Path) -> str:
	text = extraction_service.extract_text(original_filename, file_path)
	if not text or not text.strip():
	raise ExtractionError("No text extracted from file")
	return text

	def classify_file(self, original_filename: str, file_path: Path) -> dict:
	text = extraction_service.extract_text(original_filename, file_path, pdf_first_page_only=True)
	if not text or not text.strip():
	raise ExtractionError("No text extracted from file")
	preprocessed_text = preprocess_text(text)

	language = language_service.detect_language(preprocessed_text)
	labels = label_service.get_labels()
	topic = classifier_service.classify(preprocessed_text, labels)

	result = {"label": topic, "language": language}
	if language != "en":
	result["type"] = "not english"
	return result


	classification_pipeline = ClassificationPipeline()


	__all__ = [
	"classification_pipeline",
	"ClassificationError",
	"LanguageDetectionError",
	"ExtractionError",
	"ValidationError",
	]