Spaces:

AyoubChLin
/

classifier-general

Sleeping

App Files Files Community

classifier-general / app /services /classifier_service.py

AyoubChLin

[REF] api documentation

2571402 11 days ago

raw

history blame contribute delete

6.64 kB

	import logging
	from typing import Any

	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer

	from app.core.config import settings
	from app.core.exceptions import ClassificationError

	logger = logging.getLogger(__name__)


	class ClassifierService:
	_HYPOTHESIS_TEMPLATE = "This text is about {}."

	def __init__(self) -> None:
	self._tokenizer: Any \| None = None
	self._model: Any \| None = None

	def _load_model(self) -> tuple[Any, Any]:
	if self._tokenizer is None or self._model is None:
	try:
	tokenizer = AutoTokenizer.from_pretrained(
	settings.classifier_model,
	token=settings.huggingface_token,
	)
	model = AutoModelForSequenceClassification.from_pretrained(
	settings.classifier_model,
	token=settings.huggingface_token,
	)
	model.eval()
	model.to("cpu")

	if settings.enable_model_quantization:
	try:
	# Dynamic INT8 quantization for CPU inference.
	quantized_model = torch.ao.quantization.quantize_dynamic(
	model,
	{torch.nn.Linear},
	dtype=torch.qint8,
	)
	model = quantized_model
	except Exception:
	logger.warning(
	"Model quantization failed; using non-quantized model instead.",
	exc_info=True,
	)

	self._tokenizer = tokenizer
	self._model = model
	except Exception as exc:
	raise ClassificationError("Unable to initialize classifier model") from exc

	return self._tokenizer, self._model

	def warmup(self) -> None:
	self._load_model()

	@staticmethod
	def _normalize_labels(labels: list[str]) -> list[str]:
	cleaned = [label.strip() for label in labels if isinstance(label, str) and label.strip()]
	return list(dict.fromkeys(cleaned))

	@staticmethod
	def _parse_label_id(value: Any) -> int \| None:
	try:
	return int(value)
	except (TypeError, ValueError):
	return None

	@staticmethod
	def _extract_task_specific_entailment_id(model: Any) -> int \| None:
	task_specific_params = getattr(model.config, "task_specific_params", {}) or {}
	if not isinstance(task_specific_params, dict):
	return None

	zero_shot_params = task_specific_params.get("zero-shot-classification", {})
	if not isinstance(zero_shot_params, dict):
	return None

	return ClassifierService._parse_label_id(zero_shot_params.get("entailment_id"))

	@staticmethod
	def _has_generic_label_names(model: Any) -> bool:
	label2id = getattr(model.config, "label2id", {}) or {}
	id2label = getattr(model.config, "id2label", {}) or {}

	labels: list[str] = []
	labels.extend(label for label in label2id.keys() if isinstance(label, str))
	labels.extend(label for label in id2label.values() if isinstance(label, str))
	if not labels:
	return False

	return all(label.lower().startswith("label_") for label in labels)

	@staticmethod
	def _resolve_entailment_id(model: Any) -> int:
	label2id = getattr(model.config, "label2id", {}) or {}
	for label, label_id in label2id.items():
	if isinstance(label, str) and label.lower().startswith("entail"):
	parsed = ClassifierService._parse_label_id(label_id)
	if parsed is not None:
	return parsed

	id2label = getattr(model.config, "id2label", {}) or {}
	for label_id, label in id2label.items():
	if isinstance(label, str) and label.lower().startswith("entail"):
	parsed = ClassifierService._parse_label_id(label_id)
	if parsed is not None:
	return parsed

	task_specific_entailment_id = ClassifierService._extract_task_specific_entailment_id(model)
	if task_specific_entailment_id is not None:
	return task_specific_entailment_id

	if settings.classifier_entailment_label_id is not None:
	return settings.classifier_entailment_label_id

	num_labels = ClassifierService._parse_label_id(getattr(model.config, "num_labels", None))
	if num_labels == 3 and (
	ClassifierService._has_generic_label_names(model) or (not label2id and not id2label)
	):
	logger.warning(
	"Falling back to entailment label id 2 because model config labels are generic or missing "
	"and no explicit entailment mapping was found. Set CLASSIFIER_ENTAILMENT_LABEL_ID "
	"to override this behavior."
	)
	return 2

	raise ClassificationError(
	"Classifier model is missing an entailment label mapping. "
	"Set CLASSIFIER_ENTAILMENT_LABEL_ID in the environment when the model config "
	"does not expose an entailment label."
	)

	def classify(self, text: str, labels: list[str]) -> str:
	candidate_labels = self._normalize_labels(labels)
	if not candidate_labels:
	raise ClassificationError("No labels configured")

	tokenizer, model = self._load_model()
	entailment_id = self._resolve_entailment_id(model)

	try:
	sequence_pairs = [[text, self._HYPOTHESIS_TEMPLATE.format(label)] for label in candidate_labels]
	inputs = tokenizer(
	sequence_pairs,
	padding=True,
	truncation="only_first",
	return_tensors="pt",
	)

	with torch.no_grad():
	logits = model(**inputs).logits

	if logits.ndim != 2:
	raise ClassificationError("Classifier returned unexpected logits shape")
	if entailment_id < 0 or entailment_id >= logits.shape[-1]:
	raise ClassificationError("Entailment label index is out of range for classifier output")

	entailment_logits = logits[:, entailment_id]
	best_index = int(torch.argmax(entailment_logits).item())
	return candidate_labels[best_index]
	except Exception as exc:
	raise ClassificationError("Classifier prediction failed") from exc

	classifier_service = ClassifierService()