Spaces:

seai2526-uniba-TheClouds
/

Code-Comment-Classification-Api

Sleeping

App Files Files Community

Code-Comment-Classification-Api / codecommentclassification /predictor.py

Sky-Blue-da-ba-dee

added files

ac9ddbb 4 days ago

raw

history blame contribute delete

5.2 kB

	"""Prediction helpers for different model types.

	This module provides `ModelPredictor`, a lightweight wrapper that unifies
	inference for SetFit, scikit-learn RandomForest pipelines, and HuggingFace
	transformer sequence classification models. It standardizes inputs/outputs
	to a NumPy array of shape (n_samples, n_labels).
	"""

	import os
	from typing import List, Union

	import joblib
	import numpy as np
	from setfit import SetFitModel
	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer

	TextInput = Union[str, List[str]]


	class ModelPredictor:
	"""Unified predictor for SetFit, Random Forest and Transformer models.

	Expected directory layout:

	models/
	├── java/
	│ ├── setfit/ # SetFit saved model directory
	│ ├── random_forest.joblib # sklearn pipeline
	│ └── transformer/ # HF model + tokenizer (config.json, etc.)
	├── python/
	│ ├── setfit/
	│ ├── random_forest.joblib
	│ └── transformer/
	└── pharo/
	├── setfit/
	├── random_forest.joblib
	└── transformer/
	"""

	def __init__(
	self,
	lang: str,
	model_type: str,
	model_root: str = "models",
	threshold: float = 0.5,
	max_length: int = 128,
	) -> None:
	"""Parameters

	----------
	lang : str
	One of {"java", "python", "pharo"}.
	model_type : str
	One of {"setfit", "random_forest", "transformer"}.
	model_root : str
	Root directory where models are stored.
	threshold : float
	Decision threshold for multi-label Transformer predictions.
	Ignored for SetFit and Random Forest (they already output labels).
	max_length : int
	Max sequence length for Transformer tokenization.

	"""
	self.lang = lang
	self.model_type = model_type
	self.model_root = model_root
	self.threshold = float(threshold)
	self.max_length = int(max_length)

	# device only matters for Transformer
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	if model_type == "setfit":
	model_path = os.path.join(self.model_root, self.lang, "setfit")
	if not os.path.isdir(model_path):
	raise FileNotFoundError(f"SetFit model not found at: {model_path}")
	self.model = SetFitModel.from_pretrained(model_path)

	elif model_type == "random_forest":
	model_path = os.path.join(self.model_root, self.lang, "random_forest.joblib")
	if not os.path.isfile(model_path):
	raise FileNotFoundError(f"Random Forest model not found at: {model_path}")
	self.model = joblib.load(model_path)

	elif model_type == "transformer":
	model_path = os.path.join(self.model_root, self.lang, "transformer")
	if not os.path.isdir(model_path):
	raise FileNotFoundError(f"Transformer model not found at: {model_path}")

	# load tokenizer and model from the same directory used during training
	self.tokenizer = AutoTokenizer.from_pretrained(model_path)
	self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(
	self.device
	)
	self.model.eval()

	else:
	raise ValueError(f"Unsupported model_type: {model_type}")

	def predict(self, texts: TextInput) -> np.ndarray:
	"""Run prediction on one or many text samples.

	Parameters
	----------
	texts : str \| list[str]
	A single text or a list of texts.

	Returns
	-------
	np.ndarray
	Array of shape (n_samples, n_labels) with integer (typically binary) values.

	"""
	if isinstance(texts, str):
	texts = [texts]

	if self.model_type == "setfit":
	raw_outputs = self.model(texts)
	outputs = np.array(list(raw_outputs), dtype=int)

	elif self.model_type == "random_forest":
	raw_outputs = self.model.predict(texts)
	outputs = np.array(list(raw_outputs), dtype=int)

	elif self.model_type == "transformer":
	enc = self.tokenizer(
	texts,
	padding=True,
	truncation=True,
	max_length=self.max_length,
	return_tensors="pt",
	)
	enc = {k: v.to(self.device) for k, v in enc.items()}

	with torch.no_grad():
	logits = self.model(**enc).logits
	probs = torch.sigmoid(logits)
	preds = (probs > self.threshold).long().cpu().numpy()

	outputs = preds.astype(int)
	else:
	raise ValueError(f"Unsupported model_type: {self.model_type}")

	# Ensure 2D shape (n_samples, n_labels)
	if outputs.ndim == 1:
	outputs = outputs.reshape(1, -1)

	return outputs