Spaces:

seai2526-uniba-TheClouds
/

Code-Comment-Classification-Api

Running

File size: 5,195 Bytes

ac9ddbb

"""Prediction helpers for different model types.

This module provides `ModelPredictor`, a lightweight wrapper that unifies
inference for SetFit, scikit-learn RandomForest pipelines, and HuggingFace
transformer sequence classification models. It standardizes inputs/outputs
to a NumPy array of shape (n_samples, n_labels).
"""

import os
from typing import List, Union

import joblib
import numpy as np
from setfit import SetFitModel
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

TextInput = Union[str, List[str]]


class ModelPredictor:
    """Unified predictor for SetFit, Random Forest and Transformer models.

    Expected directory layout:

        models/
          ├── java/
          │     ├── setfit/                # SetFit saved model directory
          │     ├── random_forest.joblib   # sklearn pipeline
          │     └── transformer/           # HF model + tokenizer (config.json, etc.)
          ├── python/
          │     ├── setfit/
          │     ├── random_forest.joblib
          │     └── transformer/
          └── pharo/
                ├── setfit/
                ├── random_forest.joblib
                └── transformer/
    """

    def __init__(
        self,
        lang: str,
        model_type: str,
        model_root: str = "models",
        threshold: float = 0.5,
        max_length: int = 128,
    ) -> None:
        """Parameters

        ----------
        lang : str
            One of {"java", "python", "pharo"}.
        model_type : str
            One of {"setfit", "random_forest", "transformer"}.
        model_root : str
            Root directory where models are stored.
        threshold : float
            Decision threshold for multi-label Transformer predictions.
            Ignored for SetFit and Random Forest (they already output labels).
        max_length : int
            Max sequence length for Transformer tokenization.

        """
        self.lang = lang
        self.model_type = model_type
        self.model_root = model_root
        self.threshold = float(threshold)
        self.max_length = int(max_length)

        # device only matters for Transformer
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if model_type == "setfit":
            model_path = os.path.join(self.model_root, self.lang, "setfit")
            if not os.path.isdir(model_path):
                raise FileNotFoundError(f"SetFit model not found at: {model_path}")
            self.model = SetFitModel.from_pretrained(model_path)

        elif model_type == "random_forest":
            model_path = os.path.join(self.model_root, self.lang, "random_forest.joblib")
            if not os.path.isfile(model_path):
                raise FileNotFoundError(f"Random Forest model not found at: {model_path}")
            self.model = joblib.load(model_path)

        elif model_type == "transformer":
            model_path = os.path.join(self.model_root, self.lang, "transformer")
            if not os.path.isdir(model_path):
                raise FileNotFoundError(f"Transformer model not found at: {model_path}")

            # load tokenizer and model from the same directory used during training
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to(
                self.device
            )
            self.model.eval()

        else:
            raise ValueError(f"Unsupported model_type: {model_type}")

    def predict(self, texts: TextInput) -> np.ndarray:
        """Run prediction on one or many text samples.

        Parameters
        ----------
        texts : str | list[str]
            A single text or a list of texts.

        Returns
        -------
        np.ndarray
            Array of shape (n_samples, n_labels) with integer (typically binary) values.

        """
        if isinstance(texts, str):
            texts = [texts]

        if self.model_type == "setfit":
            raw_outputs = self.model(texts)
            outputs = np.array(list(raw_outputs), dtype=int)

        elif self.model_type == "random_forest":
            raw_outputs = self.model.predict(texts)
            outputs = np.array(list(raw_outputs), dtype=int)

        elif self.model_type == "transformer":
            enc = self.tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt",
            )
            enc = {k: v.to(self.device) for k, v in enc.items()}

            with torch.no_grad():
                logits = self.model(**enc).logits
                probs = torch.sigmoid(logits)
                preds = (probs > self.threshold).long().cpu().numpy()

            outputs = preds.astype(int)
        else:
            raise ValueError(f"Unsupported model_type: {self.model_type}")

        # Ensure 2D shape (n_samples, n_labels)
        if outputs.ndim == 1:
            outputs = outputs.reshape(1, -1)

        return outputs