File size: 4,524 Bytes

99f1de6

import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from typing import Dict, List, Any

class EndpointHandler():
    def __init__(self, path=""):
        """
        Initializes the model and tokenizer.

        Args:
            path (str): Path to the directory containing the model files.
        """
        # Load model and tokenizer from the path provided by Inference Endpoints
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        self.model = AutoModelForSequenceClassification.from_pretrained(path)
        
        # Determine device - Inference Endpoints will handle GPU allocation if available
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.model.to(self.device)
        self.model.eval() # Set model to evaluation mode
        print("Model and tokenizer loaded successfully.")


    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Processes inference requests.

        Args:
            data (Dict[str, Any]): A dictionary containing the input data. 
                                   Expected key: "inputs" (string or list of strings).
                                   Optional key: "parameters" (dictionary).

        Returns:
            List[Dict[str, Any]]: A list of dictionaries, where each dictionary 
                                  contains the prediction results for one input string 
                                  (e.g., [{"label": "AI", "score": 0.98}]).
        """
        # Get inputs from the payload
        inputs = data.pop("inputs", None)
        parameters = data.pop("parameters", {}) # Optional parameters

        if inputs is None:
            raise ValueError("Missing 'inputs' key in request data")

        # Preprocessing: Tokenize the input text
        # Handle both single string and list of strings inputs
        # Padding=True and truncation=True are good defaults for batch processing
        tokenized_inputs = self.tokenizer(
            inputs, 
            return_tensors="pt", 
            padding=True, 
            truncation=True,
            max_length=self.tokenizer.model_max_length # Use model's max sequence length
        ).to(self.device)

        # Inference: Run prediction
        with torch.no_grad(): # Disable gradient calculations for inference
            outputs = self.model(**tokenized_inputs)

        # Postprocessing: Convert logits to probabilities and get labels
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        
        results = []
        # Iterate through each input in the batch
        for i in range(probabilities.shape[0]):
            scores = probabilities[i].tolist() # Get probabilities for the i-th input
            predictions = []
            for j, score in enumerate(scores):
                # Map the class index (j) to the actual label string
                label = self.model.config.id2label[j]
                predictions.append({"label": label, "score": score})
            
            # Sort predictions by score descending if needed, or just return all
            # predictions.sort(key=lambda x: x["score"], reverse=True) 
            results.append(predictions) # Append all label scores for this input

        # If the original input was a single string, return just the first result list
        if isinstance(inputs, str):
             # However, the standard API often expects a list even for single inputs
             # So we return results which is already a list containing one list of predictions
             # Let's adjust to return a flat list of predictions if input was single string,
             # matching common pipeline output. But check what your consumer expects.
             # For now, return the list of lists structure for consistency.
             pass # Keep results as list of lists: [[{'label': '...', 'score': ...}]]
        
        # If you want to return ONLY the top prediction per input string:
        # top_results = []
        # for i in range(probabilities.shape[0]):
        #    top_prob, top_idx = torch.max(probabilities[i], dim=0)
        #    label = self.model.config.id2label[top_idx.item()]
        #    score = top_prob.item()
        #    top_results.append({"label": label, "score": score})
        # return top_results

        # Return all labels and scores per input
        return results