import json from transformers import AutoModelForSequenceClassification, AutoTokenizer import torch from typing import Dict, List, Any class EndpointHandler(): def __init__(self, path=""): """ Initializes the model and tokenizer. Args: path (str): Path to the directory containing the model files. """ # Load model and tokenizer from the path provided by Inference Endpoints self.tokenizer = AutoTokenizer.from_pretrained(path) self.model = AutoModelForSequenceClassification.from_pretrained(path) # Determine device - Inference Endpoints will handle GPU allocation if available if torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.model.to(self.device) self.model.eval() # Set model to evaluation mode print("Model and tokenizer loaded successfully.") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Processes inference requests. Args: data (Dict[str, Any]): A dictionary containing the input data. Expected key: "inputs" (string or list of strings). Optional key: "parameters" (dictionary). Returns: List[Dict[str, Any]]: A list of dictionaries, where each dictionary contains the prediction results for one input string (e.g., [{"label": "AI", "score": 0.98}]). """ # Get inputs from the payload inputs = data.pop("inputs", None) parameters = data.pop("parameters", {}) # Optional parameters if inputs is None: raise ValueError("Missing 'inputs' key in request data") # Preprocessing: Tokenize the input text # Handle both single string and list of strings inputs # Padding=True and truncation=True are good defaults for batch processing tokenized_inputs = self.tokenizer( inputs, return_tensors="pt", padding=True, truncation=True, max_length=self.tokenizer.model_max_length # Use model's max sequence length ).to(self.device) # Inference: Run prediction with torch.no_grad(): # Disable gradient calculations for inference outputs = self.model(**tokenized_inputs) # Postprocessing: Convert logits to probabilities and get labels logits = outputs.logits probabilities = torch.softmax(logits, dim=-1) results = [] # Iterate through each input in the batch for i in range(probabilities.shape[0]): scores = probabilities[i].tolist() # Get probabilities for the i-th input predictions = [] for j, score in enumerate(scores): # Map the class index (j) to the actual label string label = self.model.config.id2label[j] predictions.append({"label": label, "score": score}) # Sort predictions by score descending if needed, or just return all # predictions.sort(key=lambda x: x["score"], reverse=True) results.append(predictions) # Append all label scores for this input # If the original input was a single string, return just the first result list if isinstance(inputs, str): # However, the standard API often expects a list even for single inputs # So we return results which is already a list containing one list of predictions # Let's adjust to return a flat list of predictions if input was single string, # matching common pipeline output. But check what your consumer expects. # For now, return the list of lists structure for consistency. pass # Keep results as list of lists: [[{'label': '...', 'score': ...}]] # If you want to return ONLY the top prediction per input string: # top_results = [] # for i in range(probabilities.shape[0]): # top_prob, top_idx = torch.max(probabilities[i], dim=0) # label = self.model.config.id2label[top_idx.item()] # score = top_prob.item() # top_results.append({"label": label, "score": score}) # return top_results # Return all labels and scores per input return results