MLGResearch
/

cleaver_t5g_ss

+import os
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import torch
+class EndpointHandler:
+    def __init__(self, path=""):
+        """
+        Initializes the handler by loading the T5Gemma model and tokenizer.
+        trust_remote_code=True is essential for new architectures.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(
+            path,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True
+        )
+        self.model.eval()
+    def __call__(self, data):
+        """
+        This method is called for each inference request. It now uses the
+        tokenizer's chat template, which is the correct and most robust
+        method for formatting inputs for this model.
+        """
+        # Get inputs and generation parameters
+        inputs_text = data.pop("inputs", [])
+        parameters = data
+        if isinstance(inputs_text, str):
+            inputs_text = [inputs_text]
+        # Create the chat message structure that apply_chat_template expects
+        messages_list = [[{"role": "user", "content": text}] for text in inputs_text]
+        # Apply the model's specific chat template to format the input correctly
+        # The tokenizer handles padding for batched inputs automatically.
+        input_ids = [
+            self.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, return_tensors="pt"
+            ) for messages in messages_list
+        ]
+        # Batch generation
+        outputs = []
+        for ids in input_ids:
+            output_tokens = self.model.generate(ids, **parameters)
+            # For T5, the output contains only the generated tokens
+            outputs.append(self.tokenizer.decode(output_tokens[0], skip_special_tokens=True))
+        return outputs