File size: 3,781 Bytes

"""
Custom handler for TranslateGemma on HuggingFace Inference Endpoints.
Properly handles the special chat template format.
"""

import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
from typing import Dict, Any


class EndpointHandler:
    def __init__(self, path: str = ""):
        # Load from HuggingFace Hub directly
        model_id = "google/translategemma-12b-it"
        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = AutoModelForImageTextToText.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.bfloat16
        )

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process translation request.

        Expected input format:
        {
            "inputs": {
                "text": "Text to translate",
                "source_lang_code": "en",
                "target_lang_code": "ja_JP"
            },
            "parameters": {
                "max_new_tokens": 200
            }
        }
        """
        inputs_data = data.get("inputs", data)
        parameters = data.get("parameters", {})

        # Extract translation parameters
        if isinstance(inputs_data, dict):
            text = inputs_data.get("text", "")
            source_lang = inputs_data.get("source_lang_code", "en")
            target_lang = inputs_data.get("target_lang_code", "en")
        else:
            # Fallback for simple string input
            return {"error": "Expected dict with text, source_lang_code, target_lang_code"}

        # Check if target_lang is a custom prompt (for unsupported languages)
        is_custom_prompt = target_lang.startswith("Translate to")

        if is_custom_prompt:
            # Custom prompt format for unsupported languages
            # Add explicit instruction to return ONLY the translation
            prompt = f"<start_of_turn>user\n{target_lang} Output only the translation, no explanations.\n\n{text}<end_of_turn>\n<start_of_turn>model\n"
            tokenized = self.processor.tokenizer(
                prompt,
                return_tensors="pt",
                add_special_tokens=True
            )
            inputs = {k: v.to(self.model.device) for k, v in tokenized.items()}
        else:
            # Standard language code: use structured message format
            messages = [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "source_lang_code": source_lang,
                            "target_lang_code": target_lang,
                            "text": text,
                        }
                    ],
                }
            ]

            # Apply chat template
            inputs = self.processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt"
            ).to(self.model.device, dtype=torch.bfloat16)

        # Generate
        max_new_tokens = parameters.get("max_new_tokens", 2000)

        with torch.inference_mode():
            generation = self.model.generate(
                **inputs,
                do_sample=False,
                max_new_tokens=max_new_tokens
            )

        # Decode - only the new tokens
        input_len = inputs["input_ids"].shape[1]
        generated_tokens = generation[0][input_len:]
        translation = self.processor.decode(generated_tokens, skip_special_tokens=True)

        return {
            "translation": translation.strip(),
            "source_lang": source_lang,
            "target_lang": target_lang
        }