from typing import Dict, List, Any
import torch
from PIL import Image
from transformers import AutoModel, AutoProcessor

class EndpointHandler:
    def __init__(self, path=""):
        # Load the processor and model from the local path
        # This uses your custom code in the repo via trust_remote_code
        self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
        self.model = AutoModel.from_pretrained(path, trust_remote_code=True)
        
        # Move to GPU if available, otherwise CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Args:
            data (:obj:`Dict[str, Any]`):
                Includes the deserialized image input under the "inputs" key.
        """
        # The Hub's image-to-text widget sends a PIL Image in the "inputs" key
        inputs_data = data.pop("inputs", data)
        
        # Ensure it's a PIL Image (handling both URL strings or raw bytes if necessary)
        if not isinstance(inputs_data, Image.Image):
            # If for some reason it's not a PIL image, you'd handle conversion here
            pass

        # 1. Preprocess the image using your custom processor
        processed_inputs = self.processor(inputs_data)
        pixel_values = processed_inputs["pixel_values"].to(self.device)

        # 2. Run Inference
        with torch.no_grad():
            outputs = self.model(pixel_values)
            logits = outputs.logits

        # 3. Decode the prediction using your CTC logic
        prediction = self.processor.batch_decode(logits)[0]

        # The widget expects a list of dicts for image-to-text
        # 'generated_text' is the standard key for the widget to display the result
        return [{"generated_text": prediction}]