File size: 7,758 Bytes

d4d15db

from typing import Dict, List, Any
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoImageProcessor,
)
import torch
from PIL import Image
import base64
import io

# get dtype and device
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"

class EndpointHandler():
    def __init__(self, path=""):
        print(f"Initializing model on device: {device}")
        print(f"Using dtype: {dtype}")
        
        # load the model - using AutoModel like in local inference
        self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
        self.image_processor = AutoImageProcessor.from_pretrained(path, trust_remote_code=True)
        
        # Load model with explicit device mapping
        if device == "cuda":
            self.model = AutoModel.from_pretrained(
                path, 
                torch_dtype=dtype, 
                trust_remote_code=True,
                device_map="auto"  # Automatically map to available GPUs
            )
        else:
            self.model = AutoModel.from_pretrained(
                path, 
                torch_dtype=dtype, 
                trust_remote_code=True
            )
            self.model = self.model.to(device)
        
        print(f"Model loaded successfully on device: {self.model.device}")
        print(f"Model dtype: {next(self.model.parameters()).dtype}")

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
       data args:
            inputs (:obj: `str` or `list`): messages in chat format or text input
            parameters (:obj: `dict`): generation parameters
      Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """
        print("Call inside handler")
        # get inputs
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", {})
        print("parameters", parameters)
        
        # Remove parameters that might cause issues
        parameters.pop("details", None)
        parameters.pop("stop", None)
        parameters.pop("return_full_text", None)
        if "do_sample" in parameters:
            parameters["do_sample"] = True
        
        # Set default generation parameters
        max_new_tokens = parameters.pop("max_new_tokens", 512)
        temperature = parameters.pop("temperature", 0)
        
        try:
            # Handle different input formats
            if isinstance(inputs, str):
                # If it's a string, treat it as a simple text prompt
                input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
                generated_ids = self.model.generate(
                    input_ids,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    **parameters
                )
                prompt_len = input_ids.shape[1]
                generated_ids = generated_ids[:, prompt_len:]
                output_text = self.tokenizer.batch_decode(
                    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
                return [{"generated_text": output_text[0]}]
            
            elif isinstance(inputs, list):
                # Handle chat format with images
                messages = inputs
                
                # Apply chat template
                input_ids = self.tokenizer.apply_chat_template(
                    messages, tokenize=True, add_generation_prompt=True
                )
                input_text = self.tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
                print(input_text)

                input_ids = torch.tensor([input_ids]).to(self.model.device)
                
                # Process ALL images if present
                pixel_values_list = []
                grid_thws_list = []
                
                # Look for images in the messages
                for message in messages:
                    if isinstance(message.get("content"), list):
                        for content_item in message["content"]:
                            if content_item.get("type") == "image_url":
                                image_data = content_item.get("image_url").get("url", "")
                                if image_data.startswith("data:image"):
                                    # Decode base64 image
                                    image_data = image_data.split(",")[1]
                                    image_bytes = base64.b64decode(image_data)
                                    image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
                                    
                                    # Process each image individually
                                    info = self.image_processor.preprocess(images=[image])
                                    pixel_values = torch.tensor(info['pixel_values']).to(dtype=dtype, device=self.model.device)
                                    grid_thws = torch.tensor(info['image_grid_thw']).to(self.model.device)
                                    
                                    pixel_values_list.append(pixel_values)
                                    grid_thws_list.append(grid_thws)
                
                # Generate response
                if pixel_values_list and grid_thws_list:
                    # Multi-modal generation with images
                    # Concatenate all pixel_values and grid_thws for batch processing
                    all_pixel_values = torch.cat(pixel_values_list, dim=0)
                    all_grid_thws = torch.cat(grid_thws_list, dim=0)
                    
                    print(f"Processing {len(pixel_values_list)} images")
                    print(f"pixel_values shape: {all_pixel_values.shape}")
                    print(f"grid_thws shape: {all_grid_thws.shape}")
                    print("grid_thws", all_grid_thws)
                    
                    # Ensure all tensors are on the same device as the model
                    all_pixel_values = all_pixel_values.to(self.model.device)
                    all_grid_thws = all_grid_thws.to(self.model.device)
                    
                    with torch.no_grad():
                        generated_ids = self.model.generate(
                            input_ids, 
                            pixel_values=all_pixel_values, 
                            grid_thws=all_grid_thws,
                            max_new_tokens=max_new_tokens,
                            temperature=temperature,
                            **parameters
                        )
                else:
                    # Text-only generation
                    generated_ids = self.model.generate(
                        input_ids,
                        max_new_tokens=max_new_tokens,
                        temperature=temperature,
                        **parameters
                    )
                
                prompt_len = input_ids.shape[1]
                generated_ids = generated_ids[:, prompt_len:]
                output_text = self.tokenizer.batch_decode(
                    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
                print("##Model Response##", output_text)
                return [{"generated_text": output_text[0]}]
            
            else:
                raise ValueError(f"Unsupported input type: {type(inputs)}")
                
        except Exception as e:
            print(f"Error during inference: {str(e)}")
            return [{"error": str(e)}]