from typing import Dict, List, Any from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch class EndpointHandler: def __init__(self, path=""): # Load the model tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2-1.5B-Instruct", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="cuda" if torch.cuda.is_available() else "auto" # Include device_map for correct device allocation ) # Create inference pipeline without specifying the device self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) def __call__(self, data: Any) -> List[List[Dict[str, Any]]]: inputs = data.pop("inputs", data) parameters = data.pop("parameters", {}) if isinstance(inputs, str): inputs = [inputs] # Get predictions from the pipeline prediction = self.pipeline(inputs, **parameters) return prediction # Example usage if __name__ == "__main__": handler = EndpointHandler() data = { "inputs": "Hello, how can I", "parameters": {"max_length": 50, "num_return_sequences": 1} } result = handler(data) print(result)