File size: 1,351 Bytes

c1cb360
 
1e592e3
fe371ad
 
1aeb34c
eff3ac4
b47e2d8
c1cb360
b47e2d8
52afa01
e126c73
e8628b3
e126c73
 
 
b3aebd1
eff3ac4
1aeb34c
1e592e3
 
 
 
 
eff3ac4
1e592e3
 
b47e2d8
eff3ac4

from typing import Dict, List, Any
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

class EndpointHandler:
    def __init__(self, path=""):
        # Load the model
        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
        model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen2-1.5B-Instruct",
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="cuda" if torch.cuda.is_available() else "auto" # Include device_map for correct device allocation
        )

        # Create inference pipeline without specifying the device
        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

    def __call__(self, data: Any) -> List[List[Dict[str, Any]]]:
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", {})

        if isinstance(inputs, str):
            inputs = [inputs]

        # Get predictions from the pipeline
        prediction = self.pipeline(inputs, **parameters)
        
        return prediction

# Example usage
if __name__ == "__main__":
    handler = EndpointHandler()
    data = {
        "inputs": "Hello, how can I",
        "parameters": {"max_length": 50, "num_return_sequences": 1}
    }
    result = handler(data)
    print(result)