from transformers import AutoTokenizer, AutoModelForCausalLM import torch class Model: def __init__(self): device = "cuda" if torch.cuda.is_available() else "cpu" self.tokenizer = AutoTokenizer.from_pretrained(".") self.model = AutoModelForCausalLM.from_pretrained( ".", torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto" ) def __call__(self, prompt: str): inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) output_ids = self.model.generate( **inputs, max_new_tokens=128, temperature=0.7, do_sample=True ) return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)