import transformers import torch # from transformers import BitsAndBytesConfig from transformers import AutoModelForCausalLM, AutoTokenizer from deepeval.models import DeepEvalBaseLLM class Llemma_Finetuned(DeepEvalBaseLLM): def __init__( self, model, tokenizer ): self.model = model self.tokenizer = tokenizer def load_model(self): return self.model def generate(self, prompt: str) -> str: model = self.load_model() device = "cuda" # the device to load the model onto model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device) model.to(device) generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True) return self.tokenizer.batch_decode(generated_ids)[0] async def a_generate(self, prompt: str) -> str: return self.generate(prompt) def get_model_name(self): return "Llemma Finetuned"