# handler.py import torch from transformers import AutoModelForCausalLM, AutoTokenizer from typing import Dict, List, Any class EndpointHandler: def __init__(self, path: str = ""): """ Initialize the model and tokenizer. :param path: Path to the model repository (not used directly since we load from Hugging Face Hub). """ # Define the base model and adapter model names self.base_model_name = "mistralai/Mistral-7B-Instruct-v0.3" self.adapter_model_name = "Danna8/MistralF" # Load the tokenizer self.tokenizer = AutoTokenizer.from_pretrained(self.adapter_model_name) # Load the base model with optimizations self.model = AutoModelForCausalLM.from_pretrained( self.base_model_name, torch_dtype=torch.float16, # Use FP16 for efficiency device_map="auto" # Automatically map to GPU ) # Load the adapter self.model.load_adapter(self.adapter_model_name) self.model.set_active_adapters("default") # Adjust the adapter name if needed def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Handle inference requests. :param data: Input data containing the text to process. :return: List of generated outputs. """ # Extract the input text from the request inputs = data.get("inputs", "") if not inputs: return [{"error": "No input provided"}] # Tokenize the input tokenized_inputs = self.tokenizer(inputs, return_tensors="pt").to("cuda") # Generate output outputs = self.model.generate( **tokenized_inputs, max_new_tokens=50, do_sample=True, top_p=0.95, temperature=0.7, pad_token_id=self.tokenizer.eos_token_id # Ensure proper padding ) # Decode the output generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Return the result in the expected format return [{"generated_text": generated_text}]