MistralF / handler.py
Danna8's picture
Create handler.py
d1f0c49 verified
# handler.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Dict, List, Any
class EndpointHandler:
def __init__(self, path: str = ""):
"""
Initialize the model and tokenizer.
:param path: Path to the model repository (not used directly since we load from Hugging Face Hub).
"""
# Define the base model and adapter model names
self.base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
self.adapter_model_name = "Danna8/MistralF"
# Load the tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.adapter_model_name)
# Load the base model with optimizations
self.model = AutoModelForCausalLM.from_pretrained(
self.base_model_name,
torch_dtype=torch.float16, # Use FP16 for efficiency
device_map="auto" # Automatically map to GPU
)
# Load the adapter
self.model.load_adapter(self.adapter_model_name)
self.model.set_active_adapters("default") # Adjust the adapter name if needed
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Handle inference requests.
:param data: Input data containing the text to process.
:return: List of generated outputs.
"""
# Extract the input text from the request
inputs = data.get("inputs", "")
if not inputs:
return [{"error": "No input provided"}]
# Tokenize the input
tokenized_inputs = self.tokenizer(inputs, return_tensors="pt").to("cuda")
# Generate output
outputs = self.model.generate(
**tokenized_inputs,
max_new_tokens=50,
do_sample=True,
top_p=0.95,
temperature=0.7,
pad_token_id=self.tokenizer.eos_token_id # Ensure proper padding
)
# Decode the output
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Return the result in the expected format
return [{"generated_text": generated_text}]