|
|
|
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from typing import Dict, List, Any |
|
|
|
|
|
class EndpointHandler: |
|
|
def __init__(self, path: str = ""): |
|
|
""" |
|
|
Initialize the model and tokenizer. |
|
|
:param path: Path to the model repository (not used directly since we load from Hugging Face Hub). |
|
|
""" |
|
|
|
|
|
self.base_model_name = "mistralai/Mistral-7B-Instruct-v0.3" |
|
|
self.adapter_model_name = "Danna8/MistralF" |
|
|
|
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(self.adapter_model_name) |
|
|
|
|
|
|
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
self.base_model_name, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
|
|
|
self.model.load_adapter(self.adapter_model_name) |
|
|
self.model.set_active_adapters("default") |
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
Handle inference requests. |
|
|
:param data: Input data containing the text to process. |
|
|
:return: List of generated outputs. |
|
|
""" |
|
|
|
|
|
inputs = data.get("inputs", "") |
|
|
if not inputs: |
|
|
return [{"error": "No input provided"}] |
|
|
|
|
|
|
|
|
tokenized_inputs = self.tokenizer(inputs, return_tensors="pt").to("cuda") |
|
|
|
|
|
|
|
|
outputs = self.model.generate( |
|
|
**tokenized_inputs, |
|
|
max_new_tokens=50, |
|
|
do_sample=True, |
|
|
top_p=0.95, |
|
|
temperature=0.7, |
|
|
pad_token_id=self.tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
|
|
|
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
return [{"generated_text": generated_text}] |