File size: 2,126 Bytes
d1f0c49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# handler.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Dict, List, Any

class EndpointHandler:
    def __init__(self, path: str = ""):
        """
        Initialize the model and tokenizer.
        :param path: Path to the model repository (not used directly since we load from Hugging Face Hub).
        """
        # Define the base model and adapter model names
        self.base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
        self.adapter_model_name = "Danna8/MistralF"

        # Load the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.adapter_model_name)

        # Load the base model with optimizations
        self.model = AutoModelForCausalLM.from_pretrained(
            self.base_model_name,
            torch_dtype=torch.float16,  # Use FP16 for efficiency
            device_map="auto"  # Automatically map to GPU
        )

        # Load the adapter
        self.model.load_adapter(self.adapter_model_name)
        self.model.set_active_adapters("default")  # Adjust the adapter name if needed

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Handle inference requests.
        :param data: Input data containing the text to process.
        :return: List of generated outputs.
        """
        # Extract the input text from the request
        inputs = data.get("inputs", "")
        if not inputs:
            return [{"error": "No input provided"}]

        # Tokenize the input
        tokenized_inputs = self.tokenizer(inputs, return_tensors="pt").to("cuda")

        # Generate output
        outputs = self.model.generate(
            **tokenized_inputs,
            max_new_tokens=50,
            do_sample=True,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=self.tokenizer.eos_token_id  # Ensure proper padding
        )

        # Decode the output
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Return the result in the expected format
        return [{"generated_text": generated_text}]