File size: 4,780 Bytes
16c9c8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17b5f1e
 
b76a135
 
 
 
f073753
b76a135
 
 
f073753
 
 
 
 
 
 
 
 
16c9c8d
c7d058d
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from typing import Dict, List, Any
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
import time

class EndpointHandler:
    def __init__(self, path="5iveDesignStudio/autotrain-TenderGPT-Festive-v2-0"):
        # load the model
        config = PeftConfig.from_pretrained(path)
        
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )      
        
        self.model = AutoModelForCausalLM.from_pretrained(
            config.base_model_name_or_path,
            return_dict=True,
            load_in_4bit=True,
            device_map={"":0},
            trust_remote_code=True,
            quantization_config=bnb_config,
        )
        
        self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.model = PeftModel.from_pretrained(self.model, path)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    #def __call__(self, data: Any) -> Dict[str, Any]:
    def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
        """
        Args:
            inputs :obj:`list`:. The object should be like {"context": "some word", "question": "some word"} containing:
                - "context": 
                - "question": 
        Return:
            A :obj:`list`:. The object returned should be like {"answer": "some word", time: "..."} containing:
                - "answer": answer the question based on the context
                - "time": the time run predict
        """

        # process input
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", None)

        prompt = f"""Below is an instruction that describes a task. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
>>TITLE<<: Tender Response.
>>CONTEXT<<: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe in a conversational tone. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
>>QUESTION<<: {inputs}
>>ANSWER<<:
""".strip()

        # preprocess
        batch = self.tokenizer(
            prompt,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(self.device)

        # pass inputs with all kwargs in data
        #if parameters is not None:
        #    outputs = self.model.generate(**inputs, **parameters)
        #else:
        #    outputs = self.model.generate(**inputs)

        # postprocess the prediction
        #prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        generation_config = self.model.generation_config
        generation_config.top_p = 0.75
        generation_config.temperature = 0.7
        generation_config.max_new_tokens = 140
        generation_config.num_return_sequences = 1
        generation_config.pad_token_id = self.tokenizer.eos_token_id
        generation_config.eos_token_id = self.tokenizer.eos_token_id

        start = time.time()
        with torch.cuda.amp.autocast():
            output_tokens = self.model.generate(
                input_ids = batch.input_ids,
                generation_config=generation_config,
            )
        end = time.time()

        generated_text = self.tokenizer.decode(
            output_tokens[0]
        )

        answer = generated_text.split('>>END<<')[0].split('>>ANSWER<<:')[1].strip()

        if "CONTEXT:" in answer:
            if "RESPONSE:" in answer:
                answerclean = answer.partition("RESPONSE:")[2]
            else:
                answerclean = "I'm sorry, but I'm not able to help with your tender topic."
        else:
            answerclean = answer

        if "<|endoftext|>:" in answerclean:
            answerclean = answerclean.replace('<|endoftext|>', '')
        else:
            first_full_stop = answerclean.index('.')
            last_full_stop = answerclean.rindex('.')
            answerclean = answerclean[0:last_full_stop+1]

        prediction = {'generated_text': answerclean, 'time': f"{(end-start):.2f} s"} 
        #prediction = {'generated_text': answerclean, 'time': f"{(end-start):.2f} s", 'complete_ouput': generated_text}

        result = []
        result.append(prediction)
        

        return result