|
|
from typing import Dict, List, Any |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
|
from peft import PeftModel, PeftConfig |
|
|
import torch |
|
|
import time |
|
|
|
|
|
class EndpointHandler: |
|
|
def __init__(self, path="5iveDesignStudio/autotrain-TenderGPT-Festive-v2-0"): |
|
|
|
|
|
config = PeftConfig.from_pretrained(path) |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.float16, |
|
|
) |
|
|
|
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
config.base_model_name_or_path, |
|
|
return_dict=True, |
|
|
load_in_4bit=True, |
|
|
device_map={"":0}, |
|
|
trust_remote_code=True, |
|
|
quantization_config=bnb_config, |
|
|
) |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) |
|
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
|
|
|
self.model = PeftModel.from_pretrained(self.model, path) |
|
|
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: |
|
|
""" |
|
|
Args: |
|
|
inputs :obj:`list`:. The object should be like {"context": "some word", "question": "some word"} containing: |
|
|
- "context": |
|
|
- "question": |
|
|
Return: |
|
|
A :obj:`list`:. The object returned should be like {"answer": "some word", time: "..."} containing: |
|
|
- "answer": answer the question based on the context |
|
|
- "time": the time run predict |
|
|
""" |
|
|
|
|
|
|
|
|
inputs = data.pop("inputs", data) |
|
|
parameters = data.pop("parameters", None) |
|
|
|
|
|
prompt = f"""Below is an instruction that describes a task. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. |
|
|
>>TITLE<<: Tender Response. |
|
|
>>CONTEXT<<: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe in a conversational tone. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. |
|
|
>>QUESTION<<: {inputs} |
|
|
>>ANSWER<<: |
|
|
""".strip() |
|
|
|
|
|
|
|
|
batch = self.tokenizer( |
|
|
prompt, |
|
|
padding=True, |
|
|
truncation=True, |
|
|
return_tensors="pt" |
|
|
).to(self.device) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
generation_config = self.model.generation_config |
|
|
generation_config.top_p = 0.75 |
|
|
generation_config.temperature = 0.7 |
|
|
generation_config.max_new_tokens = 140 |
|
|
generation_config.num_return_sequences = 1 |
|
|
generation_config.pad_token_id = self.tokenizer.eos_token_id |
|
|
generation_config.eos_token_id = self.tokenizer.eos_token_id |
|
|
|
|
|
start = time.time() |
|
|
with torch.cuda.amp.autocast(): |
|
|
output_tokens = self.model.generate( |
|
|
input_ids = batch.input_ids, |
|
|
generation_config=generation_config, |
|
|
) |
|
|
end = time.time() |
|
|
|
|
|
generated_text = self.tokenizer.decode( |
|
|
output_tokens[0] |
|
|
) |
|
|
|
|
|
answer = generated_text.split('>>END<<')[0].split('>>ANSWER<<:')[1].strip() |
|
|
|
|
|
if "CONTEXT:" in answer: |
|
|
if "RESPONSE:" in answer: |
|
|
answerclean = answer.partition("RESPONSE:")[2] |
|
|
else: |
|
|
answerclean = "I'm sorry, but I'm not able to help with your tender topic." |
|
|
else: |
|
|
answerclean = answer |
|
|
|
|
|
if "<|endoftext|>:" in answerclean: |
|
|
answerclean = answerclean.replace('<|endoftext|>', '') |
|
|
else: |
|
|
first_full_stop = answerclean.index('.') |
|
|
last_full_stop = answerclean.rindex('.') |
|
|
answerclean = answerclean[0:last_full_stop+1] |
|
|
|
|
|
prediction = {'generated_text': answerclean, 'time': f"{(end-start):.2f} s"} |
|
|
|
|
|
|
|
|
result = [] |
|
|
result.append(prediction) |
|
|
|
|
|
|
|
|
return result |