File size: 4,780 Bytes
16c9c8d 17b5f1e b76a135 f073753 b76a135 f073753 16c9c8d c7d058d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
from typing import Dict, List, Any
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
import time
class EndpointHandler:
def __init__(self, path="5iveDesignStudio/autotrain-TenderGPT-Festive-v2-0"):
# load the model
config = PeftConfig.from_pretrained(path)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
self.model = AutoModelForCausalLM.from_pretrained(
config.base_model_name_or_path,
return_dict=True,
load_in_4bit=True,
device_map={"":0},
trust_remote_code=True,
quantization_config=bnb_config,
)
self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = PeftModel.from_pretrained(self.model, path)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
#def __call__(self, data: Any) -> Dict[str, Any]:
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
"""
Args:
inputs :obj:`list`:. The object should be like {"context": "some word", "question": "some word"} containing:
- "context":
- "question":
Return:
A :obj:`list`:. The object returned should be like {"answer": "some word", time: "..."} containing:
- "answer": answer the question based on the context
- "time": the time run predict
"""
# process input
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", None)
prompt = f"""Below is an instruction that describes a task. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
>>TITLE<<: Tender Response.
>>CONTEXT<<: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe in a conversational tone. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
>>QUESTION<<: {inputs}
>>ANSWER<<:
""".strip()
# preprocess
batch = self.tokenizer(
prompt,
padding=True,
truncation=True,
return_tensors="pt"
).to(self.device)
# pass inputs with all kwargs in data
#if parameters is not None:
# outputs = self.model.generate(**inputs, **parameters)
#else:
# outputs = self.model.generate(**inputs)
# postprocess the prediction
#prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
generation_config = self.model.generation_config
generation_config.top_p = 0.75
generation_config.temperature = 0.7
generation_config.max_new_tokens = 140
generation_config.num_return_sequences = 1
generation_config.pad_token_id = self.tokenizer.eos_token_id
generation_config.eos_token_id = self.tokenizer.eos_token_id
start = time.time()
with torch.cuda.amp.autocast():
output_tokens = self.model.generate(
input_ids = batch.input_ids,
generation_config=generation_config,
)
end = time.time()
generated_text = self.tokenizer.decode(
output_tokens[0]
)
answer = generated_text.split('>>END<<')[0].split('>>ANSWER<<:')[1].strip()
if "CONTEXT:" in answer:
if "RESPONSE:" in answer:
answerclean = answer.partition("RESPONSE:")[2]
else:
answerclean = "I'm sorry, but I'm not able to help with your tender topic."
else:
answerclean = answer
if "<|endoftext|>:" in answerclean:
answerclean = answerclean.replace('<|endoftext|>', '')
else:
first_full_stop = answerclean.index('.')
last_full_stop = answerclean.rindex('.')
answerclean = answerclean[0:last_full_stop+1]
prediction = {'generated_text': answerclean, 'time': f"{(end-start):.2f} s"}
#prediction = {'generated_text': answerclean, 'time': f"{(end-start):.2f} s", 'complete_ouput': generated_text}
result = []
result.append(prediction)
return result |