Max5ive's picture
Handler
16c9c8d
raw
history blame
3.98 kB
from typing import Dict, List, Any
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
import time
class EndpointHandler:
def __init__(self, path="5iveDesignStudio/autotrain-TenderGPT-Festive-v2-0"):
# load the model
config = PeftConfig.from_pretrained(path)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
self.model = AutoModelForCausalLM.from_pretrained(
config.base_model_name_or_path,
return_dict=True,
load_in_4bit=True,
device_map={"":0},
trust_remote_code=True,
quantization_config=bnb_config,
)
self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = PeftModel.from_pretrained(self.model, path)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
#def __call__(self, data: Any) -> Dict[str, Any]:
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
"""
Args:
inputs :obj:`list`:. The object should be like {"context": "some word", "question": "some word"} containing:
- "context":
- "question":
Return:
A :obj:`list`:. The object returned should be like {"answer": "some word", time: "..."} containing:
- "answer": answer the question based on the context
- "time": the time run predict
"""
# process input
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", None)
prompt = f"""Below is an instruction that describes a task. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
>>TITLE<<: Tender Response.
>>CONTEXT<<: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe in a conversational tone. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
>>QUESTION<<: {inputs}
>>ANSWER<<:
""".strip()
# preprocess
batch = self.tokenizer(
prompt,
padding=True,
truncation=True,
return_tensors="pt"
).to(self.device)
# pass inputs with all kwargs in data
#if parameters is not None:
# outputs = self.model.generate(**inputs, **parameters)
#else:
# outputs = self.model.generate(**inputs)
# postprocess the prediction
#prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
generation_config = self.model.generation_config
generation_config.top_p = 0.75
generation_config.temperature = 0.7
generation_config.max_new_tokens = 140
generation_config.num_return_sequences = 1
generation_config.pad_token_id = self.tokenizer.eos_token_id
generation_config.eos_token_id = self.tokenizer.eos_token_id
start = time.time()
with torch.cuda.amp.autocast():
output_tokens = self.model.generate(
input_ids = batch.input_ids,
generation_config=generation_config,
)
end = time.time()
generated_text = self.tokenizer.decode(
output_tokens[0]
)
prediction = {'answer': generated_text.split('>>END<<')[0].split('>>ANSWER<<:')[1].strip(), 'time': f"{(end-start):.2f} s"}
return prediction