|
|
import torch |
|
|
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer |
|
|
from typing import Dict, List, Any |
|
|
|
|
|
|
|
|
class EndpointHandler: |
|
|
def __init__(self, path="ajayarora1235/Rap-Nemo-0"): |
|
|
|
|
|
use_4bit = True |
|
|
|
|
|
|
|
|
bnb_4bit_compute_dtype = "float16" |
|
|
|
|
|
|
|
|
bnb_4bit_quant_type = "nf4" |
|
|
|
|
|
|
|
|
use_nested_quant = False |
|
|
|
|
|
nf4_config = BitsAndBytesConfig( |
|
|
load_in_4bit=use_4bit, |
|
|
bnb_4bit_quant_type=bnb_4bit_quant_type, |
|
|
bnb_4bit_use_double_quant=use_nested_quant, |
|
|
bnb_4bit_compute_dtype=bnb_4bit_compute_dtype |
|
|
) |
|
|
|
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
path, |
|
|
quantization_config=nf4_config, |
|
|
) |
|
|
self.model.config.use_cache = False |
|
|
self.model.config.pretraining_tp = 1 |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(path) |
|
|
|
|
|
self.tokenizer.pad_token = self.tokenizer.unk_token |
|
|
self.tokenizer.padding_side = "right" |
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
|
input_text = data["inputs"] |
|
|
kwargs = data.get("kwargs", {}) |
|
|
|
|
|
|
|
|
input_tokens = self.tokenizer.encode(input_text, return_tensors="pt") |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
output_tokens = self.model.generate(input_tokens, max_new_tokens=500, do_sample=True, **kwargs) |
|
|
|
|
|
|
|
|
output_text = self.tokenizer.decode(output_tokens[0]) |
|
|
|
|
|
return [{"output": output_text}] |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
handler = EndpointHandler() |
|
|
input_data = {"inputs": "Write a verse in the style of Lupe Fiasco about falling in love with Chipotle."} |
|
|
output_data = handler(input_data) |
|
|
print(output_data) |