File size: 2,057 Bytes
caf5f40 3aed470 caf5f40 3aed470 1765c10 3aed470 caf5f40 a25e749 caf5f40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from typing import Dict, List, Any
class EndpointHandler:
def __init__(self, path="ajayarora1235/Rap-Nemo-0"):
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
nf4_config = BitsAndBytesConfig(
load_in_4bit=use_4bit,
bnb_4bit_quant_type=bnb_4bit_quant_type,
bnb_4bit_use_double_quant=use_nested_quant,
bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
)
self.model = AutoModelForCausalLM.from_pretrained(
path,
quantization_config=nf4_config,
)
self.model.config.use_cache = False
self.model.config.pretraining_tp = 1
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.tokenizer.pad_token = self.tokenizer.unk_token
self.tokenizer.padding_side = "right"
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
input_text = data["inputs"]
kwargs = data.get("kwargs", {})
# Tokenize input text
input_tokens = self.tokenizer.encode(input_text, return_tensors="pt")
# Generate output tokens
with torch.no_grad():
output_tokens = self.model.generate(input_tokens, max_new_tokens=500, do_sample=True, **kwargs)
# Decode output tokens
output_text = self.tokenizer.decode(output_tokens[0])
return [{"output": output_text}]
# Example usage
if __name__ == "__main__":
handler = EndpointHandler()
input_data = {"inputs": "Write a verse in the style of Lupe Fiasco about falling in love with Chipotle."}
output_data = handler(input_data)
print(output_data) |