File size: 2,057 Bytes

caf5f40
3aed470
caf5f40
 
 
 
 
3aed470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1765c10
3aed470
caf5f40
 
a25e749
caf5f40

import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from typing import Dict, List, Any


class EndpointHandler:
    def __init__(self, path="ajayarora1235/Rap-Nemo-0"):
        # Activate 4-bit precision base model loading
        use_4bit = True
        
        # Compute dtype for 4-bit base models
        bnb_4bit_compute_dtype = "float16"
        
        # Quantization type (fp4 or nf4)
        bnb_4bit_quant_type = "nf4"

        # Activate nested quantization for 4-bit base models (double quantization)
        use_nested_quant = False
        
        nf4_config = BitsAndBytesConfig(
           load_in_4bit=use_4bit,
           bnb_4bit_quant_type=bnb_4bit_quant_type,
           bnb_4bit_use_double_quant=use_nested_quant,
           bnb_4bit_compute_dtype=bnb_4bit_compute_dtype
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            path,
            quantization_config=nf4_config,
        )
        self.model.config.use_cache = False
        self.model.config.pretraining_tp = 1
        
        self.tokenizer = AutoTokenizer.from_pretrained(path)

        self.tokenizer.pad_token = self.tokenizer.unk_token
        self.tokenizer.padding_side = "right"

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        input_text = data["inputs"]
        kwargs = data.get("kwargs", {})

        # Tokenize input text
        input_tokens = self.tokenizer.encode(input_text, return_tensors="pt")

        # Generate output tokens
        with torch.no_grad():
            output_tokens = self.model.generate(input_tokens, max_new_tokens=500, do_sample=True, **kwargs)

        # Decode output tokens
        output_text = self.tokenizer.decode(output_tokens[0])

        return [{"output": output_text}]


# Example usage
if __name__ == "__main__":
    handler = EndpointHandler()
    input_data = {"inputs": "Write a verse in the style of Lupe Fiasco about falling in love with Chipotle."}
    output_data = handler(input_data)
    print(output_data)