File size: 1,766 Bytes

380cd6b

from typing import Dict, List, Any
from transformers import pipeline
import torch, PIL, transformers, triton, sentencepiece, protobuf
import torchvision, einops
import xformers, accelerate
from transformers import AutoModelForCausalLM, LlamaTokenizer


class EndpointHandler():
    def __init__(self, path=""):
        self.model = AutoModelForCausalLM.from_pretrained(
            'THUDM/cogvlm-chat-hf',
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            #   cache_dir='/tmp'
        )
        self.tokenizer = LlamaTokenizer.from_pretrained('lmsys/vicuna-7b-v1.5')
        # create inference pipeline
        # self.pipeline = pipeline(model=model, tokenizer=tokenizer)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
         Args:
             data (:obj:):
                 includes the input data and the parameters for the inference.
         Return:
             A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
                 - "label": A string representing what the label/class is. There can be multiple labels.
                 - "score": A score between 0 and 1 describing how confident the model is for this label/class.
         """
        inputs = data.pop("inputs", data)
        gen_kwargs = {"max_length": 2048, "do_sample": False}

        # pass inputs with all kwargs in data
        # prediction = self.pipeline(inputs)

        outputs = self.model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        prediction = self.tokenizer.decode(outputs[0])

        # post process the prediction
        return prediction