| # coding=utf-8 | |
| from transformers import AutoTokenizer | |
| from lyraChatGLM import GLM6B, FasterChatGLM | |
| import os | |
| current_workdir = os.path.dirname(__file__) | |
| MAX_OUT_LEN = 100 | |
| chatglm6b_dir = os.path.join(current_workdir, "models") | |
| tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True) | |
| input_str = ["为什么我们需要对深度学习模型加速?", ] | |
| inputs = tokenizer(input_str, return_tensors="pt", padding=True) | |
| input_ids = inputs.input_ids.to('cuda:0') | |
| plan_path = os.path.join(current_workdir, "models/glm6b-bs8.ftm") | |
| # kernel for chat model. | |
| kernel = GLM6B(plan_path=plan_path, | |
| batch_size=1, | |
| num_beams=1, | |
| use_cache=True, | |
| num_heads=32, | |
| emb_size_per_heads=128, | |
| decoder_layers=28, | |
| vocab_size=150528, | |
| max_seq_len=MAX_OUT_LEN) | |
| chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda() | |
| # generate | |
| sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN) | |
| # de-tokenize model output to text | |
| res = tokenizer.decode(sample_output[0], skip_special_tokens=True) | |
| print(res) |