TMElyralab
/

lyraChatGLM

Model card Files Files and versions

moyanwang commited on May 12, 2023

Commit

5f232f5

·

1 Parent(s): 05581a1

update demo

Files changed (1) hide show

demo.py +10 -24

demo.py CHANGED Viewed

@@ -1,47 +1,33 @@
 from transformers import AutoTokenizer
 from faster_chat_glm import GLM6B, FasterChatGLM
-MAX_OUT_LEN = 50
-BATCH_SIZE = 8
-USE_CACHE = True
-print("Prepare config and inputs....")
 chatglm6b_dir = './models'
 tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
-input_str = ["音乐推荐应该考虑哪些因素？帮我写一篇不少于800字的方案。 ", ] * BATCH_SIZE
 inputs = tokenizer(input_str, return_tensors="pt", padding=True)
-input_ids = inputs.input_ids
-input_ids = input_ids.to('cuda:0')
-print(input_ids.shape)
-print('Loading faster model...')
-if USE_CACHE:
-    plan_path = f'./models/glm6b-kv-cache-dy-bs{BATCH_SIZE}.ftm'
-else:
-    plan_path = f'./models/glm6b-bs{BATCH_SIZE}.ftm'
 # kernel for chat model.
 kernel = GLM6B(plan_path=plan_path,
-               batch_size=BATCH_SIZE,
                num_beams=1,
-               use_cache=USE_CACHE,
                num_heads=32,
                emb_size_per_heads=128,
                decoder_layers=28,
                vocab_size=150528,
                max_seq_len=MAX_OUT_LEN)
-print("test")
-chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()
 # generate
 sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
 # de-tokenize model output to text
 res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
-print(res)
-res = tokenizer.decode(sample_output[BATCH_SIZE-1], skip_special_tokens=True)
-print(res)

+# coding=utf-8
 from transformers import AutoTokenizer
 from faster_chat_glm import GLM6B, FasterChatGLM
+MAX_OUT_LEN = 100
 chatglm6b_dir = './models'
 tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
+input_str = ["为什么我们需要对深度学习模型加速？", ]
 inputs = tokenizer(input_str, return_tensors="pt", padding=True)
+input_ids = inputs.input_ids.to('cuda:0')
+plan_path = './models/glm6b-bs8.ftm'
 # kernel for chat model.
 kernel = GLM6B(plan_path=plan_path,
+               batch_size=1,
                num_beams=1,
+               use_cache=True,
                num_heads=32,
                emb_size_per_heads=128,
                decoder_layers=28,
                vocab_size=150528,
                max_seq_len=MAX_OUT_LEN)
+chat = FasterChatGLM(model_dir="./models", kernel=kernel).half().cuda()
 # generate
 sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
 # de-tokenize model output to text
 res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
+print(res)