moyanwang
commited on
Commit
·
5f232f5
1
Parent(s):
05581a1
update demo
Browse files
demo.py
CHANGED
|
@@ -1,47 +1,33 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
from transformers import AutoTokenizer
|
| 4 |
from faster_chat_glm import GLM6B, FasterChatGLM
|
| 5 |
|
| 6 |
|
| 7 |
-
MAX_OUT_LEN =
|
| 8 |
-
BATCH_SIZE = 8
|
| 9 |
-
USE_CACHE = True
|
| 10 |
-
|
| 11 |
-
print("Prepare config and inputs....")
|
| 12 |
chatglm6b_dir = './models'
|
| 13 |
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
|
| 14 |
-
|
| 15 |
-
input_str = ["音乐推荐应该考虑哪些因素?帮我写一篇不少于800字的方案。 ", ] * BATCH_SIZE
|
| 16 |
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
|
| 17 |
-
input_ids = inputs.input_ids
|
| 18 |
-
input_ids = input_ids.to('cuda:0')
|
| 19 |
-
print(input_ids.shape)
|
| 20 |
|
| 21 |
|
| 22 |
-
|
| 23 |
-
if USE_CACHE:
|
| 24 |
-
plan_path = f'./models/glm6b-kv-cache-dy-bs{BATCH_SIZE}.ftm'
|
| 25 |
-
else:
|
| 26 |
-
plan_path = f'./models/glm6b-bs{BATCH_SIZE}.ftm'
|
| 27 |
-
|
| 28 |
# kernel for chat model.
|
| 29 |
kernel = GLM6B(plan_path=plan_path,
|
| 30 |
-
batch_size=
|
| 31 |
num_beams=1,
|
| 32 |
-
use_cache=
|
| 33 |
num_heads=32,
|
| 34 |
emb_size_per_heads=128,
|
| 35 |
decoder_layers=28,
|
| 36 |
vocab_size=150528,
|
| 37 |
max_seq_len=MAX_OUT_LEN)
|
| 38 |
-
|
| 39 |
-
chat = FasterChatGLM(model_dir=
|
| 40 |
|
| 41 |
# generate
|
| 42 |
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
|
| 43 |
# de-tokenize model output to text
|
| 44 |
res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
|
| 45 |
-
print(res)
|
| 46 |
-
res = tokenizer.decode(sample_output[BATCH_SIZE-1], skip_special_tokens=True)
|
| 47 |
-
print(res)
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
|
| 3 |
from transformers import AutoTokenizer
|
| 4 |
from faster_chat_glm import GLM6B, FasterChatGLM
|
| 5 |
|
| 6 |
|
| 7 |
+
MAX_OUT_LEN = 100
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
chatglm6b_dir = './models'
|
| 9 |
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
|
| 10 |
+
input_str = ["为什么我们需要对深度学习模型加速?", ]
|
|
|
|
| 11 |
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
|
| 12 |
+
input_ids = inputs.input_ids.to('cuda:0')
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
+
plan_path = './models/glm6b-bs8.ftm'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# kernel for chat model.
|
| 17 |
kernel = GLM6B(plan_path=plan_path,
|
| 18 |
+
batch_size=1,
|
| 19 |
num_beams=1,
|
| 20 |
+
use_cache=True,
|
| 21 |
num_heads=32,
|
| 22 |
emb_size_per_heads=128,
|
| 23 |
decoder_layers=28,
|
| 24 |
vocab_size=150528,
|
| 25 |
max_seq_len=MAX_OUT_LEN)
|
| 26 |
+
|
| 27 |
+
chat = FasterChatGLM(model_dir="./models", kernel=kernel).half().cuda()
|
| 28 |
|
| 29 |
# generate
|
| 30 |
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
|
| 31 |
# de-tokenize model output to text
|
| 32 |
res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
|
| 33 |
+
print(res)
|
|
|
|
|
|