TMElyralab
/

lyraChatGLM

Model card Files Files and versions

bigmoyan commited on May 12, 2023

Commit

f8cc4df

·

1 Parent(s): 78ab63e

Update README.md

Files changed (1) hide show

README.md +7 -9

README.md CHANGED Viewed

@@ -41,19 +41,16 @@ from transformers import AutoTokenizer
 from faster_chat_glm import GLM6B, FasterChatGLM
-tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
-BATCH_SIZE = 8
-MAX_OUT_LEN = 50
-# prepare input
-input_str = ["为什么我们需要对深度学习模型加速? ", ] *
 inputs = tokenizer(input_str, return_tensors="pt", padding=True)
 input_ids = inputs.input_ids.to('cuda:0')
 # kernel for chat model.
-kernel = GLM6B(plan_path="./models/glm6b-bs{BATCH_SIZE}.ftm",
                batch_size=1,
                num_beams=1,
                use_cache=True,
@@ -62,7 +59,8 @@ kernel = GLM6B(plan_path="./models/glm6b-bs{BATCH_SIZE}.ftm",
                decoder_layers=28,
                vocab_size=150528,
                max_seq_len=MAX_OUT_LEN)
-chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()
 # generate
 sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)

 from faster_chat_glm import GLM6B, FasterChatGLM
+MAX_OUT_LEN = 100
+tokenizer = AutoTokenizer.from_pretrained('./models', trust_remote_code=True)
+input_str = ["为什么我们需要对深度学习模型加速？", ]
 inputs = tokenizer(input_str, return_tensors="pt", padding=True)
 input_ids = inputs.input_ids.to('cuda:0')
+plan_path = './models/glm6b-bs8.ftm'
 # kernel for chat model.
+kernel = GLM6B(plan_path=plan_path,
                batch_size=1,
                num_beams=1,
                use_cache=True,
                decoder_layers=28,
                vocab_size=150528,
                max_seq_len=MAX_OUT_LEN)
+chat = FasterChatGLM(model_dir="./models", kernel=kernel).half().cuda()
 # generate
 sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)