carsonhxsu
commited on
Commit
·
53f87ca
1
Parent(s):
1c9695e
Update code
Browse files- README.md +1 -1
- demo.py +5 -5
- lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so +0 -3
- lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so +0 -3
- lyraChatGLM/ftlib/libth_transformer_sm80_cu11.so +0 -3
- lyraChatGLM/ftlib/libth_transformer_sm80_cu12.so +0 -3
- lyraChatGLM/lyra_glm.py +4 -1
- lyraChatGLM/model.py +1 -0
README.md
CHANGED
|
@@ -86,7 +86,7 @@ python demo.py
|
|
| 86 |
```python
|
| 87 |
from lyraChatGLM import LyraChatGLM6B
|
| 88 |
|
| 89 |
-
model_path = "./models/1-gpu-fp16.
|
| 90 |
tokenizer_path = "./models"
|
| 91 |
data_type = "fp16"
|
| 92 |
int8_mode = 0 # 1 for INT8 WEIGHT ONLY PTQ
|
|
|
|
| 86 |
```python
|
| 87 |
from lyraChatGLM import LyraChatGLM6B
|
| 88 |
|
| 89 |
+
model_path = "./models/1-gpu-fp16.bin"
|
| 90 |
tokenizer_path = "./models"
|
| 91 |
data_type = "fp16"
|
| 92 |
int8_mode = 0 # 1 for INT8 WEIGHT ONLY PTQ
|
demo.py
CHANGED
|
@@ -3,13 +3,13 @@ import numpy as np
|
|
| 3 |
|
| 4 |
model_path = "./models/1-gpu-fp16.bin"
|
| 5 |
tokenizer_path = "./models"
|
| 6 |
-
|
| 7 |
int8_mode = 0
|
| 8 |
max_output_length = 150
|
| 9 |
-
arch = "
|
| 10 |
-
cuda_version =
|
| 11 |
|
| 12 |
-
model = LyraChatGLM6B(model_path, tokenizer_path,
|
| 13 |
|
| 14 |
prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服裤子鞋子搭配。"
|
| 15 |
# test_batch_size = 256
|
|
@@ -19,4 +19,4 @@ prompts = [prompt, ]
|
|
| 19 |
# # If you want to get different output in same batch, you can set do_sample to True
|
| 20 |
output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
|
| 21 |
|
| 22 |
-
print(output_texts)
|
|
|
|
| 3 |
|
| 4 |
model_path = "./models/1-gpu-fp16.bin"
|
| 5 |
tokenizer_path = "./models"
|
| 6 |
+
inference_data_type = "fp16"
|
| 7 |
int8_mode = 0
|
| 8 |
max_output_length = 150
|
| 9 |
+
arch = "Volta" # Ampere or Volta
|
| 10 |
+
cuda_version = 11 # cuda version, we currently support 11 and 12
|
| 11 |
|
| 12 |
+
model = LyraChatGLM6B(model_path, tokenizer_path, inference_data_type, int8_mode, arch, cuda_version)
|
| 13 |
|
| 14 |
prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服裤子鞋子搭配。"
|
| 15 |
# test_batch_size = 256
|
|
|
|
| 19 |
# # If you want to get different output in same batch, you can set do_sample to True
|
| 20 |
output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
|
| 21 |
|
| 22 |
+
print(output_texts)
|
lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a4a778897f6c5f77b0ea1cb14bb63732da9c3cc4e16ff16d9f911dcc8b6f6be5
|
| 3 |
-
size 114267536
|
|
|
|
|
|
|
|
|
|
|
|
lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:99ac80b2f4c161bbacbf64a7607f323c612c7c5f26b83eaec7f559425f3a818b
|
| 3 |
-
size 114186112
|
|
|
|
|
|
|
|
|
|
|
|
lyraChatGLM/ftlib/libth_transformer_sm80_cu11.so
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a1d6cd03321b671275fcabb4136562845233875564047ccde20401fca4df45c2
|
| 3 |
-
size 200834616
|
|
|
|
|
|
|
|
|
|
|
|
lyraChatGLM/ftlib/libth_transformer_sm80_cu12.so
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2da10aad8e92bcdf45b15884cee63e845f582cd28bcc0f7f1c2a4f6a101e9646
|
| 3 |
-
size 200916960
|
|
|
|
|
|
|
|
|
|
|
|
lyraChatGLM/lyra_glm.py
CHANGED
|
@@ -134,7 +134,10 @@ class LyraChatGLM6B:
|
|
| 134 |
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
| 135 |
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
| 136 |
|
| 137 |
-
input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
|
|
|
|
|
|
|
|
|
|
| 138 |
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
| 139 |
mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])
|
| 140 |
|
|
|
|
| 134 |
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
| 135 |
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
| 136 |
|
| 137 |
+
# input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
|
| 138 |
+
raw_input_token_ids = self.tokenizer(prompts, padding=True)
|
| 139 |
+
input_token_ids = torch.tensor (raw_input_token_ids["input_ids"],dtype=torch.int32)
|
| 140 |
+
|
| 141 |
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
| 142 |
mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])
|
| 143 |
|
lyraChatGLM/model.py
CHANGED
|
@@ -123,6 +123,7 @@ class ChatGLM6BModel(nn.Module):
|
|
| 123 |
self.adapter_inter_size,
|
| 124 |
self.use_attention_linear_bias,
|
| 125 |
self.model_path,
|
|
|
|
| 126 |
inference_data_type,
|
| 127 |
self.shared_contexts_ratio)
|
| 128 |
self.build_model = True
|
|
|
|
| 123 |
self.adapter_inter_size,
|
| 124 |
self.use_attention_linear_bias,
|
| 125 |
self.model_path,
|
| 126 |
+
self.weights_data_type,
|
| 127 |
inference_data_type,
|
| 128 |
self.shared_contexts_ratio)
|
| 129 |
self.build_model = True
|