Update code

Files changed (8) hide show

README.md CHANGED Viewed

@@ -86,7 +86,7 @@ python demo.py
 ```python
 from lyraChatGLM import LyraChatGLM6B
-model_path = "./models/1-gpu-fp16.h5"
 tokenizer_path = "./models"
 data_type = "fp16"
 int8_mode = 0   # 1 for INT8 WEIGHT ONLY PTQ

 ```python
 from lyraChatGLM import LyraChatGLM6B
+model_path = "./models/1-gpu-fp16.bin"
 tokenizer_path = "./models"
 data_type = "fp16"
 int8_mode = 0   # 1 for INT8 WEIGHT ONLY PTQ

demo.py CHANGED Viewed

@@ -3,13 +3,13 @@ import numpy as np
 model_path = "./models/1-gpu-fp16.bin"
 tokenizer_path = "./models"
-data_type = "fp16"
 int8_mode = 0
 max_output_length = 150
-arch = "Ampere" # Ampere or Volta
-cuda_version = 12 # cuda version, we currently support 11 and 12
-model = LyraChatGLM6B(model_path, tokenizer_path, data_type, int8_mode, arch, cuda_version)
 prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服裤子鞋子搭配。"
 # test_batch_size = 256
@@ -19,4 +19,4 @@ prompts = [prompt, ]
 # # If you want to get different output in same batch, you can set do_sample to True
 output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
-print(output_texts)

 model_path = "./models/1-gpu-fp16.bin"
 tokenizer_path = "./models"
+inference_data_type = "fp16"
 int8_mode = 0
 max_output_length = 150
+arch = "Volta" # Ampere or Volta
+cuda_version = 11 # cuda version, we currently support 11 and 12
+model = LyraChatGLM6B(model_path, tokenizer_path, inference_data_type, int8_mode, arch, cuda_version)
 prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服裤子鞋子搭配。"
 # test_batch_size = 256
 # # If you want to get different output in same batch, you can set do_sample to True
 output_texts = model.generate(prompts, output_length=max_output_length,top_k=30, top_p=0.85, temperature=0.35, repetition_penalty=1.2, do_sample=False)
+print(output_texts)

lyraChatGLM/ftlib/libth_transformer_sm70_cu11.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a4a778897f6c5f77b0ea1cb14bb63732da9c3cc4e16ff16d9f911dcc8b6f6be5
-size 114267536

lyraChatGLM/ftlib/libth_transformer_sm70_cu12.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:99ac80b2f4c161bbacbf64a7607f323c612c7c5f26b83eaec7f559425f3a818b
-size 114186112

lyraChatGLM/ftlib/libth_transformer_sm80_cu11.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a1d6cd03321b671275fcabb4136562845233875564047ccde20401fca4df45c2
-size 200834616

lyraChatGLM/ftlib/libth_transformer_sm80_cu12.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2da10aad8e92bcdf45b15884cee63e845f582cd28bcc0f7f1c2a4f6a101e9646
-size 200916960

lyraChatGLM/lyra_glm.py CHANGED Viewed

@@ -134,7 +134,10 @@ class LyraChatGLM6B:
         ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
         ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
-        input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
         input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
         mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])

         ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
         ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
+        # input_token_ids = self.tokenizer(prompts, return_tensors="pt", padding=True).input_ids.int()
+        raw_input_token_ids = self.tokenizer(prompts, padding=True)
+        input_token_ids = torch.tensor (raw_input_token_ids["input_ids"],dtype=torch.int32)
         input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
         mask_positions = torch.IntTensor([seq.index(130001) for seq in input_token_ids.tolist()])

lyraChatGLM/model.py CHANGED Viewed

@@ -123,6 +123,7 @@ class ChatGLM6BModel(nn.Module):
             self.adapter_inter_size,
             self.use_attention_linear_bias,
             self.model_path,
             inference_data_type,
             self.shared_contexts_ratio)
         self.build_model = True

             self.adapter_inter_size,
             self.use_attention_linear_bias,
             self.model_path,
+            self.weights_data_type,
             inference_data_type,
             self.shared_contexts_ratio)
         self.build_model = True