flpelerin commited on
Commit
44afb45
·
1 Parent(s): d5344c9

Update 2 files

Browse files

- /trainer.cli.py
- /export.py

Files changed (2) hide show
  1. export.py +250 -0
  2. trainer.cli.py +3 -0
export.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def SerializeFP32(file, tensor):
2
+ d = tensor.detach().cpu().view(-1).to(torch.float32).numpy()
3
+ b = struct.pack(f'{len(d)}f', *d)
4
+ file.write(b)
5
+
6
+
7
+ def SerializeINT8(file, tensor):
8
+ d = tensor.detach().cpu().view(-1).numpy().astype(np.int8)
9
+ b = struct.pack(f'{len(d)}b', *d)
10
+ file.write(b)
11
+
12
+
13
+ def QuantizeINT8(w, group_size):
14
+ assert w.numel() % group_size == 0
15
+ ori_shape = w.shape
16
+ w = w.float() # convert to float32
17
+ w = w.reshape(-1, group_size)
18
+
19
+ wmax = torch.abs(w).max(dim=1).values
20
+ scale = wmax / 127.0
21
+ quant = w / scale[:,None]
22
+
23
+ int8val = torch.round(quant).to(torch.int8)
24
+ fp32val = (int8val.float() * scale[:,None]).view(-1)
25
+ fp32valr = fp32val.reshape(-1, group_size)
26
+
27
+ err = torch.abs(fp32valr - w).max(dim=1).values
28
+ maxerr = err.max().item()
29
+
30
+ return int8val, scale, maxerr
31
+
32
+
33
+ def WriteWeightsFP32(file, model, key):
34
+ print(f"writing {key} {list(model[key].shape)[::-1]}")
35
+ SerializeFP32(file, model[key])
36
+
37
+
38
+ def WriteWeightsINT8(file, model, key, group_size=64):
39
+ """ writes the quantized layer weights to file """
40
+ q, s, err = QuantizeINT8(model[key], group_size)
41
+
42
+ SerializeINT8(file, q)
43
+ SerializeFP32(file, s)
44
+
45
+ print(f"{key} quantized {tuple(model[key].shape)} to Q8_0 with max error {err}")
46
+
47
+
48
+ def WriteLayersFP32(file, model, layer, n_layers):
49
+ """ writes the layer weights to file """
50
+ for n in range(n_layers):
51
+ WriteWeightsFP32(file, model, layer % n)
52
+
53
+
54
+ def WriteLayersINT8(file, model, layer, n_layers, group_size=64):
55
+ qtensors = { "q": [], "s": [] }
56
+
57
+ for n in range(n_layers):
58
+ q, s, err = QuantizeINT8(model[layer % n], group_size)
59
+
60
+ qtensors["q"].append(q)
61
+ qtensors["s"].append(s)
62
+
63
+
64
+ print(f"{layer % n} quantized {tuple(model[layer % n].shape)} to Q8_0 with max error {err}")
65
+
66
+
67
+ for q in qtensors["q"]:
68
+ SerializeINT8(file, q)
69
+
70
+ for s in qtensors["s"]:
71
+ SerializeFP32(file, s)
72
+
73
+
74
+ def LoadConfig(config_path):
75
+ with open(config_path) as f:
76
+ config = json.load(f)
77
+
78
+ return config
79
+
80
+
81
+ def LoadModel(model_path):
82
+ model = torch.load(model_path, map_location='cpu')
83
+
84
+ # remove the 'backbone.' prefix from the keys
85
+ unwanted_prefix = 'backbone.'
86
+ for k,v in list(model.items()):
87
+ if k.startswith(unwanted_prefix):
88
+ model[k[len(unwanted_prefix):]] = model.pop(k)
89
+
90
+ return model
91
+
92
+
93
+ def ExportModelFP32(model, config, output_path):
94
+ out_file = open(output_path, 'wb')
95
+
96
+ n_layers = config['n_layer']
97
+
98
+ '''
99
+ Example of the model structure:
100
+ embedding.weight - [50280, 768]
101
+ layers.0.mixer.D - [1536]
102
+ layers.0.mixer.in_proj.weight - [3072, 768]
103
+ layers.0.mixer.conv1d.weight - [1536, 1, 4]
104
+ layers.0.mixer.conv1d.bias - [1536]
105
+ layers.0.mixer.x_proj.weight - [80, 1536]
106
+ layers.0.mixer.dt_proj.weight - [1536, 48]
107
+ layers.0.mixer.dt_proj.bias - [1536]
108
+ layers.0.mixer.A_log - [1536, 16]
109
+ layers.0.mixer.out_proj.weight - [768, 1536]
110
+ layers.0.norm.weight - [768]
111
+ norm_f.weight - [768]
112
+ lm_head.weight - [50280, 768]
113
+ '''
114
+
115
+ for n in range(n_layers):
116
+ a_log = f'layers.{n}.mixer.A_log'
117
+ if a_log in model:
118
+ model[f'layers.{n}.mixer.A'] = -torch.exp(model.pop(a_log))
119
+
120
+
121
+ WriteWeightsFP32(out_file, model, 'embedding.weight')
122
+
123
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.in_proj.weight', n_layers)
124
+ writeLayersFP32(out_file, model, 'layers.%d.mixer.conv1d.weight', n_layers)
125
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.conv1d.bias', n_layers)
126
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.x_proj.weight', n_layers)
127
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.dt_proj.weight', n_layers)
128
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.dt_proj.bias', n_layers)
129
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.A', n_layers)
130
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.D', n_layers)
131
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.out_proj.weight', n_layers)
132
+ WriteLayersFP32(out_file, model, 'layers.%d.norm.weight', n_layers)
133
+
134
+ WriteWeightsFP32(out_file, model, 'norm_f.weight')
135
+ WriteWeightsFP32(out_file, model, 'lm_head.weight')
136
+
137
+ out_file.close()
138
+
139
+
140
+ print(f"Exported FP32 model to {output_path}")
141
+
142
+
143
+ def ExportModelINT8(model, config, output_path, group_size=64):
144
+ out_file = open(output_path, 'wb')
145
+
146
+ n_layers = config['n_layer']
147
+
148
+ '''
149
+ Example of the model structure:
150
+ embedding.weight - [50280, 768]
151
+ layers.0.mixer.D - [1536]
152
+ layers.0.mixer.in_proj.weight - [3072, 768]
153
+ layers.0.mixer.conv1d.weight - [1536, 1, 4]
154
+ layers.0.mixer.conv1d.bias - [1536]
155
+ layers.0.mixer.x_proj.weight - [80, 1536]
156
+ layers.0.mixer.dt_proj.weight - [1536, 48]
157
+ layers.0.mixer.dt_proj.bias - [1536]
158
+ layers.0.mixer.A_log - [1536, 16]
159
+ layers.0.mixer.out_proj.weight - [768, 1536]
160
+ layers.0.norm.weight - [768]
161
+ norm_f.weight - [768]
162
+ lm_head.weight - [50280, 768]
163
+ '''
164
+
165
+ for n in range(n_layers):
166
+ a_log = f'layers.{n}.mixer.A_log'
167
+ if a_log in model:
168
+ model[f'layers.{n}.mixer.A'] = -torch.exp(model.pop(a_log))
169
+
170
+
171
+ WriteWeightsINT8(out_file, model, 'embedding.weight')
172
+
173
+ WriteLayersINT8(out_file, model, 'layers.%d.mixer.in_proj.weight', n_layers)
174
+
175
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.conv1d.weight', n_layers)
176
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.conv1d.bias', n_layers)
177
+
178
+ WriteLayersINT8(out_file, model, 'layers.%d.mixer.x_proj.weight', n_layers)
179
+
180
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.dt_proj.weight', n_layers)
181
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.dt_proj.bias', n_layers)
182
+
183
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.A', n_layers)
184
+ WriteLayersFP32(out_file, model, 'layers.%d.mixer.D', n_layers)
185
+
186
+ WriteLayersINT8(out_file, model, 'layers.%d.mixer.out_proj.weight', n_layers)
187
+
188
+ WriteLayersFP32(out_file, model, 'layers.%d.norm.weight', n_layers)
189
+ WriteWeightsFP32(out_file, model, 'norm_f.weight')
190
+
191
+ WriteWeightsINT8(out_file, model, 'lm_head.weight')
192
+
193
+ out_file.close()
194
+
195
+
196
+ print(f"Exported INT8 model to {output_path}")
197
+
198
+
199
+ def ExportConfig(model, config, output_path):
200
+ """
201
+ Exports the config to a C header file, following this configuration example:
202
+
203
+ #define VOCAB_SIZE 256
204
+ #define N_LAYER 12
205
+ #define D_MODEL 768
206
+ #define D_INNER 1536
207
+ #define DT_RANK 48
208
+ #define D_STATE 16
209
+ #define D_CONV 4
210
+ #define GS 64
211
+
212
+ #define [KEY] [VALUE]
213
+ key is converted to uppercase and value is the value from the config dictionary
214
+ """
215
+
216
+ vocab_size = config['vocab_size']
217
+ rounded_vocab_size = vocab_size if vocab_size % 8 == 0 else vocab_size + (8 - (vocab_size % 8))
218
+
219
+ with open(output_path, 'w') as f:
220
+ f.write("#pragma once\n\n")
221
+ f.write("#define VOCAB_SIZE %d\n" % vocab_size)
222
+ f.write("#define ROUNDED_VOCAB_SIZE %d\n\n" % rounded_vocab_size)
223
+ f.write("#define N_LAYER %d\n" % config['n_layer'])
224
+ f.write("#define D_MODEL %d\n" % config['d_model'])
225
+ f.write("#define D_INNER %d\n" % (2 * config['d_model']))
226
+ f.write("#define DT_RANK %d\n" % model['layers.0.mixer.dt_proj.weight'].shape[1])
227
+ f.write("#define D_STATE %d\n" % model['layers.0.mixer.A'].shape[1])
228
+ f.write("#define D_CONV %d\n\n" % model['layers.0.mixer.conv1d.weight'].shape[2])
229
+ f.write("#define GS 64\n")
230
+
231
+
232
+ print(f"Exported C compatible config (header) to {output_path}")
233
+
234
+
235
+
236
+
237
+
238
+
239
+ def ExportAll(model, tokenizer):
240
+ model.save()
241
+ model = LoadModel('pytorch_model.bin')
242
+ config = LoadConfig('config.json')
243
+
244
+ tokenizer.to_file('tokenizer.bin')
245
+
246
+ ExportModelFP32(model, config, 'model.fp32.bin')
247
+ ExportModelINT8(model, config, 'model.int8.bin')
248
+
249
+ ExportConfig(model, config, 'config.h')
250
+
trainer.cli.py CHANGED
@@ -11,6 +11,8 @@ from model import Model
11
  from logger import Wandb
12
 
13
 
 
 
14
 
15
  parser = ArgumentParser(
16
  prog='Trainer implementation, using Pytorch',
@@ -54,3 +56,4 @@ if __name__ == '__main__':
54
  trainer.train(batches)
55
 
56
 
 
 
11
  from logger import Wandb
12
 
13
 
14
+ from export import ExportAll
15
+
16
 
17
  parser = ArgumentParser(
18
  prog='Trainer implementation, using Pytorch',
 
56
  trainer.train(batches)
57
 
58
 
59
+ ExportAll(model, tokenizer)