xmcmic commited on
Commit
8bcc1f4
·
1 Parent(s): a7453ff

Delete VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146

Browse files
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2209e7ef5909f2aa8036c1815d1a42892c16a4979e4f4839f21d6884194e926
3
- size 13765332257
 
 
 
 
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:892d43a8999e7538367537428e69900cbe92e843a5532bdcee4524cbea41990b
3
- size 14583
 
 
 
 
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d539f8550bc2792403faa05e505ea0b067c57108e26ac6b15870a6c758fd70f
3
- size 14583
 
 
 
 
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bc589a2a3a575df97929b97991b612debb43736d0ee4d0122ead5336f1bc7fc
3
- size 14583
 
 
 
 
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:acbd698f57f06f61a281a7413d875ee291d48d4dc78781e051af966690d01f7f
3
- size 14583
 
 
 
 
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:40ca1bc78f3ddde4a145da0ab038cf559f011d4d8aeee481922129ce92346bb7
3
- size 4731
 
 
 
 
VQA_lora_PMC_LLaMA_PMCCLIP/blank/checkpoint-4146/zero_to_fp32.py DELETED
@@ -1,432 +0,0 @@
1
- # Copyright (c) Microsoft Corporation.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- # DeepSpeed Team
5
-
6
- # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
7
- # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
8
- # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
9
- # application.
10
- #
11
- # example: python zero_to_fp32.py . pytorch_model.bin
12
-
13
- import argparse
14
- import torch
15
- import glob
16
- import math
17
- import os
18
- import re
19
- from collections import OrderedDict
20
-
21
- # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
22
- # DeepSpeed data structures it has to be available in the current python environment.
23
- from deepspeed.utils import logger
24
- from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
25
- FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES)
26
-
27
- debug = 0
28
-
29
- # load to cpu
30
- device = torch.device('cpu')
31
-
32
-
33
- def atoi(text):
34
- return int(text) if text.isdigit() else text
35
-
36
-
37
- def natural_keys(text):
38
- '''
39
- alist.sort(key=natural_keys) sorts in human order
40
- http://nedbatchelder.com/blog/200712/human_sorting.html
41
- (See Toothy's implementation in the comments)
42
- '''
43
- return [atoi(c) for c in re.split(r'(\d+)', text)]
44
-
45
-
46
- def get_model_state_file(checkpoint_dir, zero_stage):
47
- if not os.path.isdir(checkpoint_dir):
48
- raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
49
-
50
- # there should be only one file
51
- if zero_stage == 2:
52
- file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
53
- elif zero_stage == 3:
54
- file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
55
-
56
- if not os.path.exists(file):
57
- raise FileNotFoundError(f"can't find model states file at '{file}'")
58
-
59
- return file
60
-
61
-
62
- def get_optim_files(checkpoint_dir):
63
- # XXX: need to test that this simple glob rule works for multi-node setup too
64
- optim_files = sorted(glob.glob(os.path.join(checkpoint_dir, "*_optim_states.pt")), key=natural_keys)
65
-
66
- if len(optim_files) == 0:
67
- raise FileNotFoundError(f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
68
-
69
- return optim_files
70
-
71
-
72
- def parse_model_state(file):
73
- state_dict = torch.load(file, map_location=device)
74
-
75
- if BUFFER_NAMES not in state_dict:
76
- raise ValueError(f"{file} is not a model state checkpoint")
77
- buffer_names = state_dict[BUFFER_NAMES]
78
- if debug:
79
- print("Found buffers:", buffer_names)
80
-
81
- # recover just the buffers while restoring them to fp32 if they were saved in fp16
82
- buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
83
- param_shapes = state_dict[PARAM_SHAPES]
84
-
85
- ds_version = state_dict.get(DS_VERSION, None)
86
-
87
- return buffers, param_shapes, ds_version
88
-
89
-
90
- def parse_optim_states(files, ds_checkpoint_dir):
91
-
92
- total_files = len(files)
93
- state_dicts = []
94
- for f in files:
95
- state_dicts.append(torch.load(f, map_location=device))
96
-
97
- if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
98
- raise ValueError(f"{files[0]} is not a zero checkpoint")
99
- zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
100
- world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
101
-
102
- # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
103
- # parameters can be different from data parallelism for non-expert parameters. So we can just
104
- # use the max of the partition_count to get the dp world_size.
105
-
106
- if type(world_size) is list:
107
- world_size = max(world_size)
108
-
109
- if world_size != total_files:
110
- raise ValueError(
111
- f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
112
- "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
113
- )
114
-
115
- # the groups are named differently in each stage
116
- if zero_stage == 2:
117
- fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
118
- elif zero_stage == 3:
119
- fp32_groups_key = FP32_FLAT_GROUPS
120
- else:
121
- raise ValueError(f"unknown zero stage {zero_stage}")
122
-
123
- if zero_stage == 2:
124
- fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
125
- elif zero_stage == 3:
126
- # if there is more than one param group, there will be multiple flattened tensors - one
127
- # flattened tensor per group - for simplicity merge them into a single tensor
128
- #
129
- # XXX: could make the script more memory efficient for when there are multiple groups - it
130
- # will require matching the sub-lists of param_shapes for each param group flattened tensor
131
-
132
- fp32_flat_groups = [
133
- torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
134
- ]
135
-
136
- return zero_stage, world_size, fp32_flat_groups
137
-
138
-
139
- def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
140
- """
141
- Returns fp32 state_dict reconstructed from ds checkpoint
142
-
143
- Args:
144
- - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
145
-
146
- """
147
- print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
148
-
149
- optim_files = get_optim_files(ds_checkpoint_dir)
150
- zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
151
- print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
152
-
153
- model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
154
- buffers, param_shapes, ds_version = parse_model_state(model_file)
155
- print(f'Parsing checkpoint created by deepspeed=={ds_version}')
156
-
157
- if zero_stage == 2:
158
- return _get_fp32_state_dict_from_zero2_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers)
159
- elif zero_stage == 3:
160
- return _get_fp32_state_dict_from_zero3_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers)
161
-
162
-
163
- def _get_fp32_state_dict_from_zero2_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers):
164
-
165
- # Reconstruction protocol:
166
- #
167
- # XXX: document this
168
-
169
- if debug:
170
- for i in range(world_size):
171
- for j in range(len(fp32_flat_groups[0])):
172
- print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
173
-
174
- # XXX: memory usage doubles here (zero2)
175
- num_param_groups = len(fp32_flat_groups[0])
176
- merged_single_partition_of_fp32_groups = []
177
- for i in range(num_param_groups):
178
- merged_partitions = [sd[i] for sd in fp32_flat_groups]
179
- full_single_fp32_vector = torch.cat(merged_partitions, 0)
180
- merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
181
- avail_numel = sum(
182
- [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
183
-
184
- if debug:
185
- wanted_params = sum([len(shapes) for shapes in param_shapes])
186
- wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
187
- # not asserting if there is a mismatch due to possible padding
188
- print(f"Have {avail_numel} numels to process.")
189
- print(f"Need {wanted_numel} numels in {wanted_params} params.")
190
-
191
- state_dict = OrderedDict()
192
-
193
- # buffers
194
- state_dict.update(buffers)
195
- if debug:
196
- print(f"added {len(buffers)} buffers")
197
-
198
- # params
199
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
200
- # out-of-core computing solution
201
- total_numel = 0
202
- total_params = 0
203
- for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
204
- offset = 0
205
- avail_numel = full_single_fp32_vector.numel()
206
- for name, shape in shapes.items():
207
-
208
- unpartitioned_numel = shape.numel()
209
- total_numel += unpartitioned_numel
210
- total_params += 1
211
-
212
- if debug:
213
- print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
214
- state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
215
- offset += unpartitioned_numel
216
-
217
- # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
218
- # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
219
- # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
220
- # live optimizer object, so we are checking that the numbers are within the right range
221
- align_to = 2 * world_size
222
-
223
- def zero2_align(x):
224
- return align_to * math.ceil(x / align_to)
225
-
226
- if debug:
227
- print(f"original offset={offset}, avail_numel={avail_numel}")
228
-
229
- offset = zero2_align(offset)
230
- avail_numel = zero2_align(avail_numel)
231
-
232
- if debug:
233
- print(f"aligned offset={offset}, avail_numel={avail_numel}")
234
-
235
- # Sanity check
236
- if offset != avail_numel:
237
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
238
-
239
- print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
240
-
241
- return state_dict
242
-
243
-
244
- def zero3_partitioned_param_info(unpartitioned_numel, world_size):
245
- remainder = unpartitioned_numel % world_size
246
- padding_numel = (world_size - remainder) if remainder else 0
247
- partitioned_numel = math.ceil(unpartitioned_numel / world_size)
248
- return partitioned_numel, padding_numel
249
-
250
-
251
- def _get_fp32_state_dict_from_zero3_checkpoint(world_size, param_shapes, fp32_flat_groups, buffers):
252
-
253
- # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
254
- # param, re-consolidating each param, while dealing with padding if any
255
-
256
- avail_numel = fp32_flat_groups[0].numel() * world_size
257
- # merge list of dicts, preserving order
258
- param_shapes = {k: v for d in param_shapes for k, v in d.items()}
259
-
260
- if debug:
261
- for i in range(world_size):
262
- print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
263
-
264
- wanted_params = len(param_shapes)
265
- wanted_numel = sum(shape.numel() for shape in param_shapes.values())
266
- # not asserting if there is a mismatch due to possible padding
267
- print(f"Have {avail_numel} numels to process.")
268
- print(f"Need {wanted_numel} numels in {wanted_params} params.")
269
-
270
- state_dict = OrderedDict()
271
-
272
- # buffers
273
- state_dict.update(buffers)
274
- if debug:
275
- print(f"added {len(buffers)} buffers")
276
-
277
- # params
278
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
279
- # out-of-core computing solution
280
- offset = 0
281
- total_numel = 0
282
- total_params = 0
283
- for name, shape in param_shapes.items():
284
-
285
- unpartitioned_numel = shape.numel()
286
- total_numel += unpartitioned_numel
287
- total_params += 1
288
-
289
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
290
-
291
- if debug:
292
- print(
293
- f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
294
- )
295
-
296
- # XXX: memory usage doubles here
297
- state_dict[name] = torch.cat(
298
- tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
299
- 0).narrow(0, 0, unpartitioned_numel).view(shape)
300
- offset += partitioned_numel
301
-
302
- offset *= world_size
303
-
304
- # Sanity check
305
- if offset != avail_numel:
306
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
307
-
308
- print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
309
-
310
- return state_dict
311
-
312
-
313
- def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
314
- """
315
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
316
- ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
317
- via a model hub.
318
-
319
- Args:
320
- - ``checkpoint_dir``: path to the desired checkpoint folder
321
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
322
-
323
- Returns:
324
- - pytorch ``state_dict``
325
-
326
- Note: this approach may not work if your application doesn't have sufficient free CPU memory and
327
- you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
328
- the checkpoint.
329
-
330
- A typical usage might be ::
331
-
332
- from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
333
- # do the training and checkpoint saving
334
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
335
- model = model.cpu() # move to cpu
336
- model.load_state_dict(state_dict)
337
- # submit to model hub or save the model to share with others
338
-
339
- In this example the ``model`` will no longer be usable in the deepspeed context of the same
340
- application. i.e. you will need to re-initialize the deepspeed engine, since
341
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
342
-
343
- If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
344
-
345
- """
346
- if tag is None:
347
- latest_path = os.path.join(checkpoint_dir, 'latest')
348
- if os.path.isfile(latest_path):
349
- with open(latest_path, 'r') as fd:
350
- tag = fd.read().strip()
351
- else:
352
- raise ValueError(f"Unable to find 'latest' file at {latest_path}")
353
-
354
- ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
355
-
356
- if not os.path.isdir(ds_checkpoint_dir):
357
- raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
358
-
359
- return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
360
-
361
-
362
- def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
363
- """
364
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
365
- loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
366
-
367
- Args:
368
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
369
- - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
370
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
371
- """
372
-
373
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
374
- print(f"Saving fp32 state dict to {output_file}")
375
- torch.save(state_dict, output_file)
376
-
377
-
378
- def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
379
- """
380
- 1. Put the provided model to cpu
381
- 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
382
- 3. Load it into the provided model
383
-
384
- Args:
385
- - ``model``: the model object to update
386
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
387
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
388
-
389
- Returns:
390
- - ``model`: modified model
391
-
392
- Make sure you have plenty of CPU memory available before you call this function. If you don't
393
- have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
394
- conveniently placed for you in the checkpoint folder.
395
-
396
- A typical usage might be ::
397
-
398
- from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
399
- model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
400
- # submit to model hub or save the model to share with others
401
-
402
- Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
403
- of the same application. i.e. you will need to re-initialize the deepspeed engine, since
404
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
405
-
406
- """
407
- logger.info(f"Extracting fp32 weights")
408
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
409
-
410
- logger.info(f"Overwriting model with fp32 weights")
411
- model = model.cpu()
412
- model.load_state_dict(state_dict, strict=False)
413
-
414
- return model
415
-
416
-
417
- if __name__ == "__main__":
418
-
419
- parser = argparse.ArgumentParser()
420
- parser.add_argument("checkpoint_dir",
421
- type=str,
422
- help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
423
- parser.add_argument(
424
- "output_file",
425
- type=str,
426
- help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
427
- parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
428
- args = parser.parse_args()
429
-
430
- debug = args.debug
431
-
432
- convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)