BryanW commited on
Commit
38f7dd9
·
verified ·
1 Parent(s): 1254814

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/__init__.cpython-312.pyc +0 -0
  2. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_cli.cpython-312.pyc +0 -0
  3. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_ddp_comm_hook.cpython-312.pyc +0 -0
  4. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_distributed_data_loop.cpython-312.pyc +0 -0
  5. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_merge_weights.cpython-312.pyc +0 -0
  6. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_notebook.cpython-312.pyc +0 -0
  7. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_ops.cpython-312.pyc +0 -0
  8. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_script.cpython-312.pyc +0 -0
  9. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_sync.cpython-312.pyc +0 -0
  10. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__init__.py +13 -0
  11. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/__init__.cpython-312.pyc +0 -0
  12. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_checkpointing.cpython-312.pyc +0 -0
  13. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_ds_alst_ulysses_sp.cpython-312.pyc +0 -0
  14. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_metrics.cpython-312.pyc +0 -0
  15. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_peak_memory_usage.cpython-312.pyc +0 -0
  16. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_performance.cpython-312.pyc +0 -0
  17. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_pippy.cpython-312.pyc +0 -0
  18. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_zero3_integration.cpython-312.pyc +0 -0
  19. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_checkpointing.py +269 -0
  20. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_ds_alst_ulysses_sp.py +129 -0
  21. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py +331 -0
  22. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_metrics.py +307 -0
  23. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py +314 -0
  24. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_performance.py +299 -0
  25. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_pippy.py +117 -0
  26. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_zero3_integration.py +59 -0
  27. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/test_cli.py +32 -0
  28. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/test_ops.py +181 -0
  29. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/__diff.cpython-312.pyc +0 -0
  30. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/__info__.cpython-312.pyc +0 -0
  31. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/__init__.cpython-312.pyc +0 -0
  32. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/_objects.cpython-312.pyc +0 -0
  33. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/_shims.cpython-312.pyc +0 -0
  34. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/detect.cpython-312.pyc +0 -0
  35. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/logger.cpython-312.pyc +0 -0
  36. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/objtypes.cpython-312.pyc +0 -0
  37. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/pointers.cpython-312.pyc +0 -0
  38. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/session.cpython-312.pyc +0 -0
  39. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/settings.cpython-312.pyc +0 -0
  40. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/source.cpython-312.pyc +0 -0
  41. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/temp.cpython-312.pyc +0 -0
  42. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__init__.py +22 -0
  43. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__main__.py +35 -0
  44. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_abc.cpython-312.pyc +0 -0
  45. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_detect.cpython-312.pyc +0 -0
  46. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_dictviews.cpython-312.pyc +0 -0
  47. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_fglobals.cpython-312.pyc +0 -0
  48. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_logger.cpython-312.pyc +0 -0
  49. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_mixins.cpython-312.pyc +0 -0
  50. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_moduledict.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (235 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_cli.cpython-312.pyc ADDED
Binary file (975 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_ddp_comm_hook.cpython-312.pyc ADDED
Binary file (4.73 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_distributed_data_loop.cpython-312.pyc ADDED
Binary file (18.4 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_merge_weights.cpython-312.pyc ADDED
Binary file (8.45 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_notebook.cpython-312.pyc ADDED
Binary file (5.46 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_ops.cpython-312.pyc ADDED
Binary file (9.47 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_script.cpython-312.pyc ADDED
Binary file (48.3 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/__pycache__/test_sync.cpython-312.pyc ADDED
Binary file (19.5 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (249 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_checkpointing.cpython-312.pyc ADDED
Binary file (11.1 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_ds_alst_ulysses_sp.cpython-312.pyc ADDED
Binary file (5.07 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_metrics.cpython-312.pyc ADDED
Binary file (16.2 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_peak_memory_usage.cpython-312.pyc ADDED
Binary file (14.4 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_performance.cpython-312.pyc ADDED
Binary file (11.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_pippy.cpython-312.pyc ADDED
Binary file (3.66 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/__pycache__/test_zero3_integration.cpython-312.pyc ADDED
Binary file (1.59 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_checkpointing.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+ import json
16
+ import os
17
+
18
+ import evaluate
19
+ import torch
20
+ from datasets import load_dataset
21
+ from torch.optim import AdamW
22
+ from torch.utils.data import DataLoader
23
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
24
+
25
+ from accelerate import Accelerator, DistributedType
26
+ from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
27
+
28
+
29
+ MAX_GPU_BATCH_SIZE = 16
30
+ EVAL_BATCH_SIZE = 32
31
+
32
+
33
+ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
34
+ """
35
+ Creates a set of `DataLoader`s for the `glue` dataset.
36
+
37
+ Args:
38
+ accelerator (`Accelerator`):
39
+ An `Accelerator` object
40
+ batch_size (`int`, *optional*):
41
+ The batch size for the train and validation DataLoaders.
42
+ model_name (`str`, *optional*):
43
+ """
44
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
45
+ datasets = load_dataset("glue", "mrpc")
46
+
47
+ def tokenize_function(examples):
48
+ # max_length=None => use the model max length (it's actually the default)
49
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
50
+ return outputs
51
+
52
+ # Apply the method we just defined to all the examples in all the splits of the dataset
53
+ tokenized_datasets = datasets.map(
54
+ tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
55
+ )
56
+
57
+ # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
58
+ # transformers library
59
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
60
+
61
+ def collate_fn(examples):
62
+ # On TPU it's best to pad everything to the same length or training will be very slow.
63
+ if accelerator.distributed_type == DistributedType.XLA:
64
+ return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
65
+ return tokenizer.pad(examples, padding="longest", return_tensors="pt")
66
+
67
+ # Instantiate dataloaders.
68
+ train_dataloader = DataLoader(
69
+ tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
70
+ )
71
+ eval_dataloader = DataLoader(
72
+ tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
73
+ )
74
+
75
+ return train_dataloader, eval_dataloader
76
+
77
+
78
+ def evaluation_loop(accelerator, model, eval_dataloader, metric):
79
+ model.eval()
80
+ samples_seen = 0
81
+ for step, batch in enumerate(eval_dataloader):
82
+ # We could avoid this line since we set the accelerator with `device_placement=True`.
83
+ batch.to(accelerator.device)
84
+ with torch.no_grad():
85
+ outputs = model(**batch)
86
+ predictions = outputs.logits.argmax(dim=-1)
87
+ # It is slightly faster to call this once, than multiple times
88
+ predictions, references = accelerator.gather(
89
+ (predictions, batch["labels"])
90
+ ) # If we are in a multiprocess environment, the last batch has duplicates
91
+ if accelerator.use_distributed:
92
+ if step == len(eval_dataloader) - 1:
93
+ predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
94
+ references = references[: len(eval_dataloader.dataset) - samples_seen]
95
+ else:
96
+ samples_seen += references.shape[0]
97
+ metric.add_batch(
98
+ predictions=predictions,
99
+ references=references,
100
+ )
101
+
102
+ eval_metric = metric.compute()
103
+ return eval_metric["accuracy"]
104
+
105
+
106
+ def training_function(config, args):
107
+ # Initialize accelerator
108
+ accelerator = Accelerator()
109
+
110
+ # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
111
+ lr = config["lr"]
112
+ num_epochs = int(config["num_epochs"])
113
+ seed = int(config["seed"])
114
+ batch_size = int(config["batch_size"])
115
+ model_name = args.model_name_or_path
116
+
117
+ set_seed(seed)
118
+ train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name)
119
+
120
+ # Instantiate the model (we build the model here so that the seed also control new weights initialization)
121
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)
122
+
123
+ # Instantiate optimizer
124
+ optimizer_cls = (
125
+ AdamW
126
+ if accelerator.state.deepspeed_plugin is None
127
+ or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
128
+ else DummyOptim
129
+ )
130
+ optimizer = optimizer_cls(params=model.parameters(), lr=lr)
131
+
132
+ if accelerator.state.deepspeed_plugin is not None:
133
+ gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
134
+ "gradient_accumulation_steps"
135
+ ]
136
+ else:
137
+ gradient_accumulation_steps = 1
138
+ max_training_steps = (len(train_dataloader) * num_epochs) // gradient_accumulation_steps
139
+
140
+ # Instantiate scheduler
141
+ if (
142
+ accelerator.state.deepspeed_plugin is None
143
+ or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
144
+ ):
145
+ lr_scheduler = get_linear_schedule_with_warmup(
146
+ optimizer=optimizer,
147
+ num_warmup_steps=0,
148
+ num_training_steps=max_training_steps,
149
+ )
150
+ else:
151
+ lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)
152
+
153
+ # Prepare everything
154
+ # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
155
+ # prepare method.
156
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
157
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
158
+ )
159
+
160
+ # We need to keep track of how many total steps we have iterated over
161
+ overall_step = 0
162
+ # We also need to keep track of the stating epoch so files are named properly
163
+ starting_epoch = 0
164
+ metric = evaluate.load("glue", "mrpc")
165
+ ending_epoch = num_epochs
166
+
167
+ if args.partial_train_epoch is not None:
168
+ ending_epoch = args.partial_train_epoch
169
+
170
+ if args.resume_from_checkpoint:
171
+ accelerator.load_state(args.resume_from_checkpoint)
172
+ epoch_string = args.resume_from_checkpoint.split("epoch_")[1]
173
+ state_epoch_num = ""
174
+ for char in epoch_string:
175
+ if char.isdigit():
176
+ state_epoch_num += char
177
+ else:
178
+ break
179
+ starting_epoch = int(state_epoch_num) + 1
180
+ accuracy = evaluation_loop(accelerator, model, eval_dataloader, metric)
181
+ accelerator.print("resumed checkpoint performance:", accuracy)
182
+ accelerator.print("resumed checkpoint's scheduler's lr:", lr_scheduler.get_lr()[0])
183
+ accelerator.print("resumed optimizers's lr:", optimizer.param_groups[0]["lr"])
184
+ with open(os.path.join(args.output_dir, f"state_{starting_epoch - 1}.json")) as f:
185
+ resumed_state = json.load(f)
186
+ assert resumed_state["accuracy"] == accuracy, "Accuracy mismatch, loading from checkpoint failed"
187
+ assert resumed_state["lr"] == lr_scheduler.get_lr()[0], (
188
+ "Scheduler learning rate mismatch, loading from checkpoint failed"
189
+ )
190
+ assert resumed_state["optimizer_lr"] == optimizer.param_groups[0]["lr"], (
191
+ "Optimizer learning rate mismatch, loading from checkpoint failed"
192
+ )
193
+ assert resumed_state["epoch"] == starting_epoch - 1, "Epoch mismatch, loading from checkpoint failed"
194
+ return
195
+
196
+ # Now we train the model
197
+ state = {}
198
+ for epoch in range(starting_epoch, ending_epoch):
199
+ model.train()
200
+ for step, batch in enumerate(train_dataloader):
201
+ outputs = model(**batch)
202
+ loss = outputs.loss
203
+ loss = loss / gradient_accumulation_steps
204
+ accelerator.backward(loss)
205
+ if step % gradient_accumulation_steps == 0:
206
+ optimizer.step()
207
+ lr_scheduler.step()
208
+ optimizer.zero_grad()
209
+
210
+ overall_step += 1
211
+ output_dir = f"epoch_{epoch}"
212
+ output_dir = os.path.join(args.output_dir, output_dir)
213
+ accelerator.save_state(output_dir)
214
+ accuracy = evaluation_loop(accelerator, model, eval_dataloader, metric)
215
+ state["accuracy"] = accuracy
216
+ state["lr"] = lr_scheduler.get_lr()[0]
217
+ state["optimizer_lr"] = optimizer.param_groups[0]["lr"]
218
+ state["epoch"] = epoch
219
+ state["step"] = overall_step
220
+ accelerator.print(f"epoch {epoch}:", state)
221
+
222
+ accelerator.wait_for_everyone()
223
+ if accelerator.is_main_process:
224
+ with open(os.path.join(args.output_dir, f"state_{epoch}.json"), "w") as f:
225
+ json.dump(state, f)
226
+ accelerator.end_training()
227
+
228
+
229
+ def main():
230
+ parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
231
+ parser.add_argument(
232
+ "--model_name_or_path",
233
+ type=str,
234
+ default="bert-base-cased",
235
+ help="Path to pretrained model or model identifier from huggingface.co/models.",
236
+ required=False,
237
+ )
238
+ parser.add_argument(
239
+ "--output_dir",
240
+ type=str,
241
+ default=".",
242
+ help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
243
+ )
244
+ parser.add_argument(
245
+ "--resume_from_checkpoint",
246
+ type=str,
247
+ default=None,
248
+ help="If the training should continue from a checkpoint folder.",
249
+ )
250
+ parser.add_argument(
251
+ "--partial_train_epoch",
252
+ type=int,
253
+ default=None,
254
+ help="If passed, the training will stop after this number of epochs.",
255
+ )
256
+ parser.add_argument(
257
+ "--num_epochs",
258
+ type=int,
259
+ default=2,
260
+ help="Number of train epochs.",
261
+ )
262
+ args = parser.parse_args()
263
+ config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
264
+
265
+ training_function(config, args)
266
+
267
+
268
+ if __name__ == "__main__":
269
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_ds_alst_ulysses_sp.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Test script for verifying ALST/Ulysses SP works
17
+ """
18
+
19
+ import torch
20
+ from deepspeed.runtime.utils import move_to_device
21
+ from transformers import AutoModelForCausalLM, AutoTokenizer
22
+
23
+ from accelerate import Accelerator
24
+ from accelerate.utils import ParallelismConfig, set_seed
25
+ from accelerate.utils.dataclasses import DeepSpeedSequenceParallelConfig
26
+
27
+
28
+ set_seed(42)
29
+
30
+ world_size = 2
31
+ model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
32
+
33
+ micro_batch_size = 1
34
+
35
+ parallelism_config = ParallelismConfig(
36
+ sp_backend="deepspeed",
37
+ sp_size=world_size,
38
+ # dp_shard_size=1, # set if dp is wanted as well
39
+ sp_handler=DeepSpeedSequenceParallelConfig(
40
+ sp_seq_length=256,
41
+ sp_seq_length_is_variable=True,
42
+ sp_attn_implementation="sdpa",
43
+ ),
44
+ )
45
+
46
+ accelerator = Accelerator(
47
+ parallelism_config=parallelism_config,
48
+ )
49
+
50
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
51
+ model = AutoModelForCausalLM.from_pretrained(model_name)
52
+
53
+ samples = 4
54
+ seqlen = 32
55
+ input_ids = torch.arange(1, seqlen * samples + 1).view(-1, seqlen) + 100
56
+ position_ids = torch.arange(seqlen * samples).view(-1, seqlen)
57
+
58
+ ds = torch.utils.data.TensorDataset(input_ids, position_ids)
59
+
60
+
61
+ def collate_fn(batch):
62
+ input_ids, position_ids = batch[0]
63
+ return dict(
64
+ input_ids=input_ids.unsqueeze(0),
65
+ position_ids=position_ids.unsqueeze(0),
66
+ labels=input_ids.unsqueeze(0),
67
+ )
68
+
69
+
70
+ dl = torch.utils.data.DataLoader(ds, batch_size=micro_batch_size, collate_fn=collate_fn)
71
+
72
+ optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
73
+
74
+ rank = torch.distributed.get_rank()
75
+
76
+ if rank == 0:
77
+ print(f"DL orig: {len(dl)} samples")
78
+
79
+ model, optimizer, dl = accelerator.prepare(model, optimizer, dl)
80
+
81
+ if rank == 0:
82
+ print(f"DL w/ adapter: {len(dl)} samples")
83
+
84
+ sp_size = parallelism_config.sp_size if parallelism_config else 1
85
+ if sp_size > 1:
86
+ sp_group = accelerator.torch_device_mesh["sp"].get_group()
87
+ sp_world_size = parallelism_config.sp_size
88
+
89
+ unwrapped_model = accelerator.unwrap_model(model)
90
+
91
+ # Normal training loop
92
+ for iter, batch in enumerate(dl):
93
+ optimizer.zero_grad()
94
+
95
+ if rank == 0:
96
+ print(f"batch {iter}: seqlen: {len(batch['input_ids'][0])}")
97
+ batch = move_to_device(batch, model.device)
98
+ outputs = model(**batch)
99
+
100
+ shift_labels = batch["shift_labels"]
101
+ loss = unwrapped_model.loss_function(
102
+ logits=outputs.logits,
103
+ labels=None,
104
+ shift_labels=shift_labels,
105
+ vocab_size=unwrapped_model.config.vocab_size,
106
+ )
107
+
108
+ if sp_size > 1:
109
+ # differentiable weighted per-shard-loss aggregation across ranks
110
+ losses_per_rank = torch.distributed.nn.functional.all_gather(loss, group=sp_group)
111
+ # special dealing with SFT that has prompt tokens that aren't used in loss computation
112
+ good_tokens = (shift_labels != -100).view(-1).sum()
113
+ good_tokens_per_rank = torch.distributed.nn.functional.all_gather(good_tokens, group=sp_group)
114
+ total_loss = sum(
115
+ losses_per_rank[rank] * good_tokens_per_rank[rank]
116
+ for rank in range(sp_world_size)
117
+ if good_tokens_per_rank[rank] > 0
118
+ )
119
+ total_good_tokens = sum(good_tokens_per_rank)
120
+ loss = total_loss / max(total_good_tokens, 1)
121
+
122
+ if rank == 0:
123
+ accelerator.print(f"{iter}: {loss=}")
124
+ accelerator.log(dict(train_loss=loss, step=iter))
125
+
126
+ accelerator.backward(loss)
127
+ optimizer.step()
128
+
129
+ accelerator.end_training()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_ds_multiple_model.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Test script for verifying multiple models can be utilized with Accelerate + DeepSpeed:
17
+
18
+ Scenario 1: One model is training, another model is being used for inference/logits to impact training in some form.
19
+ Scenario 2: Two models are training simultaneously, which means two optimizers, etc.
20
+ """
21
+
22
+ import argparse
23
+ from pathlib import Path
24
+
25
+ import evaluate
26
+ import torch
27
+ from datasets import load_dataset
28
+ from torch.optim import AdamW
29
+ from torch.utils.data import DataLoader
30
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
31
+
32
+ from accelerate import Accelerator, DeepSpeedPlugin, DistributedType
33
+ from accelerate.state import AcceleratorState
34
+ from accelerate.utils.deepspeed import get_active_deepspeed_plugin
35
+
36
+
37
+ EVAL_BATCH_SIZE = 16
38
+
39
+
40
+ class NoiseModel(torch.nn.Module):
41
+ def __init__(self, noise_factor=0.1):
42
+ super().__init__()
43
+ self.noise_factor = torch.nn.Parameter(torch.tensor(noise_factor, dtype=torch.float32))
44
+
45
+ def forward(self, loss):
46
+ return loss * self.noise_factor
47
+
48
+
49
+ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
50
+ """
51
+ Creates a set of `DataLoader`s for the `glue` dataset.
52
+
53
+ Args:
54
+ accelerator (`Accelerator`):
55
+ An `Accelerator` object
56
+ batch_size (`int`, *optional*):
57
+ The batch size for the train and validation DataLoaders.
58
+ model_name (`str`, *optional*):
59
+ """
60
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
61
+ datasets = load_dataset("glue", "mrpc")
62
+
63
+ def tokenize_function(examples):
64
+ # max_length=None => use the model max length (it's actually the default)
65
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
66
+ return outputs
67
+
68
+ # Apply the method we just defined to all the examples in all the splits of the dataset
69
+ tokenized_datasets = datasets.map(
70
+ tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
71
+ )
72
+
73
+ # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
74
+ # transformers library
75
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
76
+
77
+ def collate_fn(examples):
78
+ # On TPU it's best to pad everything to the same length or training will be very slow.
79
+ if accelerator.distributed_type == DistributedType.XLA:
80
+ return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
81
+ return tokenizer.pad(examples, padding="longest", return_tensors="pt")
82
+
83
+ # Instantiate dataloaders.
84
+ train_dataloader = DataLoader(
85
+ tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
86
+ )
87
+ eval_dataloader = DataLoader(
88
+ tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
89
+ )
90
+
91
+ return train_dataloader, eval_dataloader
92
+
93
+
94
+ test_file_path = __file__
95
+ path = Path(test_file_path).resolve()
96
+ test_file_dir_str = str(path.parent.parent.parent.parent.parent.parent)
97
+
98
+ # Create our DS plugins
99
+ # We use custom schedulers and optimizers, hence `model_only`
100
+ ds_config_file = dict(
101
+ zero2=f"{test_file_dir_str}/tests/deepspeed/ds_config_zero2_model_only.json",
102
+ zero3=f"{test_file_dir_str}/tests/deepspeed/ds_config_zero3_model_only.json",
103
+ )
104
+
105
+
106
+ def single_model_training(config, args):
107
+ # Training a single model, we have a `noise` model that is untrainable used to inject some noise into the training process
108
+ num_epochs = config["num_epochs"]
109
+ zero2_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero2"])
110
+ zero3_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero3"])
111
+
112
+ deepspeed_plugins = {"training": zero2_plugin, "inference": zero3_plugin}
113
+
114
+ # Initialize accelerator
115
+ accelerator = Accelerator(
116
+ deepspeed_plugins=deepspeed_plugins,
117
+ mixed_precision="bf16",
118
+ )
119
+
120
+ # Initialize model under zero2 plugin
121
+ assert get_active_deepspeed_plugin(accelerator.state) is zero2_plugin
122
+ train_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
123
+ train_dataloader, eval_dataloader = get_dataloaders(
124
+ accelerator, batch_size=config["batch_size"], model_name=args.model_name_or_path
125
+ )
126
+ max_training_steps = len(train_dataloader) * config["num_epochs"]
127
+ optimizer = AdamW(train_model.parameters(), lr=config["lr"])
128
+ lr_scheduler = get_linear_schedule_with_warmup(
129
+ optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
130
+ )
131
+
132
+ train_dataloader, eval_dataloader, train_model, optimizer, lr_scheduler = accelerator.prepare(
133
+ train_dataloader, eval_dataloader, train_model, optimizer, lr_scheduler
134
+ )
135
+
136
+ # Now prepare the model under zero3 plugin
137
+ accelerator.state.select_deepspeed_plugin("inference")
138
+ assert get_active_deepspeed_plugin(accelerator.state) is zero3_plugin
139
+ inference_model = NoiseModel()
140
+ inference_model = accelerator.prepare(inference_model)
141
+ inference_model.eval()
142
+
143
+ # Run training loop
144
+ accelerator.state.select_deepspeed_plugin("training")
145
+ # We also need to keep track of the stating epoch so files are named properly
146
+ starting_epoch = 0
147
+
148
+ # Now we train the model
149
+ best_performance = 0
150
+ metric = evaluate.load("glue", "mrpc")
151
+ performance_metric = {}
152
+ for epoch in range(starting_epoch, num_epochs):
153
+ train_model.train()
154
+ inference_model.train()
155
+ for step, batch in enumerate(train_dataloader):
156
+ with accelerator.accumulate(train_model):
157
+ outputs_1 = train_model(**batch)
158
+ with torch.no_grad():
159
+ outputs_2 = inference_model(outputs_1.loss)
160
+ # Combine the losses
161
+ loss = outputs_1.loss + outputs_2
162
+ accelerator.backward(loss)
163
+ optimizer.step()
164
+ lr_scheduler.step()
165
+ optimizer.zero_grad()
166
+
167
+ train_model.eval()
168
+ for step, batch in enumerate(eval_dataloader):
169
+ with torch.no_grad():
170
+ outputs = train_model(**batch)
171
+ predictions = outputs.logits.argmax(dim=-1)
172
+ # It is slightly faster to call this once, than multiple times
173
+ predictions, references = accelerator.gather_for_metrics((predictions, batch["labels"]))
174
+ metric.add_batch(
175
+ predictions=predictions,
176
+ references=references,
177
+ )
178
+
179
+ eval_metric = metric.compute()
180
+ # Use accelerator.print to print only on the main process.
181
+ accelerator.print(f"epoch {epoch}:", eval_metric)
182
+ performance_metric[f"epoch-{epoch}"] = eval_metric["accuracy"]
183
+
184
+ if best_performance < eval_metric["accuracy"]:
185
+ best_performance = eval_metric["accuracy"]
186
+ assert best_performance > performance_metric["epoch-0"]
187
+
188
+
189
+ def multiple_model_training(config, args):
190
+ # This will essentially be like a k-fold model, but one model is Zero-2 and another model is Zero-3
191
+ num_epochs = config["num_epochs"]
192
+ zero2_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero2"])
193
+ zero3_plugin = DeepSpeedPlugin(hf_ds_config=ds_config_file["zero3"])
194
+
195
+ deepspeed_plugins = {"zero2": zero2_plugin, "zero3": zero3_plugin}
196
+
197
+ # Initialize accelerator
198
+ zero2_accelerator = Accelerator(
199
+ deepspeed_plugins=deepspeed_plugins,
200
+ mixed_precision="bf16",
201
+ )
202
+
203
+ # Since an `AcceleratorState` has already been made, we can just reuse it here
204
+ zero3_accelerator = Accelerator()
205
+
206
+ # Initialize model under zero2 plugin
207
+ assert get_active_deepspeed_plugin(zero2_accelerator.state) is zero2_plugin
208
+ zero2_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
209
+ train_dataloader, eval_dataloader = get_dataloaders(
210
+ zero2_accelerator, batch_size=config["batch_size"], model_name=args.model_name_or_path
211
+ )
212
+ max_training_steps = len(train_dataloader) * config["num_epochs"]
213
+ zero2_optimizer = AdamW(zero2_model.parameters(), lr=config["lr"])
214
+ zero2_lr_scheduler = get_linear_schedule_with_warmup(
215
+ zero2_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
216
+ )
217
+
218
+ train_dataloader, eval_dataloader, zero2_model, zero2_optimizer, zero2_lr_scheduler = zero2_accelerator.prepare(
219
+ train_dataloader, eval_dataloader, zero2_model, zero2_optimizer, zero2_lr_scheduler
220
+ )
221
+ assert zero2_accelerator.deepspeed_engine_wrapped.engine is zero2_model
222
+
223
+ # now do Zero3
224
+ zero3_accelerator.state.select_deepspeed_plugin("zero3")
225
+ zero3_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = zero2_plugin.deepspeed_config[
226
+ "train_micro_batch_size_per_gpu"
227
+ ]
228
+ assert get_active_deepspeed_plugin(zero3_accelerator.state) is zero3_plugin
229
+ zero3_model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
230
+ zero3_optimizer = AdamW(zero3_model.parameters(), lr=config["lr"])
231
+ zero3_lr_scheduler = get_linear_schedule_with_warmup(
232
+ zero3_optimizer, num_warmup_steps=0, num_training_steps=max_training_steps
233
+ )
234
+ zero3_model, zero3_optimizer, zero3_lr_scheduler = zero3_accelerator.prepare(
235
+ zero3_model, zero3_optimizer, zero3_lr_scheduler
236
+ )
237
+ assert zero3_accelerator.deepspeed_engine_wrapped.engine is zero3_model
238
+
239
+ # Run training loop
240
+ starting_epoch = 0
241
+
242
+ # Now we train the model
243
+ best_performance_a = 0
244
+ best_performance_b = 0
245
+ metric_a = evaluate.load("glue", "mrpc")
246
+ metric_b = evaluate.load("glue", "mrpc")
247
+ performance_metric_a = {}
248
+ performance_metric_b = {}
249
+ for epoch in range(starting_epoch, num_epochs):
250
+ zero2_model.train()
251
+ zero3_model.train()
252
+ for step, batch in enumerate(train_dataloader):
253
+ with zero2_accelerator.accumulate(zero2_model, zero3_model):
254
+ outputs_1 = zero2_model(**batch)
255
+ zero2_accelerator.backward(outputs_1.loss)
256
+ zero2_optimizer.step()
257
+ zero2_lr_scheduler.step()
258
+ zero2_optimizer.zero_grad()
259
+ outputs_2 = zero3_model(**batch)
260
+ zero3_accelerator.backward(outputs_2.loss)
261
+ zero3_optimizer.step()
262
+ zero3_lr_scheduler.step()
263
+ zero3_optimizer.zero_grad()
264
+
265
+ zero2_model.eval()
266
+ zero3_model.eval()
267
+ for step, batch in enumerate(eval_dataloader):
268
+ with torch.no_grad():
269
+ logits_a = zero2_model(**batch).logits
270
+ logits_b = zero3_model(**batch).logits
271
+ # Combine the logits from both models
272
+ predictions_a = logits_a.argmax(dim=-1)
273
+ predictions_b = logits_b.argmax(dim=-1)
274
+ # It is slightly faster to call this once, than multiple times
275
+ predictions_a, predictions_b, references = zero2_accelerator.gather_for_metrics(
276
+ (predictions_a, predictions_b, batch["labels"])
277
+ )
278
+ metric_a.add_batch(
279
+ predictions=predictions_a,
280
+ references=references,
281
+ )
282
+ metric_b.add_batch(
283
+ predictions=predictions_b,
284
+ references=references,
285
+ )
286
+
287
+ eval_metric_a = metric_a.compute()
288
+ eval_metric_b = metric_b.compute()
289
+ # Use accelerator.print to print only on the main process.
290
+ zero2_accelerator.print(f"epoch {epoch}:", eval_metric_a, eval_metric_b)
291
+ performance_metric_a[f"epoch-{epoch}"] = eval_metric_a["accuracy"]
292
+ performance_metric_b[f"epoch-{epoch}"] = eval_metric_b["accuracy"]
293
+
294
+ if best_performance_a < eval_metric_a["accuracy"]:
295
+ best_performance_a = eval_metric_a["accuracy"]
296
+ if best_performance_b < eval_metric_b["accuracy"]:
297
+ best_performance_b = eval_metric_b["accuracy"]
298
+ assert best_performance_a > performance_metric_a["epoch-0"]
299
+ assert best_performance_b > performance_metric_b["epoch-0"]
300
+
301
+
302
+ def main():
303
+ parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
304
+ parser.add_argument(
305
+ "--model_name_or_path",
306
+ type=str,
307
+ default="bert-base-cased",
308
+ help="Path to pretrained model or model identifier from huggingface.co/models.",
309
+ required=False,
310
+ )
311
+ parser.add_argument(
312
+ "--performance_lower_bound",
313
+ type=float,
314
+ default=None,
315
+ help="Optional lower bound for the performance metric. If set, the training will throw error when the performance metric drops below this value.",
316
+ )
317
+ parser.add_argument(
318
+ "--num_epochs",
319
+ type=int,
320
+ default=3,
321
+ help="Number of train epochs.",
322
+ )
323
+ args = parser.parse_args()
324
+ config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 8}
325
+ single_model_training(config, args)
326
+ AcceleratorState._reset_state(True)
327
+ multiple_model_training(config, args)
328
+
329
+
330
+ if __name__ == "__main__":
331
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_metrics.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+ import math
17
+ import os
18
+ from copy import deepcopy
19
+
20
+ import datasets
21
+ import evaluate
22
+ import torch
23
+ import transformers
24
+ from datasets import load_dataset
25
+ from torch.utils.data import DataLoader, IterableDataset
26
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
27
+
28
+ from accelerate import Accelerator, DataLoaderConfiguration, DistributedType
29
+ from accelerate.data_loader import DataLoaderDispatcher
30
+ from accelerate.test_utils import RegressionDataset, RegressionModel, torch_device
31
+ from accelerate.utils import is_torch_xla_available, set_seed
32
+
33
+
34
+ os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
35
+
36
+
37
+ class ListHandler(logging.Handler):
38
+ def __init__(self, *args, **kwargs):
39
+ super().__init__(*args, **kwargs)
40
+ self.logs = []
41
+
42
+ def emit(self, record):
43
+ self.logs.append(record)
44
+
45
+
46
+ def get_basic_setup(accelerator, num_samples=82, batch_size=16):
47
+ "Returns everything needed to perform basic training"
48
+ set_seed(42)
49
+ model = RegressionModel()
50
+ ddp_model = deepcopy(model)
51
+ dset = RegressionDataset(length=num_samples)
52
+ dataloader = DataLoader(dset, batch_size=batch_size)
53
+ model.to(accelerator.device)
54
+ ddp_model, dataloader = accelerator.prepare(ddp_model, dataloader)
55
+ return model, ddp_model, dataloader
56
+
57
+
58
+ def get_dataloader(accelerator: Accelerator, use_longest=False):
59
+ tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/mrpc-bert-base-cased")
60
+ dataset = load_dataset("glue", "mrpc", split="validation")
61
+
62
+ def tokenize_function(examples):
63
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
64
+ return outputs
65
+
66
+ with accelerator.main_process_first():
67
+ tokenized_datasets = dataset.map(
68
+ tokenize_function,
69
+ batched=True,
70
+ remove_columns=["idx", "sentence1", "sentence2"],
71
+ )
72
+
73
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
74
+
75
+ def collate_fn(examples):
76
+ if use_longest:
77
+ return tokenizer.pad(examples, padding="longest", return_tensors="pt")
78
+ return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
79
+
80
+ return DataLoader(tokenized_datasets, shuffle=False, collate_fn=collate_fn, batch_size=16)
81
+
82
+
83
+ def get_mrpc_setup(dispatch_batches, split_batches):
84
+ dataloader_config = DataLoaderConfiguration(dispatch_batches=dispatch_batches, split_batches=split_batches)
85
+ accelerator = Accelerator(dataloader_config=dataloader_config)
86
+ dataloader = get_dataloader(accelerator, not dispatch_batches)
87
+ model = AutoModelForSequenceClassification.from_pretrained(
88
+ "hf-internal-testing/mrpc-bert-base-cased", return_dict=True
89
+ )
90
+ ddp_model, ddp_dataloader = accelerator.prepare(model, dataloader)
91
+ return {
92
+ "ddp": [ddp_model, ddp_dataloader, torch_device],
93
+ "no": [model, dataloader, accelerator.device],
94
+ }, accelerator
95
+
96
+
97
+ def generate_predictions(model, dataloader, accelerator):
98
+ logits_and_targets = []
99
+ for batch in dataloader:
100
+ input, target = batch.values()
101
+ with torch.no_grad():
102
+ logit = model(input)
103
+ logit, target = accelerator.gather_for_metrics((logit, target))
104
+ logits_and_targets.append((logit, target))
105
+ logits, targs = [], []
106
+ for logit, targ in logits_and_targets:
107
+ logits.append(logit)
108
+ targs.append(targ)
109
+ logits, targs = torch.cat(logits), torch.cat(targs)
110
+ return logits, targs
111
+
112
+
113
+ def test_torch_metrics(
114
+ accelerator: Accelerator, num_samples=82, dispatch_batches=False, split_batches=False, batch_size=16
115
+ ):
116
+ _, ddp_model, dataloader = get_basic_setup(accelerator, num_samples, batch_size)
117
+ logits, _ = generate_predictions(ddp_model, dataloader, accelerator)
118
+ assert len(logits) == num_samples, (
119
+ f"Unexpected number of inputs:\n Expected: {num_samples}\n Actual: {len(logits)}"
120
+ )
121
+
122
+
123
+ def test_mrpc(dispatch_batches: bool = False, split_batches: bool = False):
124
+ metric = evaluate.load("glue", "mrpc")
125
+ setup, accelerator = get_mrpc_setup(dispatch_batches, split_batches)
126
+ # First do baseline
127
+ model, dataloader, device = setup["no"]
128
+ model.to(device)
129
+ model.eval()
130
+ for batch in dataloader:
131
+ batch.to(device)
132
+ with torch.inference_mode():
133
+ outputs = model(**batch)
134
+ preds = outputs.logits.argmax(dim=-1)
135
+ metric.add_batch(predictions=preds, references=batch["labels"])
136
+ baseline = metric.compute()
137
+
138
+ # Then do distributed
139
+ model, dataloader, device = setup["ddp"]
140
+ model.eval()
141
+ for batch in dataloader:
142
+ with torch.inference_mode():
143
+ outputs = model(**batch)
144
+ preds = outputs.logits.argmax(dim=-1)
145
+ references = batch["labels"]
146
+ preds, references = accelerator.gather_for_metrics((preds, references))
147
+ metric.add_batch(predictions=preds, references=references)
148
+ distributed = metric.compute()
149
+
150
+ for key in "accuracy f1".split():
151
+ assert math.isclose(baseline[key], distributed[key]), (
152
+ f"Baseline and Distributed are not the same for key {key}:\n\tBaseline: {baseline[key]}\n\tDistributed: {distributed[key]}\n"
153
+ )
154
+
155
+
156
+ def test_gather_for_metrics_with_non_tensor_objects_iterable_dataset():
157
+ class DummyIterableDataset(IterableDataset):
158
+ def __init__(self, data):
159
+ self.data = data
160
+
161
+ def __len__(self):
162
+ return len(self.data)
163
+
164
+ def __iter__(self):
165
+ yield from self.data
166
+
167
+ iterable_dataset = DummyIterableDataset([n for n in range(30)])
168
+ dataloader = DataLoader(iterable_dataset, batch_size=4)
169
+ accelerator = Accelerator()
170
+ prepared_dataloader = accelerator.prepare(dataloader)
171
+
172
+ if accelerator.is_main_process:
173
+ logger = logging.root.manager.loggerDict["accelerate.accelerator"]
174
+ list_handler = ListHandler()
175
+ logger.addHandler(list_handler)
176
+
177
+ batches_for_metrics = []
178
+ for batch in prepared_dataloader:
179
+ batches_for_metrics.append(accelerator.gather_for_metrics(batch))
180
+
181
+ assert torch.cat(batches_for_metrics).size(0) == 30
182
+
183
+ if accelerator.is_main_process:
184
+ assert len(list_handler.logs) == 0
185
+ logger.removeHandler(list_handler)
186
+
187
+
188
+ def test_gather_for_metrics_with_iterable_dataset():
189
+ class DummyIterableDataset(IterableDataset):
190
+ def __init__(self, data):
191
+ self.data = data
192
+
193
+ def __len__(self):
194
+ return len(self.data)
195
+
196
+ def __iter__(self):
197
+ yield from self.data
198
+
199
+ iterable_dataset = DummyIterableDataset(torch.as_tensor(range(30)))
200
+ dataloader = DataLoader(iterable_dataset, batch_size=4)
201
+
202
+ accelerator = Accelerator()
203
+ prepared_dataloader = accelerator.prepare(dataloader)
204
+
205
+ assert isinstance(prepared_dataloader, DataLoaderDispatcher)
206
+
207
+ if accelerator.is_main_process:
208
+ logger = logging.root.manager.loggerDict["accelerate.accelerator"]
209
+ list_handler = ListHandler()
210
+ logger.addHandler(list_handler)
211
+
212
+ batches_for_metrics = []
213
+ for batch in prepared_dataloader:
214
+ batches_for_metrics.append(accelerator.gather_for_metrics(batch))
215
+
216
+ assert torch.cat(batches_for_metrics).size(0) == 30
217
+
218
+ if accelerator.is_main_process:
219
+ assert len(list_handler.logs) == 0
220
+
221
+ logger.removeHandler(list_handler)
222
+
223
+
224
+ def test_gather_for_metrics_drop_last():
225
+ accelerator = Accelerator()
226
+ per_device_batch_size = 5
227
+ num_items = (10 * accelerator.num_processes) + 1
228
+ dataloader = DataLoader(range(num_items), batch_size=per_device_batch_size, drop_last=True)
229
+ dataloader = accelerator.prepare(dataloader)
230
+
231
+ iterator = iter(dataloader)
232
+ next(iterator) # Skip first batch tensor([0, 1, 2, 3, 4], device='cuda:0')
233
+ batch = next(iterator)
234
+ gathered_items = accelerator.gather_for_metrics(batch)
235
+
236
+ # Should return a full set of complete batches from each GPU
237
+ num_expected_items = per_device_batch_size * accelerator.num_processes
238
+ assert gathered_items.size(0) == (num_expected_items), (
239
+ f"Expected number of items: {num_expected_items}, Actual: {gathered_items.size(0)}"
240
+ )
241
+
242
+
243
+ def main():
244
+ dataloader_config = DataLoaderConfiguration(split_batches=False, dispatch_batches=False)
245
+ accelerator = Accelerator(dataloader_config=dataloader_config)
246
+ if accelerator.is_local_main_process:
247
+ datasets.utils.logging.set_verbosity_warning()
248
+ transformers.utils.logging.set_verbosity_warning()
249
+ else:
250
+ datasets.utils.logging.set_verbosity_error()
251
+ transformers.utils.logging.set_verbosity_error()
252
+ # TorchXLA does not support batch dispatching. 'put_on_device' is always False for
253
+ # TorchXLA, which can cause a value error in 'prepare_data_loader' function.
254
+ dispatch_batches_options = [False] if accelerator.state.distributed_type == DistributedType.XLA else [True, False]
255
+
256
+ # Temporarily close this test for TorchXLA due to the 'Cannot set version_counter for
257
+ # inference tensor' error in inference mode. Reopen it after TorchXLA fixes this bug.
258
+ # These are a bit slower so they should only be ran on the GPU or TPU
259
+ if accelerator.device.type != "cpu" and not is_torch_xla_available():
260
+ if accelerator.is_local_main_process:
261
+ print("**Testing gather_for_metrics**")
262
+ for split_batches in [True, False]:
263
+ for dispatch_batches in dispatch_batches_options:
264
+ if accelerator.is_local_main_process:
265
+ print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`")
266
+ test_mrpc(dispatch_batches, split_batches)
267
+ accelerator.state._reset_state()
268
+ print("test_gather_for_metrics_with_iterable_dataset")
269
+ test_gather_for_metrics_with_iterable_dataset()
270
+ print("test gather_for_metrics_with_non_tensor_objects_iterable_dataset")
271
+ test_gather_for_metrics_with_non_tensor_objects_iterable_dataset()
272
+
273
+ # MpDeviceLoader in TorchXLA is an asynchronous loader that preloads several batches into cache.
274
+ # This can cause the 'end_of_dataloader' of DataLoaderStateMixin to be set earlier than intended.
275
+ # Skip this test when TorchXLA is enabled.
276
+ if accelerator.state.distributed_type != DistributedType.XLA:
277
+ if accelerator.is_local_main_process:
278
+ print("**Test torch metrics**")
279
+ for split_batches in [True, False]:
280
+ for dispatch_batches in dispatch_batches_options:
281
+ dataloader_config = DataLoaderConfiguration(
282
+ split_batches=split_batches, dispatch_batches=dispatch_batches
283
+ )
284
+ accelerator = Accelerator(dataloader_config=dataloader_config)
285
+ if accelerator.is_local_main_process:
286
+ print(f"With: `split_batches={split_batches}`, `dispatch_batches={dispatch_batches}`, length=99")
287
+ test_torch_metrics(accelerator, 99)
288
+ accelerator.state._reset_state()
289
+ if accelerator.is_local_main_process:
290
+ print("**Test last batch is not dropped when perfectly divisible**")
291
+ accelerator = Accelerator()
292
+ test_torch_metrics(accelerator, 512)
293
+ accelerator.state._reset_state()
294
+ if accelerator.is_local_main_process:
295
+ print("**Test that `drop_last` is taken into account**")
296
+ test_gather_for_metrics_drop_last()
297
+ accelerator.end_training()
298
+ accelerator.state._reset_state()
299
+
300
+
301
+ def _mp_fn(index):
302
+ # For xla_spawn (TPUs)
303
+ main()
304
+
305
+
306
+ if __name__ == "__main__":
307
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_peak_memory_usage.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+ import gc
16
+ import json
17
+ import os
18
+
19
+ import torch
20
+ from datasets import load_dataset
21
+ from torch.optim import AdamW
22
+ from torch.utils.data import DataLoader
23
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
24
+
25
+ from accelerate import Accelerator, DistributedType
26
+ from accelerate.utils import (
27
+ is_hpu_available,
28
+ is_mlu_available,
29
+ is_musa_available,
30
+ is_npu_available,
31
+ is_sdaa_available,
32
+ is_xpu_available,
33
+ )
34
+ from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
35
+
36
+
37
+ MAX_GPU_BATCH_SIZE = 16
38
+ EVAL_BATCH_SIZE = 32
39
+
40
+
41
+ # Converting Bytes to Megabytes
42
+ def b2mb(x):
43
+ return int(x / 2**20)
44
+
45
+
46
+ # This context manager is used to track the peak memory usage of the process
47
+ class TorchTracemalloc:
48
+ def __enter__(self):
49
+ gc.collect()
50
+ if torch.cuda.is_available():
51
+ torch.cuda.empty_cache()
52
+ torch.cuda.reset_max_memory_allocated() # reset the peak gauge to zero
53
+ self.begin = torch.cuda.memory_allocated()
54
+ elif is_mlu_available():
55
+ torch.mlu.empty_cache()
56
+ torch.mlu.reset_max_memory_allocated() # reset the peak gauge to zero
57
+ self.begin = torch.mlu.memory_allocated()
58
+ elif is_sdaa_available():
59
+ torch.sdaa.empty_cache()
60
+ torch.sdaa.reset_max_memory_allocated() # reset the peak gauge to zero
61
+ self.begin = torch.sdaa.memory_allocated()
62
+ elif is_musa_available():
63
+ torch.musa.empty_cache()
64
+ torch.musa.reset_max_memory_allocated() # reset the peak gauge to zero
65
+ self.begin = torch.musa.memory_allocated()
66
+ elif is_npu_available():
67
+ torch.npu.empty_cache()
68
+ torch.npu.reset_max_memory_allocated() # reset the peak gauge to zero
69
+ self.begin = torch.npu.memory_allocated()
70
+ elif is_xpu_available():
71
+ torch.xpu.empty_cache()
72
+ torch.xpu.reset_peak_memory_stats() # reset the peak gauge to zero
73
+ self.begin = torch.xpu.memory_allocated()
74
+ elif is_hpu_available():
75
+ # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
76
+ torch.hpu.reset_peak_memory_stats() # reset the peak gauge to zero
77
+ self.begin = torch.hpu.memory_allocated()
78
+ return self
79
+
80
+ def __exit__(self, *exc):
81
+ gc.collect()
82
+ if torch.cuda.is_available():
83
+ torch.cuda.empty_cache()
84
+ self.end = torch.cuda.memory_allocated()
85
+ self.peak = torch.cuda.max_memory_allocated()
86
+ elif is_mlu_available():
87
+ torch.mlu.empty_cache()
88
+ self.end = torch.mlu.memory_allocated()
89
+ self.begin = torch.mlu.max_memory_allocated()
90
+ elif is_sdaa_available():
91
+ torch.sdaa.empty_cache()
92
+ self.end = torch.sdaa.memory_allocated()
93
+ self.begin = torch.sdaa.max_memory_allocated()
94
+ elif is_musa_available():
95
+ torch.musa.empty_cache()
96
+ self.end = torch.musa.memory_allocated()
97
+ self.begin = torch.musa.max_memory_allocated()
98
+ elif is_npu_available():
99
+ torch.npu.empty_cache()
100
+ self.end = torch.npu.memory_allocated()
101
+ self.peak = torch.npu.max_memory_allocated()
102
+ elif is_xpu_available():
103
+ torch.xpu.empty_cache()
104
+ self.end = torch.xpu.memory_allocated()
105
+ self.peak = torch.xpu.max_memory_allocated()
106
+ elif is_hpu_available():
107
+ # torch.hpu.empty_cache() # not available on hpu as it reserves all device memory for the current process
108
+ self.end = torch.hpu.memory_allocated()
109
+ self.peak = torch.hpu.max_memory_allocated()
110
+ self.used = b2mb(self.end - self.begin)
111
+ self.peaked = b2mb(self.peak - self.begin)
112
+ # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}")
113
+
114
+
115
+ def get_dataloaders(
116
+ accelerator: Accelerator,
117
+ batch_size: int = 16,
118
+ model_name: str = "bert-base-cased",
119
+ n_train: int = 320,
120
+ n_val: int = 160,
121
+ ):
122
+ """
123
+ Creates a set of `DataLoader`s for the `glue` dataset.
124
+
125
+ Args:
126
+ accelerator (`Accelerator`):
127
+ An `Accelerator` object
128
+ batch_size (`int`, *optional*):
129
+ The batch size for the train and validation DataLoaders.
130
+ model_name (`str`, *optional*):
131
+ The name of the model to use.
132
+ n_train (`int`, *optional*):
133
+ The number of training examples to use.
134
+ n_val (`int`, *optional*):
135
+ The number of validation examples to use.
136
+ """
137
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
138
+ datasets = load_dataset(
139
+ "glue", "mrpc", split={"train": f"train[:{n_train}]", "validation": f"validation[:{n_val}]"}
140
+ )
141
+
142
+ def tokenize_function(examples):
143
+ # max_length=None => use the model max length (it's actually the default)
144
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
145
+ return outputs
146
+
147
+ # Apply the method we just defined to all the examples in all the splits of the dataset
148
+ tokenized_datasets = datasets.map(
149
+ tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
150
+ )
151
+
152
+ # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
153
+ # transformers library
154
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
155
+
156
+ def collate_fn(examples):
157
+ # On TPU it's best to pad everything to the same length or training will be very slow.
158
+ if accelerator.distributed_type == DistributedType.XLA:
159
+ return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
160
+ return tokenizer.pad(examples, padding="longest", return_tensors="pt")
161
+
162
+ # Instantiate dataloaders.
163
+ train_dataloader = DataLoader(
164
+ tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
165
+ )
166
+ eval_dataloader = DataLoader(
167
+ tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
168
+ )
169
+
170
+ return train_dataloader, eval_dataloader
171
+
172
+
173
+ def training_function(config, args):
174
+ # Initialize accelerator
175
+ accelerator = Accelerator()
176
+
177
+ # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
178
+ lr = config["lr"]
179
+ num_epochs = int(config["num_epochs"])
180
+ seed = int(config["seed"])
181
+ batch_size = int(config["batch_size"])
182
+ model_name = args.model_name_or_path
183
+
184
+ set_seed(seed)
185
+ train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name, args.n_train, args.n_val)
186
+
187
+ # Instantiate the model (we build the model here so that the seed also control new weights initialization)
188
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True)
189
+
190
+ # Instantiate optimizer
191
+ optimizer_cls = (
192
+ AdamW
193
+ if accelerator.state.deepspeed_plugin is None
194
+ or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
195
+ else DummyOptim
196
+ )
197
+ optimizer = optimizer_cls(params=model.parameters(), lr=lr)
198
+
199
+ if accelerator.state.deepspeed_plugin is not None:
200
+ gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
201
+ "gradient_accumulation_steps"
202
+ ]
203
+ else:
204
+ gradient_accumulation_steps = 1
205
+ max_training_steps = (len(train_dataloader) * num_epochs) // gradient_accumulation_steps
206
+
207
+ # Instantiate scheduler
208
+ if (
209
+ accelerator.state.deepspeed_plugin is None
210
+ or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
211
+ ):
212
+ lr_scheduler = get_linear_schedule_with_warmup(
213
+ optimizer=optimizer,
214
+ num_warmup_steps=0,
215
+ num_training_steps=max_training_steps,
216
+ )
217
+ else:
218
+ lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)
219
+
220
+ # Prepare everything
221
+ # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
222
+ # prepare method.
223
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
224
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
225
+ )
226
+
227
+ # We need to keep track of how many total steps we have iterated over
228
+ overall_step = 0
229
+ # We also need to keep track of the stating epoch so files are named properly
230
+ starting_epoch = 0
231
+
232
+ # Now we train the model
233
+ train_total_peak_memory = {}
234
+ for epoch in range(starting_epoch, num_epochs):
235
+ with TorchTracemalloc() as tracemalloc:
236
+ model.train()
237
+ for step, batch in enumerate(train_dataloader):
238
+ outputs = model(**batch)
239
+ loss = outputs.loss
240
+ loss = loss / gradient_accumulation_steps
241
+ accelerator.backward(loss)
242
+ if step % gradient_accumulation_steps == 0:
243
+ optimizer.step()
244
+ lr_scheduler.step()
245
+ optimizer.zero_grad()
246
+
247
+ overall_step += 1
248
+
249
+ # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage
250
+ accelerator.print(f"Memory before entering the train : {b2mb(tracemalloc.begin)}")
251
+ accelerator.print(f"Memory consumed at the end of the train (end-begin): {tracemalloc.used}")
252
+ accelerator.print(f"Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}")
253
+ accelerator.print(
254
+ f"Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}"
255
+ )
256
+ train_total_peak_memory[f"epoch-{epoch}"] = tracemalloc.peaked + b2mb(tracemalloc.begin)
257
+ if args.peak_memory_upper_bound is not None:
258
+ assert train_total_peak_memory[f"epoch-{epoch}"] <= args.peak_memory_upper_bound, (
259
+ "Peak memory usage exceeded the upper bound"
260
+ )
261
+
262
+ accelerator.wait_for_everyone()
263
+ if accelerator.is_main_process:
264
+ with open(os.path.join(args.output_dir, "peak_memory_utilization.json"), "w") as f:
265
+ json.dump(train_total_peak_memory, f)
266
+ accelerator.end_training()
267
+
268
+
269
+ def main():
270
+ parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
271
+ parser.add_argument(
272
+ "--model_name_or_path",
273
+ type=str,
274
+ default="bert-base-cased",
275
+ help="Path to pretrained model or model identifier from huggingface.co/models.",
276
+ required=False,
277
+ )
278
+ parser.add_argument(
279
+ "--output_dir",
280
+ type=str,
281
+ default=".",
282
+ help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
283
+ )
284
+ parser.add_argument(
285
+ "--peak_memory_upper_bound",
286
+ type=float,
287
+ default=None,
288
+ help="The upper bound of peak memory usage in MB. If set, the training will throw an error if the peak memory usage exceeds this value.",
289
+ )
290
+ parser.add_argument(
291
+ "--n_train",
292
+ type=int,
293
+ default=320,
294
+ help="Number of training examples to use.",
295
+ )
296
+ parser.add_argument(
297
+ "--n_val",
298
+ type=int,
299
+ default=160,
300
+ help="Number of validation examples to use.",
301
+ )
302
+ parser.add_argument(
303
+ "--num_epochs",
304
+ type=int,
305
+ default=1,
306
+ help="Number of train epochs.",
307
+ )
308
+ args = parser.parse_args()
309
+ config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
310
+ training_function(config, args)
311
+
312
+
313
+ if __name__ == "__main__":
314
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_performance.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+ import json
16
+ import os
17
+ from contextlib import nullcontext
18
+ from pathlib import Path
19
+
20
+ import evaluate
21
+ import torch
22
+ from datasets import load_dataset
23
+ from torch.optim import AdamW
24
+ from torch.utils.data import DataLoader
25
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
26
+
27
+ from accelerate import Accelerator, DistributedType
28
+ from accelerate.parallelism_config import ParallelismConfig
29
+ from accelerate.utils import SAFE_WEIGHTS_NAME, set_seed
30
+ from accelerate.utils.deepspeed import DummyOptim, DummyScheduler
31
+
32
+
33
+ MAX_GPU_BATCH_SIZE = 16
34
+ EVAL_BATCH_SIZE = 32
35
+
36
+
37
+ def get_dataloaders(accelerator: Accelerator, batch_size: int = 16, model_name: str = "bert-base-cased"):
38
+ """
39
+ Creates a set of `DataLoader`s for the `glue` dataset.
40
+
41
+ Args:
42
+ accelerator (`Accelerator`):
43
+ An `Accelerator` object
44
+ batch_size (`int`, *optional*):
45
+ The batch size for the train and validation DataLoaders.
46
+ model_name (`str`, *optional*):
47
+ """
48
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
49
+
50
+ datasets = load_dataset("glue", "mrpc")
51
+
52
+ def tokenize_function(examples):
53
+ # max_length=None => use the model max length (it's actually the default)
54
+ outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
55
+ return outputs
56
+
57
+ # Apply the method we just defined to all the examples in all the splits of the dataset
58
+ tokenized_datasets = datasets.map(
59
+ tokenize_function, batched=True, remove_columns=["idx", "sentence1", "sentence2"], load_from_cache_file=False
60
+ )
61
+
62
+ # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the
63
+ # transformers library
64
+ tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
65
+
66
+ def collate_fn(examples):
67
+ # On TPU it's best to pad everything to the same length or training will be very slow.
68
+ if accelerator.distributed_type == DistributedType.XLA:
69
+ return tokenizer.pad(examples, padding="max_length", max_length=128, return_tensors="pt")
70
+ return tokenizer.pad(examples, padding="longest", return_tensors="pt")
71
+
72
+ # Instantiate dataloaders.
73
+ train_dataloader = DataLoader(
74
+ tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size
75
+ )
76
+ eval_dataloader = DataLoader(
77
+ tokenized_datasets["validation"], shuffle=False, collate_fn=collate_fn, batch_size=EVAL_BATCH_SIZE
78
+ )
79
+
80
+ return train_dataloader, eval_dataloader
81
+
82
+
83
+ def training_function(config, args):
84
+ accelerator_kwargs = {}
85
+ # need this for DeepSpeed tests as `args.tp_size` would be None and `torch.distributed.init_device_mesh` would fail
86
+ if args.tp_size is not None:
87
+ accelerator_kwargs["parallelism_config"] = ParallelismConfig(tp_size=args.tp_size)
88
+
89
+ # Initialize accelerator
90
+ accelerator = Accelerator(**accelerator_kwargs)
91
+
92
+ # Sample hyper-parameters for learning rate, batch size, seed and a few other HPs
93
+ lr = config["lr"]
94
+ num_epochs = int(config["num_epochs"])
95
+ seed = int(config["seed"])
96
+ batch_size = int(config["batch_size"])
97
+ model_name = args.model_name_or_path
98
+
99
+ set_seed(seed)
100
+ train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size, model_name)
101
+
102
+ # Add TP related kwargs if provided
103
+ model_kwargs = {}
104
+ if args.tp_plan is not None:
105
+ model_kwargs["tp_plan"] = args.tp_plan
106
+ if args.tp_size is not None:
107
+ model_kwargs["tp_size"] = args.tp_size
108
+
109
+ # Instantiate the model (we build the model here so that the seed also control new weights initialization)
110
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, **model_kwargs)
111
+
112
+ if args.add_pad_token:
113
+ if model.config.pad_token_id is None:
114
+ model.config.pad_token_id = 0
115
+
116
+ # Instantiate optimizer
117
+ optimizer_cls = (
118
+ AdamW
119
+ if accelerator.state.deepspeed_plugin is None
120
+ or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
121
+ else DummyOptim
122
+ )
123
+ optimizer = optimizer_cls(params=model.parameters(), lr=lr)
124
+
125
+ max_training_steps = len(train_dataloader) * num_epochs
126
+
127
+ # Instantiate scheduler
128
+ linear_decay_scheduler = False
129
+ if (
130
+ accelerator.state.deepspeed_plugin is None
131
+ or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
132
+ ):
133
+ lr_scheduler = get_linear_schedule_with_warmup(
134
+ optimizer=optimizer,
135
+ num_warmup_steps=0,
136
+ num_training_steps=max_training_steps,
137
+ )
138
+ linear_decay_scheduler = True
139
+ else:
140
+ lr_scheduler = DummyScheduler(optimizer, total_num_steps=max_training_steps, warmup_num_steps=0)
141
+
142
+ # Prepare everything
143
+ # There is no specific order to remember, we just need to unpack the objects in the same order we gave them to the
144
+ # prepare method.
145
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
146
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
147
+ )
148
+
149
+ # We also need to keep track of the stating epoch so files are named properly
150
+ starting_epoch = 0
151
+
152
+ # Now we train the model
153
+ metric = evaluate.load("glue", "mrpc")
154
+ best_performance = 0
155
+ performance_metric = {}
156
+ expected_lr_after_first_optim_step = lr * (
157
+ 1 - 1 / (max_training_steps / accelerator.num_processes / accelerator.gradient_accumulation_steps)
158
+ )
159
+ lr_scheduler_check_completed = False
160
+ for epoch in range(starting_epoch, num_epochs):
161
+ model.train()
162
+ for step, batch in enumerate(train_dataloader):
163
+ with accelerator.accumulate(model):
164
+ outputs = model(**batch)
165
+ loss = outputs.loss
166
+ accelerator.backward(loss)
167
+ context = nullcontext
168
+ if args.tp_plan is not None:
169
+ from torch.distributed._tensor.experimental import implicit_replication
170
+
171
+ context = implicit_replication
172
+ with context():
173
+ optimizer.step()
174
+ lr_scheduler.step()
175
+ optimizer.zero_grad()
176
+
177
+ # assert the learning rate after first optimizer step
178
+ if (
179
+ accelerator.sync_gradients
180
+ and not lr_scheduler_check_completed
181
+ and linear_decay_scheduler
182
+ and accelerator.state.mixed_precision == "no"
183
+ ):
184
+ assert lr_scheduler.get_last_lr()[0] == expected_lr_after_first_optim_step, (
185
+ f"Wrong lr found at second step, expected {expected_lr_after_first_optim_step}, got {lr_scheduler.get_last_lr()[0]}"
186
+ )
187
+ lr_scheduler_check_completed = True
188
+
189
+ model.eval()
190
+ samples_seen = 0
191
+ for step, batch in enumerate(eval_dataloader):
192
+ # We could avoid this line since we set the accelerator with `device_placement=True`.
193
+ batch.to(accelerator.device)
194
+ with torch.no_grad():
195
+ outputs = model(**batch)
196
+ predictions = outputs.logits.argmax(dim=-1)
197
+ # It is slightly faster to call this once, than multiple times
198
+ predictions, references = accelerator.gather(
199
+ (predictions, batch["labels"])
200
+ ) # If we are in a multiprocess environment, the last batch has duplicates
201
+ if accelerator.use_distributed:
202
+ if step == len(eval_dataloader) - 1:
203
+ predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
204
+ references = references[: len(eval_dataloader.dataset) - samples_seen]
205
+ else:
206
+ samples_seen += references.shape[0]
207
+ metric.add_batch(
208
+ predictions=predictions,
209
+ references=references,
210
+ )
211
+
212
+ eval_metric = metric.compute()
213
+ # Use accelerator.print to print only on the main process.
214
+ accelerator.print(f"epoch {epoch}:", eval_metric)
215
+ performance_metric[f"epoch-{epoch}"] = eval_metric["accuracy"]
216
+
217
+ if best_performance < eval_metric["accuracy"]:
218
+ best_performance = eval_metric["accuracy"]
219
+
220
+ # check that the LR is 0
221
+ if linear_decay_scheduler and accelerator.state.mixed_precision == "no":
222
+ assert lr_scheduler.get_last_lr()[0] == 0, (
223
+ f"Wrong lr found at last step, expected 0, got {lr_scheduler.get_last_lr()[0]}"
224
+ )
225
+
226
+ if args.performance_lower_bound is not None:
227
+ assert args.performance_lower_bound <= best_performance, (
228
+ f"Best performance metric {best_performance} is lower than the lower bound {args.performance_lower_bound}"
229
+ )
230
+
231
+ accelerator.wait_for_everyone()
232
+ if accelerator.is_main_process:
233
+ with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
234
+ json.dump(performance_metric, f)
235
+
236
+ # TODO: skip saving of the model test for TP until the feature lands
237
+ if args.tp_plan is None:
238
+ # Finally try saving the model
239
+ accelerator.save_model(model, args.output_dir)
240
+ accelerator.wait_for_everyone()
241
+ if args.tp_plan is None:
242
+ assert Path(args.output_dir, SAFE_WEIGHTS_NAME).exists(), (
243
+ "Model was not saved when calling `Accelerator.save_model`"
244
+ )
245
+ accelerator.end_training()
246
+
247
+
248
+ def main():
249
+ parser = argparse.ArgumentParser(description="Simple example of training script tracking peak GPU memory usage.")
250
+ parser.add_argument(
251
+ "--model_name_or_path",
252
+ type=str,
253
+ default="bert-base-cased",
254
+ help="Path to pretrained model or model identifier from huggingface.co/models.",
255
+ required=False,
256
+ )
257
+ parser.add_argument(
258
+ "--output_dir",
259
+ type=str,
260
+ default=".",
261
+ help="Optional save directory where all checkpoint folders will be stored. Default is the current working directory.",
262
+ )
263
+ parser.add_argument(
264
+ "--performance_lower_bound",
265
+ type=float,
266
+ default=None,
267
+ help="Optional lower bound for the performance metric. If set, the training will throw error when the performance metric drops below this value.",
268
+ )
269
+ parser.add_argument(
270
+ "--num_epochs",
271
+ type=int,
272
+ default=3,
273
+ help="Number of train epochs.",
274
+ )
275
+ parser.add_argument(
276
+ "--add_pad_token",
277
+ type=bool,
278
+ default=False,
279
+ help="To add pad token if not exists.",
280
+ )
281
+ parser.add_argument(
282
+ "--tp_plan",
283
+ type=str,
284
+ default=None,
285
+ help="pass 'auto' to use TP",
286
+ )
287
+ parser.add_argument(
288
+ "--tp_size",
289
+ type=int,
290
+ default=None,
291
+ help="TP size to be used to shard the model",
292
+ )
293
+ args = parser.parse_args()
294
+ config = {"lr": 2e-5, "num_epochs": args.num_epochs, "seed": 42, "batch_size": 16}
295
+ training_function(config, args)
296
+
297
+
298
+ if __name__ == "__main__":
299
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_pippy.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import torch
15
+ from transformers import (
16
+ BertConfig,
17
+ BertForMaskedLM,
18
+ GPT2Config,
19
+ GPT2ForSequenceClassification,
20
+ )
21
+
22
+ from accelerate import PartialState
23
+ from accelerate.inference import prepare_pippy
24
+ from accelerate.test_utils import torch_device
25
+ from accelerate.utils import DistributedType, set_seed
26
+
27
+
28
+ model_to_config = {
29
+ "bert": (BertForMaskedLM, BertConfig, 512),
30
+ "gpt2": (GPT2ForSequenceClassification, GPT2Config, 1024),
31
+ }
32
+
33
+
34
+ def get_model_and_data_for_text(model_name, device, num_processes: int = 2):
35
+ initializer, config, seq_len = model_to_config[model_name]
36
+ config_args = {}
37
+ # Eventually needed for batch inference tests on gpt-2 when bs != 1
38
+ # if model_name == "gpt2":
39
+ # config_args["pad_token_id"] = 0
40
+ model_config = config(**config_args)
41
+ model = initializer(model_config)
42
+ kwargs = dict(low=0, high=model_config.vocab_size, device=device, dtype=torch.int64, requires_grad=False)
43
+ trace_input = torch.randint(size=(1, seq_len), **kwargs)
44
+ inference_inputs = torch.randint(size=(num_processes, seq_len), **kwargs)
45
+ return model, trace_input, inference_inputs
46
+
47
+
48
+ def test_bert(batch_size: int = 2):
49
+ set_seed(42)
50
+ state = PartialState()
51
+ model, trace_input, inference_inputs = get_model_and_data_for_text("bert", "cpu", batch_size)
52
+ model = prepare_pippy(model, example_args=(trace_input,), no_split_module_classes=model._no_split_modules)
53
+ # For inference args need to be a tuple
54
+ inputs = inference_inputs.to(torch_device)
55
+ with torch.no_grad():
56
+ output = model(inputs)
57
+ # Zach: Check that we just grab the real outputs we need at the end
58
+ if not state.is_last_process:
59
+ assert output is None, "Output was not generated on just the last process!"
60
+ else:
61
+ assert output is not None, "Output was not generated in the last process!"
62
+
63
+
64
+ def test_gpt2(batch_size: int = 2):
65
+ set_seed(42)
66
+ state = PartialState()
67
+ model, trace_input, inference_inputs = get_model_and_data_for_text("gpt2", "cpu", batch_size)
68
+ model = prepare_pippy(model, example_args=(trace_input,), no_split_module_classes=model._no_split_modules)
69
+ # For inference args need to be a tuple
70
+ inputs = inference_inputs.to(torch_device)
71
+ with torch.no_grad():
72
+ output = model(inputs)
73
+ # Zach: Check that we just grab the real outputs we need at the end
74
+ if not state.is_last_process:
75
+ assert output is None, "Output was not generated on just the last process!"
76
+ else:
77
+ assert output is not None, "Output was not generated in the last process!"
78
+
79
+
80
+ # Currently disabled, enable again once PyTorch pippy interface can trace a resnet34
81
+ # def test_resnet(batch_size: int = 2):
82
+ # set_seed(42)
83
+ # state = PartialState()
84
+ # model = resnet34()
85
+ # input_tensor = torch.rand(1, 3, 224, 224)
86
+ # model = prepare_pippy(
87
+ # model,
88
+ # example_args=(input_tensor,),
89
+ # )
90
+ # inference_inputs = torch.rand(batch_size, 3, 224, 224)
91
+ # inputs = send_to_device(inference_inputs, torch_device)
92
+ # with torch.no_grad():
93
+ # output = model(inputs)
94
+ # # Zach: Check that we just grab the real outputs we need at the end
95
+ # if not state.is_last_process:
96
+ # assert output is None, "Output was not generated on just the last process!"
97
+ # else:
98
+ # assert output is not None, "Output was not generated in the last process!"
99
+
100
+
101
+ if __name__ == "__main__":
102
+ state = PartialState()
103
+ state.print("Testing pippy integration...")
104
+ try:
105
+ if state.distributed_type in [DistributedType.MULTI_GPU, DistributedType.MULTI_XPU, DistributedType.MULTI_HPU]:
106
+ state.print("Testing GPT2...")
107
+ test_gpt2()
108
+ # Issue: When modifying the tokenizer for batch GPT2 inference, there's an issue
109
+ # due to references
110
+ # NameError: cannot access free variable 'chunk_args_list' where it is not associated with a value in enclosing scope
111
+ # test_gpt2(3)
112
+ state.print("Testing BERT...")
113
+ test_bert()
114
+ else:
115
+ print("Less than two GPUs found, not running tests!")
116
+ finally:
117
+ state.destroy_process_group()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/external_deps/test_zero3_integration.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import torch.distributed
16
+
17
+ from accelerate.test_utils import require_huggingface_suite, torch_device
18
+ from accelerate.utils import is_transformers_available
19
+
20
+
21
+ if is_transformers_available():
22
+ from transformers import AutoModel, TrainingArguments
23
+
24
+
25
+ GPT2_TINY = "sshleifer/tiny-gpt2"
26
+
27
+
28
+ @require_huggingface_suite
29
+ def init_torch_dist_then_launch_deepspeed():
30
+ if torch_device == "xpu":
31
+ backend = "xccl"
32
+ elif torch_device == "hpu":
33
+ backend = "hccl"
34
+ else:
35
+ backend = "nccl"
36
+
37
+ torch.distributed.init_process_group(backend=backend)
38
+ deepspeed_config = {
39
+ "zero_optimization": {
40
+ "stage": 3,
41
+ },
42
+ "train_batch_size": "auto",
43
+ "train_micro_batch_size_per_gpu": "auto",
44
+ }
45
+ train_args = TrainingArguments(
46
+ output_dir="./",
47
+ deepspeed=deepspeed_config,
48
+ )
49
+ model = AutoModel.from_pretrained(GPT2_TINY)
50
+ assert train_args is not None
51
+ assert model is not None
52
+
53
+
54
+ def main():
55
+ init_torch_dist_then_launch_deepspeed()
56
+
57
+
58
+ if __name__ == "__main__":
59
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/test_cli.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import torch
15
+
16
+ from accelerate.utils import is_xpu_available
17
+
18
+
19
+ def main():
20
+ accelerator_type = "GPU"
21
+ num_accelerators = 0
22
+ if torch.cuda.is_available():
23
+ num_accelerators = torch.cuda.device_count()
24
+ accelerator_type = "GPU"
25
+ elif is_xpu_available():
26
+ num_accelerators = torch.xpu.device_count()
27
+ accelerator_type = "XPU"
28
+ print(f"Successfully ran on {num_accelerators} {accelerator_type}s")
29
+
30
+
31
+ if __name__ == "__main__":
32
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/test_utils/scripts/test_ops.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import torch
18
+
19
+ from accelerate import PartialState
20
+ from accelerate.test_utils.testing import assert_exception
21
+ from accelerate.utils.dataclasses import DistributedType
22
+ from accelerate.utils.operations import (
23
+ DistributedOperationException,
24
+ broadcast,
25
+ copy_tensor_to_devices,
26
+ gather,
27
+ gather_object,
28
+ pad_across_processes,
29
+ reduce,
30
+ )
31
+
32
+
33
+ def create_tensor(state):
34
+ return (torch.arange(state.num_processes) + 1.0 + (state.num_processes * state.process_index)).to(state.device)
35
+
36
+
37
+ def test_gather(state):
38
+ tensor = create_tensor(state)
39
+ gathered_tensor = gather(tensor)
40
+ assert gathered_tensor.tolist() == list(range(1, state.num_processes**2 + 1))
41
+
42
+
43
+ def test_gather_object(state):
44
+ # Gather objects in TorchXLA is not supported.
45
+ if state.distributed_type == DistributedType.XLA:
46
+ return
47
+ obj = [state.process_index]
48
+ gathered_obj = gather_object(obj)
49
+ assert len(gathered_obj) == state.num_processes, f"{gathered_obj}, {len(gathered_obj)} != {state.num_processes}"
50
+ assert gathered_obj == list(range(state.num_processes)), f"{gathered_obj} != {list(range(state.num_processes))}"
51
+
52
+
53
+ def test_gather_non_contiguous(state):
54
+ # Skip this test because the 'is_contiguous' function of XLA tensor always returns True.
55
+ if state.distributed_type == DistributedType.XLA:
56
+ return
57
+
58
+ # Create a non-contiguous tensor (enforce non-contiguity after device memory allocation)
59
+ tensor = torch.arange(12, device=state.device).view(4, 3).t()
60
+ assert not tensor.is_contiguous()
61
+ # Shouldn't error out
62
+ _ = gather(tensor)
63
+
64
+
65
+ def test_broadcast(state):
66
+ tensor = create_tensor(state)
67
+ broadcasted_tensor = broadcast(tensor)
68
+ assert broadcasted_tensor.shape == torch.Size([state.num_processes])
69
+ assert broadcasted_tensor.tolist() == list(range(1, state.num_processes + 1))
70
+
71
+
72
+ def test_pad_across_processes(state):
73
+ # We need to pad the tensor with one more element if we are the main process
74
+ # to ensure that we can pad
75
+ if state.is_main_process:
76
+ tensor = torch.arange(state.num_processes + 1).to(state.device)
77
+ else:
78
+ tensor = torch.arange(state.num_processes).to(state.device)
79
+ padded_tensor = pad_across_processes(tensor)
80
+ assert padded_tensor.shape == torch.Size([state.num_processes + 1])
81
+ if not state.is_main_process:
82
+ assert padded_tensor.tolist() == list(range(0, state.num_processes)) + [0]
83
+
84
+
85
+ def test_reduce_sum(state):
86
+ # For now runs on only two processes
87
+ if state.num_processes != 2:
88
+ return
89
+ tensor = create_tensor(state)
90
+ reduced_tensor = reduce(tensor, "sum")
91
+ truth_tensor = torch.tensor([4.0, 6]).to(state.device)
92
+ assert torch.allclose(reduced_tensor, truth_tensor), f"{reduced_tensor} != {truth_tensor}"
93
+
94
+
95
+ def test_reduce_mean(state):
96
+ # For now runs on only two processes
97
+ if state.num_processes != 2:
98
+ return
99
+ tensor = create_tensor(state)
100
+ reduced_tensor = reduce(tensor, "mean")
101
+ truth_tensor = torch.tensor([2.0, 3]).to(state.device)
102
+ assert torch.allclose(reduced_tensor, truth_tensor), f"{reduced_tensor} != {truth_tensor}"
103
+
104
+
105
+ def test_op_checker(state):
106
+ # Must be in a distributed state, and gathering is currently not supported in TorchXLA.
107
+ if state.distributed_type in [DistributedType.NO, DistributedType.XLA]:
108
+ return
109
+ state.debug = True
110
+ # `pad_across_processes`
111
+ if state.process_index == 0:
112
+ data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
113
+ else:
114
+ data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4, 5]]]).to(state.device)}
115
+
116
+ with assert_exception(DistributedOperationException):
117
+ pad_across_processes(data, dim=0)
118
+
119
+ # `reduce`
120
+ if state.process_index == 0:
121
+ data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
122
+ else:
123
+ data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]]).to(state.device)}
124
+
125
+ with assert_exception(DistributedOperationException):
126
+ reduce(data)
127
+
128
+ # `broadcast`
129
+ if state.process_index == 0:
130
+ data = {"tensor": torch.tensor([[0.0, 1, 2, 3, 4]]).to(state.device)}
131
+ else:
132
+ data = {"tensor": torch.tensor([[[0.0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]]).to(state.device)}
133
+
134
+ with assert_exception(DistributedOperationException):
135
+ broadcast(data)
136
+
137
+ state.debug = False
138
+
139
+
140
+ def test_copy_tensor_to_devices(state):
141
+ if state.distributed_type not in [DistributedType.MULTI_GPU, DistributedType.XLA]:
142
+ return
143
+ if state.is_main_process:
144
+ tensor = torch.tensor([1, 2, 3], dtype=torch.int).to(state.device)
145
+ else:
146
+ tensor = None
147
+ tensor = copy_tensor_to_devices(tensor)
148
+ assert torch.allclose(tensor, torch.tensor([1, 2, 3], dtype=torch.int, device=state.device))
149
+
150
+
151
+ def _mp_fn(index):
152
+ # For xla_spawn (TPUs)
153
+ main()
154
+
155
+
156
+ def main():
157
+ state = PartialState()
158
+ state.print(f"State: {state}")
159
+ state.print("testing gather")
160
+ test_gather(state)
161
+ state.print("testing gather_object")
162
+ test_gather_object(state)
163
+ state.print("testing gather non-contiguous")
164
+ test_gather_non_contiguous(state)
165
+ state.print("testing broadcast")
166
+ test_broadcast(state)
167
+ state.print("testing pad_across_processes")
168
+ test_pad_across_processes(state)
169
+ state.print("testing reduce_sum")
170
+ test_reduce_sum(state)
171
+ state.print("testing reduce_mean")
172
+ test_reduce_mean(state)
173
+ state.print("testing op_checker")
174
+ test_op_checker(state)
175
+ state.print("testing sending tensors across devices")
176
+ test_copy_tensor_to_devices(state)
177
+ state.destroy_process_group()
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/__diff.cpython-312.pyc ADDED
Binary file (9 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/__info__.cpython-312.pyc ADDED
Binary file (10.7 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (4.74 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/_objects.cpython-312.pyc ADDED
Binary file (25.7 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/_shims.cpython-312.pyc ADDED
Binary file (7.71 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/detect.cpython-312.pyc ADDED
Binary file (13.4 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/logger.cpython-312.pyc ADDED
Binary file (12.7 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/objtypes.cpython-312.pyc ADDED
Binary file (684 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/pointers.cpython-312.pyc ADDED
Binary file (4.99 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/session.cpython-312.pyc ADDED
Binary file (26.4 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/settings.cpython-312.pyc ADDED
Binary file (412 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/source.cpython-312.pyc ADDED
Binary file (40.7 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/__pycache__/temp.cpython-312.pyc ADDED
Binary file (9.49 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ #
3
+ # Author: Mike McKerns (mmckerns @caltech and @uqfoundation)
4
+ # Copyright (c) 2018-2024 The Uncertainty Quantification Foundation.
5
+ # License: 3-clause BSD. The full license text is available at:
6
+ # - https://github.com/uqfoundation/dill/blob/master/LICENSE
7
+ """
8
+ to run this test suite, first build and install `dill`.
9
+
10
+ $ python -m pip install ../..
11
+
12
+
13
+ then run the tests with:
14
+
15
+ $ python -m dill.tests
16
+
17
+
18
+ or, if `nose` is installed:
19
+
20
+ $ nosetests
21
+
22
+ """
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__main__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ #
3
+ # Author: Mike McKerns (mmckerns @caltech and @uqfoundation)
4
+ # Copyright (c) 2018-2024 The Uncertainty Quantification Foundation.
5
+ # License: 3-clause BSD. The full license text is available at:
6
+ # - https://github.com/uqfoundation/dill/blob/master/LICENSE
7
+
8
+ import glob
9
+ import os
10
+ import sys
11
+ import subprocess as sp
12
+ python = sys.executable
13
+ try:
14
+ import pox
15
+ python = pox.which_python(version=True) or python
16
+ except ImportError:
17
+ pass
18
+ shell = sys.platform[:3] == 'win'
19
+
20
+ suite = os.path.dirname(__file__) or os.path.curdir
21
+ tests = glob.glob(suite + os.path.sep + 'test_*.py')
22
+
23
+
24
+ if __name__ == '__main__':
25
+
26
+ failed = 0
27
+ for test in tests:
28
+ p = sp.Popen([python, test], shell=shell).wait()
29
+ if p:
30
+ print('F', end='', flush=True)
31
+ failed = 1
32
+ else:
33
+ print('.', end='', flush=True)
34
+ print('')
35
+ exit(failed)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_abc.cpython-312.pyc ADDED
Binary file (7.97 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_detect.cpython-312.pyc ADDED
Binary file (7.58 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_dictviews.cpython-312.pyc ADDED
Binary file (2.21 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_fglobals.cpython-312.pyc ADDED
Binary file (2.94 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_logger.cpython-312.pyc ADDED
Binary file (3.7 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_mixins.cpython-312.pyc ADDED
Binary file (6.95 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/dill/tests/__pycache__/test_moduledict.cpython-312.pyc ADDED
Binary file (2.14 kB). View file