justinstrong commited on
Commit
d390ed0
·
verified ·
1 Parent(s): 4f31a1a

Upload logs/training_20260324_024348.log with huggingface_hub

Browse files
Files changed (1) hide show
  1. logs/training_20260324_024348.log +245 -0
logs/training_20260324_024348.log ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === Starting Training ===
2
+ The following values were not passed to `accelerate launch` and had defaults used instead:
3
+ `--num_processes` was set to a value of `1`
4
+ `--num_machines` was set to a value of `1`
5
+ `--mixed_precision` was set to a value of `'no'`
6
+ `--dynamo_backend` was set to a value of `'no'`
7
+ To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.
8
+ WARNING:lerobot.configs.policies:Device 'mps' is not available. Switching to 'cuda'.
9
+ WARNING:lerobot.configs.policies:Device 'mps' is not available. Switching to 'cuda'.
10
+ INFO 2026-03-24 02:43:56 ot_train.py:198 {'batch_size': 32,
11
+ 'checkpoint_path': None,
12
+ 'cudnn_deterministic': False,
13
+ 'dataset': {'episodes': None,
14
+ 'image_transforms': {'enable': False,
15
+ 'max_num_transforms': 3,
16
+ 'random_order': False,
17
+ 'tfs': {'affine': {'kwargs': {'degrees': [-5.0,
18
+ 5.0],
19
+ 'translate': [0.05,
20
+ 0.05]},
21
+ 'type': 'RandomAffine',
22
+ 'weight': 1.0},
23
+ 'brightness': {'kwargs': {'brightness': [0.8,
24
+ 1.2]},
25
+ 'type': 'ColorJitter',
26
+ 'weight': 1.0},
27
+ 'contrast': {'kwargs': {'contrast': [0.8,
28
+ 1.2]},
29
+ 'type': 'ColorJitter',
30
+ 'weight': 1.0},
31
+ 'hue': {'kwargs': {'hue': [-0.05,
32
+ 0.05]},
33
+ 'type': 'ColorJitter',
34
+ 'weight': 1.0},
35
+ 'saturation': {'kwargs': {'saturation': [0.5,
36
+ 1.5]},
37
+ 'type': 'ColorJitter',
38
+ 'weight': 1.0},
39
+ 'sharpness': {'kwargs': {'sharpness': [0.5,
40
+ 1.5]},
41
+ 'type': 'SharpnessJitter',
42
+ 'weight': 1.0}}},
43
+ 'repo_id': 'so100:/ephemeral/community_dataset_v3:/workspace/pi05-so100-diverse/filtered_index.json:/workspace/pi05-so100-diverse/norm_stats.json',
44
+ 'revision': None,
45
+ 'root': None,
46
+ 'streaming': False,
47
+ 'use_imagenet_stats': True,
48
+ 'video_backend': 'torchcodec'},
49
+ 'env': None,
50
+ 'eval': {'batch_size': 50, 'n_episodes': 50, 'use_async_envs': False},
51
+ 'eval_freq': 20000,
52
+ 'job_name': 'pi05',
53
+ 'log_freq': 50,
54
+ 'num_workers': 4,
55
+ 'optimizer': {'betas': [0.9, 0.95],
56
+ 'eps': 1e-08,
57
+ 'grad_clip_norm': 1.0,
58
+ 'lr': 2.5e-05,
59
+ 'type': 'adamw',
60
+ 'weight_decay': 0.01},
61
+ 'output_dir': '/ephemeral/production_run',
62
+ 'peft': None,
63
+ 'policy': {'action_expert_variant': 'gemma_300m',
64
+ 'chunk_size': 50,
65
+ 'compile_mode': 'max-autotune',
66
+ 'compile_model': False,
67
+ 'device': 'cuda',
68
+ 'dtype': 'bfloat16',
69
+ 'empty_cameras': 0,
70
+ 'freeze_vision_encoder': False,
71
+ 'gradient_checkpointing': False,
72
+ 'image_resolution': [224, 224],
73
+ 'input_features': {'observation.images.base_0_rgb': {'shape': [3,
74
+ 224,
75
+ 224],
76
+ 'type': <FeatureType.VISUAL: 'VISUAL'>},
77
+ 'observation.images.left_wrist_0_rgb': {'shape': [3,
78
+ 224,
79
+ 224],
80
+ 'type': <FeatureType.VISUAL: 'VISUAL'>},
81
+ 'observation.images.right_wrist_0_rgb': {'shape': [3,
82
+ 224,
83
+ 224],
84
+ 'type': <FeatureType.VISUAL: 'VISUAL'>},
85
+ 'observation.state': {'shape': [32],
86
+ 'type': <FeatureType.STATE: 'STATE'>}},
87
+ 'license': None,
88
+ 'max_action_dim': 32,
89
+ 'max_period': 4.0,
90
+ 'max_state_dim': 32,
91
+ 'min_period': 0.004,
92
+ 'n_action_steps': 50,
93
+ 'n_obs_steps': 1,
94
+ 'normalization_mapping': {'ACTION': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
95
+ 'STATE': <NormalizationMode.MEAN_STD: 'MEAN_STD'>,
96
+ 'VISUAL': <NormalizationMode.IDENTITY: 'IDENTITY'>},
97
+ 'num_inference_steps': 10,
98
+ 'optimizer_betas': [0.9, 0.95],
99
+ 'optimizer_eps': 1e-08,
100
+ 'optimizer_grad_clip_norm': 1.0,
101
+ 'optimizer_lr': 2.5e-05,
102
+ 'optimizer_weight_decay': 0.01,
103
+ 'output_features': {'action': {'shape': [32],
104
+ 'type': <FeatureType.ACTION: 'ACTION'>}},
105
+ 'paligemma_variant': 'gemma_2b',
106
+ 'pretrained_path': 'lerobot/pi05_base',
107
+ 'private': None,
108
+ 'push_to_hub': True,
109
+ 'repo_id': 'StrongRoboticsLab/pi05-so100-diverse',
110
+ 'rtc_config': None,
111
+ 'scheduler_decay_lr': 2.5e-06,
112
+ 'scheduler_decay_steps': 170000,
113
+ 'scheduler_warmup_steps': 1000,
114
+ 'tags': None,
115
+ 'time_sampling_beta_alpha': 1.5,
116
+ 'time_sampling_beta_beta': 1.0,
117
+ 'time_sampling_offset': 0.001,
118
+ 'time_sampling_scale': 0.999,
119
+ 'tokenizer_max_length': 200,
120
+ 'train_expert_only': True,
121
+ 'type': 'pi05',
122
+ 'use_amp': False,
123
+ 'use_peft': False},
124
+ 'rabc_epsilon': 1e-06,
125
+ 'rabc_head_mode': 'sparse',
126
+ 'rabc_kappa': 0.01,
127
+ 'rabc_progress_path': None,
128
+ 'rename_map': {'observation.images.image': 'observation.images.base_0_rgb',
129
+ 'observation.images.image2': 'observation.images.left_wrist_0_rgb'},
130
+ 'resume': False,
131
+ 'save_checkpoint': True,
132
+ 'save_freq': 500,
133
+ 'scheduler': {'decay_lr': 2.5e-06,
134
+ 'num_decay_steps': 170000,
135
+ 'num_warmup_steps': 1000,
136
+ 'peak_lr': 2.5e-05,
137
+ 'type': 'cosine_decay_with_warmup'},
138
+ 'seed': 1000,
139
+ 'steps': 170000,
140
+ 'tolerance_s': 0.0001,
141
+ 'use_policy_training_preset': True,
142
+ 'use_rabc': False,
143
+ 'wandb': {'add_tags': True,
144
+ 'disable_artifact': False,
145
+ 'enable': True,
146
+ 'entity': None,
147
+ 'mode': None,
148
+ 'notes': None,
149
+ 'project': 'pi05-so100-diverse',
150
+ 'run_id': None}}
151
+ INFO 2026-03-24 02:43:57 db_utils.py:117 Logs will be synced with wandb.
152
+ INFO 2026-03-24 02:43:57 db_utils.py:118 Track this run --> https://wandb.ai/gptjustin-strong-robotics-lab/pi05-so100-diverse/runs/pajke257
153
+ INFO 2026-03-24 02:43:57 ot_train.py:222 Creating dataset
154
+ INFO 2026-03-24 02:44:00 ot_train.py:240 Creating policy
155
+ INFO 2026-03-24 02:46:22 _client.py:1025 HTTP Request: HEAD https://huggingface.co/lerobot/pi05_base/resolve/main/model.safetensors "HTTP/1.1 302 Found"
156
+ INFO 2026-03-24 02:46:22 _client.py:1025 HTTP Request: GET https://huggingface.co/api/models/lerobot/pi05_base/xet-read-token/9e55186ad36e66b95cda57bc47818d9e6237ae30 "HTTP/1.1 200 OK"
157
+ WARNING 2026-03-24 02:46:40 ng_pi05.py:1102 Vision embedding key might need handling: paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.bias
158
+ WARNING 2026-03-24 02:46:40 ng_pi05.py:1102 Vision embedding key might need handling: paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.weight
159
+ INFO 2026-03-24 02:46:46 _client.py:1025 HTTP Request: HEAD https://huggingface.co/lerobot/pi05_base/resolve/main/policy_preprocessor.json "HTTP/1.1 307 Temporary Redirect"
160
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/lerobot/pi05_base/9e55186ad36e66b95cda57bc47818d9e6237ae30/policy_preprocessor.json "HTTP/1.1 200 OK"
161
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: GET https://huggingface.co/api/resolve-cache/models/lerobot/pi05_base/9e55186ad36e66b95cda57bc47818d9e6237ae30/policy_preprocessor.json "HTTP/1.1 200 OK"
162
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: HEAD https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/config.json "HTTP/1.1 200 OK"
163
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: GET https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/config.json "HTTP/1.1 200 OK"
164
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: HEAD https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/tokenizer_config.json "HTTP/1.1 200 OK"
165
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: GET https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/tokenizer_config.json "HTTP/1.1 200 OK"
166
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: GET https://huggingface.co/api/models/google/paligemma-3b-pt-224/tree/main/additional_chat_templates?recursive=false&expand=false "HTTP/1.1 404 Not Found"
167
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: GET https://huggingface.co/api/models/google/paligemma-3b-pt-224/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"
168
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: HEAD https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/tokenizer.json "HTTP/1.1 302 Found"
169
+ INFO 2026-03-24 02:46:47 _client.py:1025 HTTP Request: GET https://huggingface.co/api/models/google/paligemma-3b-pt-224/xet-read-token/35e4f46485b4d07967e7e9935bc3786aad50687c "HTTP/1.1 200 OK"
170
+ INFO 2026-03-24 02:46:48 _client.py:1025 HTTP Request: HEAD https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/added_tokens.json "HTTP/1.1 200 OK"
171
+ INFO 2026-03-24 02:46:48 _client.py:1025 HTTP Request: GET https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/added_tokens.json "HTTP/1.1 200 OK"
172
+ INFO 2026-03-24 02:46:48 _client.py:1025 HTTP Request: HEAD https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/special_tokens_map.json "HTTP/1.1 200 OK"
173
+ INFO 2026-03-24 02:46:48 _client.py:1025 HTTP Request: GET https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/special_tokens_map.json "HTTP/1.1 200 OK"
174
+ INFO 2026-03-24 02:46:48 _client.py:1025 HTTP Request: HEAD https://huggingface.co/google/paligemma-3b-pt-224/resolve/main/chat_template.jinja "HTTP/1.1 404 Not Found"
175
+ INFO 2026-03-24 02:46:49 _client.py:1025 HTTP Request: GET https://huggingface.co/api/models/google/paligemma-3b-pt-224 "HTTP/1.1 200 OK"
176
+ INFO 2026-03-24 02:46:49 _client.py:1025 HTTP Request: HEAD https://huggingface.co/lerobot/pi05_base/resolve/main/policy_postprocessor.json "HTTP/1.1 307 Temporary Redirect"
177
+ INFO 2026-03-24 02:46:49 _client.py:1025 HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/lerobot/pi05_base/9e55186ad36e66b95cda57bc47818d9e6237ae30/policy_postprocessor.json "HTTP/1.1 200 OK"
178
+ INFO 2026-03-24 02:46:49 _client.py:1025 HTTP Request: GET https://huggingface.co/api/resolve-cache/models/lerobot/pi05_base/9e55186ad36e66b95cda57bc47818d9e6237ae30/policy_postprocessor.json "HTTP/1.1 200 OK"
179
+ INFO 2026-03-24 02:46:49 ot_train.py:295 Creating optimizer and scheduler
180
+ INFO 2026-03-24 02:46:49 ot_train.py:330 Output dir: /ephemeral/production_run
181
+ INFO 2026-03-24 02:46:49 ot_train.py:337 cfg.steps=170000 (170K)
182
+ INFO 2026-03-24 02:46:49 ot_train.py:338 dataset.num_frames=4923840 (5M)
183
+ INFO 2026-03-24 02:46:49 ot_train.py:339 dataset.num_episodes=10155
184
+ INFO 2026-03-24 02:46:49 ot_train.py:342 Effective batch size: 32 x 1 = 32
185
+ INFO 2026-03-24 02:46:49 ot_train.py:343 num_learnable_params=693422112 (693M)
186
+ INFO 2026-03-24 02:46:49 ot_train.py:344 num_total_params=4143404816 (4B)
187
+ The PI05 model is a direct port of the OpenPI implementation.
188
+ This implementation follows the original OpenPI structure for compatibility.
189
+ Original implementation: https://github.com/Physical-Intelligence/openpi
190
+ Loading model from: lerobot/pi05_base
191
+ ✓ Loaded state dict from model.safetensors
192
+ Remapped 812 state dict keys
193
+ All keys loaded successfully!
194
+
195
+ Traceback (most recent call last):
196
+ File "<frozen runpy>", line 198, in _run_module_as_main
197
+ File "<frozen runpy>", line 88, in _run_code
198
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/scripts/lerobot_train.py", line 575, in <module>
199
+ main()
200
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/scripts/lerobot_train.py", line 571, in main
201
+ train()
202
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/configs/parser.py", line 233, in wrapper_inner
203
+ response = fn(cfg, *args, **kwargs)
204
+ ^^^^^^^^^^^^^^^^^^^^^^^^
205
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/scripts/lerobot_train.py", line 419, in train
206
+ train_tracker, output_dict = update_policy(
207
+ ^^^^^^^^^^^^^^
208
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/scripts/lerobot_train.py", line 118, in update_policy
209
+ loss, output_dict = policy.forward(batch)
210
+ ^^^^^^^^^^^^^^^^^^^^^
211
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/policies/pi05/modeling_pi05.py", line 1264, in forward
212
+ losses = self.model.forward(images, img_masks, tokens, masks, actions)
213
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
214
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/policies/pi05/modeling_pi05.py", line 771, in forward
215
+ suffix_out = self._apply_checkpoint(
216
+ ^^^^^^^^^^^^^^^^^^^^^^^
217
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/policies/pi05/modeling_pi05.py", line 617, in _apply_checkpoint
218
+ return func(*args, **kwargs)
219
+ ^^^^^^^^^^^^^^^^^^^^^
220
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/policies/pi05/modeling_pi05.py", line 761, in forward_func
221
+ (_, suffix_out), _ = self.paligemma_with_expert.forward(
222
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
223
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/policies/pi05/modeling_pi05.py", line 513, in forward
224
+ inputs_embeds = compute_layer_complete(
225
+ ^^^^^^^^^^^^^^^^^^^^^^^
226
+ File "/workspace/pi05-so100-diverse/lerobot/src/lerobot/policies/pi05/modeling_pi05.py", line 264, in compute_layer_complete
227
+ att_output, _ = modeling_gemma.eager_attention_forward(
228
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
229
+ File "/usr/local/lib/python3.12/dist-packages/transformers/models/gemma/modeling_gemma.py", line 210, in eager_attention_forward
230
+ attn_weights = attn_weights + attention_mask
231
+ ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~
232
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1014.00 MiB. GPU 0 has a total capacity of 79.19 GiB of which 222.50 MiB is free. Including non-PyTorch memory, this process has 78.96 GiB memory in use. Of the allocated memory 76.51 GiB is allocated by PyTorch, and 1.86 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
233
+ Traceback (most recent call last):
234
+ File "<frozen runpy>", line 198, in _run_module_as_main
235
+ File "<frozen runpy>", line 88, in _run_code
236
+ File "/usr/local/lib/python3.12/dist-packages/accelerate/commands/launch.py", line 1415, in <module>
237
+ main()
238
+ File "/usr/local/lib/python3.12/dist-packages/accelerate/commands/launch.py", line 1411, in main
239
+ launch_command(args)
240
+ File "/usr/local/lib/python3.12/dist-packages/accelerate/commands/launch.py", line 1405, in launch_command
241
+ simple_launcher(args)
242
+ File "/usr/local/lib/python3.12/dist-packages/accelerate/commands/launch.py", line 993, in simple_launcher
243
+ raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
244
+ subprocess.CalledProcessError: Command '['/usr/bin/python3.12', '-m', 'lerobot.scripts.lerobot_train', '--dataset.repo_id=so100:/ephemeral/community_dataset_v3:/workspace/pi05-so100-diverse/filtered_index.json:/workspace/pi05-so100-diverse/norm_stats.json', '--policy.path=lerobot/pi05_base', '--policy.train_expert_only=true', '--policy.dtype=bfloat16', '--policy.gradient_checkpointing=false', '--policy.push_to_hub=true', '--policy.repo_id=StrongRoboticsLab/pi05-so100-diverse', '--policy.normalization_mapping={"VISUAL": "IDENTITY", "STATE": "MEAN_STD", "ACTION": "MEAN_STD"}', '--policy.scheduler_warmup_steps=1000', '--policy.scheduler_decay_steps=170000', '--rename_map={"observation.images.image": "observation.images.base_0_rgb", "observation.images.image2": "observation.images.left_wrist_0_rgb"}', '--batch_size=32', '--steps=170000', '--save_freq=500', '--log_freq=50', '--num_workers=4', '--wandb.enable=true', '--wandb.project=pi05-so100-diverse', '--output_dir=/ephemeral/production_run']' returned non-zero exit status 1.
245
+ === Training Complete (exit: 1) ===