Upload folder using huggingface_hub

cf33aaa verified 5 months ago

122 kB

	3: W1124 00:03:06.850000 675180 torch/distributed/run.py:792]
	3: W1124 00:03:06.850000 675180 torch/distributed/run.py:792] *****************************************
	3: W1124 00:03:06.850000 675180 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	3: W1124 00:03:06.850000 675180 torch/distributed/run.py:792] *****************************************
	0: W1124 00:03:06.866000 4127050 torch/distributed/run.py:792]
	0: W1124 00:03:06.866000 4127050 torch/distributed/run.py:792] *****************************************
	0: W1124 00:03:06.866000 4127050 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	0: W1124 00:03:06.866000 4127050 torch/distributed/run.py:792] *****************************************
	2: W1124 00:03:06.882000 628563 torch/distributed/run.py:792]
	2: W1124 00:03:06.882000 628563 torch/distributed/run.py:792] *****************************************
	2: W1124 00:03:06.882000 628563 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	2: W1124 00:03:06.882000 628563 torch/distributed/run.py:792] *****************************************
	1: W1124 00:03:06.884000 2620875 torch/distributed/run.py:792]
	1: W1124 00:03:06.884000 2620875 torch/distributed/run.py:792] *****************************************
	1: W1124 00:03:06.884000 2620875 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	1: W1124 00:03:06.884000 2620875 torch/distributed/run.py:792] *****************************************
	0: [2025-11-24 00:03:26,198] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:4127210] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
	0: [2025-11-24 00:03:26,198] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:4127210] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
	1: [2025-11-24 00:03:26,198] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:2620950] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
	1: [2025-11-24 00:03:26,198] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:2620950] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
	3: [2025-11-24 00:03:26,198] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:675256] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
	3: [2025-11-24 00:03:26,198] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:675256] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
	2: [2025-11-24 00:03:26,199] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:628638] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m
	2: [2025-11-24 00:03:26,199] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:628638] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m
	0: [33m[2025-11-24 00:03:57,538] [WARNING] [axolotl.utils.config.normalize_config:139] [PID:4127210] [RANK:0] Invalid value for save_steps (1.6666666666666667) from saves_per_epoch and/or num_epochs. Saving at training end only.[39m
	0: [2025-11-24 00:03:57,695] [INFO] [axolotl.cli.config.load_cfg:245] [PID:4127210] [RANK:0] config:
	0: {
	0: "activation_offloading": false,
	0: "auto_resume_from_checkpoints": true,
	0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1763938979818356030.yaml",
	0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-1b",
	0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-1b",
	0: "batch_size": 16,
	0: "bf16": true,
	0: "capabilities": {
	0: "bf16": true,
	0: "compute_capability": "sm_90",
	0: "fp8": false,
	0: "n_gpu": 16,
	0: "n_node": 1
	0: },
	0: "chat_template": "gemma3",
	0: "context_parallel_size": 1,
	0: "dataloader_num_workers": 16,
	0: "dataloader_pin_memory": true,
	0: "dataloader_prefetch_factor": 256,
	0: "dataset_prepared_path": "/lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0",
	0: "dataset_processes": 192,
	0: "datasets": [
	0: {
	0: "chat_template": "tokenizer_default",
	0: "data_files": [
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0007.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0009.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0005.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0006.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0014.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0010.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0012.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0008.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0001.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0002.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0013.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0015.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0004.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0011.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0000.jsonl",
	0: "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking/0003.jsonl"
	0: ],
	0: "ds_type": "json",
	0: "field_messages": "conversations",
	0: "message_property_mappings": {
	0: "content": "content",
	0: "role": "role"
	0: },
	0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking",
	0: "trust_remote_code": false,
	0: "type": "chat_template"
	0: }
	0: ],
	0: "ddp": true,
	0: "deepspeed": {
	0: "bf16": {
	0: "enabled": true
	0: },
	0: "gradient_accumulation_steps": "auto",
	0: "gradient_clipping": "auto",
	0: "train_batch_size": "auto",
	0: "train_micro_batch_size_per_gpu": "auto",
	0: "wall_clock_breakdown": false,
	0: "zero_optimization": {
	0: "contiguous_gradients": true,
	0: "overlap_comm": true,
	0: "reduce_bucket_size": "auto",
	0: "stage": 3,
	0: "stage3_gather_16bit_weights_on_model_save": true,
	0: "stage3_param_persistence_threshold": "auto",
	0: "stage3_prefetch_bucket_size": "auto",
	0: "sub_group_size": 0
	0: }
	0: },
	0: "device": "cuda:0",
	0: "device_map": {
	0: "": 0
	0: },
	0: "dion_rank_fraction": 1.0,
	0: "dion_rank_multiple_of": 1,
	0: "env_capabilities": {
	0: "torch_version": "2.6.0"
	0: },
	0: "eot_tokens": [
	0: "<end_of_turn>"
	0: ],
	0: "eval_batch_size": 1,
	0: "eval_causal_lm_metrics": [
	0: "sacrebleu",
	0: "comet",
	0: "ter",
	0: "chrf"
	0: ],
	0: "eval_max_new_tokens": 128,
	0: "eval_sample_packing": true,
	0: "eval_table_size": 0,
	0: "evals_per_epoch": 0,
	0: "flash_attention": true,
	0: "fp16": false,
	0: "gradient_accumulation_steps": 1,
	0: "gradient_checkpointing": true,
	0: "gradient_checkpointing_kwargs": {
	0: "use_reentrant": true
	0: },
	0: "learning_rate": 2e-05,
	0: "lisa_layers_attribute": "model.layers",
	0: "load_best_model_at_end": false,
	0: "load_in_4bit": false,
	0: "load_in_8bit": false,
	0: "local_rank": 0,
	0: "logging_steps": 10,
	0: "lora_dropout": 0.0,
	0: "loraplus_lr_embedding": 1e-06,
	0: "lr_scheduler": "warmup_stable_decay",
	0: "lr_scheduler_kwargs": {
	0: "min_lr_ratio": 0.1,
	0: "num_decay_steps": 200
	0: },
	0: "max_prompt_len": 512,
	0: "mean_resizing_embeddings": false,
	0: "micro_batch_size": 1,
	0: "model_config_type": "gemma3_text",
	0: "num_epochs": 0.6,
	0: "optimizer": "adamw_torch_fused",
	0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-1b/0",
	0: "pad_to_sequence_len": true,
	0: "pretrain_multipack_attn": true,
	0: "pretrain_multipack_buffer_size": 10000,
	0: "profiler_steps_start": 0,
	0: "qlora_sharded_model_loading": false,
	0: "ray_num_workers": 1,
	0: "resources_per_worker": {
	0: "GPU": 1
	0: },
	0: "sample_packing": true,
	0: "sample_packing_bin_size": 200,
	0: "sample_packing_group_size": 100000,
	0: "save_only_model": true,
	0: "save_safetensors": true,
	0: "save_total_limit": 20,
	0: "saves_per_epoch": 1,
	0: "sequence_len": 16384,
	0: "shuffle_before_merging_datasets": false,
	0: "shuffle_merged_datasets": true,
	0: "skip_prepare_dataset": false,
	0: "strict": false,
	0: "tensor_parallel_size": 1,
	0: "tf32": false,
	0: "tiled_mlp_use_original_mlp": true,
	0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Gemma/base/gemma-3-27b",
	0: "torch_dtype": "torch.bfloat16",
	0: "train_on_inputs": false,
	0: "trl": {
	0: "log_completions": false,
	0: "mask_truncated_completions": false,
	0: "ref_model_mixup_alpha": 0.9,
	0: "ref_model_sync_steps": 64,
	0: "scale_rewards": true,
	0: "sync_ref_model": false,
	0: "use_vllm": false,
	0: "vllm_server_host": "0.0.0.0",
	0: "vllm_server_port": 8000
	0: },
	0: "use_ray": false,
	0: "use_tensorboard": true,
	0: "val_set_size": 0.0,
	0: "vllm": {
	0: "device": "auto",
	0: "dtype": "auto",
	0: "gpu_memory_utilization": 0.9,
	0: "host": "0.0.0.0",
	0: "port": 8000
	0: },
	0: "warmup_steps": 100,
	0: "weight_decay": 0.0,
	0: "world_size": 16
	0: }[39m
	0: [2025-11-24 00:03:57,696] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:4127210] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used.[39m
	1: [2025-11-24 00:03:59,617] [INFO] [axolotl.utils.data.sft._load_raw_datasets:314] [PID:2620950] [RANK:0] Loading raw datasets...[39m
	1: [2025-11-24 00:04:05,080] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:88] [PID:2620950] [RANK:0] Loading dataset: /lustre/fswork/projects/rech/qwv/udv55np/dataset/ift/Nemotron-Super-49B-v1_5/no_thinking with base_type: chat_template and prompt_style: None[39m
	1: Dropping Long Sequences (>16384) (num_proc=192): 0%\| \| 0/557277 [00:00<?, ? examples/s] Dropping Long Sequences (>16384) (num_proc=192): 0%\| \| 1000/557277 [00:01<16:57, 546.65 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 5%\|▍ \| 27000/557277 [00:01<00:27, 19254.20 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 8%\|▊ \| 43000/557277 [00:02<00:15, 32394.59 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 11%\|█▏ \| 64000/557277 [00:02<00:09, 53237.46 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 15%\|█▍ \| 82000/557277 [00:02<00:06, 69062.34 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 18%\|█▊ \| 101000/557277 [00:02<00:05, 88582.09 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 21%\|██ \| 118030/557277 [00:02<00:05, 75568.36 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 24%\|██▎ \| 131545/557277 [00:02<00:06, 65850.8
	1: 7 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 48%\|████▊ \| 265598/557277 [00:03<00:01, 261159.28 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 79%\|███████▊ \| 437843/557277 [00:03<00:00, 523616.24 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 94%\|█████████▍\| 525217/557277 [00:03<00:00, 440082.46 examples/s] Dropping Long Sequences (>16384) (num_proc=192): 100%\|██████████\| 557277/557277 [00:03<00:00, 140183.34 examples/s]
	1: Drop Samples with Zero Trainable Tokens (num_proc=192): 0%\| \| 0/556595 [00:00<?, ? examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 0%\| \| 1000/556595 [00:01<14:35, 634.70 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 2%\|▏ \| 9000/556595 [00:01<01:16, 7201.69 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 3%\|▎ \| 17000/556595 [00:01<00:36, 14783.70 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 6%\|▌ \| 32000/556595 [00:01<00:16, 32381.41 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 8%\|▊ \| 45899/556595 [00:01<00:10, 48622.66 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 10%\|█ \| 57596/556595 [00:02<00:09, 54620.75 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 12%\|█▏ \| 67990/556595 [00:02<00:09, 54101.47 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 14%\|
	1: █▍ \| 76788/556595 [00:02<00:10, 47872.81 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 15%\|█▌ \| 84485/556595 [00:02<00:09, 50156.92 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 16%\|█▋ \| 91182/556595 [00:02<00:08, 52948.99 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 18%\|█▊ \| 97980/556595 [00:02<00:08, 55683.76 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 19%\|█▉ \| 104778/556595 [00:03<00:07, 58290.29 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 21%\|██ \| 114475/556595 [00:03<00:06, 67824.75 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 22%\|██▏ \| 122273/556595 [00:03<00:07, 59304.83 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 23%\|██▎ \| 128869/556595 [00:03<00:07, 53849.86 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 24%\|██▍
	1: \| 135566/556595 [00:03<00:07, 54104.25 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 25%\|██▌ \| 141465/556595 [00:03<00:08, 51615.38 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 26%\|██▋ \| 147263/556595 [00:03<00:07, 53015.62 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 27%\|██▋ \| 152960/556595 [00:04<00:12, 31310.04 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 53%\|█████▎ \| 295112/556595 [00:04<00:00, 274303.40 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 95%\|█████████▌\| 531123/556595 [00:04<00:00, 687146.38 examples/s] Drop Samples with Zero Trainable Tokens (num_proc=192): 100%\|██████████\| 556595/556595 [00:05<00:00, 107236.31 examples/s]
	1: Add position_id column (Sample Packing) (num_proc=192): 0%\| \| 0/556595 [00:00<?, ? examples/s] Add position_id column (Sample Packing) (num_proc=192): 0%\| \| 1000/556595 [00:01<15:58, 579.61 examples/s] Add position_id column (Sample Packing) (num_proc=192): 2%\|▏ \| 13000/556595 [00:01<00:55, 9708.75 examples/s] Add position_id column (Sample Packing) (num_proc=192): 5%\|▌ \| 28000/556595 [00:01<00:22, 23461.09 examples/s] Add position_id column (Sample Packing) (num_proc=192): 7%\|▋ \| 39000/556595 [00:02<00:16, 31872.27 examples/s] Add position_id column (Sample Packing) (num_proc=192): 10%\|▉ \| 55000/556595 [00:02<00:10, 49421.01 examples/s] Add position_id column (Sample Packing) (num_proc=192): 13%\|█▎ \| 71000/556595 [00:02<00:07, 66587.00 examples/s] Add position_id column (Sample Packing) (num_proc=192): 15%\|█▌ \| 84000/556595 [00:02<00:06, 74430.55 examples/s] Add position_id column (Sample Packing) (num_proc=192): 1
	1: 8%\|█▊ \| 98000/556595 [00:02<00:05, 87737.09 examples/s] Add position_id column (Sample Packing) (num_proc=192): 20%\|██ \| 113000/556595 [00:02<00:04, 100541.51 examples/s] Add position_id column (Sample Packing) (num_proc=192): 23%\|██▎ \| 126495/556595 [00:03<00:06, 66089.09 examples/s] Add position_id column (Sample Packing) (num_proc=192): 25%\|██▍ \| 136889/556595 [00:03<00:07, 53001.49 examples/s] Add position_id column (Sample Packing) (num_proc=192): 49%\|████▉ \| 272354/556595 [00:03<00:01, 251852.59 examples/s] Add position_id column (Sample Packing) (num_proc=192): 64%\|██████▍ \| 358213/556595 [00:03<00:00, 359565.72 examples/s] Add position_id column (Sample Packing) (num_proc=192): 75%\|███████▍ \| 416395/556595 [00:03<00:00, 316487.30 examples/s] Add position_id column (Sample Packing) (num_proc=192): 83%\|████████▎ \| 464577/556595 [00:03<00:00, 276324.78 examples/s] Add position_id column
	1: (Sample Packing) (num_proc=192): 91%\|█████████ \| 505153/556595 [00:04<00:00, 268841.01 examples/s] Add position_id column (Sample Packing) (num_proc=192): 97%\|█████████▋\| 540321/556595 [00:04<00:00, 232253.86 examples/s] Add position_id column (Sample Packing) (num_proc=192): 100%\|██████████\| 556595/556595 [00:05<00:00, 111162.45 examples/s]
	1: Saving the dataset (0/192 shards): 0%\| \| 0/556595 [00:00<?, ? examples/s] Saving the dataset (0/192 shards): 1%\| \| 2899/556595 [00:01<05:29, 1680.30 examples/s] Saving the dataset (1/192 shards): 1%\| \| 2899/556595 [00:01<05:29, 1680.30 examples/s] Saving the dataset (2/192 shards): 1%\| \| 5798/556595 [00:01<05:27, 1680.30 examples/s] Saving the dataset (3/192 shards): 2%\|▏ \| 8697/556595 [00:01<05:26, 1680.30 examples/s] Saving the dataset (4/192 shards): 2%\|▏ \| 11596/556595 [00:01<05:24, 1680.30 examples/s] Saving the dataset (5/192 shards): 3%\|▎ \| 14495/556595 [00:01<05:22, 1680.30 examples/s] Saving the dataset (6/192 shards): 3%\|▎ \| 17394/556595 [00:01<05:20, 1680.30 examples/s] Saving the dataset (7/192 shards): 4%\|▎ \| 20293/556595 [00:01<05:19, 1680.30 examples/s] Saving the dataset (8/192 shards): 4%\|▍ \| 23192/556595 [00:01<05:17, 1680.30 examples/s] Saving the dataset (9/192 shards): 5%
	1: \|▍ \| 26091/556595 [00:01<05:15, 1680.30 examples/s] Saving the dataset (10/192 shards): 5%\|▌ \| 28990/556595 [00:01<05:13, 1680.30 examples/s] Saving the dataset (11/192 shards): 6%\|▌ \| 31889/556595 [00:01<05:12, 1680.30 examples/s] Saving the dataset (12/192 shards): 6%\|▋ \| 34788/556595 [00:01<05:10, 1680.30 examples/s] Saving the dataset (13/192 shards): 7%\|▋ \| 37687/556595 [00:01<05:08, 1680.30 examples/s] Saving the dataset (14/192 shards): 7%\|▋ \| 40586/556595 [00:01<05:07, 1680.30 examples/s] Saving the dataset (15/192 shards): 8%\|▊ \| 43485/556595 [00:01<05:05, 1680.30 examples/s] Saving the dataset (16/192 shards): 8%\|▊ \| 46384/556595 [00:01<05:03, 1680.30 examples/s] Saving the dataset (17/192 shards): 9%\|▉ \| 49283/556595 [00:01<05:01, 1680.30 examples/s] Saving the dataset (18/192 shards): 9%\|▉ \| 52182/556595 [00:01<05:00, 1680.30 examples/s] Saving the dataset (19/192 shards): 10%\|█
	1: \| 57980/556595 [00:01<04:56, 1680.30 examples/s] Saving the dataset (20/192 shards): 10%\|█ \| 57980/556595 [00:01<04:56, 1680.30 examples/s] Saving the dataset (20/192 shards): 11%\|█ \| 60879/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (21/192 shards): 11%\|█ \| 60879/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (22/192 shards): 11%\|█▏ \| 63778/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (23/192 shards): 12%\|█▏ \| 66677/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (24/192 shards): 13%\|█▎ \| 69576/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (25/192 shards): 13%\|█▎ \| 72475/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (26/192 shards): 14%\|█▎ \| 75374/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (27/192 shards): 14%\|█▍ \| 78273/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (28/192 s
	1: hards): 15%\|█▍ \| 81172/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (29/192 shards): 15%\|█▌ \| 84071/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (30/192 shards): 16%\|█▌ \| 86970/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (31/192 shards): 16%\|█▌ \| 89869/556595 [00:01<00:10, 45863.80 examples/s] Saving the dataset (32/192 shards): 17%\|█▋ \| 92768/556595 [00:02<00:10, 45863.80 examples/s] Saving the dataset (32/192 shards): 17%\|█▋ \| 95667/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (33/192 shards): 17%\|█▋ \| 95667/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (34/192 shards): 18%\|█▊ \| 98566/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (35/192 shards): 18%\|█▊ \| 101465/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (36/192 shards): 19%\|█▉ \| 104364/556595 [00:02<00:06, 67785.85 examples/s]
	1: Saving the dataset (37/192 shards): 19%\|█▉ \| 107263/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (38/192 shards): 20%\|█▉ \| 110162/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (39/192 shards): 20%\|██ \| 113061/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (40/192 shards): 21%\|██ \| 115960/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (41/192 shards): 21%\|██▏ \| 118859/556595 [00:02<00:06, 67785.85 examples/s] Saving the dataset (41/192 shards): 22%\|██▏ \| 121758/556595 [00:02<00:05, 86635.90 examples/s] Saving the dataset (42/192 shards): 22%\|██▏ \| 121758/556595 [00:02<00:05, 86635.90 examples/s] Saving the dataset (43/192 shards): 22%\|██▏ \| 124657/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (44/192 shards): 23%\|██▎ \| 127556/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (45/192 shards): 23%\|██▎ \| 1
	1: 30455/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (46/192 shards): 24%\|██▍ \| 133354/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (47/192 shards): 24%\|██▍ \| 136253/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (48/192 shards): 25%\|██▌ \| 139152/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (49/192 shards): 26%\|██▌ \| 142051/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (50/192 shards): 26%\|██▌ \| 144950/556595 [00:02<00:04, 86635.90 examples/s] Saving the dataset (50/192 shards): 27%\|██▋ \| 147849/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (51/192 shards): 27%\|██▋ \| 147849/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (52/192 shards): 27%\|██▋ \| 150748/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (53/192 shards): 28%\|██▊ \| 153647/556595 [00:02<00:03, 105802.82 examples/s]
	1: Saving the dataset (54/192 shards): 28%\|██▊ \| 156546/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (55/192 shards): 29%\|██▊ \| 159445/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (56/192 shards): 29%\|██▉ \| 162344/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (57/192 shards): 30%\|██▉ \| 165243/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (58/192 shards): 30%\|███ \| 168142/556595 [00:02<00:03, 105802.82 examples/s] Saving the dataset (58/192 shards): 31%\|███ \| 171041/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (59/192 shards): 31%\|███ \| 171041/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (60/192 shards): 31%\|███▏ \| 173940/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (61/192 shards): 32%\|███▏ \| 176839/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (62/192 shards): 32
	1: %\|███▏ \| 179738/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (63/192 shards): 33%\|███▎ \| 185536/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (64/192 shards): 33%\|███▎ \| 185536/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (65/192 shards): 34%\|███▍ \| 188435/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (66/192 shards): 34%\|███▍ \| 191334/556595 [00:02<00:03, 116752.77 examples/s] Saving the dataset (66/192 shards): 35%\|███▍ \| 194233/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (67/192 shards): 35%\|███▍ \| 194233/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (68/192 shards): 35%\|███▌ \| 197132/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (69/192 shards): 36%\|███▌ \| 200031/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (70/192 shards): 36%\|███▋ \| 202
	1: 930/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (71/192 shards): 37%\|███▋ \| 205829/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (72/192 shards): 38%\|███▊ \| 208728/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (73/192 shards): 38%\|███▊ \| 211627/556595 [00:02<00:02, 131571.77 examples/s] Saving the dataset (73/192 shards): 39%\|███▊ \| 214526/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (74/192 shards): 39%\|███▊ \| 214526/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (75/192 shards): 39%\|███▉ \| 217425/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (76/192 shards): 40%\|███▉ \| 220324/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (77/192 shards): 40%\|████ \| 223223/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (78/192 shards): 41%\|████ \| 226122/556595 [00:02<00:02,
	1: 137894.41 examples/s] Saving the dataset (79/192 shards): 41%\|████ \| 229021/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (80/192 shards): 42%\|████▏ \| 231920/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (81/192 shards): 42%\|████▏ \| 234819/556595 [00:02<00:02, 137894.41 examples/s] Saving the dataset (81/192 shards): 43%\|████▎ \| 237718/556595 [00:02<00:02, 155142.10 examples/s] Saving the dataset (82/192 shards): 43%\|████▎ \| 237718/556595 [00:02<00:02, 155142.10 examples/s] Saving the dataset (83/192 shards): 43%\|████▎ \| 240617/556595 [00:02<00:02, 155142.10 examples/s] Saving the dataset (84/192 shards): 44%\|████▍ \| 243516/556595 [00:02<00:02, 155142.10 examples/s] Saving the dataset (85/192 shards): 44%\|████▍ \| 246415/556595 [00:02<00:01, 155142.10 examples/s] Saving the dataset (86/192 shards): 45%\|████▍ \| 249314/556595 [00:02<00:01, 155142.10
	1: examples/s] Saving the dataset (87/192 shards): 45%\|████▌ \| 252213/556595 [00:02<00:01, 155142.10 examples/s] Saving the dataset (88/192 shards): 46%\|████▌ \| 255112/556595 [00:02<00:01, 155142.10 examples/s] Saving the dataset (88/192 shards): 46%\|████▋ \| 258011/556595 [00:02<00:01, 162333.45 examples/s] Saving the dataset (89/192 shards): 46%\|████▋ \| 258011/556595 [00:02<00:01, 162333.45 examples/s] Saving the dataset (90/192 shards): 47%\|████▋ \| 260910/556595 [00:02<00:01, 162333.45 examples/s] Saving the dataset (91/192 shards): 47%\|████▋ \| 263809/556595 [00:02<00:01, 162333.45 examples/s] Saving the dataset (92/192 shards): 48%\|████▊ \| 266708/556595 [00:02<00:01, 162333.45 examples/s] Saving the dataset (93/192 shards): 48%\|████▊ \| 269607/556595 [00:02<00:01, 162333.45 examples/s] Saving the dataset (94/192 shards): 49%\|████▉ \| 272506/556595 [00:02<00:01, 162333.45 exampl
	1: es/s] Saving the dataset (95/192 shards): 49%\|████▉ \| 275405/556595 [00:03<00:01, 162333.45 examples/s] Saving the dataset (96/192 shards): 50%\|█████ \| 278304/556595 [00:03<00:01, 162333.45 examples/s] Saving the dataset (96/192 shards): 51%\|█████ \| 281203/556595 [00:03<00:01, 169891.97 examples/s] Saving the dataset (97/192 shards): 51%\|█████ \| 281203/556595 [00:03<00:01, 169891.97 examples/s] Saving the dataset (98/192 shards): 52%\|█████▏ \| 289900/556595 [00:03<00:01, 169891.97 examples/s] Saving the dataset (99/192 shards): 52%\|█████▏ \| 289900/556595 [00:03<00:01, 169891.97 examples/s] Saving the dataset (100/192 shards): 52%\|█████▏ \| 289900/556595 [00:03<00:01, 169891.97 examples/s] Saving the dataset (101/192 shards): 53%\|█████▎ \| 292799/556595 [00:03<00:01, 169891.97 examples/s] Saving the dataset (102/192 shards): 53%\|█████▎ \| 295698/556595 [00:03<00:01, 169891.97
	1: examples/s] Saving the dataset (103/192 shards): 54%\|█████▎ \| 298597/556595 [00:03<00:01, 169891.97 examples/s] Saving the dataset (103/192 shards): 54%\|█████▍ \| 301496/556595 [00:03<00:01, 173067.43 examples/s] Saving the dataset (104/192 shards): 55%\|█████▍ \| 304395/556595 [00:03<00:01, 173067.43 examples/s] Saving the dataset (105/192 shards): 55%\|█████▍ \| 304395/556595 [00:03<00:01, 173067.43 examples/s] Saving the dataset (106/192 shards): 55%\|█████▌ \| 307294/556595 [00:03<00:01, 173067.43 examples/s] Saving the dataset (107/192 shards): 56%\|█████▌ \| 310193/556595 [00:03<00:01, 173067.43 examples/s] Saving the dataset (108/192 shards): 56%\|█████▋ \| 313092/556595 [00:03<00:01, 173067.43 examples/s] Saving the dataset (109/192 shards): 57%\|█████▋ \| 315991/556595 [00:03<00:01, 173067.43 examples/s] Saving the dataset (110/192 shards): 57%\|█████▋ \| 318890/556595 [00:
	1: 03<00:01, 173067.43 examples/s] Saving the dataset (110/192 shards): 58%\|█████▊ \| 321789/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (111/192 shards): 58%\|█████▊ \| 321789/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (112/192 shards): 58%\|█████▊ \| 324688/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (113/192 shards): 59%\|█████▉ \| 327587/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (114/192 shards): 59%\|█████▉ \| 330486/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (115/192 shards): 60%\|█████▉ \| 333385/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (116/192 shards): 60%\|██████ \| 336284/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (117/192 shards): 61%\|██████ \| 339183/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (118/192 shards): 61%\|██████▏
	1: \| 342082/556595 [00:03<00:01, 167787.42 examples/s] Saving the dataset (118/192 shards): 62%\|██████▏ \| 344981/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (119/192 shards): 62%\|██████▏ \| 344981/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (120/192 shards): 63%\|██████▎ \| 347880/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (121/192 shards): 63%\|██████▎ \| 350779/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (122/192 shards): 64%\|██████▎ \| 353678/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (123/192 shards): 64%\|██████▍ \| 356577/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (124/192 shards): 65%\|██████▍ \| 359476/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (125/192 shards): 66%\|██████▌ \| 365274/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (126/19
	1: 2 shards): 66%\|██████▌ \| 365274/556595 [00:03<00:01, 178224.81 examples/s] Saving the dataset (126/192 shards): 66%\|██████▌ \| 368173/556595 [00:03<00:01, 169693.92 examples/s] Saving the dataset (127/192 shards): 66%\|██████▌ \| 368173/556595 [00:03<00:01, 169693.92 examples/s] Saving the dataset (128/192 shards): 67%\|██████▋ \| 371072/556595 [00:03<00:01, 169693.92 examples/s] Saving the dataset (129/192 shards): 68%\|██████▊ \| 376870/556595 [00:03<00:01, 169693.92 examples/s] Saving the dataset (130/192 shards): 68%\|██████▊ \| 376870/556595 [00:03<00:01, 169693.92 examples/s] Saving the dataset (131/192 shards): 70%\|███████ \| 390466/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (132/192 shards): 70%\|███████ \| 390466/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (133/192 shards): 70%\|███████ \| 390466/556595 [00:03<00:00, 169693.92
	1: examples/s] Saving the dataset (134/192 shards): 71%\|███████ \| 393365/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (135/192 shards): 73%\|███████▎ \| 408062/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (136/192 shards): 75%\|███████▍ \| 416759/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (137/192 shards): 75%\|███████▍ \| 416759/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (138/192 shards): 77%\|███████▋ \| 427658/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (139/192 shards): 79%\|███████▉ \| 439254/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (140/192 shards): 79%\|███████▉ \| 442153/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (141/192 shards): 79%\|███████▉ \| 442153/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (142/192 shards): 79%\|████
	1: ███▉ \| 442153/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (143/192 shards): 80%\|███████▉ \| 445052/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (144/192 shards): 80%\|███████▉ \| 445052/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (145/192 shards): 80%\|████████ \| 447052/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (146/192 shards): 82%\|████████▏ \| 454648/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (147/192 shards): 82%\|████████▏ \| 455547/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (148/192 shards): 82%\|████████▏ \| 458446/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (149/192 shards): 84%\|████████▍ \| 467143/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (150/192 shards): 84%\|████████▍ \| 467143/556595 [00:03<00:00, 169693.92
	1: examples/s] Saving the dataset (151/192 shards): 84%\|████████▍ \| 468042/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (152/192 shards): 84%\|████████▍ \| 468042/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (153/192 shards): 84%\|████████▍ \| 468042/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (154/192 shards): 84%\|████████▍ \| 468941/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (155/192 shards): 84%\|████████▍ \| 469840/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (156/192 shards): 85%\|████████▍ \| 471840/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (157/192 shards): 85%\|████████▌ \| 474739/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (158/192 shards): 85%\|████████▌ \| 474739/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (159/192 shards):
	1: 85%\|████████▌ \| 474739/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (160/192 shards): 85%\|████████▌ \| 474739/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (161/192 shards): 85%\|████████▌ \| 475638/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (162/192 shards): 87%\|████████▋ \| 484537/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (163/192 shards): 87%\|████████▋ \| 484537/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (164/192 shards): 89%\|████████▉ \| 496132/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (165/192 shards): 91%\|█████████ \| 503929/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (166/192 shards): 91%\|█████████ \| 503929/556595 [00:03<00:00, 169693.92 examples/s] Saving the dataset (167/192 shards): 91%\|█████████ \| 503929/55659
	1: 5 [00:03<00:00, 169693.92 examples/s] Saving the dataset (167/192 shards): 91%\|█████████ \| 506828/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (168/192 shards): 91%\|█████████▏\| 508828/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (169/192 shards): 91%\|█████████▏\| 508828/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (170/192 shards): 92%\|█████████▏\| 509727/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (171/192 shards): 93%\|█████████▎\| 518221/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (172/192 shards): 94%\|█████████▎\| 521119/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (173/192 shards): 94%\|█████████▍\| 524017/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (174/192 shards): 95%\|█████████▍\| 526915/556595 [00:03<00:00, 468570.22 examples
	1: /s] Saving the dataset (175/192 shards): 95%\|█████████▍\| 526915/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (176/192 shards): 95%\|█████████▍\| 527814/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (177/192 shards): 95%\|█████████▍\| 527814/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (178/192 shards): 95%\|█████████▌\| 529611/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (179/192 shards): 95%\|█████████▌\| 529611/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (180/192 shards): 95%\|█████████▌\| 529611/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (181/192 shards): 95%\|█████████▌\| 529611/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (182/192 shards): 95%\|█████████▌\| 529611/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (183/192
	1: shards): 97%\|█████████▋\| 538306/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (184/192 shards): 99%\|█████████▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (185/192 shards): 99%\|█████████▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (186/192 shards): 99%\|█████████▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (187/192 shards): 99%\|█████████▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (188/192 shards): 99%\|█████████▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (189/192 shards): 99%\|█████████▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (190/192 shards): 99%\|█████████▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (191/192 shards): 99%\|██████
	1: ███▉\| 553697/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (192/192 shards): 100%\|██████████\| 556595/556595 [00:03<00:00, 468570.22 examples/s] Saving the dataset (192/192 shards): 100%\|██████████\| 556595/556595 [00:03<00:00, 149145.23 examples/s]
	0: [2025-11-24 00:05:58,155] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:472] [PID:4127210] [RANK:0] Loading prepared dataset from disk at /lustre/fswork/projects/rech/dgo/udv55np/dataset_gemma/Nemotron-Super-49B-v1_5/split_0/06698e902d3dba325ca34849b1dea5ea...[39m
	0: [2025-11-24 00:07:08,642] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:4127210] [RANK:0] gather_len_batches: [18975, 18976, 18976, 18976, 18976, 18975, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976][39m
	0: [2025-11-24 00:07:08,811] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:4127210] [RANK:0] sample_packing_eff_est across ranks: [0.9988827705383301, 0.9989354014396667, 0.9989354014396667, 0.9988827705383301, 0.9989354014396667, 0.9988827705383301, 0.9988827705383301, 0.9989354014396667, 0.9988827705383301, 0.9988827705383301, 0.9988827705383301, 0.9989880323410034, 0.9989354014396667, 0.9988827705383301, 0.9989354014396667, 0.9989354014396667][39m
	0: [2025-11-24 00:07:08,819] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:4127210] [RANK:0] Maximum number of steps set at 711[39m
	0: [2025-11-24 00:07:09,986] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:4127210] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation[39m
	0: [2025-11-24 00:07:09,987] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:4127210] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation[39m
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	2: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	1: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	3: The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
	0: [2025-11-24 00:07:22,370] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:4127210] [RANK:0] Converting modules to torch.bfloat16[39m
	0: [2025-11-24 00:08:20,774] [INFO] [axolotl.train.save_initial_configs:416] [PID:4127210] [RANK:0] Pre-saving tokenizer to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-1b/0...[39m
	0: [2025-11-24 00:08:21,511] [INFO] [axolotl.train.save_initial_configs:419] [PID:4127210] [RANK:0] Pre-saving model config to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-1b/0...[39m
	0: [2025-11-24 00:08:21,526] [INFO] [axolotl.train.execute_training:203] [PID:4127210] [RANK:0] Starting trainer...[39m
	0: [2025-11-24 00:09:55,377] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:4127210] [RANK:0] gather_len_batches: [18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976, 18976][39m
	0: Parameter Offload - Persistent parameters statistics: param_count = 157, numel = 134272
	2: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	2: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	0: 0%\| \| 0/711 [00:00<?, ?it/s]It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	1: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	3: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	3: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	1: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	0: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	3: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	3: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	2: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	0: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	2: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	0: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	1: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	1: It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
	0: {'loss': 1.1018, 'grad_norm': 1.7590831241890867, 'learning_rate': 3.62e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.01}
	0: 0%\| \| 1/711 [03:18<39:09:13, 198.53s/it] 0%\| \| 2/711 [03:23<16:39:36, 84.59s/it] 0%\| \| 3/711 [03:24<9:07:59, 46.44s/it] 1%\| \| 4/711 [03:25<5:35:52, 28.50s/it] 1%\| \| 5/711 [03:26<3:39:02, 18.61s/it] 1%\| \| 6/711 [03:27<2:28:28, 12.64s/it] 1%\| \| 7/711 [03:28<1:43:39, 8.84s/it] 1%\| \| 8/711 [03:29<1:14:18, 6.34s/it] 1%\|▏ \| 9/711 [03:30<54:41, 4.67s/it] 1%\|▏ \| 10/711 [03:31<41:22, 3.54s/it] 1%\|▏ \| 10/711 [03:31<41:22, 3.54s/it] 2%\|▏ \| 11/711 [03:32<32:19, 2.77s/it] 2%\|▏ \| 12/711 [03:33<26:03, 2.24s/it] 2%\|▏ \| 13/711 [03:34<21:42, 1.87s/it] 2%\|▏ \| 14/711 [03:35<18:41, 1.61s/it] 2%\|▏ \| 15/711 [03:36<16:34, 1.43s/it] 2%\|▏ \| 16/711 [03:37<15:05, 1.30s/it] 2%\|▏ \| 17/711 [03:38<14:03, 1.22s/it] 3%\|▎ \| 18/711 [03:39<13:19, 1.15s/it] 3%\|�
	0: {'loss': 0.9902, 'grad_norm': 1.160769879445861, 'learning_rate': 5.420000000000001e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.02}
	0: {'loss': 0.9244, 'grad_norm': 1.058149478414618, 'learning_rate': 7.22e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.03}
	0: �� \| 19/711 [03:40<12:50, 1.11s/it] 3%\|▎ \| 20/711 [03:41<12:27, 1.08s/it] 3%\|▎ \| 20/711 [03:41<12:27, 1.08s/it] 3%\|▎ \| 21/711 [03:42<12:13, 1.06s/it] 3%\|▎ \| 22/711 [03:43<12:02, 1.05s/it] 3%\|▎ \| 23/711 [03:44<11:53, 1.04s/it] 3%\|▎ \| 24/711 [03:45<11:47, 1.03s/it] 4%\|▎ \| 25/711 [03:46<11:43, 1.03s/it] 4%\|▎ \| 26/711 [03:47<11:49, 1.04s/it] 4%\|▍ \| 27/711 [03:48<11:44, 1.03s/it] 4%\|▍ \| 28/711 [03:49<11:41, 1.03s/it] 4%\|▍ \| 29/711 [03:50<11:36, 1.02s/it] 4%\|▍ \| 30/711 [03:51<11:36, 1.02s/it] 4%\|▍ \| 30/711 [03:51<11:36, 1.02s/it] 4%\|▍ \| 31/711 [03:52<11:32, 1.02s/it] 5%\|▍ \| 32/711 [03:53<11:29, 1.02s/it] 5%\|▍ \| 33/711 [03:54<11:27, 1.01s/it] 5%\|▍ \| 34/711 [03:55<11:26, 1.01s/it] 5%\|▍
	0: {'loss': 0.8932, 'grad_norm': 0.9348341250213281, 'learning_rate': 9.020000000000002e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.03}
	0: {'loss': 0.8577, 'grad_norm': 0.794975085250345, 'learning_rate': 1.0820000000000001e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.04}
	0: \| 35/711 [03:56<11:26, 1.02s/it] 5%\|▌ \| 36/711 [03:57<11:25, 1.02s/it] 5%\|▌ \| 37/711 [03:58<11:23, 1.01s/it] 5%\|▌ \| 38/711 [03:59<11:21, 1.01s/it] 5%\|▌ \| 39/711 [04:00<11:19, 1.01s/it] 6%\|▌ \| 40/711 [04:02<11:17, 1.01s/it] 6%\|▌ \| 40/711 [04:02<11:17, 1.01s/it] 6%\|▌ \| 41/711 [04:03<11:16, 1.01s/it] 6%\|▌ \| 42/711 [04:04<11:14, 1.01s/it] 6%\|▌ \| 43/711 [04:05<11:14, 1.01s/it] 6%\|▌ \| 44/711 [04:06<11:13, 1.01s/it] 6%\|▋ \| 45/711 [04:07<11:12, 1.01s/it] 6%\|▋ \| 46/711 [04:08<11:13, 1.01s/it] 7%\|▋ \| 47/711 [04:09<11:13, 1.01s/it] 7%\|▋ \| 48/711 [04:10<11:13, 1.02s/it] 7%\|▋ \| 49/711 [04:11<11:11, 1.01s/it] 7%\|▋ \| 50/711 [04:12<11:11, 1.02s/it] 7%\|▋ \| 50/711 [04:12<11:11, 1.02s/it] 7%\|▋ \|
	0: {'loss': 0.8286, 'grad_norm': 0.8899530497730146, 'learning_rate': 1.2620000000000001e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.05}
	0: 51/711 [04:13<11:10, 1.02s/it] 7%\|▋ \| 52/711 [04:14<11:11, 1.02s/it] 7%\|▋ \| 53/711 [04:15<11:14, 1.02s/it] 8%\|▊ \| 54/711 [04:16<11:10, 1.02s/it] 8%\|▊ \| 55/711 [04:17<11:08, 1.02s/it] 8%\|▊ \| 56/711 [04:18<11:10, 1.02s/it] 8%\|▊ \| 57/711 [04:19<11:08, 1.02s/it] 8%\|▊ \| 58/711 [04:20<11:10, 1.03s/it] 8%\|▊ \| 59/711 [04:21<11:07, 1.02s/it] 8%\|▊ \| 60/711 [04:22<11:03, 1.02s/it] 8%\|▊ \| 60/711 [04:22<11:03, 1.02s/it] 9%\|▊ \| 61/711 [04:23<11:01, 1.02s/it] 9%\|▊ \| 62/711 [04:24<10:58, 1.01s/it] 9%\|▉ \| 63/711 [04:25<11:01, 1.02s/it] 9%\|▉ \| 64/711 [04:26<10:59, 1.02s/it] 9%\|▉ \| 65/711 [04:27<10:58, 1.02s/it] 9%\|▉ \| 66/711 [04:28<10:55, 1.02s/it] 9%\|▉ \| 67/711 [04:29<10:53, 1.01s/it] 10%\|▉ \| 68/711 [04:30<10:51, 1.01s/it] 10%\|▉ \| 69/7
	0: {'loss': 0.8345, 'grad_norm': 0.9574271371234939, 'learning_rate': 1.4420000000000001e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.06}
	0: {'loss': 0.8208, 'grad_norm': 0.9418691707757363, 'learning_rate': 1.6220000000000004e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.07}
	0: 11 [04:31<10:50, 1.01s/it] 10%\|▉ \| 70/711 [04:32<10:50, 1.01s/it] 10%\|▉ \| 70/711 [04:32<10:50, 1.01s/it] 10%\|▉ \| 71/711 [04:33<10:50, 1.02s/it] 10%\|█ \| 72/711 [04:34<10:49, 1.02s/it] 10%\|█ \| 73/711 [04:35<10:47, 1.02s/it] 10%\|█ \| 74/711 [04:36<10:46, 1.02s/it] 11%\|█ \| 75/711 [04:37<10:45, 1.01s/it] 11%\|█ \| 76/711 [04:38<10:44, 1.01s/it] 11%\|█ \| 77/711 [04:39<10:43, 1.01s/it] 11%\|█ \| 78/711 [04:40<10:41, 1.01s/it] 11%\|█ \| 79/711 [04:41<10:40, 1.01s/it] 11%\|█▏ \| 80/711 [04:42<10:40, 1.02s/it] 11%\|█▏ \| 80/711 [04:42<10:40, 1.02s/it] 11%\|█▏ \| 81/711 [04:43<10:44, 1.02s/it] 12%\|█▏ \| 82/711 [04:44<10:40, 1.02s/it] 12%\|█▏ \| 83/711 [04:45<10:43, 1.02s/it] 12%\|█▏ \| 84/711 [04:46<10:40, 1.02s/it] 12%\|█▏
	0: {'loss': 0.7899, 'grad_norm': 0.9662815954776516, 'learning_rate': 1.802e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.08}
	0: {'loss': 0.7661, 'grad_norm': 1.2089037525815847, 'learning_rate': 1.982e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.08}
	0: \| 85/711 [04:47<10:37, 1.02s/it] 12%\|█▏ \| 86/711 [04:48<10:39, 1.02s/it] 12%\|█▏ \| 87/711 [04:49<10:36, 1.02s/it] 12%\|█▏ \| 88/711 [04:50<10:33, 1.02s/it] 13%\|█▎ \| 89/711 [04:51<10:30, 1.01s/it] 13%\|█▎ \| 90/711 [04:52<10:28, 1.01s/it] 13%\|█▎ \| 90/711 [04:52<10:28, 1.01s/it] 13%\|█▎ \| 91/711 [04:53<10:28, 1.01s/it] 13%\|█▎ \| 92/711 [04:54<10:28, 1.02s/it] 13%\|█▎ \| 93/711 [04:55<10:30, 1.02s/it] 13%\|█▎ \| 94/711 [04:56<10:32, 1.02s/it] 13%\|█▎ \| 95/711 [04:57<10:32, 1.03s/it] 14%\|█▎ \| 96/711 [04:58<10:29, 1.02s/it] 14%\|█▎ \| 97/711 [05:00<10:26, 1.02s/it] 14%\|█▍ \| 98/711 [05:01<10:28, 1.02s/it] 14%\|█▍ \| 99/711 [05:02<10:24, 1.02s/it] 14%\|█▍ \| 100/711 [05:03<10:22, 1.02s/it] 14%\|█▍ \| 100/711 [05:03<10
	0: {'loss': 0.8007, 'grad_norm': 0.8960002301387379, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.09}
	0: :22, 1.02s/it] 14%\|█▍ \| 101/711 [05:04<10:21, 1.02s/it] 14%\|█▍ \| 102/711 [05:05<10:19, 1.02s/it] 14%\|█▍ \| 103/711 [05:06<10:18, 1.02s/it] 15%\|█▍ \| 104/711 [05:07<10:16, 1.02s/it] 15%\|█▍ \| 105/711 [05:08<10:14, 1.01s/it] 15%\|█▍ \| 106/711 [05:09<10:13, 1.01s/it] 15%\|█▌ \| 107/711 [05:10<10:12, 1.01s/it] 15%\|█▌ \| 108/711 [05:11<10:12, 1.02s/it] 15%\|█▌ \| 109/711 [05:12<10:10, 1.01s/it] 15%\|█▌ \| 110/711 [05:13<10:09, 1.01s/it] 15%\|█▌ \| 110/711 [05:13<10:09, 1.01s/it] 16%\|█▌ \| 111/711 [05:14<10:10, 1.02s/it] 16%\|█▌ \| 112/711 [05:15<10:08, 1.02s/it] 16%\|█▌ \| 113/711 [05:16<10:10, 1.02s/it] 16%\|█▌ \| 114/711 [05:17<10:08, 1.02s/it] 16%\|█▌ \| 115/711 [05:18<10:06, 1.02s/it] 16%\|█▋ \| 116/711 [05:19<10:04, 1.02s/it] 16%\|█▋ \| 117/711 [05:20<10:
	0: {'loss': 0.7871, 'grad_norm': 1.0229518399908935, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.1}
	0: {'loss': 0.763, 'grad_norm': 1.0001793363102398, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.11}
	0: 03, 1.02s/it] 17%\|█▋ \| 118/711 [05:21<10:01, 1.02s/it] 17%\|█▋ \| 119/711 [05:22<10:00, 1.01s/it] 17%\|█▋ \| 120/711 [05:23<09:59, 1.01s/it] 17%\|█▋ \| 120/711 [05:23<09:59, 1.01s/it] 17%\|█▋ \| 121/711 [05:24<09:58, 1.01s/it] 17%\|█▋ \| 122/711 [05:25<09:57, 1.01s/it] 17%\|█▋ \| 123/711 [05:26<09:56, 1.01s/it] 17%\|█▋ \| 124/711 [05:27<09:56, 1.02s/it] 18%\|█▊ \| 125/711 [05:28<09:58, 1.02s/it] 18%\|█▊ \| 126/711 [05:29<09:59, 1.02s/it] 18%\|█▊ \| 127/711 [05:30<10:00, 1.03s/it] 18%\|█▊ \| 128/711 [05:31<09:56, 1.02s/it] 18%\|█▊ \| 129/711 [05:32<09:53, 1.02s/it] 18%\|█▊ \| 130/711 [05:33<09:50, 1.02s/it] 18%\|█▊ \| 130/711 [05:33<09:50, 1.02s/it] 18%\|█▊ \| 131/711 [05:34<09:48, 1.02s/it] 19%\|█▊ \| 132/711 [05:35<09:47,
	0: {'loss': 0.767, 'grad_norm': 0.9082590810280519, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.12}
	0: 1.01s/it] 19%\|█▊ \| 133/711 [05:36<09:49, 1.02s/it] 19%\|█▉ \| 134/711 [05:37<09:48, 1.02s/it] 19%\|█▉ \| 135/711 [05:38<09:46, 1.02s/it] 19%\|█▉ \| 136/711 [05:39<09:45, 1.02s/it] 19%\|█▉ \| 137/711 [05:40<09:43, 1.02s/it] 19%\|█▉ \| 138/711 [05:41<09:42, 1.02s/it] 20%\|█▉ \| 139/711 [05:42<09:41, 1.02s/it] 20%\|█▉ \| 140/711 [05:43<09:39, 1.02s/it] 20%\|█▉ \| 140/711 [05:43<09:39, 1.02s/it] 20%\|█▉ \| 141/711 [05:44<09:39, 1.02s/it] 20%\|█▉ \| 142/711 [05:45<09:39, 1.02s/it] 20%\|██ \| 143/711 [05:46<09:38, 1.02s/it] 20%\|██ \| 144/711 [05:47<09:37, 1.02s/it] 20%\|██ \| 145/711 [05:48<09:37, 1.02s/it] 21%\|██ \| 146/711 [05:49<09:35, 1.02s/it] 21%\|██ \| 147/711 [05:50<09:33, 1.02s/it] 21%\|██ \| 148/711 [05:51<09:31, 1.02s/it] 21%\|██ \| 149/711 [05:52<09:30,
	0: {'loss': 0.7652, 'grad_norm': 0.9284057369823192, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.13}
	0: {'loss': 0.7668, 'grad_norm': 0.8767250768222354, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.13}
	0: 1.02s/it] 21%\|██ \| 150/711 [05:53<09:29, 1.02s/it] 21%\|██ \| 150/711 [05:53<09:29, 1.02s/it] 21%\|██ \| 151/711 [05:54<09:29, 1.02s/it] 21%\|██▏ \| 152/711 [05:55<09:30, 1.02s/it] 22%\|██▏ \| 153/711 [05:56<09:28, 1.02s/it] 22%\|██▏ \| 154/711 [05:58<09:27, 1.02s/it] 22%\|██▏ \| 155/711 [05:59<09:26, 1.02s/it] 22%\|██▏ \| 156/711 [06:00<09:24, 1.02s/it] 22%\|██▏ \| 157/711 [06:01<09:23, 1.02s/it] 22%\|██▏ \| 158/711 [06:02<09:21, 1.02s/it] 22%\|██▏ \| 159/711 [06:03<09:24, 1.02s/it] 23%\|██▎ \| 160/711 [06:04<09:22, 1.02s/it] 23%\|██▎ \| 160/711 [06:04<09:22, 1.02s/it] 23%\|██▎ \| 161/711 [06:05<09:20, 1.02s/it] 23%\|██▎ \| 162/711 [06:06<09:18, 1.02s/it] 23%\|██▎ \| 163/711 [06:07<09:17, 1.02s/it] 23%\|██▎ \|
	0: {'loss': 0.7515, 'grad_norm': 0.9756062058818691, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.14}
	0: 164/711 [06:08<09:16, 1.02s/it] 23%\|██▎ \| 165/711 [06:09<09:14, 1.02s/it] 23%\|██▎ \| 166/711 [06:10<09:17, 1.02s/it] 23%\|██▎ \| 167/711 [06:11<09:15, 1.02s/it] 24%\|██▎ \| 168/711 [06:12<09:13, 1.02s/it] 24%\|██▍ \| 169/711 [06:13<09:12, 1.02s/it] 24%\|██▍ \| 170/711 [06:14<09:10, 1.02s/it] 24%\|██▍ \| 170/711 [06:14<09:10, 1.02s/it] 24%\|██▍ \| 171/711 [06:15<09:09, 1.02s/it] 24%\|██▍ \| 172/711 [06:16<09:06, 1.01s/it] 24%\|██▍ \| 173/711 [06:17<09:07, 1.02s/it] 24%\|██▍ \| 174/711 [06:18<09:06, 1.02s/it] 25%\|██▍ \| 175/711 [06:19<09:05, 1.02s/it] 25%\|██▍ \| 176/711 [06:20<09:03, 1.02s/it] 25%\|██▍ \| 177/711 [06:21<09:02, 1.02s/it] 25%\|██▌ \| 178/711 [06:22<09:04, 1.02s/it] 25%\|██▌ \| 179/711 [06:23<09:02, 1.02s/it] 25%\|██▌ \| 180/711 [06:24<09:01
	0: {'loss': 0.7428, 'grad_norm': 0.9528470394392502, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.15}
	0: {'loss': 0.7358, 'grad_norm': 0.9445388231828549, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.16}
	0: , 1.02s/it] 25%\|██▌ \| 180/711 [06:24<09:01, 1.02s/it] 25%\|██▌ \| 181/711 [06:25<09:01, 1.02s/it] 26%\|██▌ \| 182/711 [06:26<09:00, 1.02s/it] 26%\|██▌ \| 183/711 [06:27<08:58, 1.02s/it] 26%\|██▌ \| 184/711 [06:28<08:57, 1.02s/it] 26%\|██▌ \| 185/711 [06:29<08:55, 1.02s/it] 26%\|██▌ \| 186/711 [06:30<08:54, 1.02s/it] 26%\|██▋ \| 187/711 [06:31<08:52, 1.02s/it] 26%\|██▋ \| 188/711 [06:32<08:55, 1.02s/it] 27%\|██▋ \| 189/711 [06:33<08:52, 1.02s/it] 27%\|██▋ \| 190/711 [06:34<08:50, 1.02s/it] 27%\|██▋ \| 190/711 [06:34<08:50, 1.02s/it] 27%\|██▋ \| 191/711 [06:35<08:48, 1.02s/it] 27%\|██▋ \| 192/711 [06:36<08:47, 1.02s/it] 27%\|██▋ \| 193/711 [06:37<08:46, 1.02s/it] 27%\|██▋ \| 194/711 [06:38<08:45, 1.02s/it] 27%\|██�
	0: {'loss': 0.7419, 'grad_norm': 0.9093993578822128, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.17}
	0: � \| 195/711 [06:39<08:45, 1.02s/it] 28%\|██▊ \| 196/711 [06:40<08:43, 1.02s/it] 28%\|██▊ \| 197/711 [06:41<08:44, 1.02s/it] 28%\|██▊ \| 198/711 [06:42<08:46, 1.03s/it] 28%\|██▊ \| 199/711 [06:43<08:43, 1.02s/it] 28%\|██▊ \| 200/711 [06:44<08:40, 1.02s/it] 28%\|██▊ \| 200/711 [06:44<08:40, 1.02s/it] 28%\|██▊ \| 201/711 [06:45<08:39, 1.02s/it] 28%\|██▊ \| 202/711 [06:46<08:38, 1.02s/it] 29%\|██▊ \| 203/711 [06:47<08:37, 1.02s/it] 29%\|██▊ \| 204/711 [06:48<08:39, 1.02s/it] 29%\|██▉ \| 205/711 [06:49<08:36, 1.02s/it] 29%\|██▉ \| 206/711 [06:50<08:33, 1.02s/it] 29%\|██▉ \| 207/711 [06:51<08:32, 1.02s/it] 29%\|██▉ \| 208/711 [06:53<08:31, 1.02s/it] 29%\|██▉ \| 209/711 [06:54<08:29, 1.02s/it] 30%\|██▉ \| 210/711 [06:55<08:28, 1.01s/it]
	0: {'loss': 0.7289, 'grad_norm': 0.9027102829983507, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.18}
	0: {'loss': 0.7468, 'grad_norm': 0.8749606638751157, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.19}
	0: 30%\|██▉ \| 210/711 [06:55<08:28, 1.01s/it] 30%\|██▉ \| 211/711 [06:56<08:27, 1.02s/it] 30%\|██▉ \| 212/711 [06:57<08:27, 1.02s/it] 30%\|██▉ \| 213/711 [06:58<08:26, 1.02s/it] 30%\|███ \| 214/711 [06:59<08:25, 1.02s/it] 30%\|███ \| 215/711 [07:00<08:24, 1.02s/it] 30%\|███ \| 216/711 [07:01<08:22, 1.02s/it] 31%\|███ \| 217/711 [07:02<08:21, 1.01s/it] 31%\|███ \| 218/711 [07:03<08:20, 1.01s/it] 31%\|███ \| 219/711 [07:04<08:18, 1.01s/it] 31%\|███ \| 220/711 [07:05<08:17, 1.01s/it] 31%\|███ \| 220/711 [07:05<08:17, 1.01s/it] 31%\|███ \| 221/711 [07:06<08:17, 1.02s/it] 31%\|███ \| 222/711 [07:07<08:20, 1.02s/it] 31%\|███▏ \| 223/711 [07:08<08:18, 1.02s/it] 32%\|███▏ \| 224/711 [07:09<08:15, 1.02s/it] 32%\|███▏ \| 225/711 [07:10<08:14, 1.02s/it
	0: {'loss': 0.7306, 'grad_norm': 0.9160800008404012, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.19}
	0: ] 32%\|███▏ \| 226/711 [07:11<08:16, 1.02s/it] 32%\|███▏ \| 227/711 [07:12<08:14, 1.02s/it] 32%\|███▏ \| 228/711 [07:13<08:12, 1.02s/it] 32%\|███▏ \| 229/711 [07:14<08:10, 1.02s/it] 32%\|███▏ \| 230/711 [07:15<08:08, 1.01s/it] 32%\|███▏ \| 230/711 [07:15<08:08, 1.01s/it] 32%\|███▏ \| 231/711 [07:16<08:07, 1.01s/it] 33%\|███▎ \| 232/711 [07:17<08:06, 1.01s/it] 33%\|███▎ \| 233/711 [07:18<08:04, 1.01s/it] 33%\|███▎ \| 234/711 [07:19<08:03, 1.01s/it] 33%\|███▎ \| 235/711 [07:20<08:02, 1.01s/it] 33%\|███▎ \| 236/711 [07:21<08:01, 1.01s/it] 33%\|███▎ \| 237/711 [07:22<08:00, 1.01s/it] 33%\|███▎ \| 238/711 [07:23<07:59, 1.01s/it] 34%\|███▎ \| 239/711 [07:24<07:57, 1.01s/it] 34%\|███▍ \| 240/711 [07:25<07:56, 1.01s/it]
	0: {'loss': 0.7361, 'grad_norm': 0.8938800949879834, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.2}
	0: {'loss': 0.7165, 'grad_norm': 0.8865745843602661, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.21}
	0: 34%\|███▍ \| 240/711 [07:25<07:56, 1.01s/it] 34%\|███▍ \| 241/711 [07:26<07:56, 1.01s/it] 34%\|███▍ \| 242/711 [07:27<07:55, 1.01s/it] 34%\|███▍ \| 243/711 [07:28<07:55, 1.02s/it] 34%\|███▍ \| 244/711 [07:29<07:54, 1.02s/it] 34%\|███▍ \| 245/711 [07:30<07:52, 1.01s/it] 35%\|███▍ \| 246/711 [07:31<07:51, 1.01s/it] 35%\|███▍ \| 247/711 [07:32<07:49, 1.01s/it] 35%\|███▍ \| 248/711 [07:33<07:48, 1.01s/it] 35%\|███▌ \| 249/711 [07:34<07:47, 1.01s/it] 35%\|███▌ \| 250/711 [07:35<07:46, 1.01s/it] 35%\|███▌ \| 250/711 [07:35<07:46, 1.01s/it] 35%\|███▌ \| 251/711 [07:36<07:45, 1.01s/it] 35%\|███▌ \| 252/711 [07:37<07:44, 1.01s/it] 36%\|███▌ \| 253/711 [07:38<07:43, 1.01s/it] 36%\|███▌ \| 254/711 [07:39<07:42, 1.01s/it] 36%\|███▌ \| 255/711 [07:
	0: {'loss': 0.7099, 'grad_norm': 0.8798367435457611, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.22}
	0: 40<07:41, 1.01s/it] 36%\|███▌ \| 256/711 [07:41<07:41, 1.01s/it] 36%\|███▌ \| 257/711 [07:42<07:40, 1.01s/it] 36%\|███▋ \| 258/711 [07:43<07:39, 1.02s/it] 36%\|███▋ \| 259/711 [07:44<07:38, 1.01s/it] 37%\|███▋ \| 260/711 [07:45<07:36, 1.01s/it] 37%\|███▋ \| 260/711 [07:45<07:36, 1.01s/it] 37%\|███▋ \| 261/711 [07:46<07:36, 1.01s/it] 37%\|███▋ \| 262/711 [07:47<07:34, 1.01s/it] 37%\|███▋ \| 263/711 [07:48<07:33, 1.01s/it] 37%\|███▋ \| 264/711 [07:49<07:32, 1.01s/it] 37%\|███▋ \| 265/711 [07:50<07:31, 1.01s/it] 37%\|███▋ \| 266/711 [07:51<07:30, 1.01s/it] 38%\|███▊ \| 267/711 [07:52<07:29, 1.01s/it] 38%\|███▊ \| 268/711 [07:53<07:30, 1.02s/it] 38%\|███▊ \| 269/711 [07:54<07:28, 1.02s/it] 38%\|███▊ \| 270/711 [07:55<07:27, 1.01s/it]
	0: {'loss': 0.7113, 'grad_norm': 0.9763568156539483, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.23}
	0: {'loss': 0.7247, 'grad_norm': 0.9774181687381295, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.24}
	0: 38%\|███▊ \| 270/711 [07:55<07:27, 1.01s/it] 38%\|███▊ \| 271/711 [07:56<07:26, 1.01s/it] 38%\|███▊ \| 272/711 [07:57<07:24, 1.01s/it] 38%\|███▊ \| 273/711 [07:58<07:23, 1.01s/it] 39%\|███▊ \| 274/711 [07:59<07:25, 1.02s/it] 39%\|███▊ \| 275/711 [08:01<07:23, 1.02s/it] 39%\|███▉ \| 276/711 [08:02<07:22, 1.02s/it] 39%\|███▉ \| 277/711 [08:03<07:20, 1.02s/it] 39%\|███▉ \| 278/711 [08:04<07:19, 1.01s/it] 39%\|███▉ \| 279/711 [08:05<07:17, 1.01s/it] 39%\|███▉ \| 280/711 [08:06<07:16, 1.01s/it] 39%\|███▉ \| 280/711 [08:06<07:16, 1.01s/it] 40%\|███▉ \| 281/711 [08:07<07:15, 1.01s/it] 40%\|███▉ \| 282/711 [08:08<07:14, 1.01s/it] 40%\|███▉ \| 283/711 [08:09<07:15, 1.02s/it] 40%\|███▉ \| 284/711 [08:10<07:16, 1.02s/it] 40%\|████
	0: {'loss': 0.6936, 'grad_norm': 0.8964349971328741, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.24}
	0: \| 285/711 [08:11<07:15, 1.02s/it] 40%\|████ \| 286/711 [08:12<07:12, 1.02s/it] 40%\|████ \| 287/711 [08:13<07:14, 1.02s/it] 41%\|████ \| 288/711 [08:14<07:12, 1.02s/it] 41%\|████ \| 289/711 [08:15<07:10, 1.02s/it] 41%\|████ \| 290/711 [08:16<07:08, 1.02s/it] 41%\|████ \| 290/711 [08:16<07:08, 1.02s/it] 41%\|████ \| 291/711 [08:17<07:08, 1.02s/it] 41%\|████ \| 292/711 [08:18<07:09, 1.02s/it] 41%\|████ \| 293/711 [08:19<07:08, 1.03s/it] 41%\|████▏ \| 294/711 [08:20<07:06, 1.02s/it] 41%\|████▏ \| 295/711 [08:21<07:03, 1.02s/it] 42%\|████▏ \| 296/711 [08:22<07:01, 1.02s/it] 42%\|████▏ \| 297/711 [08:23<07:03, 1.02s/it] 42%\|████▏ \| 298/711 [08:24<07:01, 1.02s/it] 42%\|████▏ \| 299/711 [08:25<06:59, 1.02s/it] 42%\|████▏ \| 300/711 [08:26<06:58, 1
	0: {'loss': 0.707, 'grad_norm': 0.9596158535601407, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.25}
	0: {'loss': 0.7079, 'grad_norm': 0.8256624207228515, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.26}
	0: .02s/it] 42%\|████▏ \| 300/711 [08:26<06:58, 1.02s/it] 42%\|████▏ \| 301/711 [08:27<06:56, 1.02s/it] 42%\|████▏ \| 302/711 [08:28<06:55, 1.02s/it] 43%\|████▎ \| 303/711 [08:29<06:53, 1.01s/it] 43%\|████▎ \| 304/711 [08:30<06:52, 1.01s/it] 43%\|████▎ \| 305/711 [08:31<06:52, 1.02s/it] 43%\|████▎ \| 306/711 [08:32<06:51, 1.02s/it] 43%\|████▎ \| 307/711 [08:33<06:50, 1.02s/it] 43%\|████▎ \| 308/711 [08:34<06:49, 1.02s/it] 43%\|████▎ \| 309/711 [08:35<06:50, 1.02s/it] 44%\|████▎ \| 310/711 [08:36<06:51, 1.03s/it] 44%\|████▎ \| 310/711 [08:36<06:51, 1.03s/it] 44%\|████▎ \| 311/711 [08:37<06:49, 1.02s/it] 44%\|████▍ \| 312/711 [08:38<06:50, 1.03s/it] 44%\|████▍ \| 313/711 [08:39<06:47, 1.02s/it] 44%\|██�
	0: {'loss': 0.7113, 'grad_norm': 0.8590726758549844, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.27}
	0: �█▍ \| 314/711 [08:40<06:47, 1.03s/it] 44%\|████▍ \| 315/711 [08:41<06:45, 1.03s/it] 44%\|████▍ \| 316/711 [08:42<06:43, 1.02s/it] 45%\|████▍ \| 317/711 [08:43<06:41, 1.02s/it] 45%\|████▍ \| 318/711 [08:44<06:39, 1.02s/it] 45%\|████▍ \| 319/711 [08:45<06:38, 1.02s/it] 45%\|████▌ \| 320/711 [08:46<06:36, 1.01s/it] 45%\|████▌ \| 320/711 [08:46<06:36, 1.01s/it] 45%\|████▌ \| 321/711 [08:48<07:02, 1.08s/it] 45%\|████▌ \| 322/711 [08:49<06:53, 1.06s/it] 45%\|████▌ \| 323/711 [08:50<06:46, 1.05s/it] 46%\|████▌ \| 324/711 [08:51<06:40, 1.04s/it] 46%\|████▌ \| 325/711 [08:52<06:37, 1.03s/it] 46%\|████▌ \| 326/711 [08:53<06:34, 1.02s/it] 46%\|████▌ \| 327/711 [08:54<06:32, 1.02s/it] 46%\|████▌ \| 328/711 [08:55<06:29, 1.02s/it] 46%\|████▋ \|
	0: {'loss': 0.7097, 'grad_norm': 0.9011059611829694, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.28}
	0: {'loss': 0.695, 'grad_norm': 0.8452924322256501, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.29}
	0: 329/711 [08:56<06:27, 1.02s/it] 46%\|████▋ \| 330/711 [08:57<06:27, 1.02s/it] 46%\|████▋ \| 330/711 [08:57<06:27, 1.02s/it] 47%\|████▋ \| 331/711 [08:58<06:26, 1.02s/it] 47%\|████▋ \| 332/711 [08:59<06:24, 1.01s/it] 47%\|████▋ \| 333/711 [09:00<06:25, 1.02s/it] 47%\|████▋ \| 334/711 [09:01<06:23, 1.02s/it] 47%\|████▋ \| 335/711 [09:02<06:23, 1.02s/it] 47%\|████▋ \| 336/711 [09:03<06:21, 1.02s/it] 47%\|████▋ \| 337/711 [09:04<06:18, 1.01s/it] 48%\|████▊ \| 338/711 [09:05<06:17, 1.01s/it] 48%\|████▊ \| 339/711 [09:06<06:15, 1.01s/it] 48%\|████▊ \| 340/711 [09:07<06:15, 1.01s/it] 48%\|████▊ \| 340/711 [09:07<06:15, 1.01s/it] 48%\|████▊ \| 341/711 [09:08<06:15, 1.01s/it] 48%\|████▊ \| 342/711 [09:09<06:13,
	0: {'loss': 0.7032, 'grad_norm': 0.8466692851184044, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.3}
	0: 1.01s/it] 48%\|████▊ \| 343/711 [09:10<06:13, 1.01s/it] 48%\|████▊ \| 344/711 [09:11<06:13, 1.02s/it] 49%\|████▊ \| 345/711 [09:12<06:33, 1.08s/it] 49%\|████▊ \| 346/711 [09:13<06:25, 1.06s/it] 49%\|████▉ \| 347/711 [09:14<06:19, 1.04s/it] 49%\|████▉ \| 348/711 [09:15<06:14, 1.03s/it] 49%\|████▉ \| 349/711 [09:16<06:11, 1.03s/it] 49%\|████▉ \| 350/711 [09:17<06:09, 1.02s/it] 49%\|████▉ \| 350/711 [09:17<06:09, 1.02s/it] 49%\|████▉ \| 351/711 [09:18<06:07, 1.02s/it] 50%\|████▉ \| 352/711 [09:19<06:05, 1.02s/it] 50%\|████▉ \| 353/711 [09:20<06:03, 1.01s/it] 50%\|████▉ \| 354/711 [09:21<06:01, 1.01s/it] 50%\|████▉ \| 355/711 [09:22<06:00, 1.01s/it] 50%\|█████ \| 356/711 [09:23<05:59, 1.01s/it] 50%\|█████ \| 357/711 [09:24<05:58, 1.01s/it]
	0: {'loss': 0.7127, 'grad_norm': 0.8028586105919348, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.3}
	0: {'loss': 0.7049, 'grad_norm': 0.8418301197643927, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.31}
	0: 50%\|█████ \| 358/711 [09:25<05:57, 1.01s/it] 50%\|█████ \| 359/711 [09:26<05:57, 1.01s/it] 51%\|█████ \| 360/711 [09:27<05:55, 1.01s/it] 51%\|█████ \| 360/711 [09:27<05:55, 1.01s/it] 51%\|█████ \| 361/711 [09:28<05:55, 1.01s/it] 51%\|█████ \| 362/711 [09:29<05:53, 1.01s/it] 51%\|█████ \| 363/711 [09:30<05:52, 1.01s/it] 51%\|█████ \| 364/711 [09:31<05:51, 1.01s/it] 51%\|█████▏ \| 365/711 [09:32<05:51, 1.02s/it] 51%\|█████▏ \| 366/711 [09:33<05:50, 1.01s/it] 52%\|█████▏ \| 367/711 [09:34<05:49, 1.02s/it] 52%\|█████▏ \| 368/711 [09:35<05:47, 1.01s/it] 52%\|█████▏ \| 369/711 [09:36<05:46, 1.01s/it] 52%\|█████▏ \| 370/711 [09:37<05:45, 1.01s/it] 52%\|█████▏ \| 370/711 [09:37<05:45, 1.01s/it] 52%\|█�
	0: {'loss': 0.7056, 'grad_norm': 0.8018926220188637, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.32}
	0: ��███▏ \| 371/711 [09:38<05:44, 1.01s/it] 52%\|█████▏ \| 372/711 [09:39<05:42, 1.01s/it] 52%\|█████▏ \| 373/711 [09:40<05:42, 1.01s/it] 53%\|█████▎ \| 374/711 [09:42<05:41, 1.01s/it] 53%\|█████▎ \| 375/711 [09:43<05:40, 1.01s/it] 53%\|█████▎ \| 376/711 [09:44<05:39, 1.01s/it] 53%\|█████▎ \| 377/711 [09:45<05:38, 1.01s/it] 53%\|█████▎ \| 378/711 [09:46<05:37, 1.01s/it] 53%\|█████▎ \| 379/711 [09:47<05:38, 1.02s/it] 53%\|█████▎ \| 380/711 [09:48<05:37, 1.02s/it] 53%\|█████▎ \| 380/711 [09:48<05:37, 1.02s/it] 54%\|█████▎ \| 381/711 [09:49<05:38, 1.02s/it] 54%\|█████▎ \| 382/711 [09:50<05:35, 1.02s/it] 54%\|█████▍ \| 383/711 [09:51<05:33, 1.02s/it] 54%\|█████▍ \| 384/711 [09:52<05:32, 1.02s/it] 54%\|█████▍ \| 385/711 [09:53<05:30,
	0: {'loss': 0.6974, 'grad_norm': 0.8565445228240428, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.33}
	0: 1.01s/it] 54%\|█████▍ \| 386/711 [09:54<05:29, 1.01s/it] 54%\|█████▍ \| 387/711 [09:55<05:28, 1.01s/it] 55%\|█████▍ \| 388/711 [09:56<05:27, 1.01s/it] 55%\|█████▍ \| 389/711 [09:57<05:26, 1.02s/it] 55%\|█████▍ \| 390/711 [09:58<05:26, 1.02s/it] 55%\|█████▍ \| 390/711 [09:58<05:26, 1.02s/it] 55%\|█████▍ \| 391/711 [09:59<05:25, 1.02s/it] 55%\|█████▌ \| 392/711 [10:00<05:24, 1.02s/it] 55%\|█████▌ \| 393/711 [10:01<05:22, 1.01s/it] 55%\|█████▌ \| 394/711 [10:02<05:21, 1.01s/it] 56%\|█████▌ \| 395/711 [10:03<05:22, 1.02s/it] 56%\|█████▌ \| 396/711 [10:04<05:19, 1.02s/it] 56%\|█████▌ \| 397/711 [10:05<05:18, 1.01s/it] 56%\|█████▌ \| 398/711 [10:06<05:16, 1.01s/it] 56%\|█████▌ \| 399/711 [10:07<05:15, 1.01s/it] 56%\|█████▋ \| 400/
	0: {'loss': 0.6793, 'grad_norm': 0.9500235419290328, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.34}
	0: {'loss': 0.6814, 'grad_norm': 0.8451661040419431, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.35}
	0: 711 [10:08<05:14, 1.01s/it] 56%\|█████▋ \| 400/711 [10:08<05:14, 1.01s/it] 56%\|█████▋ \| 401/711 [10:09<05:14, 1.01s/it] 57%\|█████▋ \| 402/711 [10:10<05:12, 1.01s/it] 57%\|█████▋ \| 403/711 [10:11<05:11, 1.01s/it] 57%\|█████▋ \| 404/711 [10:12<05:10, 1.01s/it] 57%\|█████▋ \| 405/711 [10:13<05:11, 1.02s/it] 57%\|█████▋ \| 406/711 [10:14<05:09, 1.02s/it] 57%\|█████▋ \| 407/711 [10:15<05:08, 1.01s/it] 57%\|█████▋ \| 408/711 [10:16<05:07, 1.02s/it] 58%\|█████▊ \| 409/711 [10:17<05:08, 1.02s/it] 58%\|█████▊ \| 410/711 [10:18<05:06, 1.02s/it] 58%\|█████▊ \| 410/711 [10:18<05:06, 1.02s/it] 58%\|█████▊ \| 411/711 [10:19<05:07, 1.02s/it] 58%\|█████▊ \| 412/711 [10:20<05:05, 1.02s/it] 58%\|█████▊
	0: {'loss': 0.6906, 'grad_norm': 0.8679849121193738, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.35}
	0: \| 413/711 [10:21<05:03, 1.02s/it] 58%\|█████▊ \| 414/711 [10:22<05:02, 1.02s/it] 58%\|█████▊ \| 415/711 [10:23<05:00, 1.02s/it] 59%\|█████▊ \| 416/711 [10:24<05:01, 1.02s/it] 59%\|█████▊ \| 417/711 [10:25<04:59, 1.02s/it] 59%\|█████▉ \| 418/711 [10:26<04:57, 1.02s/it] 59%\|█████▉ \| 419/711 [10:27<04:56, 1.02s/it] 59%\|█████▉ \| 420/711 [10:28<04:54, 1.01s/it] 59%\|█████▉ \| 420/711 [10:28<04:54, 1.01s/it] 59%\|█████▉ \| 421/711 [10:29<04:53, 1.01s/it] 59%\|█████▉ \| 422/711 [10:30<04:52, 1.01s/it] 59%\|█████▉ \| 423/711 [10:31<04:51, 1.01s/it] 60%\|█████▉ \| 424/711 [10:32<04:50, 1.01s/it] 60%\|█████▉ \| 425/711 [10:33<04:49, 1.01s/it] 60%\|█████▉ \| 426/711 [10:34<04:48, 1.01s/it] 60%\|██████ \| 427/711 [10:35<04:46, 1.01s/it] 60%\|�
	0: {'loss': 0.6849, 'grad_norm': 0.8279829459119256, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.36}
	0: {'loss': 0.6902, 'grad_norm': 0.870874120336189, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.37}
	0: ��█████ \| 428/711 [10:36<04:48, 1.02s/it] 60%\|██████ \| 429/711 [10:37<04:47, 1.02s/it] 60%\|██████ \| 430/711 [10:38<04:45, 1.02s/it] 60%\|██████ \| 430/711 [10:38<04:45, 1.02s/it] 61%\|██████ \| 431/711 [10:39<04:44, 1.02s/it] 61%\|██████ \| 432/711 [10:40<04:43, 1.02s/it] 61%\|██████ \| 433/711 [10:41<04:42, 1.01s/it] 61%\|██████ \| 434/711 [10:42<04:40, 1.01s/it] 61%\|██████ \| 435/711 [10:43<04:39, 1.01s/it] 61%\|██████▏ \| 436/711 [10:44<04:40, 1.02s/it] 61%\|██████▏ \| 437/711 [10:46<04:40, 1.02s/it] 62%\|██████▏ \| 438/711 [10:47<04:38, 1.02s/it] 62%\|██████▏ \| 439/711 [10:48<04:36, 1.02s/it] 62%\|██████▏ \| 440/711 [10:49<04:34, 1.01s/it] 62%\|██████▏ \| 440/711 [10:49<04:3
	0: {'loss': 0.681, 'grad_norm': 0.8159001929242623, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.38}
	0: 4, 1.01s/it] 62%\|██████▏ \| 441/711 [10:50<04:33, 1.01s/it] 62%\|██████▏ \| 442/711 [10:51<04:32, 1.01s/it] 62%\|██████▏ \| 443/711 [10:52<04:31, 1.01s/it] 62%\|██████▏ \| 444/711 [10:53<04:30, 1.01s/it] 63%\|██████▎ \| 445/711 [10:54<04:31, 1.02s/it] 63%\|██████▎ \| 446/711 [10:55<04:29, 1.02s/it] 63%\|██████▎ \| 447/711 [10:56<04:28, 1.02s/it] 63%\|██████▎ \| 448/711 [10:57<04:26, 1.01s/it] 63%\|██████▎ \| 449/711 [10:58<04:25, 1.01s/it] 63%\|██████▎ \| 450/711 [10:59<04:24, 1.01s/it] 63%\|██████▎ \| 450/711 [10:59<04:24, 1.01s/it] 63%\|██████▎ \| 451/711 [11:00<04:23, 1.01s/it] 64%\|██████▎ \| 452/711 [11:01<04:23, 1.02s/it] 64%\|██████▎ \| 453/711 [11:02<04:22, 1.02s/it] 64%\|██████▍ \| 454/711 [11:03<04:20, 1.02s/it]
	0: {'loss': 0.6751, 'grad_norm': 0.9324021579543116, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.39}
	0: 64%\|██████▍ \| 455/711 [11:04<04:19, 1.01s/it] 64%\|██████▍ \| 456/711 [11:05<04:18, 1.01s/it] 64%\|██████▍ \| 457/711 [11:06<04:16, 1.01s/it] 64%\|██████▍ \| 458/711 [11:07<04:16, 1.01s/it] 65%\|██████▍ \| 459/711 [11:08<04:15, 1.01s/it] 65%\|██████▍ \| 460/711 [11:09<04:14, 1.02s/it] 65%\|██████▍ \| 460/711 [11:09<04:14, 1.02s/it] 65%\|██████▍ \| 461/711 [11:10<04:14, 1.02s/it] 65%\|██████▍ \| 462/711 [11:11<04:13, 1.02s/it] 65%\|██████▌ \| 463/711 [11:12<04:11, 1.01s/it] 65%\|██████▌ \| 464/711 [11:13<04:10, 1.01s/it] 65%\|██████▌ \| 465/711 [11:14<04:08, 1.01s/it] 66%\|██████▌ \| 466/711 [11:15<04:07, 1.01s/it] 66%\|██████▌ \| 467/711 [11:16<04:06, 1.01s/it] 66%\|██████▌ \| 468/711 [11:17<04:05, 1.01s/it] 66%\|██�
	0: {'loss': 0.681, 'grad_norm': 1.1760742860535562, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.4}
	0: {'loss': 0.6857, 'grad_norm': 0.7580929709516384, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.4}
	0: ��███▌ \| 469/711 [11:18<04:06, 1.02s/it] 66%\|██████▌ \| 470/711 [11:19<04:04, 1.02s/it] 66%\|██████▌ \| 470/711 [11:19<04:04, 1.02s/it] 66%\|██████▌ \| 471/711 [11:20<04:03, 1.02s/it] 66%\|██████▋ \| 472/711 [11:21<04:02, 1.02s/it] 67%\|██████▋ \| 473/711 [11:22<04:01, 1.01s/it] 67%\|██████▋ \| 474/711 [11:23<03:59, 1.01s/it] 67%\|██████▋ \| 475/711 [11:24<03:58, 1.01s/it] 67%\|██████▋ \| 476/711 [11:25<03:58, 1.02s/it] 67%\|██████▋ \| 477/711 [11:26<03:57, 1.02s/it] 67%\|██████▋ \| 478/711 [11:27<03:56, 1.02s/it] 67%\|██████▋ \| 479/711 [11:28<03:55, 1.02s/it] 68%\|██████▊ \| 480/711 [11:29<03:54, 1.01s/it] 68%\|██████▊ \| 480/711 [11:29<03:54, 1.01s/it] 68%\|██████▊ \| 481/711
	0: {'loss': 0.6551, 'grad_norm': 0.7688484391693519, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.41}
	0: [11:30<03:54, 1.02s/it] 68%\|██████▊ \| 482/711 [11:31<03:52, 1.02s/it] 68%\|██████▊ \| 483/711 [11:32<03:51, 1.01s/it] 68%\|██████▊ \| 484/711 [11:33<03:50, 1.01s/it] 68%\|██████▊ \| 485/711 [11:34<03:49, 1.02s/it] 68%\|██████▊ \| 486/711 [11:35<03:48, 1.02s/it] 68%\|██████▊ \| 487/711 [11:36<03:47, 1.01s/it] 69%\|██████▊ \| 488/711 [11:37<03:45, 1.01s/it] 69%\|██████▉ \| 489/711 [11:38<03:46, 1.02s/it] 69%\|██████▉ \| 490/711 [11:39<03:44, 1.02s/it] 69%\|██████▉ \| 490/711 [11:39<03:44, 1.02s/it] 69%\|██████▉ \| 491/711 [11:40<03:45, 1.02s/it] 69%\|██████▉ \| 492/711 [11:41<03:43, 1.02s/it] 69%\|██████▉ \| 493/711 [11:42<03:41, 1.02s/it] 69%\|██████▉ \| 494/711 [11:43<03:41, 1.02s/it] 70%\|██████▉ \| 495/711 [11:44<03:39
	0: {'loss': 0.665, 'grad_norm': 0.7880196713924965, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.42}
	0: , 1.02s/it] 70%\|██████▉ \| 496/711 [11:45<03:37, 1.01s/it] 70%\|██████▉ \| 497/711 [11:46<03:37, 1.02s/it] 70%\|███████ \| 498/711 [11:47<03:36, 1.02s/it] 70%\|███████ \| 499/711 [11:48<03:36, 1.02s/it] 70%\|███████ \| 500/711 [11:49<03:34, 1.02s/it] 70%\|███████ \| 500/711 [11:49<03:34, 1.02s/it] 70%\|███████ \| 501/711 [11:50<03:33, 1.02s/it] 71%\|███████ \| 502/711 [11:51<03:31, 1.01s/it] 71%\|███████ \| 503/711 [11:53<03:30, 1.01s/it] 71%\|███████ \| 504/711 [11:54<03:29, 1.01s/it] 71%\|███████ \| 505/711 [11:55<03:27, 1.01s/it] 71%\|███████ \| 506/711 [11:56<03:26, 1.01s/it] 71%\|███████▏ \| 507/711 [11:57<03:26, 1.01s/it] 71%\|███████▏ \| 508/711 [11:58<03:25, 1.01s/it] 72%\|███████▏ \| 509/711 [11:59<03:24, 1.01
	0: {'loss': 0.6839, 'grad_norm': 0.815597267678134, 'learning_rate': 2e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.43}
	0: {'loss': 0.6696, 'grad_norm': 0.8299691960041005, 'learning_rate': 1.9929032311830303e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.44}
	0: s/it] 72%\|███████▏ \| 510/711 [12:00<03:23, 1.01s/it] 72%\|███████▏ \| 510/711 [12:00<03:23, 1.01s/it] 72%\|███████▏ \| 511/711 [12:01<03:23, 1.02s/it] 72%\|███████▏ \| 512/711 [12:02<03:21, 1.01s/it] 72%\|███████▏ \| 513/711 [12:03<03:20, 1.01s/it] 72%\|███████▏ \| 514/711 [12:04<03:18, 1.01s/it] 72%\|███████▏ \| 515/711 [12:05<03:17, 1.01s/it] 73%\|███████▎ \| 516/711 [12:06<03:17, 1.01s/it] 73%\|███████▎ \| 517/711 [12:07<03:16, 1.01s/it] 73%\|███████▎ \| 518/711 [12:08<03:15, 1.01s/it] 73%\|███████▎ \| 519/711 [12:09<03:14, 1.01s/it] 73%\|███████▎ \| 520/711 [12:10<03:14, 1.02s/it] 73%\|███████▎ \| 520/711 [12:10<03:14, 1.02s/it] 73%\|███████▎ \| 521/711 [12:11<03:13, 1.
	0: {'loss': 0.672, 'grad_norm': 0.8102022933620939, 'learning_rate': 1.9642643171092488e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.45}
	0: 02s/it] 73%\|███████▎ \| 522/711 [12:12<03:11, 1.01s/it] 74%\|███████▎ \| 523/711 [12:13<03:10, 1.01s/it] 74%\|███████▎ \| 524/711 [12:14<03:10, 1.02s/it] 74%\|███████▍ \| 525/711 [12:15<03:09, 1.02s/it] 74%\|███████▍ \| 526/711 [12:16<03:07, 1.02s/it] 74%\|███████▍ \| 527/711 [12:17<03:06, 1.01s/it] 74%\|███████▍ \| 528/711 [12:18<03:06, 1.02s/it] 74%\|███████▍ \| 529/711 [12:19<03:04, 1.01s/it] 75%\|███████▍ \| 530/711 [12:20<03:03, 1.01s/it] 75%\|███████▍ \| 530/711 [12:20<03:03, 1.01s/it] 75%\|███████▍ \| 531/711 [12:21<03:02, 1.01s/it] 75%\|███████▍ \| 532/711 [12:22<03:01, 1.01s/it] 75%\|███████▍ \| 533/711 [12:23<03:00, 1.02s/it] 75%\|███████▌ \| 534/711 [12:24<02:59, 1.01s/it] 75%\|███████▌ \| 535/711
	0: {'loss': 0.6854, 'grad_norm': 0.8052276313272076, 'learning_rate': 1.9143443472194178e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.46}
	0: [12:25<02:58, 1.01s/it] 75%\|███████▌ \| 536/711 [12:26<02:57, 1.02s/it] 76%\|███████▌ \| 537/711 [12:27<02:56, 1.01s/it] 76%\|███████▌ \| 538/711 [12:28<02:55, 1.01s/it] 76%\|███████▌ \| 539/711 [12:29<02:54, 1.01s/it] 76%\|███████▌ \| 540/711 [12:30<02:53, 1.01s/it] 76%\|███████▌ \| 540/711 [12:30<02:53, 1.01s/it] 76%\|███████▌ \| 541/711 [12:31<02:52, 1.02s/it] 76%\|███████▌ \| 542/711 [12:32<02:51, 1.02s/it] 76%\|███████▋ \| 543/711 [12:33<02:50, 1.01s/it] 77%\|███████▋ \| 544/711 [12:34<02:49, 1.01s/it] 77%\|███████▋ \| 545/711 [12:35<02:47, 1.01s/it] 77%\|███████▋ \| 546/711 [12:36<02:47, 1.02s/it] 77%\|███████▋ \| 547/711 [12:37<02:46, 1.02s/it] 77%\|███████▋ \| 548/711 [12:38<02:45, 1.02s/it] 77%\|██████�
	0: {'loss': 0.6813, 'grad_norm': 0.8340810817094527, 'learning_rate': 1.8443725168471054e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.46}
	0: {'loss': 0.6641, 'grad_norm': 0.85639870414892, 'learning_rate': 1.7560717646792704e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.47}
	0: ��▋ \| 549/711 [12:39<02:44, 1.02s/it] 77%\|███████▋ \| 550/711 [12:40<02:43, 1.02s/it] 77%\|███████▋ \| 550/711 [12:40<02:43, 1.02s/it] 77%\|███████▋ \| 551/711 [12:41<02:42, 1.02s/it] 78%\|███████▊ \| 552/711 [12:42<02:41, 1.01s/it] 78%\|███████▊ \| 553/711 [12:43<02:40, 1.01s/it] 78%\|███████▊ \| 554/711 [12:44<02:38, 1.01s/it] 78%\|███████▊ \| 555/711 [12:45<02:37, 1.01s/it] 78%\|███████▊ \| 556/711 [12:46<02:37, 1.01s/it] 78%\|███████▊ \| 557/711 [12:47<02:37, 1.02s/it] 78%\|███████▊ \| 558/711 [12:48<02:36, 1.02s/it] 79%\|███████▊ \| 559/711 [12:49<02:34, 1.02s/it] 79%\|███████▉ \| 560/711 [12:50<02:33, 1.01s/it] 79%\|███████▉ \| 560/711 [12:50<02:33, 1.01s/it] 79%\|█████�
	0: {'loss': 0.6813, 'grad_norm': 0.7779683047257264, 'learning_rate': 1.651616348287679e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.48}
	0: �█▉ \| 561/711 [12:51<02:31, 1.01s/it] 79%\|███████▉ \| 562/711 [12:52<02:30, 1.01s/it] 79%\|███████▉ \| 563/711 [12:53<02:31, 1.02s/it] 79%\|███████▉ \| 564/711 [12:54<02:29, 1.02s/it] 79%\|███████▉ \| 565/711 [12:55<02:28, 1.02s/it] 80%\|███████▉ \| 566/711 [12:56<02:29, 1.03s/it] 80%\|███████▉ \| 567/711 [12:58<02:28, 1.03s/it] 80%\|███████▉ \| 568/711 [12:59<02:26, 1.03s/it] 80%\|████████ \| 569/711 [13:00<02:24, 1.02s/it] 80%\|████████ \| 570/711 [13:01<02:23, 1.02s/it] 80%\|████████ \| 570/711 [13:01<02:23, 1.02s/it] 80%\|████████ \| 571/711 [13:02<02:22, 1.01s/it] 80%\|████████ \| 572/711 [13:03<02:20, 1.01s/it] 81%\|████████ \| 573/711 [13:04<02:19, 1.01s/it] 81%\|████████ \| 574/711 [13:05<02:18, 1.01s/it] 81%\|
	0: {'loss': 0.6657, 'grad_norm': 0.8763000605487214, 'learning_rate': 1.5335783066915437e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.49}
	0: ████████ \| 575/711 [13:06<02:17, 1.01s/it] 81%\|████████ \| 576/711 [13:07<02:16, 1.01s/it] 81%\|████████ \| 577/711 [13:08<02:16, 1.02s/it] 81%\|████████▏ \| 578/711 [13:09<02:16, 1.02s/it] 81%\|████████▏ \| 579/711 [13:10<02:14, 1.02s/it] 82%\|████████▏ \| 580/711 [13:11<02:13, 1.02s/it] 82%\|████████▏ \| 580/711 [13:11<02:13, 1.02s/it] 82%\|████████▏ \| 581/711 [13:12<02:12, 1.02s/it] 82%\|████████▏ \| 582/711 [13:13<02:11, 1.02s/it] 82%\|████████▏ \| 583/711 [13:14<02:10, 1.02s/it] 82%\|████████▏ \| 584/711 [13:15<02:08, 1.01s/it] 82%\|████████▏ \| 585/711 [13:16<02:07, 1.01s/it] 82%\|████████▏ \| 586/711 [13:17<02:06, 1.01s/it] 83%\|████████▎ \| 587/711 [13:18<02:06, 1.02s/it] 83%\|████████▎
	0: {'loss': 0.6539, 'grad_norm': 0.7886914559405968, 'learning_rate': 1.4048641282207624e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.5}
	0: {'loss': 0.6791, 'grad_norm': 0.7967066194263275, 'learning_rate': 1.2686431831271523e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.58, 'epoch': 0.51}
	0: \| 588/711 [13:19<02:05, 1.02s/it] 83%\|████████▎ \| 589/711 [13:20<02:04, 1.02s/it] 83%\|████████▎ \| 590/711 [13:21<02:03, 1.02s/it] 83%\|████████▎ \| 590/711 [13:21<02:03, 1.02s/it] 83%\|████████▎ \| 591/711 [13:22<02:02, 1.02s/it] 83%\|████████▎ \| 592/711 [13:23<02:01, 1.02s/it] 83%\|████████▎ \| 593/711 [13:24<01:59, 1.01s/it] 84%\|████████▎ \| 594/711 [13:25<01:58, 1.01s/it] 84%\|████████▎ \| 595/711 [13:26<01:57, 1.02s/it] 84%\|████████▍ \| 596/711 [13:27<01:56, 1.02s/it] 84%\|████████▍ \| 597/711 [13:28<01:55, 1.01s/it] 84%\|████████▍ \| 598/711 [13:29<02:01, 1.07s/it] 84%\|████████▍ \| 599/711 [13:30<01:58, 1.05s/it] 84%\|████████▍ \| 600/711 [13:31<01:55, 1.04s/it] 8
	0: {'loss': 0.6489, 'grad_norm': 0.8076470421353021, 'learning_rate': 1.1282696831703156e-05, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.51}
	0: 4%\|████████▍ \| 600/711 [13:31<01:55, 1.04s/it] 85%\|████████▍ \| 601/711 [13:32<01:53, 1.03s/it] 85%\|████████▍ \| 602/711 [13:33<01:51, 1.02s/it] 85%\|████████▍ \| 603/711 [13:34<01:50, 1.02s/it] 85%\|████████▍ \| 604/711 [13:35<01:48, 1.02s/it] 85%\|████████▌ \| 605/711 [13:36<01:48, 1.02s/it] 85%\|████████▌ \| 606/711 [13:37<01:46, 1.02s/it] 85%\|████████▌ \| 607/711 [13:38<01:45, 1.02s/it] 86%\|████████▌ \| 608/711 [13:40<01:50, 1.08s/it] 86%\|████████▌ \| 609/711 [13:41<01:47, 1.06s/it] 86%\|████████▌ \| 610/711 [13:42<01:45, 1.05s/it] 86%\|████████▌ \| 610/711 [13:42<01:45, 1.05s/it] 86%\|████████▌ \| 611/711 [13:43<01:43, 1.04s/it] 86%\|████████▌ \| 612/711 [13:44<01:41, 1.03s/it] 86%\|██████
	0: {'loss': 0.6488, 'grad_norm': 0.8168905882802561, 'learning_rate': 9.872000897921262e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.52}
	0: ██▌ \| 613/711 [13:45<01:40, 1.03s/it] 86%\|████████▋ \| 614/711 [13:46<01:39, 1.02s/it] 86%\|████████▋ \| 615/711 [13:47<01:37, 1.02s/it] 87%\|████████▋ \| 616/711 [13:48<01:36, 1.02s/it] 87%\|████████▋ \| 617/711 [13:49<01:35, 1.01s/it] 87%\|████████▋ \| 618/711 [13:50<01:34, 1.02s/it] 87%\|████████▋ \| 619/711 [13:51<01:33, 1.02s/it] 87%\|████████▋ \| 620/711 [13:52<01:32, 1.02s/it] 87%\|████████▋ \| 620/711 [13:52<01:32, 1.02s/it] 87%\|████████▋ \| 621/711 [13:53<01:31, 1.02s/it] 87%\|████████▋ \| 622/711 [13:54<01:30, 1.02s/it] 88%\|████████▊ \| 623/711 [13:55<01:29, 1.02s/it] 88%\|████████▊ \| 624/711 [13:56<01:28, 1.02s/it] 88%\|████████▊ \| 625/711 [13:57<01:27, 1.01s/it] 88%\|████████▊ \| 626/711 [
	0: {'loss': 0.6461, 'grad_norm': 0.8173208558740509, 'learning_rate': 8.489080045646938e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.53}
	0: 13:58<01:26, 1.01s/it] 88%\|████████▊ \| 627/711 [13:59<01:25, 1.01s/it] 88%\|████████▊ \| 628/711 [14:00<01:24, 1.01s/it] 88%\|████████▊ \| 629/711 [14:01<01:23, 1.01s/it] 89%\|████████▊ \| 630/711 [14:02<01:22, 1.02s/it] 89%\|████████▊ \| 630/711 [14:02<01:22, 1.02s/it] 89%\|████████▊ \| 631/711 [14:03<01:21, 1.02s/it] 89%\|████████▉ \| 632/711 [14:04<01:20, 1.01s/it] 89%\|████████▉ \| 633/711 [14:05<01:18, 1.01s/it] 89%\|████████▉ \| 634/711 [14:06<01:17, 1.01s/it] 89%\|████████▉ \| 635/711 [14:07<01:17, 1.01s/it] 89%\|████████▉ \| 636/711 [14:08<01:16, 1.02s/it] 90%\|████████▉ \| 637/711 [14:09<01:15, 1.02s/it] 90%\|████████▉ \| 638/711 [14:10<01:14, 1.02s/it] 90%\|████████▉ \| 639/711 [14:11<01:13, 1.02s/i
	0: {'loss': 0.653, 'grad_norm': 0.7392727758714567, 'learning_rate': 7.167986375914347e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.54}
	0: {'loss': 0.6542, 'grad_norm': 0.8009550021670708, 'learning_rate': 5.941249599330827e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.55}
	0: t] 90%\|█████████ \| 640/711 [14:12<01:12, 1.02s/it] 90%\|█████████ \| 640/711 [14:12<01:12, 1.02s/it] 90%\|█████████ \| 641/711 [14:13<01:11, 1.01s/it] 90%\|█████████ \| 642/711 [14:14<01:09, 1.01s/it] 90%\|█████████ \| 643/711 [14:15<01:08, 1.01s/it] 91%\|█████████ \| 644/711 [14:16<01:08, 1.02s/it] 91%\|█████████ \| 645/711 [14:17<01:07, 1.02s/it] 91%\|█████████ \| 646/711 [14:18<01:06, 1.02s/it] 91%\|█████████ \| 647/711 [14:19<01:04, 1.02s/it] 91%\|█████████ \| 648/711 [14:20<01:03, 1.01s/it] 91%\|█████████▏\| 649/711 [14:21<01:02, 1.01s/it] 91%\|█████████▏\| 650/711 [14:22<01:02, 1.02s/it] 91%\|█████████▏\| 650/711 [14:22<01:02, 1.02s/it] 92%\|████████
	0: {'loss': 0.6595, 'grad_norm': 0.7139199193006327, 'learning_rate': 4.839076046641802e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.56}
	0: █▏\| 651/711 [14:23<01:01, 1.02s/it] 92%\|█████████▏\| 652/711 [14:24<00:59, 1.02s/it] 92%\|█████████▏\| 653/711 [14:25<00:59, 1.02s/it] 92%\|█████████▏\| 654/711 [14:26<00:57, 1.02s/it] 92%\|█████████▏\| 655/711 [14:27<00:56, 1.01s/it] 92%\|█████████▏\| 656/711 [14:28<00:56, 1.02s/it] 92%\|█████████▏\| 657/711 [14:29<00:55, 1.02s/it] 93%\|█████████▎\| 658/711 [14:30<00:53, 1.02s/it] 93%\|█████████▎\| 659/711 [14:31<00:52, 1.02s/it] 93%\|█████████▎\| 660/711 [14:32<00:51, 1.01s/it] 93%\|█████████▎\| 660/711 [14:32<00:51, 1.01s/it] 93%\|█████████▎\| 661/711 [14:33<00:50, 1.02s/it] 93%\|█████████▎\| 662/711 [14:34<00:49, 1.02s/it] 93%\|█████████▎\| 663/711 [14:35<00:48, 1.01s/it] 93%\|█████�
	0: {'loss': 0.6349, 'grad_norm': 0.6774414042634137, 'learning_rate': 3.888604888618787e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.56}
	0: �███▎\| 664/711 [14:36<00:47, 1.01s/it] 94%\|█████████▎\| 665/711 [14:37<00:46, 1.01s/it] 94%\|█████████▎\| 666/711 [14:38<00:45, 1.01s/it] 94%\|█████████▍\| 667/711 [14:39<00:44, 1.01s/it] 94%\|█████████▍\| 668/711 [14:40<00:43, 1.02s/it] 94%\|█████████▍\| 669/711 [14:41<00:42, 1.01s/it] 94%\|█████████▍\| 670/711 [14:42<00:41, 1.02s/it] 94%\|█████████▍\| 670/711 [14:42<00:41, 1.02s/it] 94%\|█████████▍\| 671/711 [14:43<00:40, 1.02s/it] 95%\|█████████▍\| 672/711 [14:45<00:39, 1.01s/it] 95%\|█████████▍\| 673/711 [14:46<00:38, 1.01s/it] 95%\|█████████▍\| 674/711 [14:47<00:37, 1.01s/it] 95%\|█████████▍\| 675/711 [14:48<00:36, 1.02s/it] 95%\|█████████▌\| 676/711 [14:49<00:35, 1.01s/it] 95%\|███�
	0: {'loss': 0.6585, 'grad_norm': 0.6957704121795029, 'learning_rate': 3.11323987960523e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.57}
	0: ��█████▌\| 677/711 [14:50<00:34, 1.01s/it] 95%\|█████████▌\| 678/711 [14:51<00:33, 1.01s/it] 95%\|█████████▌\| 679/711 [14:52<00:32, 1.01s/it] 96%\|█████████▌\| 680/711 [14:53<00:31, 1.01s/it] 96%\|█████████▌\| 680/711 [14:53<00:31, 1.01s/it] 96%\|█████████▌\| 681/711 [14:54<00:30, 1.01s/it] 96%\|█████████▌\| 682/711 [14:55<00:29, 1.02s/it] 96%\|█████████▌\| 683/711 [14:56<00:28, 1.02s/it] 96%\|█████████▌\| 684/711 [14:57<00:27, 1.02s/it] 96%\|█████████▋\| 685/711 [14:58<00:26, 1.02s/it] 96%\|█████████▋\| 686/711 [14:59<00:25, 1.01s/it] 97%\|█████████▋\| 687/711 [15:00<00:24, 1.01s/it] 97%\|█████████▋\| 688/711 [15:01<00:23, 1.01s/it] 97%\|█████████▋\| 689/711 [15:02<00:22, 1.01s/it] 97%\|█
	0: {'loss': 0.6588, 'grad_norm': 0.7024004831662435, 'learning_rate': 2.532073079411971e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.58}
	0: {'loss': 0.6492, 'grad_norm': 1.1172218022043618, 'learning_rate': 2.159414743441803e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.59}
	0: ████████▋\| 690/711 [15:03<00:21, 1.01s/it] 97%\|█████████▋\| 690/711 [15:03<00:21, 1.01s/it] 97%\|█████████▋\| 691/711 [15:04<00:20, 1.02s/it] 97%\|█████████▋\| 692/711 [15:05<00:19, 1.01s/it] 97%\|█████████▋\| 693/711 [15:06<00:18, 1.01s/it] 98%\|█████████▊\| 694/711 [15:07<00:17, 1.01s/it] 98%\|█████████▊\| 695/711 [15:08<00:16, 1.01s/it] 98%\|█████████▊\| 696/711 [15:09<00:15, 1.01s/it] 98%\|█████████▊\| 697/711 [15:10<00:14, 1.01s/it] 98%\|█████████▊\| 698/711 [15:11<00:13, 1.01s/it] 98%\|█████████▊\| 699/711 [15:12<00:12, 1.01s/it] 98%\|█████████▊\| 700/711 [15:13<00:11, 1.02s/it] 98%\|█████████▊\| 700/711 [15:13<00:11, 1.02s/it] 99%\|█████
	0: {'loss': 0.6439, 'grad_norm': 0.7226102601600088, 'learning_rate': 2.0044409567084157e-06, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.6}
	0: [2025-11-24 00:25:23,227] [INFO] [axolotl.core.trainers.base._save:613] [PID:4127210] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-1b/0/checkpoint-711[39m
	0: [2025-11-24 00:25:25,011] [INFO] [axolotl.core.trainers.base._save:662] [PID:4127210] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
	0: {'train_runtime': 927.6736, 'train_samples_per_second': 12.263, 'train_steps_per_second': 0.766, 'train_loss': 0.7229751322507523, 'memory/max_mem_active(gib)': 52.06, 'memory/max_mem_allocated(gib)': 52.06, 'memory/device_mem_reserved(gib)': 60.79, 'epoch': 0.6}
	0: ████▊\| 701/711 [15:14<00:10, 1.02s/it] 99%\|█████████▊\| 702/711 [15:15<00:09, 1.02s/it] 99%\|█████████▉\| 703/711 [15:16<00:08, 1.01s/it] 99%\|█████████▉\| 704/711 [15:17<00:07, 1.02s/it] 99%\|█████████▉\| 705/711 [15:18<00:06, 1.02s/it] 99%\|█████████▉\| 706/711 [15:19<00:05, 1.02s/it] 99%\|█████████▉\| 707/711 [15:20<00:04, 1.01s/it] 100%\|█████████▉\| 708/711 [15:21<00:03, 1.01s/it] 100%\|█████████▉\| 709/711 [15:22<00:02, 1.01s/it] 100%\|█████████▉\| 710/711 [15:23<00:01, 1.02s/it] 100%\|█████████▉\| 710/711 [15:23<00:01, 1.02s/it] 100%\|██████████\| 711/711 [15:24<00:00, 1.02s/it] 100%\|██████████\| 711/711 [15:27<00:00, 1.02s/it] 100%\|█████████
	0: █\| 711/711 [15:27<00:00, 1.30s/it]
	0: [2025-11-24 00:25:28,197] [INFO] [axolotl.train.save_trained_model:228] [PID:4127210] [RANK:0] Training completed! Saving trained model to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-1b/0.[39m
	0: [2025-11-24 00:25:29,151] [INFO] [axolotl.core.trainers.base._save:613] [PID:4127210] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-1b/0[39m
	0: [2025-11-24 00:25:31,046] [INFO] [axolotl.core.trainers.base._save:662] [PID:4127210] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m
	0: [2025-11-24 00:25:31,413] [INFO] [axolotl.train.save_trained_model:350] [PID:4127210] [RANK:0] Model successfully saved to /lustre/fswork/projects/rech/dgo/udv55np/ift/Nemotron-Super-49B-v1_5/gemma-3-1b/0[39m