[2026-06-14 14:08:51,250] [DEBUG] [axolotl.utils.config.resolve_dtype:74] [PID:3393] bf16 support detected, enabling for this configuration. [2026-06-14 14:08:51,433] [WARNING] [axolotl.utils.config.normalize_config:281] [PID:3393] Gemma4 requires use_reentrant=False for gradient checkpointing in distributed training. Setting use_reentrant=False. [2026-06-14 14:08:51,433] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:3393] baseline 0.000GB () [2026-06-14 14:08:51,434] [INFO] [axolotl.cli.config.load_cfg:333] [PID:3393] config: { "activation_offloading": true, "adapter": "lora", "attn_implementation": "sdpa", "attn_needs_dtype_cast": false, "attn_supports_packing": false, "attn_uses_flash_lib": false, "axolotl_config_path": "./config.yaml", "base_model": "google/gemma-4-E4B-it", "base_model_config": "google/gemma-4-E4B-it", "batch_size": 16, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 1, "n_node": 1, "tf32": true }, "chat_template": "jinja", "chat_template_jinja": "./jinja", "context_parallel_size": 1, "cut_cross_entropy": true, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 31, "dataset_prepared_path": "./dataset-e4b", "datasets": [ { "chat_template": "tokenizer_default", "field_messages": "messages", "field_tools": "tools", "message_property_mappings": { "content": "content", "role": "role" }, "path": "jacob-ml/Jacob-2-SSFT-filtered", "split": "train", "trust_remote_code": false, "type": "chat_template" } ], "ddp": false, "device": "cuda:0", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "eaft_alpha": 1.0, "eaft_k": 20, "env_capabilities": { "torch_version": "2.10.0" }, "eval_batch_size": 2, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_table_size": 0, "experimental_skip_move_to_device": true, "fp16": false, "freeze_mm_modules": true, "generate_samples": false, "generation_do_sample": true, "generation_max_new_tokens": 50, "generation_prompt_ratio": 0.5, "generation_temperature": 0.7, "gradient_accumulation_steps": 8, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": false }, "hub_model_id": "jacob-ml/Jacob-2-E4B", "include_tkps": true, "is_multimodal": true, "layer_offloading": true, "learning_rate": 0.0002, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": false, "load_in_8bit": true, "local_rank": 0, "logging_steps": 1, "lora_alpha": 16, "lora_dropout": 0.0, "lora_r": 16, "lora_target_modules": "model.language_model.layers.[\\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj", "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "mean_resizing_embeddings": false, "merge_method": "memory_efficient", "micro_batch_size": 2, "model_config_type": "gemma4", "model_config_type_text": "gemma4_text", "num_epochs": 1.0, "num_generation_samples": 3, "optimizer": "adamw_torch_8bit", "otel_metrics_host": "localhost", "otel_metrics_port": 8000, "output_dir": "./outputs/Jacob-2-E4B", "pad_to_sequence_len": false, "plugins": [ "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" ], "pretrain_multipack_attn": true, "processor_config": "google/gemma-4-E4B-it", "profiler_steps_start": 0, "qgalore_cos_threshold": 0.4, "qgalore_gamma_proj": 2, "qgalore_proj_bits": 4, "qgalore_proj_group_size": 256, "qgalore_proj_quant": true, "qgalore_proj_type": "std", "qgalore_queue_size": 5, "qgalore_rank": 256, "qgalore_scale": 0.25, "qgalore_update_proj_gap": 200, "qlora_sharded_model_loading": false, "quantize_moe_experts": false, "ray_num_workers": 1, "relora_prune_method": "magnitude", "resources_per_worker": { "GPU": 1 }, "sample_packing": false, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "sequence_len": 8192, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": false, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "google/gemma-4-E4B-it", "tokenizer_save_jinja_files": true, "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "async_prefetch": false, "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "replay_buffer_size": 0, "replay_recompute_logps": true, "reroll_max_groups": 1, "reroll_start_fraction": 1.0, "reward_num_workers": 1, "scale_rewards": true, "skip_zero_advantage_batches": true, "sync_ref_model": false, "use_data_producer": false, "use_vllm": false, "vllm_lora_sync": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "use_otel_metrics": false, "use_ray": false, "val_set_size": 0.0, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "warmup_ratio": 0.1, "weight_decay": 0.0, "world_size": 1 } [2026-06-14 14:08:55,226] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:311] [PID:3393] EOS: 1 / [2026-06-14 14:08:55,226] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:3393] BOS: 2 / [2026-06-14 14:08:55,226] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:3393] PAD: 0 / [2026-06-14 14:08:55,226] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:3393] UNK: 3 / [2026-06-14 14:08:55,227] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:482] [PID:3393] Unable to find prepared dataset in dataset-e4b/226f5539ba5a2355ba6a34bd68b2a326 [2026-06-14 14:08:55,228] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:3393] Loading raw datasets... [2026-06-14 14:08:55,228] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:3393] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. Downloading (incomplete total...): 0.00B [00:00, ?B/s] Fetching 0 files: 0it [00:00, ?it/s] Fetching 0 files: 0it [00:00, ?it/s] Download complete: : 0.00B [00:00, ?B/s] Download complete: : 0.00B [00:00, ?B/s] [2026-06-14 14:08:56,312] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:3393] Loading dataset: jacob-ml/Jacob-2-SSFT-filtered with base_type: chat_template and prompt_style: None [2026-06-14 14:08:56,315] [INFO] [axolotl.prompt_strategies.chat_template.__call__:1209] [PID:3393] Using chat template: --- {%- macro format_parameters(properties, required, filter_keys=false) -%} {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} {%- set ns = namespace(found_first=false) -%} {%- for key, value in properties | dictsort -%} {%- set add_comma = false -%} {%- if not filter_keys or key not in standard_keys -%} {%- if ns.found_first %},{% endif -%} {%- set ns.found_first = true -%} {{ key }}:{ {%- if value['description'] -%} description:<|"|>{{ value['description'] }}<|"|> {%- set add_comma = true -%} {%- endif -%} {%- if value['type'] | upper == 'STRING' -%} {%- if value['enum'] -%} {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} enum:{{ format_argument(value['enum']) }} {%- endif -%} {%- elif value['type'] | upper == 'ARRAY' -%} {%- if value['items'] is mapping and value['items'] -%} {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} items:{ {%- set ns_items = namespace(found_first=false) -%} {%- for item_key, item_value in value['items'] | dictsort -%} {%- if item_value is not none -%} {%- if ns_items.found_first %},{% endif -%} {%- set ns_items.found_first = true -%} {%- if item_key == 'properties' -%} properties:{ {%- if item_value is mapping -%} {{- format_parameters(item_value, value['items']['required'] | default([])) -}} {%- endif -%} } {%- elif item_key == 'required' -%} required:[ {%- for req_item in item_value -%} <|"|>{{- req_item -}}<|"|> {%- if not loop.last %},{% endif -%} {%- endfor -%} ] {%- elif item_key == 'type' -%} {%- if item_value is string -%} type:{{ format_argument(item_value | upper) }} {%- else -%} type:{{ format_argument(item_value | map('upper') | list) }} {%- endif -%} {%- else -%} {{ item_key }}:{{ format_argument(item_value) }} {%- endif -%} {%- endif -%} {%- endfor -%} } {%- endif -%} {%- endif -%} {%- if value['nullable'] %} {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} nullable:true {%- endif -%} {%- if value['type'] | upper == 'OBJECT' -%} {%- if value['properties'] is defined and value['properties'] is mapping -%} {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} properties:{ {{- format_parameters(value['properties'], value['required'] | default([])) -}} } {%- elif value is mapping -%} {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} properties:{ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} } {%- endif -%} {%- if value['required'] -%} {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} required:[ {%- for item in value['required'] | default([]) -%} <|"|>{{- item -}}<|"|> {%- if not loop.last %},{% endif -%} {%- endfor -%} ] {%- endif -%} {%- endif -%} {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} type:<|"|>{{ value['type'] | upper }}<|"|>} {%- endif -%} {%- endfor -%} {%- endmacro -%} {%- macro format_function_declaration(tool_data) -%} declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> {%- set params = tool_data['function']['parameters'] -%} {%- if params -%} ,parameters:{ {%- if params['properties'] -%} properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, {%- endif -%} {%- if params['required'] -%} required:[ {%- for item in params['required'] -%} <|"|>{{- item -}}<|"|> {{- ',' if not loop.last -}} {%- endfor -%} ], {%- endif -%} {%- if params['type'] -%} type:<|"|>{{- params['type'] | upper -}}<|"|>} {%- endif -%} {%- endif -%} {%- if 'response' in tool_data['function'] -%} {%- set response_declaration = tool_data['function']['response'] -%} ,response:{ {%- if response_declaration['description'] -%} description:<|"|>{{- response_declaration['description'] -}}<|"|>, {%- endif -%} {%- if response_declaration['type'] | upper == 'OBJECT' -%} type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} {%- endif -%} {%- endif -%} } {%- endmacro -%} {%- macro format_argument(argument, escape_keys=True) -%} {%- if argument is string -%} {{- '<|"|>' + argument + '<|"|>' -}} {%- elif argument is boolean -%} {{- 'true' if argument else 'false' -}} {%- elif argument is mapping -%} {{- '{' -}} {%- set ns = namespace(found_first=false) -%} {%- for key, value in argument | dictsort -%} {%- if ns.found_first %},{% endif -%} {%- set ns.found_first = true -%} {%- if escape_keys -%} {{- '<|"|>' + key + '<|"|>' -}} {%- else -%} {{- key -}} {%- endif -%} :{{- format_argument(value, escape_keys=escape_keys) -}} {%- endfor -%} {{- '}' -}} {%- elif argument is sequence -%} {{- '[' -}} {%- for item in argument -%} {{- format_argument(item, escape_keys=escape_keys) -}} {%- if not loop.last %},{% endif -%} {%- endfor -%} {{- ']' -}} {%- else -%} {{- argument -}} {%- endif -%} {%- endmacro -%} {%- macro strip_thinking(text) -%} {%- set ns = namespace(result='') -%} {%- for part in text.split('') -%} {%- if '<|channel>' in part -%} {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} {%- else -%} {%- set ns.result = ns.result + part -%} {%- endif -%} {%- endfor -%} {{- ns.result | trim -}} {%- endmacro -%} {%- macro format_tool_response_block(tool_name, response) -%} {{- '<|tool_response>' -}} {%- if response is mapping -%} {{- 'response:' + tool_name + '{' -}} {%- for key, value in response | dictsort -%} {{- key -}}:{{- format_argument(value, escape_keys=False) -}} {%- if not loop.last %},{% endif -%} {%- endfor -%} {{- '}' -}} {%- else -%} {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} {%- endif -%} {{- '' -}} {%- endmacro -%} {%- set ns = namespace(prev_message_type=None) -%} {%- set loop_messages = messages -%} {{- bos_token -}} {#- Handle System/Tool Definitions Block -#} {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} {{- '<|turn>system\n' -}} {#- Inject Thinking token at the very top of the FIRST system turn -#} {%- if enable_thinking is defined and enable_thinking -%} {{- '<|think|>\n' -}} {%- set ns.prev_message_type = 'think' -%} {%- endif -%} {%- if messages[0]['role'] in ['system', 'developer'] -%} {%- if messages[0]['content'] is string -%} {{- messages[0]['content'] | trim -}} {%- elif messages[0]['content'] is sequence -%} {%- for item in messages[0]['content'] -%} {{- item['text'] | trim + ' '-}} {%- endfor -%} {%- endif -%} {%- set loop_messages = messages[1:] -%} {%- endif -%} {%- if tools -%} {%- for tool in tools %} {{- '<|tool>' -}} {{- format_function_declaration(tool) | trim -}} {{- '' -}} {%- endfor %} {%- set ns.prev_message_type = 'tool' -%} {%- endif -%} {{- '\n' -}} {%- endif %} {#- Pre-scan: find last user message index for reasoning guard -#} {%- set ns_turn = namespace(last_user_idx=-1) -%} {%- for i in range(loop_messages | length) -%} {%- if loop_messages[i]['role'] == 'user' -%} {%- set ns_turn.last_user_idx = i -%} {%- endif -%} {%- endfor -%} {#- Loop through messages -#} {%- for message in loop_messages -%} {%- if message['role'] != 'tool' -%} {%- set ns.prev_message_type = None -%} {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} {%- if message['role'] == 'tool' and 'name' in message -%} {%- set _tool_name = message['name'] -%} {%- endif -%} {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} {%- set prev_nt = namespace(role=None, found=false) -%} {%- if loop.index0 > 0 -%} {%- for j in range(loop.index0 - 1, -1, -1) -%} {%- if not prev_nt.found -%} {%- if loop_messages[j]['role'] != 'tool' -%} {%- set prev_nt.role = loop_messages[j]['role'] -%} {%- set prev_nt.found = true -%} {%- endif -%} {%- endif -%} {%- endfor -%} {%- endif -%} {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} {%- if not continue_same_model_turn -%} {{- '<|turn>' + role + '\n' }} {%- endif -%} {#- Render reasoning/reasoning_content as thinking channel -#} {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} {{- '<|channel>thought\n' + thinking_text + '\n' -}} {%- endif -%} {%- if message['tool_calls'] -%} {%- for tool_call in message['tool_calls'] -%} {%- set function = tool_call['function'] -%} {{- '<|tool_call>call:' + function['name'] + '{' -}} {%- if function['arguments'] is mapping -%} {%- set ns_args = namespace(found_first=false) -%} {%- for key, value in function['arguments'] | dictsort -%} {%- if ns_args.found_first %},{% endif -%} {%- set ns_args.found_first = true -%} {{- key -}}:{{- format_argument(value, escape_keys=False) -}} {%- endfor -%} {%- elif function['arguments'] is string -%} {{- function['arguments'] -}} {%- endif -%} {{- '}' -}} {%- endfor -%} {%- set ns.prev_message_type = 'tool_call' -%} {%- endif -%} {%- set ns_tr_out = namespace(flag=false) -%} {%- if message.get('tool_responses') -%} {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} {%- for tool_response in message['tool_responses'] -%} {{- format_tool_response_block(tool_response['name'] | default('unknown', true), tool_response['response']) -}} {%- set ns_tr_out.flag = true -%} {%- set ns.prev_message_type = 'tool_response' -%} {%- endfor -%} {%- elif message.get('tool_calls') -%} {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} {%- set ns_tool_scan = namespace(stopped=false) -%} {%- for k in range(loop.index0 + 1, loop_messages | length) -%} {%- if ns_tool_scan.stopped -%} {%- elif loop_messages[k]['role'] != 'tool' -%} {%- set ns_tool_scan.stopped = true -%} {%- else -%} {%- set follow = loop_messages[k] -%} {#- Resolve tool_call_id to function name -#} {%- set ns_tname = namespace(name=follow['name'] | default('unknown', true)) -%} {%- for tc in message['tool_calls'] -%} {%- if tc.get('id') == follow.get('tool_call_id') -%} {%- set ns_tname.name = tc['function']['name'] -%} {%- endif -%} {%- endfor -%} {#- Handle content as string or content-parts array -#} {%- set tool_body = follow.get('content') -%} {%- if tool_body is string -%} {{- format_tool_response_block(ns_tname.name, tool_body) -}} {%- elif tool_body is sequence and tool_body is not string -%} {%- set ns_txt = namespace(s='') -%} {%- for part in tool_body -%} {%- if part.get('type') == 'text' -%} {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} {%- endif -%} {%- endfor -%} {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} {%- for part in tool_body -%} {%- if part.get('type') == 'image' -%} {{- '<|image|>' -}} {%- elif part.get('type') == 'audio' -%} {{- '<|audio|>' -}} {%- elif part.get('type') == 'video' -%} {{- '<|video|>' -}} {%- endif -%} {%- endfor -%} {%- else -%} {{- format_tool_response_block(ns_tname.name, tool_body) -}} {%- endif -%} {%- set ns_tr_out.flag = true -%} {%- set ns.prev_message_type = 'tool_response' -%} {%- endif -%} {%- endfor -%} {%- endif -%} {%- set captured_content -%} {%- if message['content'] is string -%} {%- if role == 'model' -%} {{- strip_thinking(message['content']) -}} {%- else -%} {{- message['content'] | trim -}} {%- endif -%} {%- elif message['content'] is sequence -%} {%- for item in message['content'] -%} {%- if item['type'] == 'text' -%} {%- if role == 'model' -%} {{- strip_thinking(item['text']) -}} {%- else -%} {{- item['text'] | trim -}} {%- endif -%} {%- elif item['type'] == 'image' -%} {{- '<|image|>' -}} {%- set ns.prev_message_type = 'image' -%} {%- elif item['type'] == 'audio' -%} {{- '<|audio|>' -}} {%- set ns.prev_message_type = 'audio' -%} {%- elif item['type'] == 'video' -%} {{- '<|video|>' -}} {%- set ns.prev_message_type = 'video' -%} {%- endif -%} {%- endfor -%} {%- endif -%} {%- endset -%} {{- captured_content -}} {%- set has_content = captured_content | trim | length > 0 -%} {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} {{- '<|tool_response>' -}} {%- elif not (ns_tr_out.flag and not has_content) -%} {{- '\n' -}} {%- endif -%} {%- endif -%} {%- endfor -%} {%- if add_generation_prompt -%} {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} {{- '<|turn>model\n' -}} {%- endif -%} {%- endif -%} --- [2026-06-14 14:08:56,434] [WARNING] [axolotl.prompt_strategies.chat_template._validate_eot_and_eos_tokens:357] [PID:3393] EOS token '' not found in chat_template. Please check if your template/EOS token is correct. Tokenizing Prompts (num_proc=31): 0%| | 0/4209 [00:008192) (num_proc=31): 0%| | 0/4209 [00:008192) (num_proc=31): 3%|██▋ | 136/4209 [00:01<00:58, 69.73 examples/s] Dropping Invalid Sequences (8192) (num_proc=31): 36%|█████████████████████████████▏ | 1496/4209 [00:02<00:02, 992.91 examples/s] Dropping Invalid Sequences (8192) (num_proc=31): 55%|████████████████████████████████████████████▍ | 2311/4209 [00:02<00:01, 1592.28 examples/s] Dropping Invalid Sequences (8192) (num_proc=31): 74%|████████████████████████████████████████████████████████████▏ | 3126/4209 [00:02<00:00, 2289.56 examples/s] Dropping Invalid Sequences (8192) (num_proc=31): 100%|█████████████████████████████████████████████████████████████████████████████████| 4209/4209 [00:02<00:00, 1595.10 examples/s] [2026-06-14 14:11:58,919] [INFO] [axolotl.utils.data.utils._drop_outside_range:306] [PID:3393] Dropped 15 sequences outside valid range ([None, 8192]) Saving the dataset (0/16 shards): 0%| | 0/4194 [00:00 [2026-06-14 14:12:19,460] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:312] [PID:3393] BOS: 2 / [2026-06-14 14:12:19,460] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:313] [PID:3393] PAD: 0 / [2026-06-14 14:12:19,460] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:314] [PID:3393] UNK: 3 / [2026-06-14 14:12:24,886] [DEBUG] [axolotl.train.setup_model_and_tokenizer:81] [PID:3393] Loading model [2026-06-14 14:12:24,930] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:75] [PID:3393] Patched OptimState8bit for torch.compile compatibility [2026-06-14 14:12:24,930] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:122] [PID:3393] Patched OptimState4bit for torch.compile compatibility [2026-06-14 14:12:24,930] [DEBUG] [axolotl.monkeypatch.torchao_optim.patch_torchao_optim_state_8bit:154] [PID:3393] Patched OptimStateFp8 for torch.compile compatibility [2026-06-14 14:12:24,936] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:94] [PID:3393] Patched Trainer.evaluation_loop with nanmean loss calculation [2026-06-14 14:12:24,937] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:148] [PID:3393] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2026-06-14 14:12:25,040] [INFO] [axolotl.monkeypatch.models.gemma4.fused_attn.patch_gemma4_fused_attn:207] [PID:3393] Patched Gemma4TextAttention.forward with fused RMSNorm+RoPE Triton kernels [2026-06-14 14:12:25,040] [INFO] [axolotl.monkeypatch.models.gemma4.fused_attn.patch_gemma4_fused_attn:211] [PID:3393] Installed Gemma4 shared_kv_states side channel (PR #3611) [2026-06-14 14:12:25,062] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:3393] Applying Cut Cross Entropy to model type: gemma4 Loading weights: 0%| | 0/2076 [00:00