Linksome commited on 20 days ago

Commit

b629b51

verified ·

1 Parent(s): 8dc2b7d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +60 -0
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log +423 -0
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log +299 -0
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log +6 -0
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log +23 -0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log +191 -0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log +11 -0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log +25 -0
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log +14 -0
LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log +25 -0
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log +14 -0
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log +25 -0
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log +13 -0
LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log +25 -0
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log +12 -0
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log +25 -0
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log +0 -0
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log +6 -0
LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log +23 -0
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json +1 -0

.gitattributes CHANGED Viewed

@@ -210,3 +210,63 @@ v127rc_exp2/B_mul/checkpoint-9500/tokenizer.json filter=lfs diff=lfs merge=lfs -
 v127rc_exp2/B_mul/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 v127rc_exp2/B_mul/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 v127rc_exp2/B_mul/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 v127rc_exp2/B_mul/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 v127rc_exp2/B_mul/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 v127rc_exp2/B_mul/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-9100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-8000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+v127rc_exp2/B_mul/checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text

LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.1
+        e:
+            mfjy22anxcucsb3vwlaimrwvqrgvipis:
+                args:
+                    - /workspace/v127rc_exp1/C.yaml
+                cpu_count: 16
+                cpu_count_logical: 32
+                cudaVersion: "13.0"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "1858306048"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de
+                host: 47a53adf0198
+                memory:
+                    total: "201701408768"
+                os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-04T03:57:46.163443Z"
+                writerId: mfjy22anxcucsb3vwlaimrwvqrgvipis
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.1
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.1
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t0_d35_r286
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/C
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - up_proj
+                - q_proj
+                - gate_proj
+                - k_proj
+                - v_proj
+                - o_proj
+                - down_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log ADDED Viewed

	@@ -0,0 +1,423 @@

+  0%|                                                                                                                                                                                       | 0/18595 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+{'loss': '1.682', 'grad_norm': '0.2716', 'learning_rate': '0', 'epoch': '0.0002689', 'num_input_tokens_seen': 2047, 'train_runtime': '2.959', 'train_tokens_per_second': '691.9'}
+{'loss': '1.8', 'grad_norm': '0.2907', 'learning_rate': '1.344e-07', 'epoch': '0.0005378', 'num_input_tokens_seen': 4094, 'train_runtime': '3.966', 'train_tokens_per_second': '1032'}
+{'loss': '1.755', 'grad_norm': '0.2774', 'learning_rate': '2.688e-07', 'epoch': '0.0008067', 'num_input_tokens_seen': 6141, 'train_runtime': '4.979', 'train_tokens_per_second': '1233'}
+{'loss': '1.725', 'grad_norm': '0.278', 'learning_rate': '4.032e-07', 'epoch': '0.001076', 'num_input_tokens_seen': 8188, 'train_runtime': '5.988', 'train_tokens_per_second': '1367'}
+{'loss': '1.856', 'grad_norm': '0.2819', 'learning_rate': '5.376e-07', 'epoch': '0.001344', 'num_input_tokens_seen': 10235, 'train_runtime': '6.995', 'train_tokens_per_second': '1463'}
+{'loss': '1.864', 'grad_norm': '0.2434', 'learning_rate': '6.72e-07', 'epoch': '0.001613', 'num_input_tokens_seen': 12282, 'train_runtime': '8.002', 'train_tokens_per_second': '1535'}
+{'loss': '1.791', 'grad_norm': '0.2673', 'learning_rate': '8.065e-07', 'epoch': '0.001882', 'num_input_tokens_seen': 14329, 'train_runtime': '9.01', 'train_tokens_per_second': '1590'}
+{'loss': '1.831', 'grad_norm': '0.2574', 'learning_rate': '9.409e-07', 'epoch': '0.002151', 'num_input_tokens_seen': 16376, 'train_runtime': '10.02', 'train_tokens_per_second': '1634'}
+{'loss': '1.92', 'grad_norm': '0.2803', 'learning_rate': '1.075e-06', 'epoch': '0.00242', 'num_input_tokens_seen': 18423, 'train_runtime': '11.06', 'train_tokens_per_second': '1665'}
+{'loss': '1.949', 'grad_norm': '0.281', 'learning_rate': '1.21e-06', 'epoch': '0.002689', 'num_input_tokens_seen': 20470, 'train_runtime': '12.07', 'train_tokens_per_second': '1696'}
+{'loss': '1.955', 'grad_norm': '0.298', 'learning_rate': '1.344e-06', 'epoch': '0.002958', 'num_input_tokens_seen': 22517, 'train_runtime': '13.08', 'train_tokens_per_second': '1722'}
+{'loss': '1.811', 'grad_norm': '0.2719', 'learning_rate': '1.478e-06', 'epoch': '0.003227', 'num_input_tokens_seen': 24564, 'train_runtime': '14.09', 'train_tokens_per_second': '1743'}
+{'loss': '1.629', 'grad_norm': '0.266', 'learning_rate': '1.613e-06', 'epoch': '0.003496', 'num_input_tokens_seen': 26611, 'train_runtime': '15.1', 'train_tokens_per_second': '1763'}
+{'loss': '1.768', 'grad_norm': '0.268', 'learning_rate': '1.747e-06', 'epoch': '0.003764', 'num_input_tokens_seen': 28658, 'train_runtime': '16.1', 'train_tokens_per_second': '1779'}
+{'loss': '1.612', 'grad_norm': '0.252', 'learning_rate': '1.882e-06', 'epoch': '0.004033', 'num_input_tokens_seen': 30705, 'train_runtime': '17.11', 'train_tokens_per_second': '1794'}
+{'loss': '1.622', 'grad_norm': '0.2607', 'learning_rate': '2.016e-06', 'epoch': '0.004302', 'num_input_tokens_seen': 32752, 'train_runtime': '18.12', 'train_tokens_per_second': '1807'}
+{'loss': '1.857', 'grad_norm': '0.2805', 'learning_rate': '2.151e-06', 'epoch': '0.004571', 'num_input_tokens_seen': 34799, 'train_runtime': '19.13', 'train_tokens_per_second': '1819'}
+{'loss': '1.851', 'grad_norm': '0.2441', 'learning_rate': '2.285e-06', 'epoch': '0.00484', 'num_input_tokens_seen': 36846, 'train_runtime': '20.14', 'train_tokens_per_second': '1830'}
+{'loss': '1.826', 'grad_norm': '0.2659', 'learning_rate': '2.419e-06', 'epoch': '0.005109', 'num_input_tokens_seen': 38893, 'train_runtime': '21.15', 'train_tokens_per_second': '1839'}
+{'loss': '1.536', 'grad_norm': '0.2742', 'learning_rate': '2.554e-06', 'epoch': '0.005378', 'num_input_tokens_seen': 40940, 'train_runtime': '22.16', 'train_tokens_per_second': '1847'}
+{'loss': '1.67', 'grad_norm': '0.2687', 'learning_rate': '2.688e-06', 'epoch': '0.005647', 'num_input_tokens_seen': 42987, 'train_runtime': '23.17', 'train_tokens_per_second': '1855'}
+{'loss': '1.548', 'grad_norm': '0.2588', 'learning_rate': '2.823e-06', 'epoch': '0.005916', 'num_input_tokens_seen': 45034, 'train_runtime': '24.18', 'train_tokens_per_second': '1862'}
+{'loss': '1.866', 'grad_norm': '0.2874', 'learning_rate': '2.957e-06', 'epoch': '0.006184', 'num_input_tokens_seen': 47081, 'train_runtime': '25.19', 'train_tokens_per_second': '1869'}
+{'loss': '1.764', 'grad_norm': '0.2764', 'learning_rate': '3.091e-06', 'epoch': '0.006453', 'num_input_tokens_seen': 49128, 'train_runtime': '26.2', 'train_tokens_per_second': '1875'}
+{'loss': '1.937', 'grad_norm': '0.2965', 'learning_rate': '3.226e-06', 'epoch': '0.006722', 'num_input_tokens_seen': 51175, 'train_runtime': '27.21', 'train_tokens_per_second': '1881'}
+{'loss': '1.627', 'grad_norm': '0.2888', 'learning_rate': '3.36e-06', 'epoch': '0.006991', 'num_input_tokens_seen': 53222, 'train_runtime': '28.23', 'train_tokens_per_second': '1886'}
+{'loss': '1.792', 'grad_norm': '0.3194', 'learning_rate': '3.495e-06', 'epoch': '0.00726', 'num_input_tokens_seen': 55269, 'train_runtime': '29.24', 'train_tokens_per_second': '1890'}
+{'loss': '1.725', 'grad_norm': '0.2937', 'learning_rate': '3.629e-06', 'epoch': '0.007529', 'num_input_tokens_seen': 57316, 'train_runtime': '30.25', 'train_tokens_per_second': '1895'}
+{'loss': '1.871', 'grad_norm': '0.2757', 'learning_rate': '3.763e-06', 'epoch': '0.007798', 'num_input_tokens_seen': 59363, 'train_runtime': '31.26', 'train_tokens_per_second': '1899'}
+{'loss': '1.838', 'grad_norm': '0.2773', 'learning_rate': '3.898e-06', 'epoch': '0.008067', 'num_input_tokens_seen': 61410, 'train_runtime': '32.27', 'train_tokens_per_second': '1903'}
+{'loss': '1.909', 'grad_norm': '0.3041', 'learning_rate': '4.032e-06', 'epoch': '0.008336', 'num_input_tokens_seen': 63457, 'train_runtime': '33.28', 'train_tokens_per_second': '1907'}
+{'loss': '1.725', 'grad_norm': '0.2885', 'learning_rate': '4.167e-06', 'epoch': '0.008604', 'num_input_tokens_seen': 65504, 'train_runtime': '34.29', 'train_tokens_per_second': '1910'}
+{'loss': '1.747', 'grad_norm': '0.3163', 'learning_rate': '4.301e-06', 'epoch': '0.008873', 'num_input_tokens_seen': 67551, 'train_runtime': '35.3', 'train_tokens_per_second': '1914'}
+{'loss': '1.909', 'grad_norm': '0.2977', 'learning_rate': '4.435e-06', 'epoch': '0.009142', 'num_input_tokens_seen': 69598, 'train_runtime': '36.31', 'train_tokens_per_second': '1917'}
+{'loss': '1.641', 'grad_norm': '0.275', 'learning_rate': '4.57e-06', 'epoch': '0.009411', 'num_input_tokens_seen': 71645, 'train_runtime': '37.32', 'train_tokens_per_second': '1920'}
+{'loss': '1.782', 'grad_norm': '0.3019', 'learning_rate': '4.704e-06', 'epoch': '0.00968', 'num_input_tokens_seen': 73692, 'train_runtime': '38.33', 'train_tokens_per_second': '1922'}
+{'loss': '1.83', 'grad_norm': '0.3124', 'learning_rate': '4.839e-06', 'epoch': '0.009949', 'num_input_tokens_seen': 75739, 'train_runtime': '39.34', 'train_tokens_per_second': '1925'}
+{'loss': '1.856', 'grad_norm': '0.2672', 'learning_rate': '4.973e-06', 'epoch': '0.01022', 'num_input_tokens_seen': 77786, 'train_runtime': '40.36', 'train_tokens_per_second': '1927'}
+{'loss': '1.965', 'grad_norm': '0.297', 'learning_rate': '5.108e-06', 'epoch': '0.01049', 'num_input_tokens_seen': 79833, 'train_runtime': '41.37', 'train_tokens_per_second': '1930'}
+{'loss': '1.935', 'grad_norm': '0.337', 'learning_rate': '5.242e-06', 'epoch': '0.01076', 'num_input_tokens_seen': 81880, 'train_runtime': '42.39', 'train_tokens_per_second': '1932'}
+{'loss': '1.725', 'grad_norm': '0.3097', 'learning_rate': '5.376e-06', 'epoch': '0.01102', 'num_input_tokens_seen': 83927, 'train_runtime': '43.4', 'train_tokens_per_second': '1934'}
+{'loss': '1.534', 'grad_norm': '0.2637', 'learning_rate': '5.511e-06', 'epoch': '0.01129', 'num_input_tokens_seen': 85974, 'train_runtime': '44.42', 'train_tokens_per_second': '1935'}
+{'loss': '1.764', 'grad_norm': '0.2742', 'learning_rate': '5.645e-06', 'epoch': '0.01156', 'num_input_tokens_seen': 88021, 'train_runtime': '45.43', 'train_tokens_per_second': '1937'}
+{'loss': '1.696', 'grad_norm': '0.2804', 'learning_rate': '5.78e-06', 'epoch': '0.01183', 'num_input_tokens_seen': 90068, 'train_runtime': '46.45', 'train_tokens_per_second': '1939'}
+{'loss': '1.725', 'grad_norm': '0.279', 'learning_rate': '5.914e-06', 'epoch': '0.0121', 'num_input_tokens_seen': 92115, 'train_runtime': '47.46', 'train_tokens_per_second': '1941'}
+{'loss': '1.981', 'grad_norm': '0.3061', 'learning_rate': '6.048e-06', 'epoch': '0.01237', 'num_input_tokens_seen': 94162, 'train_runtime': '48.47', 'train_tokens_per_second': '1943'}
+{'loss': '1.589', 'grad_norm': '0.2909', 'learning_rate': '6.183e-06', 'epoch': '0.01264', 'num_input_tokens_seen': 96209, 'train_runtime': '49.48', 'train_tokens_per_second': '1944'}
+{'loss': '1.776', 'grad_norm': '0.338', 'learning_rate': '6.317e-06', 'epoch': '0.01291', 'num_input_tokens_seen': 98256, 'train_runtime': '50.49', 'train_tokens_per_second': '1946'}
+{'loss': '1.855', 'grad_norm': '0.2965', 'learning_rate': '6.452e-06', 'epoch': '0.01318', 'num_input_tokens_seen': 100303, 'train_runtime': '51.51', 'train_tokens_per_second': '1947'}
+{'loss': '1.635', 'grad_norm': '0.3187', 'learning_rate': '6.586e-06', 'epoch': '0.01344', 'num_input_tokens_seen': 102350, 'train_runtime': '52.52', 'train_tokens_per_second': '1949'}
+{'loss': '1.884', 'grad_norm': '0.3086', 'learning_rate': '6.72e-06', 'epoch': '0.01371', 'num_input_tokens_seen': 104397, 'train_runtime': '53.53', 'train_tokens_per_second': '1950'}
+{'loss': '1.779', 'grad_norm': '0.3112', 'learning_rate': '6.855e-06', 'epoch': '0.01398', 'num_input_tokens_seen': 106444, 'train_runtime': '54.55', 'train_tokens_per_second': '1951'}
+{'loss': '1.85', 'grad_norm': '0.3581', 'learning_rate': '6.989e-06', 'epoch': '0.01425', 'num_input_tokens_seen': 108491, 'train_runtime': '55.56', 'train_tokens_per_second': '1953'}
+{'loss': '1.611', 'grad_norm': '0.7226', 'learning_rate': '7.124e-06', 'epoch': '0.01452', 'num_input_tokens_seen': 110538, 'train_runtime': '56.57', 'train_tokens_per_second': '1954'}
+{'loss': '1.643', 'grad_norm': '0.2939', 'learning_rate': '7.258e-06', 'epoch': '0.01479', 'num_input_tokens_seen': 112585, 'train_runtime': '57.58', 'train_tokens_per_second': '1955'}
+{'loss': '1.978', 'grad_norm': '0.3302', 'learning_rate': '7.392e-06', 'epoch': '0.01506', 'num_input_tokens_seen': 114632, 'train_runtime': '58.6', 'train_tokens_per_second': '1956'}
+{'loss': '1.473', 'grad_norm': '0.3044', 'learning_rate': '7.527e-06', 'epoch': '0.01533', 'num_input_tokens_seen': 116679, 'train_runtime': '59.62', 'train_tokens_per_second': '1957'}
+{'loss': '1.559', 'grad_norm': '0.3122', 'learning_rate': '7.661e-06', 'epoch': '0.0156', 'num_input_tokens_seen': 118726, 'train_runtime': '60.64', 'train_tokens_per_second': '1958'}
+{'loss': '1.793', 'grad_norm': '0.344', 'learning_rate': '7.796e-06', 'epoch': '0.01586', 'num_input_tokens_seen': 120773, 'train_runtime': '61.65', 'train_tokens_per_second': '1959'}
+{'loss': '1.589', 'grad_norm': '0.3391', 'learning_rate': '7.93e-06', 'epoch': '0.01613', 'num_input_tokens_seen': 122820, 'train_runtime': '62.66', 'train_tokens_per_second': '1960'}
+{'loss': '1.713', 'grad_norm': '0.3023', 'learning_rate': '8.065e-06', 'epoch': '0.0164', 'num_input_tokens_seen': 124867, 'train_runtime': '63.68', 'train_tokens_per_second': '1961'}
+{'loss': '1.704', 'grad_norm': '0.3436', 'learning_rate': '8.199e-06', 'epoch': '0.01667', 'num_input_tokens_seen': 126914, 'train_runtime': '64.7', 'train_tokens_per_second': '1962'}
+{'loss': '1.908', 'grad_norm': '0.3627', 'learning_rate': '8.333e-06', 'epoch': '0.01694', 'num_input_tokens_seen': 128961, 'train_runtime': '65.71', 'train_tokens_per_second': '1963'}
+{'loss': '1.799', 'grad_norm': '0.3663', 'learning_rate': '8.468e-06', 'epoch': '0.01721', 'num_input_tokens_seen': 131008, 'train_runtime': '66.72', 'train_tokens_per_second': '1963'}
+{'loss': '1.855', 'grad_norm': '0.3834', 'learning_rate': '8.602e-06', 'epoch': '0.01748', 'num_input_tokens_seen': 133055, 'train_runtime': '67.74', 'train_tokens_per_second': '1964'}
+{'loss': '1.805', 'grad_norm': '0.3678', 'learning_rate': '8.737e-06', 'epoch': '0.01775', 'num_input_tokens_seen': 135102, 'train_runtime': '68.76', 'train_tokens_per_second': '1965'}
+{'loss': '1.436', 'grad_norm': '0.3304', 'learning_rate': '8.871e-06', 'epoch': '0.01802', 'num_input_tokens_seen': 137149, 'train_runtime': '69.77', 'train_tokens_per_second': '1966'}
+{'loss': '1.746', 'grad_norm': '0.307', 'learning_rate': '9.005e-06', 'epoch': '0.01828', 'num_input_tokens_seen': 139196, 'train_runtime': '70.78', 'train_tokens_per_second': '1966'}
+{'loss': '1.823', 'grad_norm': '0.3547', 'learning_rate': '9.14e-06', 'epoch': '0.01855', 'num_input_tokens_seen': 141243, 'train_runtime': '71.8', 'train_tokens_per_second': '1967'}
+{'loss': '1.66', 'grad_norm': '0.3379', 'learning_rate': '9.274e-06', 'epoch': '0.01882', 'num_input_tokens_seen': 143290, 'train_runtime': '72.82', 'train_tokens_per_second': '1968'}
+{'loss': '1.913', 'grad_norm': '0.3416', 'learning_rate': '9.409e-06', 'epoch': '0.01909', 'num_input_tokens_seen': 145337, 'train_runtime': '73.84', 'train_tokens_per_second': '1968'}
+{'loss': '1.814', 'grad_norm': '0.3721', 'learning_rate': '9.543e-06', 'epoch': '0.01936', 'num_input_tokens_seen': 147384, 'train_runtime': '74.85', 'train_tokens_per_second': '1969'}
+{'loss': '1.797', 'grad_norm': '0.373', 'learning_rate': '9.677e-06', 'epoch': '0.01963', 'num_input_tokens_seen': 149431, 'train_runtime': '75.87', 'train_tokens_per_second': '1970'}
+{'loss': '1.704', 'grad_norm': '0.3735', 'learning_rate': '9.812e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 151478, 'train_runtime': '76.88', 'train_tokens_per_second': '1970'}
+{'loss': '1.578', 'grad_norm': '0.3312', 'learning_rate': '9.946e-06', 'epoch': '0.02017', 'num_input_tokens_seen': 153525, 'train_runtime': '77.9', 'train_tokens_per_second': '1971'}
+{'loss': '1.712', 'grad_norm': '0.3716', 'learning_rate': '1.008e-05', 'epoch': '0.02044', 'num_input_tokens_seen': 155572, 'train_runtime': '78.91', 'train_tokens_per_second': '1971'}
+{'loss': '1.758', 'grad_norm': '0.3477', 'learning_rate': '1.022e-05', 'epoch': '0.0207', 'num_input_tokens_seen': 157619, 'train_runtime': '79.93', 'train_tokens_per_second': '1972'}
+{'loss': '1.85', 'grad_norm': '0.374', 'learning_rate': '1.035e-05', 'epoch': '0.02097', 'num_input_tokens_seen': 159666, 'train_runtime': '80.94', 'train_tokens_per_second': '1973'}
+{'loss': '1.77', 'grad_norm': '0.3782', 'learning_rate': '1.048e-05', 'epoch': '0.02124', 'num_input_tokens_seen': 161713, 'train_runtime': '81.96', 'train_tokens_per_second': '1973'}
+{'loss': '1.592', 'grad_norm': '0.3265', 'learning_rate': '1.062e-05', 'epoch': '0.02151', 'num_input_tokens_seen': 163760, 'train_runtime': '82.98', 'train_tokens_per_second': '1974'}
+{'loss': '1.684', 'grad_norm': '0.3949', 'learning_rate': '1.075e-05', 'epoch': '0.02178', 'num_input_tokens_seen': 165807, 'train_runtime': '83.99', 'train_tokens_per_second': '1974'}
+{'loss': '1.416', 'grad_norm': '0.339', 'learning_rate': '1.089e-05', 'epoch': '0.02205', 'num_input_tokens_seen': 167854, 'train_runtime': '85.01', 'train_tokens_per_second': '1975'}
+{'loss': '1.275', 'grad_norm': '0.3412', 'learning_rate': '1.102e-05', 'epoch': '0.02232', 'num_input_tokens_seen': 169901, 'train_runtime': '86.02', 'train_tokens_per_second': '1975'}
+{'loss': '1.798', 'grad_norm': '0.4259', 'learning_rate': '1.116e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 171948, 'train_runtime': '87.04', 'train_tokens_per_second': '1976'}
+{'loss': '1.631', 'grad_norm': '0.3738', 'learning_rate': '1.129e-05', 'epoch': '0.02286', 'num_input_tokens_seen': 173995, 'train_runtime': '88.05', 'train_tokens_per_second': '1976'}
+{'loss': '1.695', 'grad_norm': '0.3967', 'learning_rate': '1.142e-05', 'epoch': '0.02312', 'num_input_tokens_seen': 176042, 'train_runtime': '89.07', 'train_tokens_per_second': '1976'}
+{'loss': '1.809', 'grad_norm': '0.3775', 'learning_rate': '1.156e-05', 'epoch': '0.02339', 'num_input_tokens_seen': 178089, 'train_runtime': '90.09', 'train_tokens_per_second': '1977'}
+{'loss': '1.628', 'grad_norm': '0.3732', 'learning_rate': '1.169e-05', 'epoch': '0.02366', 'num_input_tokens_seen': 180136, 'train_runtime': '91.1', 'train_tokens_per_second': '1977'}
+{'loss': '1.771', 'grad_norm': '0.397', 'learning_rate': '1.183e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 182183, 'train_runtime': '92.12', 'train_tokens_per_second': '1978'}
+{'loss': '1.708', 'grad_norm': '0.4329', 'learning_rate': '1.196e-05', 'epoch': '0.0242', 'num_input_tokens_seen': 184230, 'train_runtime': '93.14', 'train_tokens_per_second': '1978'}
+{'loss': '1.629', 'grad_norm': '0.391', 'learning_rate': '1.21e-05', 'epoch': '0.02447', 'num_input_tokens_seen': 186277, 'train_runtime': '94.15', 'train_tokens_per_second': '1978'}
+{'loss': '1.69', 'grad_norm': '0.416', 'learning_rate': '1.223e-05', 'epoch': '0.02474', 'num_input_tokens_seen': 188324, 'train_runtime': '95.17', 'train_tokens_per_second': '1979'}
+{'loss': '1.882', 'grad_norm': '0.4379', 'learning_rate': '1.237e-05', 'epoch': '0.02501', 'num_input_tokens_seen': 190371, 'train_runtime': '96.19', 'train_tokens_per_second': '1979'}
+{'loss': '1.764', 'grad_norm': '0.417', 'learning_rate': '1.25e-05', 'epoch': '0.02528', 'num_input_tokens_seen': 192418, 'train_runtime': '97.2', 'train_tokens_per_second': '1980'}
+{'loss': '1.675', 'grad_norm': '0.4218', 'learning_rate': '1.263e-05', 'epoch': '0.02554', 'num_input_tokens_seen': 194465, 'train_runtime': '98.22', 'train_tokens_per_second': '1980'}
+{'loss': '1.749', 'grad_norm': '0.4339', 'learning_rate': '1.277e-05', 'epoch': '0.02581', 'num_input_tokens_seen': 196512, 'train_runtime': '99.24', 'train_tokens_per_second': '1980'}
+{'loss': '1.792', 'grad_norm': '0.4553', 'learning_rate': '1.29e-05', 'epoch': '0.02608', 'num_input_tokens_seen': 198559, 'train_runtime': '100.3', 'train_tokens_per_second': '1981'}
+{'loss': '1.597', 'grad_norm': '0.4142', 'learning_rate': '1.304e-05', 'epoch': '0.02635', 'num_input_tokens_seen': 200606, 'train_runtime': '101.3', 'train_tokens_per_second': '1981'}
+{'loss': '1.534', 'grad_norm': '0.4112', 'learning_rate': '1.317e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 202653, 'train_runtime': '102.3', 'train_tokens_per_second': '1981'}
+{'loss': '1.607', 'grad_norm': '0.4382', 'learning_rate': '1.331e-05', 'epoch': '0.02689', 'num_input_tokens_seen': 204700, 'train_runtime': '103.3', 'train_tokens_per_second': '1982'}
+{'loss': '1.306', 'grad_norm': '0.3857', 'learning_rate': '1.344e-05', 'epoch': '0.02716', 'num_input_tokens_seen': 206747, 'train_runtime': '104.3', 'train_tokens_per_second': '1982'}
+{'loss': '1.775', 'grad_norm': '0.4403', 'learning_rate': '1.358e-05', 'epoch': '0.02743', 'num_input_tokens_seen': 208794, 'train_runtime': '105.3', 'train_tokens_per_second': '1982'}
+{'loss': '1.163', 'grad_norm': '0.4105', 'learning_rate': '1.371e-05', 'epoch': '0.0277', 'num_input_tokens_seen': 210841, 'train_runtime': '106.4', 'train_tokens_per_second': '1982'}
+{'loss': '1.773', 'grad_norm': '0.467', 'learning_rate': '1.384e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 212888, 'train_runtime': '107.4', 'train_tokens_per_second': '1983'}
+{'loss': '1.548', 'grad_norm': '0.4103', 'learning_rate': '1.398e-05', 'epoch': '0.02823', 'num_input_tokens_seen': 214935, 'train_runtime': '108.4', 'train_tokens_per_second': '1983'}
+{'loss': '1.663', 'grad_norm': '0.4564', 'learning_rate': '1.411e-05', 'epoch': '0.0285', 'num_input_tokens_seen': 216982, 'train_runtime': '109.4', 'train_tokens_per_second': '1983'}
+{'loss': '1.709', 'grad_norm': '0.5568', 'learning_rate': '1.425e-05', 'epoch': '0.02877', 'num_input_tokens_seen': 219029, 'train_runtime': '110.4', 'train_tokens_per_second': '1984'}
+{'loss': '1.683', 'grad_norm': '0.4596', 'learning_rate': '1.438e-05', 'epoch': '0.02904', 'num_input_tokens_seen': 221076, 'train_runtime': '111.4', 'train_tokens_per_second': '1984'}
+{'loss': '1.786', 'grad_norm': '0.488', 'learning_rate': '1.452e-05', 'epoch': '0.02931', 'num_input_tokens_seen': 223123, 'train_runtime': '112.5', 'train_tokens_per_second': '1984'}
+{'loss': '1.593', 'grad_norm': '0.4877', 'learning_rate': '1.465e-05', 'epoch': '0.02958', 'num_input_tokens_seen': 225170, 'train_runtime': '113.5', 'train_tokens_per_second': '1984'}
+{'loss': '1.144', 'grad_norm': '0.4087', 'learning_rate': '1.478e-05', 'epoch': '0.02985', 'num_input_tokens_seen': 227217, 'train_runtime': '114.5', 'train_tokens_per_second': '1985'}
+{'loss': '1.632', 'grad_norm': '0.4522', 'learning_rate': '1.492e-05', 'epoch': '0.03012', 'num_input_tokens_seen': 229264, 'train_runtime': '115.5', 'train_tokens_per_second': '1985'}
+{'loss': '1.575', 'grad_norm': '0.4504', 'learning_rate': '1.505e-05', 'epoch': '0.03038', 'num_input_tokens_seen': 231311, 'train_runtime': '116.5', 'train_tokens_per_second': '1985'}
+{'loss': '1.705', 'grad_norm': '0.4647', 'learning_rate': '1.519e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 233358, 'train_runtime': '117.5', 'train_tokens_per_second': '1985'}
+{'loss': '1.651', 'grad_norm': '0.4929', 'learning_rate': '1.532e-05', 'epoch': '0.03092', 'num_input_tokens_seen': 235405, 'train_runtime': '118.6', 'train_tokens_per_second': '1986'}
+{'loss': '1.614', 'grad_norm': '0.4435', 'learning_rate': '1.546e-05', 'epoch': '0.03119', 'num_input_tokens_seen': 237452, 'train_runtime': '119.6', 'train_tokens_per_second': '1986'}
+{'loss': '1.159', 'grad_norm': '0.4458', 'learning_rate': '1.559e-05', 'epoch': '0.03146', 'num_input_tokens_seen': 239499, 'train_runtime': '120.6', 'train_tokens_per_second': '1986'}
+{'loss': '1.606', 'grad_norm': '0.5428', 'learning_rate': '1.573e-05', 'epoch': '0.03173', 'num_input_tokens_seen': 241546, 'train_runtime': '121.6', 'train_tokens_per_second': '1986'}
+{'loss': '1.744', 'grad_norm': '0.5349', 'learning_rate': '1.586e-05', 'epoch': '0.032', 'num_input_tokens_seen': 243593, 'train_runtime': '122.6', 'train_tokens_per_second': '1986'}
+{'loss': '1.527', 'grad_norm': '0.5387', 'learning_rate': '1.599e-05', 'epoch': '0.03227', 'num_input_tokens_seen': 245640, 'train_runtime': '123.6', 'train_tokens_per_second': '1987'}
+{'loss': '1.52', 'grad_norm': '0.5221', 'learning_rate': '1.613e-05', 'epoch': '0.03254', 'num_input_tokens_seen': 247687, 'train_runtime': '124.7', 'train_tokens_per_second': '1987'}
+{'loss': '1.561', 'grad_norm': '0.537', 'learning_rate': '1.626e-05', 'epoch': '0.0328', 'num_input_tokens_seen': 249734, 'train_runtime': '125.7', 'train_tokens_per_second': '1987'}
+{'loss': '1.633', 'grad_norm': '0.5059', 'learning_rate': '1.64e-05', 'epoch': '0.03307', 'num_input_tokens_seen': 251781, 'train_runtime': '126.7', 'train_tokens_per_second': '1987'}
+{'loss': '1.475', 'grad_norm': '0.4845', 'learning_rate': '1.653e-05', 'epoch': '0.03334', 'num_input_tokens_seen': 253828, 'train_runtime': '127.7', 'train_tokens_per_second': '1987'}
+{'loss': '1.531', 'grad_norm': '0.5408', 'learning_rate': '1.667e-05', 'epoch': '0.03361', 'num_input_tokens_seen': 255875, 'train_runtime': '128.7', 'train_tokens_per_second': '1988'}
+{'loss': '1.483', 'grad_norm': '0.5341', 'learning_rate': '1.68e-05', 'epoch': '0.03388', 'num_input_tokens_seen': 257922, 'train_runtime': '129.8', 'train_tokens_per_second': '1988'}
+{'loss': '1.496', 'grad_norm': '0.62', 'learning_rate': '1.694e-05', 'epoch': '0.03415', 'num_input_tokens_seen': 259969, 'train_runtime': '130.8', 'train_tokens_per_second': '1988'}
+{'loss': '1.392', 'grad_norm': '0.5367', 'learning_rate': '1.707e-05', 'epoch': '0.03442', 'num_input_tokens_seen': 262016, 'train_runtime': '131.8', 'train_tokens_per_second': '1988'}
+{'loss': '1.658', 'grad_norm': '0.6011', 'learning_rate': '1.72e-05', 'epoch': '0.03469', 'num_input_tokens_seen': 264063, 'train_runtime': '132.8', 'train_tokens_per_second': '1988'}
+{'loss': '1.736', 'grad_norm': '0.6064', 'learning_rate': '1.734e-05', 'epoch': '0.03496', 'num_input_tokens_seen': 266110, 'train_runtime': '133.8', 'train_tokens_per_second': '1988'}
+{'loss': '1.581', 'grad_norm': '0.5968', 'learning_rate': '1.747e-05', 'epoch': '0.03522', 'num_input_tokens_seen': 268157, 'train_runtime': '134.8', 'train_tokens_per_second': '1989'}
+{'loss': '1.429', 'grad_norm': '0.4829', 'learning_rate': '1.761e-05', 'epoch': '0.03549', 'num_input_tokens_seen': 270204, 'train_runtime': '135.9', 'train_tokens_per_second': '1989'}
+{'loss': '1.463', 'grad_norm': '0.5296', 'learning_rate': '1.774e-05', 'epoch': '0.03576', 'num_input_tokens_seen': 272251, 'train_runtime': '136.9', 'train_tokens_per_second': '1989'}
+{'loss': '1.526', 'grad_norm': '0.6281', 'learning_rate': '1.788e-05', 'epoch': '0.03603', 'num_input_tokens_seen': 274298, 'train_runtime': '137.9', 'train_tokens_per_second': '1989'}
+{'loss': '1.534', 'grad_norm': '0.6035', 'learning_rate': '1.801e-05', 'epoch': '0.0363', 'num_input_tokens_seen': 276345, 'train_runtime': '138.9', 'train_tokens_per_second': '1989'}
+{'loss': '1.653', 'grad_norm': '0.5799', 'learning_rate': '1.815e-05', 'epoch': '0.03657', 'num_input_tokens_seen': 278392, 'train_runtime': '139.9', 'train_tokens_per_second': '1989'}
+{'loss': '1.519', 'grad_norm': '0.6246', 'learning_rate': '1.828e-05', 'epoch': '0.03684', 'num_input_tokens_seen': 280439, 'train_runtime': '141', 'train_tokens_per_second': '1989'}
+{'loss': '1.389', 'grad_norm': '0.5421', 'learning_rate': '1.841e-05', 'epoch': '0.03711', 'num_input_tokens_seen': 282486, 'train_runtime': '142', 'train_tokens_per_second': '1990'}
+{'loss': '1.675', 'grad_norm': '0.6183', 'learning_rate': '1.855e-05', 'epoch': '0.03738', 'num_input_tokens_seen': 284533, 'train_runtime': '143', 'train_tokens_per_second': '1990'}
+{'loss': '1.464', 'grad_norm': '0.5757', 'learning_rate': '1.868e-05', 'epoch': '0.03764', 'num_input_tokens_seen': 286580, 'train_runtime': '144', 'train_tokens_per_second': '1990'}
+{'loss': '1.458', 'grad_norm': '0.5838', 'learning_rate': '1.882e-05', 'epoch': '0.03791', 'num_input_tokens_seen': 288627, 'train_runtime': '145', 'train_tokens_per_second': '1990'}
+{'loss': '1.58', 'grad_norm': '0.6429', 'learning_rate': '1.895e-05', 'epoch': '0.03818', 'num_input_tokens_seen': 290674, 'train_runtime': '146.1', 'train_tokens_per_second': '1990'}
+{'loss': '1.327', 'grad_norm': '0.571', 'learning_rate': '1.909e-05', 'epoch': '0.03845', 'num_input_tokens_seen': 292721, 'train_runtime': '147.1', 'train_tokens_per_second': '1990'}
+{'loss': '1.603', 'grad_norm': '0.6355', 'learning_rate': '1.922e-05', 'epoch': '0.03872', 'num_input_tokens_seen': 294768, 'train_runtime': '148.1', 'train_tokens_per_second': '1990'}
+{'loss': '1.377', 'grad_norm': '0.5791', 'learning_rate': '1.935e-05', 'epoch': '0.03899', 'num_input_tokens_seen': 296815, 'train_runtime': '149.1', 'train_tokens_per_second': '1991'}
+{'loss': '1.409', 'grad_norm': '0.6662', 'learning_rate': '1.949e-05', 'epoch': '0.03926', 'num_input_tokens_seen': 298862, 'train_runtime': '150.1', 'train_tokens_per_second': '1991'}
+{'loss': '1.234', 'grad_norm': '0.5858', 'learning_rate': '1.962e-05', 'epoch': '0.03953', 'num_input_tokens_seen': 300909, 'train_runtime': '151.2', 'train_tokens_per_second': '1991'}
+{'loss': '1.58', 'grad_norm': '0.6273', 'learning_rate': '1.976e-05', 'epoch': '0.0398', 'num_input_tokens_seen': 302956, 'train_runtime': '152.2', 'train_tokens_per_second': '1991'}
+{'loss': '1.393', 'grad_norm': '0.6303', 'learning_rate': '1.989e-05', 'epoch': '0.04006', 'num_input_tokens_seen': 305003, 'train_runtime': '153.2', 'train_tokens_per_second': '1991'}
+{'loss': '1.48', 'grad_norm': '0.7072', 'learning_rate': '2.003e-05', 'epoch': '0.04033', 'num_input_tokens_seen': 307050, 'train_runtime': '154.2', 'train_tokens_per_second': '1991'}
+{'loss': '1.548', 'grad_norm': '0.7448', 'learning_rate': '2.016e-05', 'epoch': '0.0406', 'num_input_tokens_seen': 309097, 'train_runtime': '155.2', 'train_tokens_per_second': '1991'}
+{'loss': '1.567', 'grad_norm': '0.7425', 'learning_rate': '2.03e-05', 'epoch': '0.04087', 'num_input_tokens_seen': 311144, 'train_runtime': '156.2', 'train_tokens_per_second': '1991'}
+{'loss': '1.282', 'grad_norm': '0.5985', 'learning_rate': '2.043e-05', 'epoch': '0.04114', 'num_input_tokens_seen': 313191, 'train_runtime': '157.3', 'train_tokens_per_second': '1992'}
+{'loss': '1.438', 'grad_norm': '0.7234', 'learning_rate': '2.056e-05', 'epoch': '0.04141', 'num_input_tokens_seen': 315238, 'train_runtime': '158.3', 'train_tokens_per_second': '1992'}
+{'loss': '1.454', 'grad_norm': '0.6636', 'learning_rate': '2.07e-05', 'epoch': '0.04168', 'num_input_tokens_seen': 317285, 'train_runtime': '159.3', 'train_tokens_per_second': '1992'}
+{'loss': '1.461', 'grad_norm': '0.7192', 'learning_rate': '2.083e-05', 'epoch': '0.04195', 'num_input_tokens_seen': 319332, 'train_runtime': '160.3', 'train_tokens_per_second': '1992'}
+{'loss': '1.385', 'grad_norm': '0.7114', 'learning_rate': '2.097e-05', 'epoch': '0.04222', 'num_input_tokens_seen': 321379, 'train_runtime': '161.3', 'train_tokens_per_second': '1992'}
+{'loss': '1.568', 'grad_norm': '0.9612', 'learning_rate': '2.11e-05', 'epoch': '0.04248', 'num_input_tokens_seen': 323426, 'train_runtime': '162.4', 'train_tokens_per_second': '1992'}
+{'loss': '1.551', 'grad_norm': '0.7511', 'learning_rate': '2.124e-05', 'epoch': '0.04275', 'num_input_tokens_seen': 325473, 'train_runtime': '163.4', 'train_tokens_per_second': '1992'}
+{'loss': '1.468', 'grad_norm': '0.771', 'learning_rate': '2.137e-05', 'epoch': '0.04302', 'num_input_tokens_seen': 327520, 'train_runtime': '164.4', 'train_tokens_per_second': '1992'}
+{'loss': '1.486', 'grad_norm': '0.7804', 'learning_rate': '2.151e-05', 'epoch': '0.04329', 'num_input_tokens_seen': 329567, 'train_runtime': '165.4', 'train_tokens_per_second': '1992'}
+{'loss': '1.426', 'grad_norm': '0.8093', 'learning_rate': '2.164e-05', 'epoch': '0.04356', 'num_input_tokens_seen': 331614, 'train_runtime': '166.4', 'train_tokens_per_second': '1992'}
+{'loss': '1.331', 'grad_norm': '0.7181', 'learning_rate': '2.177e-05', 'epoch': '0.04383', 'num_input_tokens_seen': 333661, 'train_runtime': '167.4', 'train_tokens_per_second': '1993'}
+{'loss': '1.026', 'grad_norm': '0.7177', 'learning_rate': '2.191e-05', 'epoch': '0.0441', 'num_input_tokens_seen': 335708, 'train_runtime': '168.5', 'train_tokens_per_second': '1993'}
+{'loss': '1.391', 'grad_norm': '0.7581', 'learning_rate': '2.204e-05', 'epoch': '0.04437', 'num_input_tokens_seen': 337755, 'train_runtime': '169.5', 'train_tokens_per_second': '1993'}
+{'loss': '1.388', 'grad_norm': '0.8128', 'learning_rate': '2.218e-05', 'epoch': '0.04464', 'num_input_tokens_seen': 339802, 'train_runtime': '170.5', 'train_tokens_per_second': '1993'}
+{'loss': '1.494', 'grad_norm': '0.8851', 'learning_rate': '2.231e-05', 'epoch': '0.0449', 'num_input_tokens_seen': 341849, 'train_runtime': '171.5', 'train_tokens_per_second': '1993'}
+{'loss': '1.275', 'grad_norm': '0.741', 'learning_rate': '2.245e-05', 'epoch': '0.04517', 'num_input_tokens_seen': 343896, 'train_runtime': '172.5', 'train_tokens_per_second': '1993'}
+{'loss': '1.307', 'grad_norm': '0.7937', 'learning_rate': '2.258e-05', 'epoch': '0.04544', 'num_input_tokens_seen': 345943, 'train_runtime': '173.6', 'train_tokens_per_second': '1993'}
+{'loss': '1.188', 'grad_norm': '0.758', 'learning_rate': '2.272e-05', 'epoch': '0.04571', 'num_input_tokens_seen': 347990, 'train_runtime': '174.6', 'train_tokens_per_second': '1993'}
+{'loss': '1.371', 'grad_norm': '0.8093', 'learning_rate': '2.285e-05', 'epoch': '0.04598', 'num_input_tokens_seen': 350037, 'train_runtime': '175.6', 'train_tokens_per_second': '1993'}
+{'loss': '1.234', 'grad_norm': '0.7643', 'learning_rate': '2.298e-05', 'epoch': '0.04625', 'num_input_tokens_seen': 352084, 'train_runtime': '176.6', 'train_tokens_per_second': '1994'}
+{'loss': '1.437', 'grad_norm': '0.8591', 'learning_rate': '2.312e-05', 'epoch': '0.04652', 'num_input_tokens_seen': 354131, 'train_runtime': '177.6', 'train_tokens_per_second': '1994'}
+{'loss': '1.425', 'grad_norm': '1.101', 'learning_rate': '2.325e-05', 'epoch': '0.04679', 'num_input_tokens_seen': 356178, 'train_runtime': '178.7', 'train_tokens_per_second': '1994'}
+{'loss': '1.402', 'grad_norm': '0.8633', 'learning_rate': '2.339e-05', 'epoch': '0.04706', 'num_input_tokens_seen': 358225, 'train_runtime': '179.7', 'train_tokens_per_second': '1994'}
+{'loss': '1.33', 'grad_norm': '0.9336', 'learning_rate': '2.352e-05', 'epoch': '0.04732', 'num_input_tokens_seen': 360272, 'train_runtime': '180.7', 'train_tokens_per_second': '1994'}
+{'loss': '1.189', 'grad_norm': '0.9058', 'learning_rate': '2.366e-05', 'epoch': '0.04759', 'num_input_tokens_seen': 362319, 'train_runtime': '181.7', 'train_tokens_per_second': '1994'}
+{'loss': '1.383', 'grad_norm': '1.003', 'learning_rate': '2.379e-05', 'epoch': '0.04786', 'num_input_tokens_seen': 364366, 'train_runtime': '182.7', 'train_tokens_per_second': '1994'}
+{'loss': '1.263', 'grad_norm': '0.949', 'learning_rate': '2.392e-05', 'epoch': '0.04813', 'num_input_tokens_seen': 366413, 'train_runtime': '183.7', 'train_tokens_per_second': '1994'}
+{'loss': '1.473', 'grad_norm': '1.062', 'learning_rate': '2.406e-05', 'epoch': '0.0484', 'num_input_tokens_seen': 368460, 'train_runtime': '184.8', 'train_tokens_per_second': '1994'}
+{'loss': '1.218', 'grad_norm': '0.862', 'learning_rate': '2.419e-05', 'epoch': '0.04867', 'num_input_tokens_seen': 370507, 'train_runtime': '185.8', 'train_tokens_per_second': '1994'}
+{'loss': '1.232', 'grad_norm': '1.03', 'learning_rate': '2.433e-05', 'epoch': '0.04894', 'num_input_tokens_seen': 372554, 'train_runtime': '186.8', 'train_tokens_per_second': '1994'}
+{'loss': '1.243', 'grad_norm': '0.9608', 'learning_rate': '2.446e-05', 'epoch': '0.04921', 'num_input_tokens_seen': 374601, 'train_runtime': '187.8', 'train_tokens_per_second': '1994'}
+{'loss': '1.423', 'grad_norm': '0.9823', 'learning_rate': '2.46e-05', 'epoch': '0.04948', 'num_input_tokens_seen': 376648, 'train_runtime': '188.8', 'train_tokens_per_second': '1995'}
+{'loss': '1.176', 'grad_norm': '0.9865', 'learning_rate': '2.473e-05', 'epoch': '0.04974', 'num_input_tokens_seen': 378695, 'train_runtime': '189.9', 'train_tokens_per_second': '1995'}
+{'loss': '1.323', 'grad_norm': '1.114', 'learning_rate': '2.487e-05', 'epoch': '0.05001', 'num_input_tokens_seen': 380742, 'train_runtime': '190.9', 'train_tokens_per_second': '1995'}
+{'loss': '1.394', 'grad_norm': '1.221', 'learning_rate': '2.5e-05', 'epoch': '0.05028', 'num_input_tokens_seen': 382789, 'train_runtime': '191.9', 'train_tokens_per_second': '1995'}
+{'loss': '1.228', 'grad_norm': '0.9599', 'learning_rate': '2.513e-05', 'epoch': '0.05055', 'num_input_tokens_seen': 384836, 'train_runtime': '192.9', 'train_tokens_per_second': '1995'}
+{'loss': '0.9697', 'grad_norm': '6.034', 'learning_rate': '2.527e-05', 'epoch': '0.05082', 'num_input_tokens_seen': 386883, 'train_runtime': '193.9', 'train_tokens_per_second': '1995'}
+{'loss': '1.253', 'grad_norm': '1.301', 'learning_rate': '2.54e-05', 'epoch': '0.05109', 'num_input_tokens_seen': 388930, 'train_runtime': '195', 'train_tokens_per_second': '1995'}
+{'loss': '1.398', 'grad_norm': '1.082', 'learning_rate': '2.554e-05', 'epoch': '0.05136', 'num_input_tokens_seen': 390977, 'train_runtime': '196', 'train_tokens_per_second': '1995'}
+{'loss': '1.206', 'grad_norm': '0.9854', 'learning_rate': '2.567e-05', 'epoch': '0.05163', 'num_input_tokens_seen': 393024, 'train_runtime': '197', 'train_tokens_per_second': '1995'}
+{'loss': '1.27', 'grad_norm': '1.037', 'learning_rate': '2.581e-05', 'epoch': '0.0519', 'num_input_tokens_seen': 395071, 'train_runtime': '198', 'train_tokens_per_second': '1995'}
+{'loss': '1.292', 'grad_norm': '1.057', 'learning_rate': '2.594e-05', 'epoch': '0.05216', 'num_input_tokens_seen': 397118, 'train_runtime': '199', 'train_tokens_per_second': '1995'}
+{'loss': '1.272', 'grad_norm': '1.172', 'learning_rate': '2.608e-05', 'epoch': '0.05243', 'num_input_tokens_seen': 399165, 'train_runtime': '200.1', 'train_tokens_per_second': '1995'}
+{'loss': '1.189', 'grad_norm': '1.225', 'learning_rate': '2.621e-05', 'epoch': '0.0527', 'num_input_tokens_seen': 401212, 'train_runtime': '201.1', 'train_tokens_per_second': '1995'}
+{'loss': '1.259', 'grad_norm': '1.042', 'learning_rate': '2.634e-05', 'epoch': '0.05297', 'num_input_tokens_seen': 403259, 'train_runtime': '202.1', 'train_tokens_per_second': '1995'}
+{'loss': '1.39', 'grad_norm': '1.074', 'learning_rate': '2.648e-05', 'epoch': '0.05324', 'num_input_tokens_seen': 405306, 'train_runtime': '203.1', 'train_tokens_per_second': '1995'}
+{'loss': '1.152', 'grad_norm': '1.069', 'learning_rate': '2.661e-05', 'epoch': '0.05351', 'num_input_tokens_seen': 407353, 'train_runtime': '204.1', 'train_tokens_per_second': '1996'}
+{'loss': '1.174', 'grad_norm': '1.032', 'learning_rate': '2.675e-05', 'epoch': '0.05378', 'num_input_tokens_seen': 409400, 'train_runtime': '205.1', 'train_tokens_per_second': '1996'}
+{'loss': '1.095', 'grad_norm': '1.298', 'learning_rate': '2.688e-05', 'epoch': '0.05405', 'num_input_tokens_seen': 411447, 'train_runtime': '206.2', 'train_tokens_per_second': '1996'}
+{'loss': '1.309', 'grad_norm': '1.595', 'learning_rate': '2.702e-05', 'epoch': '0.05432', 'num_input_tokens_seen': 413494, 'train_runtime': '207.2', 'train_tokens_per_second': '1996'}
+{'loss': '1.312', 'grad_norm': '1.278', 'learning_rate': '2.715e-05', 'epoch': '0.05458', 'num_input_tokens_seen': 415541, 'train_runtime': '208.2', 'train_tokens_per_second': '1996'}
+{'loss': '1.237', 'grad_norm': '1.138', 'learning_rate': '2.728e-05', 'epoch': '0.05485', 'num_input_tokens_seen': 417588, 'train_runtime': '209.2', 'train_tokens_per_second': '1996'}
+{'loss': '1.268', 'grad_norm': '1.082', 'learning_rate': '2.742e-05', 'epoch': '0.05512', 'num_input_tokens_seen': 419635, 'train_runtime': '210.2', 'train_tokens_per_second': '1996'}
+{'loss': '1.158', 'grad_norm': '1.197', 'learning_rate': '2.755e-05', 'epoch': '0.05539', 'num_input_tokens_seen': 421682, 'train_runtime': '211.3', 'train_tokens_per_second': '1996'}
+{'loss': '1.114', 'grad_norm': '1.105', 'learning_rate': '2.769e-05', 'epoch': '0.05566', 'num_input_tokens_seen': 423729, 'train_runtime': '212.3', 'train_tokens_per_second': '1996'}
+{'loss': '1.301', 'grad_norm': '1.159', 'learning_rate': '2.782e-05', 'epoch': '0.05593', 'num_input_tokens_seen': 425776, 'train_runtime': '213.3', 'train_tokens_per_second': '1996'}
+{'loss': '1.239', 'grad_norm': '1.363', 'learning_rate': '2.796e-05', 'epoch': '0.0562', 'num_input_tokens_seen': 427823, 'train_runtime': '214.3', 'train_tokens_per_second': '1996'}
+{'loss': '1.175', 'grad_norm': '1.773', 'learning_rate': '2.809e-05', 'epoch': '0.05647', 'num_input_tokens_seen': 429870, 'train_runtime': '215.3', 'train_tokens_per_second': '1996'}
+{'loss': '1.356', 'grad_norm': '1.097', 'learning_rate': '2.823e-05', 'epoch': '0.05674', 'num_input_tokens_seen': 431917, 'train_runtime': '216.3', 'train_tokens_per_second': '1996'}
+{'loss': '1.229', 'grad_norm': '1.221', 'learning_rate': '2.836e-05', 'epoch': '0.057', 'num_input_tokens_seen': 433964, 'train_runtime': '217.4', 'train_tokens_per_second': '1996'}
+{'loss': '1.174', 'grad_norm': '1.157', 'learning_rate': '2.849e-05', 'epoch': '0.05727', 'num_input_tokens_seen': 436011, 'train_runtime': '218.4', 'train_tokens_per_second': '1996'}
+{'loss': '1.068', 'grad_norm': '1.14', 'learning_rate': '2.863e-05', 'epoch': '0.05754', 'num_input_tokens_seen': 438058, 'train_runtime': '219.4', 'train_tokens_per_second': '1997'}
+{'loss': '1.053', 'grad_norm': '1.118', 'learning_rate': '2.876e-05', 'epoch': '0.05781', 'num_input_tokens_seen': 440105, 'train_runtime': '220.4', 'train_tokens_per_second': '1997'}
+{'loss': '1.067', 'grad_norm': '1.103', 'learning_rate': '2.89e-05', 'epoch': '0.05808', 'num_input_tokens_seen': 442152, 'train_runtime': '221.4', 'train_tokens_per_second': '1997'}
+{'loss': '1.2', 'grad_norm': '1.188', 'learning_rate': '2.903e-05', 'epoch': '0.05835', 'num_input_tokens_seen': 444199, 'train_runtime': '222.5', 'train_tokens_per_second': '1997'}
+{'loss': '1.151', 'grad_norm': '1.14', 'learning_rate': '2.917e-05', 'epoch': '0.05862', 'num_input_tokens_seen': 446246, 'train_runtime': '223.5', 'train_tokens_per_second': '1997'}
+{'loss': '1.142', 'grad_norm': '1.394', 'learning_rate': '2.93e-05', 'epoch': '0.05889', 'num_input_tokens_seen': 448293, 'train_runtime': '224.5', 'train_tokens_per_second': '1997'}
+{'loss': '1.119', 'grad_norm': '1.263', 'learning_rate': '2.944e-05', 'epoch': '0.05916', 'num_input_tokens_seen': 450340, 'train_runtime': '225.5', 'train_tokens_per_second': '1997'}
+{'loss': '1.024', 'grad_norm': '1.189', 'learning_rate': '2.957e-05', 'epoch': '0.05942', 'num_input_tokens_seen': 452387, 'train_runtime': '226.5', 'train_tokens_per_second': '1997'}
+{'loss': '1.227', 'grad_norm': '1.225', 'learning_rate': '2.97e-05', 'epoch': '0.05969', 'num_input_tokens_seen': 454434, 'train_runtime': '227.5', 'train_tokens_per_second': '1997'}
+{'loss': '0.956', 'grad_norm': '1.254', 'learning_rate': '2.984e-05', 'epoch': '0.05996', 'num_input_tokens_seen': 456481, 'train_runtime': '228.6', 'train_tokens_per_second': '1997'}
+{'loss': '1.182', 'grad_norm': '1.283', 'learning_rate': '2.997e-05', 'epoch': '0.06023', 'num_input_tokens_seen': 458528, 'train_runtime': '229.6', 'train_tokens_per_second': '1997'}
+{'loss': '1.112', 'grad_norm': '1.259', 'learning_rate': '3.011e-05', 'epoch': '0.0605', 'num_input_tokens_seen': 460575, 'train_runtime': '230.6', 'train_tokens_per_second': '1997'}
+{'loss': '1.142', 'grad_norm': '1.316', 'learning_rate': '3.024e-05', 'epoch': '0.06077', 'num_input_tokens_seen': 462622, 'train_runtime': '231.6', 'train_tokens_per_second': '1997'}
+{'loss': '0.6945', 'grad_norm': '0.9457', 'learning_rate': '3.038e-05', 'epoch': '0.06104', 'num_input_tokens_seen': 464669, 'train_runtime': '232.6', 'train_tokens_per_second': '1997'}
+{'loss': '1.13', 'grad_norm': '1.351', 'learning_rate': '3.051e-05', 'epoch': '0.06131', 'num_input_tokens_seen': 466716, 'train_runtime': '233.7', 'train_tokens_per_second': '1997'}
+{'loss': '1.041', 'grad_norm': '1.223', 'learning_rate': '3.065e-05', 'epoch': '0.06158', 'num_input_tokens_seen': 468763, 'train_runtime': '234.7', 'train_tokens_per_second': '1998'}
+{'loss': '0.9028', 'grad_norm': '1.262', 'learning_rate': '3.078e-05', 'epoch': '0.06184', 'num_input_tokens_seen': 470810, 'train_runtime': '235.7', 'train_tokens_per_second': '1998'}
+{'loss': '1.115', 'grad_norm': '1.139', 'learning_rate': '3.091e-05', 'epoch': '0.06211', 'num_input_tokens_seen': 472857, 'train_runtime': '236.7', 'train_tokens_per_second': '1998'}
+{'loss': '1.181', 'grad_norm': '1.256', 'learning_rate': '3.105e-05', 'epoch': '0.06238', 'num_input_tokens_seen': 474904, 'train_runtime': '237.7', 'train_tokens_per_second': '1998'}
+{'loss': '1.177', 'grad_norm': '1.267', 'learning_rate': '3.118e-05', 'epoch': '0.06265', 'num_input_tokens_seen': 476951, 'train_runtime': '238.7', 'train_tokens_per_second': '1998'}
+{'loss': '1.119', 'grad_norm': '1.256', 'learning_rate': '3.132e-05', 'epoch': '0.06292', 'num_input_tokens_seen': 478998, 'train_runtime': '239.8', 'train_tokens_per_second': '1997'}
+{'loss': '1.147', 'grad_norm': '1.309', 'learning_rate': '3.145e-05', 'epoch': '0.06319', 'num_input_tokens_seen': 481045, 'train_runtime': '240.8', 'train_tokens_per_second': '1997'}
+{'loss': '1.062', 'grad_norm': '1.274', 'learning_rate': '3.159e-05', 'epoch': '0.06346', 'num_input_tokens_seen': 483092, 'train_runtime': '241.9', 'train_tokens_per_second': '1997'}
+{'loss': '1.136', 'grad_norm': '1.696', 'learning_rate': '3.172e-05', 'epoch': '0.06373', 'num_input_tokens_seen': 485139, 'train_runtime': '242.9', 'train_tokens_per_second': '1998'}
+{'loss': '1.039', 'grad_norm': '1.379', 'learning_rate': '3.185e-05', 'epoch': '0.064', 'num_input_tokens_seen': 487186, 'train_runtime': '243.9', 'train_tokens_per_second': '1998'}
+{'loss': '1.247', 'grad_norm': '1.521', 'learning_rate': '3.199e-05', 'epoch': '0.06426', 'num_input_tokens_seen': 489233, 'train_runtime': '244.9', 'train_tokens_per_second': '1998'}
+{'loss': '1.183', 'grad_norm': '1.438', 'learning_rate': '3.212e-05', 'epoch': '0.06453', 'num_input_tokens_seen': 491280, 'train_runtime': '245.9', 'train_tokens_per_second': '1998'}
+{'loss': '1.089', 'grad_norm': '1.296', 'learning_rate': '3.226e-05', 'epoch': '0.0648', 'num_input_tokens_seen': 493327, 'train_runtime': '246.9', 'train_tokens_per_second': '1998'}
+{'loss': '1.182', 'grad_norm': '1.303', 'learning_rate': '3.239e-05', 'epoch': '0.06507', 'num_input_tokens_seen': 495374, 'train_runtime': '248', 'train_tokens_per_second': '1998'}
+{'loss': '1.184', 'grad_norm': '1.186', 'learning_rate': '3.253e-05', 'epoch': '0.06534', 'num_input_tokens_seen': 497421, 'train_runtime': '249', 'train_tokens_per_second': '1998'}
+{'loss': '0.8177', 'grad_norm': '1.317', 'learning_rate': '3.266e-05', 'epoch': '0.06561', 'num_input_tokens_seen': 499468, 'train_runtime': '250', 'train_tokens_per_second': '1998'}
+{'loss': '1.119', 'grad_norm': '1.171', 'learning_rate': '3.28e-05', 'epoch': '0.06588', 'num_input_tokens_seen': 501515, 'train_runtime': '251', 'train_tokens_per_second': '1998'}
+{'loss': '0.9268', 'grad_norm': '1.47', 'learning_rate': '3.293e-05', 'epoch': '0.06615', 'num_input_tokens_seen': 503562, 'train_runtime': '252', 'train_tokens_per_second': '1998'}
+{'loss': '0.8829', 'grad_norm': '1.611', 'learning_rate': '3.306e-05', 'epoch': '0.06642', 'num_input_tokens_seen': 505609, 'train_runtime': '253.1', 'train_tokens_per_second': '1998'}
+{'loss': '1.069', 'grad_norm': '1.647', 'learning_rate': '3.32e-05', 'epoch': '0.06668', 'num_input_tokens_seen': 507656, 'train_runtime': '254.1', 'train_tokens_per_second': '1998'}
+{'loss': '0.9165', 'grad_norm': '1.581', 'learning_rate': '3.333e-05', 'epoch': '0.06695', 'num_input_tokens_seen': 509703, 'train_runtime': '255.1', 'train_tokens_per_second': '1998'}
+{'loss': '1.079', 'grad_norm': '1.815', 'learning_rate': '3.347e-05', 'epoch': '0.06722', 'num_input_tokens_seen': 511750, 'train_runtime': '256.1', 'train_tokens_per_second': '1998'}
+{'loss': '0.8549', 'grad_norm': '1.626', 'learning_rate': '3.36e-05', 'epoch': '0.06749', 'num_input_tokens_seen': 513797, 'train_runtime': '257.1', 'train_tokens_per_second': '1998'}
+{'loss': '0.8964', 'grad_norm': '1.216', 'learning_rate': '3.374e-05', 'epoch': '0.06776', 'num_input_tokens_seen': 515844, 'train_runtime': '258.2', 'train_tokens_per_second': '1998'}
+{'loss': '0.9361', 'grad_norm': '1.345', 'learning_rate': '3.387e-05', 'epoch': '0.06803', 'num_input_tokens_seen': 517891, 'train_runtime': '259.2', 'train_tokens_per_second': '1998'}
+{'loss': '0.8836', 'grad_norm': '1.337', 'learning_rate': '3.401e-05', 'epoch': '0.0683', 'num_input_tokens_seen': 519938, 'train_runtime': '260.2', 'train_tokens_per_second': '1998'}
+{'loss': '1.104', 'grad_norm': '1.467', 'learning_rate': '3.414e-05', 'epoch': '0.06857', 'num_input_tokens_seen': 521985, 'train_runtime': '261.2', 'train_tokens_per_second': '1998'}
+{'loss': '1.308', 'grad_norm': '1.429', 'learning_rate': '3.427e-05', 'epoch': '0.06884', 'num_input_tokens_seen': 524032, 'train_runtime': '262.2', 'train_tokens_per_second': '1998'}
+{'loss': '1.079', 'grad_norm': '1.394', 'learning_rate': '3.441e-05', 'epoch': '0.0691', 'num_input_tokens_seen': 526079, 'train_runtime': '263.3', 'train_tokens_per_second': '1998'}
+{'loss': '1.033', 'grad_norm': '1.304', 'learning_rate': '3.454e-05', 'epoch': '0.06937', 'num_input_tokens_seen': 528126, 'train_runtime': '264.3', 'train_tokens_per_second': '1998'}
+{'loss': '0.9466', 'grad_norm': '1.488', 'learning_rate': '3.468e-05', 'epoch': '0.06964', 'num_input_tokens_seen': 530173, 'train_runtime': '265.3', 'train_tokens_per_second': '1999'}
+{'loss': '1.045', 'grad_norm': '1.277', 'learning_rate': '3.481e-05', 'epoch': '0.06991', 'num_input_tokens_seen': 532220, 'train_runtime': '266.3', 'train_tokens_per_second': '1999'}
+{'loss': '0.9476', 'grad_norm': '1.584', 'learning_rate': '3.495e-05', 'epoch': '0.07018', 'num_input_tokens_seen': 534267, 'train_runtime': '267.3', 'train_tokens_per_second': '1999'}
+{'loss': '0.7732', 'grad_norm': '1.766', 'learning_rate': '3.508e-05', 'epoch': '0.07045', 'num_input_tokens_seen': 536314, 'train_runtime': '268.3', 'train_tokens_per_second': '1999'}
+{'loss': '0.9556', 'grad_norm': '1.519', 'learning_rate': '3.522e-05', 'epoch': '0.07072', 'num_input_tokens_seen': 538361, 'train_runtime': '269.4', 'train_tokens_per_second': '1999'}
+{'loss': '0.7371', 'grad_norm': '1.619', 'learning_rate': '3.535e-05', 'epoch': '0.07099', 'num_input_tokens_seen': 540408, 'train_runtime': '270.4', 'train_tokens_per_second': '1999'}
+{'loss': '1.137', 'grad_norm': '1.548', 'learning_rate': '3.548e-05', 'epoch': '0.07126', 'num_input_tokens_seen': 542455, 'train_runtime': '271.4', 'train_tokens_per_second': '1999'}
+{'loss': '0.9216', 'grad_norm': '1.755', 'learning_rate': '3.562e-05', 'epoch': '0.07152', 'num_input_tokens_seen': 544502, 'train_runtime': '272.4', 'train_tokens_per_second': '1999'}
+{'loss': '1.001', 'grad_norm': '1.717', 'learning_rate': '3.575e-05', 'epoch': '0.07179', 'num_input_tokens_seen': 546549, 'train_runtime': '273.4', 'train_tokens_per_second': '1999'}
+{'loss': '0.8521', 'grad_norm': '1.497', 'learning_rate': '3.589e-05', 'epoch': '0.07206', 'num_input_tokens_seen': 548596, 'train_runtime': '274.5', 'train_tokens_per_second': '1999'}
+{'loss': '0.9487', 'grad_norm': '1.475', 'learning_rate': '3.602e-05', 'epoch': '0.07233', 'num_input_tokens_seen': 550643, 'train_runtime': '275.5', 'train_tokens_per_second': '1999'}
+{'loss': '1.001', 'grad_norm': '1.486', 'learning_rate': '3.616e-05', 'epoch': '0.0726', 'num_input_tokens_seen': 552690, 'train_runtime': '276.5', 'train_tokens_per_second': '1999'}
+{'loss': '0.9512', 'grad_norm': '1.384', 'learning_rate': '3.629e-05', 'epoch': '0.07287', 'num_input_tokens_seen': 554737, 'train_runtime': '277.5', 'train_tokens_per_second': '1999'}
+{'loss': '0.9048', 'grad_norm': '1.256', 'learning_rate': '3.642e-05', 'epoch': '0.07314', 'num_input_tokens_seen': 556784, 'train_runtime': '278.5', 'train_tokens_per_second': '1999'}
+{'loss': '0.966', 'grad_norm': '1.58', 'learning_rate': '3.656e-05', 'epoch': '0.07341', 'num_input_tokens_seen': 558831, 'train_runtime': '279.5', 'train_tokens_per_second': '1999'}
+{'loss': '1.121', 'grad_norm': '1.473', 'learning_rate': '3.669e-05', 'epoch': '0.07368', 'num_input_tokens_seen': 560878, 'train_runtime': '280.6', 'train_tokens_per_second': '1999'}
+{'loss': '0.9792', 'grad_norm': '1.466', 'learning_rate': '3.683e-05', 'epoch': '0.07394', 'num_input_tokens_seen': 562925, 'train_runtime': '281.6', 'train_tokens_per_second': '1999'}
+{'loss': '0.7847', 'grad_norm': '1.34', 'learning_rate': '3.696e-05', 'epoch': '0.07421', 'num_input_tokens_seen': 564972, 'train_runtime': '282.6', 'train_tokens_per_second': '1999'}
+{'loss': '0.9178', 'grad_norm': '1.556', 'learning_rate': '3.71e-05', 'epoch': '0.07448', 'num_input_tokens_seen': 567019, 'train_runtime': '283.6', 'train_tokens_per_second': '1999'}
+{'loss': '0.7879', 'grad_norm': '1.819', 'learning_rate': '3.723e-05', 'epoch': '0.07475', 'num_input_tokens_seen': 569066, 'train_runtime': '284.6', 'train_tokens_per_second': '1999'}
+{'loss': '0.9185', 'grad_norm': '1.563', 'learning_rate': '3.737e-05', 'epoch': '0.07502', 'num_input_tokens_seen': 571113, 'train_runtime': '285.6', 'train_tokens_per_second': '1999'}
+{'loss': '0.9971', 'grad_norm': '1.695', 'learning_rate': '3.75e-05', 'epoch': '0.07529', 'num_input_tokens_seen': 573160, 'train_runtime': '286.7', 'train_tokens_per_second': '1999'}
+{'loss': '0.7991', 'grad_norm': '1.747', 'learning_rate': '3.763e-05', 'epoch': '0.07556', 'num_input_tokens_seen': 575207, 'train_runtime': '287.7', 'train_tokens_per_second': '1999'}
+{'loss': '0.7907', 'grad_norm': '1.532', 'learning_rate': '3.777e-05', 'epoch': '0.07583', 'num_input_tokens_seen': 577254, 'train_runtime': '288.7', 'train_tokens_per_second': '2000'}
+{'loss': '0.977', 'grad_norm': '1.455', 'learning_rate': '3.79e-05', 'epoch': '0.0761', 'num_input_tokens_seen': 579301, 'train_runtime': '289.7', 'train_tokens_per_second': '2000'}
+{'loss': '0.7108', 'grad_norm': '1.527', 'learning_rate': '3.804e-05', 'epoch': '0.07636', 'num_input_tokens_seen': 581348, 'train_runtime': '290.7', 'train_tokens_per_second': '2000'}
+{'loss': '1.001', 'grad_norm': '1.406', 'learning_rate': '3.817e-05', 'epoch': '0.07663', 'num_input_tokens_seen': 583395, 'train_runtime': '291.8', 'train_tokens_per_second': '2000'}
+{'loss': '0.9314', 'grad_norm': '1.623', 'learning_rate': '3.831e-05', 'epoch': '0.0769', 'num_input_tokens_seen': 585442, 'train_runtime': '292.8', 'train_tokens_per_second': '2000'}
+{'loss': '0.8645', 'grad_norm': '1.463', 'learning_rate': '3.844e-05', 'epoch': '0.07717', 'num_input_tokens_seen': 587489, 'train_runtime': '293.8', 'train_tokens_per_second': '2000'}
+{'loss': '0.9374', 'grad_norm': '1.443', 'learning_rate': '3.858e-05', 'epoch': '0.07744', 'num_input_tokens_seen': 589536, 'train_runtime': '294.8', 'train_tokens_per_second': '2000'}
+{'loss': '0.7219', 'grad_norm': '1.712', 'learning_rate': '3.871e-05', 'epoch': '0.07771', 'num_input_tokens_seen': 591583, 'train_runtime': '295.8', 'train_tokens_per_second': '2000'}
+{'loss': '0.8756', 'grad_norm': '1.486', 'learning_rate': '3.884e-05', 'epoch': '0.07798', 'num_input_tokens_seen': 593630, 'train_runtime': '296.8', 'train_tokens_per_second': '2000'}
+{'loss': '0.703', 'grad_norm': '1.555', 'learning_rate': '3.898e-05', 'epoch': '0.07825', 'num_input_tokens_seen': 595677, 'train_runtime': '297.9', 'train_tokens_per_second': '2000'}
+{'loss': '0.8544', 'grad_norm': '1.665', 'learning_rate': '3.911e-05', 'epoch': '0.07852', 'num_input_tokens_seen': 597724, 'train_runtime': '298.9', 'train_tokens_per_second': '2000'}
+{'loss': '1.088', 'grad_norm': '2.26', 'learning_rate': '3.925e-05', 'epoch': '0.07878', 'num_input_tokens_seen': 599771, 'train_runtime': '299.9', 'train_tokens_per_second': '2000'}
+{'loss': '0.8961', 'grad_norm': '1.421', 'learning_rate': '3.938e-05', 'epoch': '0.07905', 'num_input_tokens_seen': 601818, 'train_runtime': '300.9', 'train_tokens_per_second': '2000'}
+{'loss': '1.096', 'grad_norm': '1.708', 'learning_rate': '3.952e-05', 'epoch': '0.07932', 'num_input_tokens_seen': 603865, 'train_runtime': '301.9', 'train_tokens_per_second': '2000'}
+{'loss': '0.9044', 'grad_norm': '1.57', 'learning_rate': '3.965e-05', 'epoch': '0.07959', 'num_input_tokens_seen': 605912, 'train_runtime': '303', 'train_tokens_per_second': '2000'}
+{'loss': '0.9157', 'grad_norm': '1.404', 'learning_rate': '3.978e-05', 'epoch': '0.07986', 'num_input_tokens_seen': 607959, 'train_runtime': '304', 'train_tokens_per_second': '2000'}
+{'loss': '0.9376', 'grad_norm': '1.561', 'learning_rate': '3.992e-05', 'epoch': '0.08013', 'num_input_tokens_seen': 610006, 'train_runtime': '305', 'train_tokens_per_second': '2000'}
+{'loss': '1.079', 'grad_norm': '1.473', 'learning_rate': '4.005e-05', 'epoch': '0.0804', 'num_input_tokens_seen': 612053, 'train_runtime': '306', 'train_tokens_per_second': '2000'}
+{'loss': '0.8078', 'grad_norm': '1.753', 'learning_rate': '4.019e-05', 'epoch': '0.08067', 'num_input_tokens_seen': 614100, 'train_runtime': '307', 'train_tokens_per_second': '2000'}
+{'loss': '0.9436', 'grad_norm': '1.635', 'learning_rate': '4.032e-05', 'epoch': '0.08094', 'num_input_tokens_seen': 616147, 'train_runtime': '308', 'train_tokens_per_second': '2000'}
+{'loss': '0.8635', 'grad_norm': '1.619', 'learning_rate': '4.046e-05', 'epoch': '0.0812', 'num_input_tokens_seen': 618194, 'train_runtime': '309.1', 'train_tokens_per_second': '2000'}
+{'loss': '0.8744', 'grad_norm': '1.512', 'learning_rate': '4.059e-05', 'epoch': '0.08147', 'num_input_tokens_seen': 620241, 'train_runtime': '310.1', 'train_tokens_per_second': '2000'}
+{'loss': '0.9712', 'grad_norm': '1.711', 'learning_rate': '4.073e-05', 'epoch': '0.08174', 'num_input_tokens_seen': 622288, 'train_runtime': '311.1', 'train_tokens_per_second': '2000'}
+{'loss': '0.8672', 'grad_norm': '1.683', 'learning_rate': '4.086e-05', 'epoch': '0.08201', 'num_input_tokens_seen': 624335, 'train_runtime': '312.1', 'train_tokens_per_second': '2000'}
+{'loss': '0.8807', 'grad_norm': '1.646', 'learning_rate': '4.099e-05', 'epoch': '0.08228', 'num_input_tokens_seen': 626382, 'train_runtime': '313.1', 'train_tokens_per_second': '2000'}
+{'loss': '0.9382', 'grad_norm': '1.572', 'learning_rate': '4.113e-05', 'epoch': '0.08255', 'num_input_tokens_seen': 628429, 'train_runtime': '314.1', 'train_tokens_per_second': '2000'}
+{'loss': '0.9096', 'grad_norm': '1.767', 'learning_rate': '4.126e-05', 'epoch': '0.08282', 'num_input_tokens_seen': 630476, 'train_runtime': '315.2', 'train_tokens_per_second': '2000'}
+{'loss': '0.9922', 'grad_norm': '1.578', 'learning_rate': '4.14e-05', 'epoch': '0.08309', 'num_input_tokens_seen': 632523, 'train_runtime': '316.2', 'train_tokens_per_second': '2001'}
+{'loss': '0.6242', 'grad_norm': '1.54', 'learning_rate': '4.153e-05', 'epoch': '0.08336', 'num_input_tokens_seen': 634570, 'train_runtime': '317.2', 'train_tokens_per_second': '2001'}
+{'loss': '0.8425', 'grad_norm': '1.811', 'learning_rate': '4.167e-05', 'epoch': '0.08362', 'num_input_tokens_seen': 636617, 'train_runtime': '318.2', 'train_tokens_per_second': '2001'}
+{'loss': '0.9227', 'grad_norm': '1.62', 'learning_rate': '4.18e-05', 'epoch': '0.08389', 'num_input_tokens_seen': 638664, 'train_runtime': '319.2', 'train_tokens_per_second': '2001'}
+{'loss': '1.007', 'grad_norm': '1.642', 'learning_rate': '4.194e-05', 'epoch': '0.08416', 'num_input_tokens_seen': 640711, 'train_runtime': '320.3', 'train_tokens_per_second': '2001'}
+{'loss': '0.7684', 'grad_norm': '1.521', 'learning_rate': '4.207e-05', 'epoch': '0.08443', 'num_input_tokens_seen': 642758, 'train_runtime': '321.3', 'train_tokens_per_second': '2001'}
+{'loss': '0.9068', 'grad_norm': '1.779', 'learning_rate': '4.22e-05', 'epoch': '0.0847', 'num_input_tokens_seen': 644805, 'train_runtime': '322.3', 'train_tokens_per_second': '2001'}
+{'loss': '0.8407', 'grad_norm': '1.588', 'learning_rate': '4.234e-05', 'epoch': '0.08497', 'num_input_tokens_seen': 646852, 'train_runtime': '323.3', 'train_tokens_per_second': '2001'}
+{'loss': '0.9359', 'grad_norm': '1.685', 'learning_rate': '4.247e-05', 'epoch': '0.08524', 'num_input_tokens_seen': 648899, 'train_runtime': '324.3', 'train_tokens_per_second': '2001'}
+{'loss': '0.8513', 'grad_norm': '1.823', 'learning_rate': '4.261e-05', 'epoch': '0.08551', 'num_input_tokens_seen': 650946, 'train_runtime': '325.4', 'train_tokens_per_second': '2001'}
+{'loss': '1.09', 'grad_norm': '2.251', 'learning_rate': '4.274e-05', 'epoch': '0.08578', 'num_input_tokens_seen': 652993, 'train_runtime': '326.4', 'train_tokens_per_second': '2001'}
+{'loss': '0.8893', 'grad_norm': '1.614', 'learning_rate': '4.288e-05', 'epoch': '0.08604', 'num_input_tokens_seen': 655040, 'train_runtime': '327.4', 'train_tokens_per_second': '2001'}
+{'loss': '0.499', 'grad_norm': '1.693', 'learning_rate': '4.301e-05', 'epoch': '0.08631', 'num_input_tokens_seen': 657087, 'train_runtime': '328.4', 'train_tokens_per_second': '2001'}
+{'loss': '1.006', 'grad_norm': '1.781', 'learning_rate': '4.315e-05', 'epoch': '0.08658', 'num_input_tokens_seen': 659134, 'train_runtime': '329.4', 'train_tokens_per_second': '2001'}
+{'loss': '0.6728', 'grad_norm': '1.412', 'learning_rate': '4.328e-05', 'epoch': '0.08685', 'num_input_tokens_seen': 661181, 'train_runtime': '330.4', 'train_tokens_per_second': '2001'}
+{'loss': '0.6491', 'grad_norm': '1.683', 'learning_rate': '4.341e-05', 'epoch': '0.08712', 'num_input_tokens_seen': 663228, 'train_runtime': '331.5', 'train_tokens_per_second': '2001'}
+{'loss': '0.9646', 'grad_norm': '1.918', 'learning_rate': '4.355e-05', 'epoch': '0.08739', 'num_input_tokens_seen': 665275, 'train_runtime': '332.5', 'train_tokens_per_second': '2001'}
+{'loss': '0.6656', 'grad_norm': '1.711', 'learning_rate': '4.368e-05', 'epoch': '0.08766', 'num_input_tokens_seen': 667322, 'train_runtime': '333.5', 'train_tokens_per_second': '2001'}
+{'loss': '0.7556', 'grad_norm': '1.799', 'learning_rate': '4.382e-05', 'epoch': '0.08793', 'num_input_tokens_seen': 669369, 'train_runtime': '334.5', 'train_tokens_per_second': '2001'}
+{'loss': '0.8211', 'grad_norm': '1.622', 'learning_rate': '4.395e-05', 'epoch': '0.0882', 'num_input_tokens_seen': 671416, 'train_runtime': '335.5', 'train_tokens_per_second': '2001'}
+{'loss': '0.8586', 'grad_norm': '1.673', 'learning_rate': '4.409e-05', 'epoch': '0.08846', 'num_input_tokens_seen': 673463, 'train_runtime': '336.6', 'train_tokens_per_second': '2001'}
+{'loss': '0.8275', 'grad_norm': '1.59', 'learning_rate': '4.422e-05', 'epoch': '0.08873', 'num_input_tokens_seen': 675510, 'train_runtime': '337.6', 'train_tokens_per_second': '2001'}
+{'loss': '0.7986', 'grad_norm': '1.536', 'learning_rate': '4.435e-05', 'epoch': '0.089', 'num_input_tokens_seen': 677557, 'train_runtime': '338.6', 'train_tokens_per_second': '2001'}
+{'loss': '0.8409', 'grad_norm': '1.524', 'learning_rate': '4.449e-05', 'epoch': '0.08927', 'num_input_tokens_seen': 679604, 'train_runtime': '339.6', 'train_tokens_per_second': '2001'}
+{'loss': '0.7889', 'grad_norm': '1.606', 'learning_rate': '4.462e-05', 'epoch': '0.08954', 'num_input_tokens_seen': 681651, 'train_runtime': '340.6', 'train_tokens_per_second': '2001'}
+{'loss': '0.8146', 'grad_norm': '1.721', 'learning_rate': '4.476e-05', 'epoch': '0.08981', 'num_input_tokens_seen': 683698, 'train_runtime': '341.6', 'train_tokens_per_second': '2001'}
+{'loss': '0.9218', 'grad_norm': '1.753', 'learning_rate': '4.489e-05', 'epoch': '0.09008', 'num_input_tokens_seen': 685745, 'train_runtime': '342.7', 'train_tokens_per_second': '2001'}
+{'loss': '0.6649', 'grad_norm': '1.632', 'learning_rate': '4.503e-05', 'epoch': '0.09035', 'num_input_tokens_seen': 687792, 'train_runtime': '343.7', 'train_tokens_per_second': '2001'}
+{'loss': '0.7102', 'grad_norm': '1.424', 'learning_rate': '4.516e-05', 'epoch': '0.09062', 'num_input_tokens_seen': 689839, 'train_runtime': '344.7', 'train_tokens_per_second': '2001'}
+{'loss': '1.134', 'grad_norm': '2.3', 'learning_rate': '4.53e-05', 'epoch': '0.09088', 'num_input_tokens_seen': 691886, 'train_runtime': '345.7', 'train_tokens_per_second': '2001'}
+{'loss': '0.9732', 'grad_norm': '2.07', 'learning_rate': '4.543e-05', 'epoch': '0.09115', 'num_input_tokens_seen': 693933, 'train_runtime': '346.7', 'train_tokens_per_second': '2001'}
+{'loss': '0.8109', 'grad_norm': '1.658', 'learning_rate': '4.556e-05', 'epoch': '0.09142', 'num_input_tokens_seen': 695980, 'train_runtime': '347.8', 'train_tokens_per_second': '2001'}
+{'loss': '0.8198', 'grad_norm': '1.551', 'learning_rate': '4.57e-05', 'epoch': '0.09169', 'num_input_tokens_seen': 698027, 'train_runtime': '348.8', 'train_tokens_per_second': '2001'}
+{'loss': '0.6508', 'grad_norm': '1.996', 'learning_rate': '4.583e-05', 'epoch': '0.09196', 'num_input_tokens_seen': 700074, 'train_runtime': '349.8', 'train_tokens_per_second': '2001'}
+{'loss': '0.6369', 'grad_norm': '1.678', 'learning_rate': '4.597e-05', 'epoch': '0.09223', 'num_input_tokens_seen': 702121, 'train_runtime': '350.8', 'train_tokens_per_second': '2001'}
+{'loss': '0.8778', 'grad_norm': '1.761', 'learning_rate': '4.61e-05', 'epoch': '0.0925', 'num_input_tokens_seen': 704168, 'train_runtime': '351.8', 'train_tokens_per_second': '2001'}
+{'loss': '0.5125', 'grad_norm': '2.032', 'learning_rate': '4.624e-05', 'epoch': '0.09277', 'num_input_tokens_seen': 706215, 'train_runtime': '352.9', 'train_tokens_per_second': '2001'}
+{'loss': '0.5776', 'grad_norm': '1.902', 'learning_rate': '4.637e-05', 'epoch': '0.09304', 'num_input_tokens_seen': 708262, 'train_runtime': '353.9', 'train_tokens_per_second': '2001'}
+{'loss': '0.8128', 'grad_norm': '1.934', 'learning_rate': '4.651e-05', 'epoch': '0.0933', 'num_input_tokens_seen': 710309, 'train_runtime': '354.9', 'train_tokens_per_second': '2001'}
+{'loss': '0.8', 'grad_norm': '2.005', 'learning_rate': '4.664e-05', 'epoch': '0.09357', 'num_input_tokens_seen': 712356, 'train_runtime': '355.9', 'train_tokens_per_second': '2002'}
+{'loss': '0.9134', 'grad_norm': '1.872', 'learning_rate': '4.677e-05', 'epoch': '0.09384', 'num_input_tokens_seen': 714403, 'train_runtime': '356.9', 'train_tokens_per_second': '2002'}
+{'loss': '0.8195', 'grad_norm': '1.896', 'learning_rate': '4.691e-05', 'epoch': '0.09411', 'num_input_tokens_seen': 716450, 'train_runtime': '357.9', 'train_tokens_per_second': '2002'}
+{'loss': '0.9879', 'grad_norm': '1.732', 'learning_rate': '4.704e-05', 'epoch': '0.09438', 'num_input_tokens_seen': 718497, 'train_runtime': '359', 'train_tokens_per_second': '2002'}
+{'loss': '0.7241', 'grad_norm': '1.685', 'learning_rate': '4.718e-05', 'epoch': '0.09465', 'num_input_tokens_seen': 720544, 'train_runtime': '360', 'train_tokens_per_second': '2002'}
+{'loss': '0.8061', 'grad_norm': '1.555', 'learning_rate': '4.731e-05', 'epoch': '0.09492', 'num_input_tokens_seen': 722591, 'train_runtime': '361', 'train_tokens_per_second': '2002'}
+{'loss': '0.8035', 'grad_norm': '1.807', 'learning_rate': '4.745e-05', 'epoch': '0.09519', 'num_input_tokens_seen': 724638, 'train_runtime': '362', 'train_tokens_per_second': '2002'}
+{'loss': '0.4991', 'grad_norm': '1.543', 'learning_rate': '4.758e-05', 'epoch': '0.09546', 'num_input_tokens_seen': 726685, 'train_runtime': '363', 'train_tokens_per_second': '2002'}
+{'loss': '0.8125', 'grad_norm': '1.724', 'learning_rate': '4.772e-05', 'epoch': '0.09572', 'num_input_tokens_seen': 728732, 'train_runtime': '364.1', 'train_tokens_per_second': '2002'}
+{'loss': '0.752', 'grad_norm': '1.793', 'learning_rate': '4.785e-05', 'epoch': '0.09599', 'num_input_tokens_seen': 730779, 'train_runtime': '365.1', 'train_tokens_per_second': '2002'}
+{'loss': '0.9271', 'grad_norm': '2.305', 'learning_rate': '4.798e-05', 'epoch': '0.09626', 'num_input_tokens_seen': 732826, 'train_runtime': '366.1', 'train_tokens_per_second': '2002'}
+{'loss': '0.6132', 'grad_norm': '2.224', 'learning_rate': '4.812e-05', 'epoch': '0.09653', 'num_input_tokens_seen': 734873, 'train_runtime': '367.1', 'train_tokens_per_second': '2002'}
+{'loss': '0.6797', 'grad_norm': '1.914', 'learning_rate': '4.825e-05', 'epoch': '0.0968', 'num_input_tokens_seen': 736920, 'train_runtime': '368.1', 'train_tokens_per_second': '2002'}
+{'loss': '0.9467', 'grad_norm': '2.078', 'learning_rate': '4.839e-05', 'epoch': '0.09707', 'num_input_tokens_seen': 738967, 'train_runtime': '369.1', 'train_tokens_per_second': '2002'}
+{'loss': '0.8589', 'grad_norm': '2.175', 'learning_rate': '4.852e-05', 'epoch': '0.09734', 'num_input_tokens_seen': 741014, 'train_runtime': '370.2', 'train_tokens_per_second': '2002'}
+{'loss': '0.8454', 'grad_norm': '1.922', 'learning_rate': '4.866e-05', 'epoch': '0.09761', 'num_input_tokens_seen': 743061, 'train_runtime': '371.2', 'train_tokens_per_second': '2002'}
+{'loss': '0.8227', 'grad_norm': '1.937', 'learning_rate': '4.879e-05', 'epoch': '0.09788', 'num_input_tokens_seen': 745108, 'train_runtime': '372.2', 'train_tokens_per_second': '2002'}
+{'loss': '0.7916', 'grad_norm': '1.935', 'learning_rate': '4.892e-05', 'epoch': '0.09814', 'num_input_tokens_seen': 747155, 'train_runtime': '373.2', 'train_tokens_per_second': '2002'}
+{'loss': '0.6554', 'grad_norm': '1.673', 'learning_rate': '4.906e-05', 'epoch': '0.09841', 'num_input_tokens_seen': 749202, 'train_runtime': '374.2', 'train_tokens_per_second': '2002'}
+{'loss': '0.8427', 'grad_norm': '1.627', 'learning_rate': '4.919e-05', 'epoch': '0.09868', 'num_input_tokens_seen': 751249, 'train_runtime': '375.2', 'train_tokens_per_second': '2002'}
+{'loss': '0.7', 'grad_norm': '1.613', 'learning_rate': '4.933e-05', 'epoch': '0.09895', 'num_input_tokens_seen': 753296, 'train_runtime': '376.3', 'train_tokens_per_second': '2002'}
+{'loss': '1.085', 'grad_norm': '1.733', 'learning_rate': '4.946e-05', 'epoch': '0.09922', 'num_input_tokens_seen': 755343, 'train_runtime': '377.3', 'train_tokens_per_second': '2002'}
+{'loss': '0.7366', 'grad_norm': '1.8', 'learning_rate': '4.96e-05', 'epoch': '0.09949', 'num_input_tokens_seen': 757390, 'train_runtime': '378.3', 'train_tokens_per_second': '2002'}
+{'loss': '0.8539', 'grad_norm': '2.089', 'learning_rate': '4.973e-05', 'epoch': '0.09976', 'num_input_tokens_seen': 759437, 'train_runtime': '379.3', 'train_tokens_per_second': '2002'}
+{'loss': '1.091', 'grad_norm': '2.162', 'learning_rate': '4.987e-05', 'epoch': '0.1', 'num_input_tokens_seen': 761484, 'train_runtime': '380.3', 'train_tokens_per_second': '2002'}
+{'loss': '0.5954', 'grad_norm': '1.539', 'learning_rate': '5e-05', 'epoch': '0.1003', 'num_input_tokens_seen': 763531, 'train_runtime': '381.4', 'train_tokens_per_second': '2002'}
+{'loss': '0.8637', 'grad_norm': '3.224', 'learning_rate': '5e-05', 'epoch': '0.1006', 'num_input_tokens_seen': 765578, 'train_runtime': '382.4', 'train_tokens_per_second': '2002'}
+{'loss': '1.156', 'grad_norm': '2.482', 'learning_rate': '5e-05', 'epoch': '0.1008', 'num_input_tokens_seen': 767625, 'train_runtime': '383.4', 'train_tokens_per_second': '2002'}
+{'loss': '0.9774', 'grad_norm': '2.115', 'learning_rate': '5e-05', 'epoch': '0.1011', 'num_input_tokens_seen': 769672, 'train_runtime': '384.4', 'train_tokens_per_second': '2002'}
+{'loss': '0.7794', 'grad_norm': '2.068', 'learning_rate': '5e-05', 'epoch': '0.1014', 'num_input_tokens_seen': 771719, 'train_runtime': '385.4', 'train_tokens_per_second': '2002'}
+{'loss': '0.7327', 'grad_norm': '2.226', 'learning_rate': '5e-05', 'epoch': '0.1016', 'num_input_tokens_seen': 773766, 'train_runtime': '386.5', 'train_tokens_per_second': '2002'}
+{'loss': '0.7302', 'grad_norm': '1.966', 'learning_rate': '5e-05', 'epoch': '0.1019', 'num_input_tokens_seen': 775813, 'train_runtime': '387.5', 'train_tokens_per_second': '2002'}
+{'loss': '0.6878', 'grad_norm': '1.581', 'learning_rate': '5e-05', 'epoch': '0.1022', 'num_input_tokens_seen': 777860, 'train_runtime': '388.5', 'train_tokens_per_second': '2002'}
+{'loss': '0.737', 'grad_norm': '1.672', 'learning_rate': '5e-05', 'epoch': '0.1024', 'num_input_tokens_seen': 779907, 'train_runtime': '389.5', 'train_tokens_per_second': '2002'}
+{'loss': '0.9472', 'grad_norm': '2.17', 'learning_rate': '5e-05', 'epoch': '0.1027', 'num_input_tokens_seen': 781954, 'train_runtime': '390.5', 'train_tokens_per_second': '2002'}
+{'loss': '0.5831', 'grad_norm': '1.504', 'learning_rate': '5e-05', 'epoch': '0.103', 'num_input_tokens_seen': 784001, 'train_runtime': '391.6', 'train_tokens_per_second': '2002'}
+{'loss': '0.6743', 'grad_norm': '1.78', 'learning_rate': '5e-05', 'epoch': '0.1033', 'num_input_tokens_seen': 786048, 'train_runtime': '392.6', 'train_tokens_per_second': '2002'}
+{'loss': '0.5688', 'grad_norm': '1.95', 'learning_rate': '5e-05', 'epoch': '0.1035', 'num_input_tokens_seen': 788095, 'train_runtime': '393.6', 'train_tokens_per_second': '2002'}
+{'loss': '0.929', 'grad_norm': '2.087', 'learning_rate': '5e-05', 'epoch': '0.1038', 'num_input_tokens_seen': 790142, 'train_runtime': '394.6', 'train_tokens_per_second': '2002'}
+{'loss': '0.4627', 'grad_norm': '2.017', 'learning_rate': '5e-05', 'epoch': '0.1041', 'num_input_tokens_seen': 792189, 'train_runtime': '395.6', 'train_tokens_per_second': '2002'}
+{'loss': '0.8193', 'grad_norm': '2.009', 'learning_rate': '5e-05', 'epoch': '0.1043', 'num_input_tokens_seen': 794236, 'train_runtime': '396.7', 'train_tokens_per_second': '2002'}
+  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
+    launcher.launch()
+  File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
+    run_exp()
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
+    _training_function(config={"args": args, "callbacks": callbacks})
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
+    run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
+  File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
+    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
+    self.accelerator.backward(loss, **kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
+    loss.backward(**kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
+    _engine_run_backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+KeyboardInterrupt

LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os":  "Linux-6.8.0-94-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T03:57:46.163443Z",
+  "args":  [
+    "/workspace/v127rc_exp1/C.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "47a53adf0198",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  16,
+  "cpu_count_logical":  32,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "1858306048"
+    }
+  },
+  "memory":  {
+    "total":  "201701408768"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de"
+    }
+  ],
+  "cudaVersion":  "13.0",
+  "writerId":  "mfjy22anxcucsb3vwlaimrwvqrgvipis"
+}

LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/global_step":388,"train/grad_norm":2.0090420246124268,"train/learning_rate":4.9999916410392856e-05,"_wandb":{"runtime":396},"_runtime":396,"train/loss":0.8193472027778625,"_step":387,"train/epoch":0.1043291207313794,"train_runtime":396.6553,"train/train_tokens_per_second":2002.333,"_timestamp":1.770177862347725e+09,"train/num_input_tokens_seen":794236}

LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log ADDED Viewed

	@@ -0,0 +1,299 @@

+  0%|                                                                                                                                                                                       | 0/40950 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+{'loss': '1.719', 'grad_norm': '0.3142', 'learning_rate': '0', 'epoch': '0.0001221', 'num_input_tokens_seen': 2047, 'train_runtime': '3.017', 'train_tokens_per_second': '678.5'}
+{'loss': '1.142', 'grad_norm': '0.2725', 'learning_rate': '6.105e-08', 'epoch': '0.0002442', 'num_input_tokens_seen': 4094, 'train_runtime': '4.05', 'train_tokens_per_second': '1011'}
+{'loss': '1.39', 'grad_norm': '0.379', 'learning_rate': '1.221e-07', 'epoch': '0.0003663', 'num_input_tokens_seen': 6141, 'train_runtime': '5.087', 'train_tokens_per_second': '1207'}
+{'loss': '1.457', 'grad_norm': '0.2879', 'learning_rate': '1.832e-07', 'epoch': '0.0004884', 'num_input_tokens_seen': 8188, 'train_runtime': '6.124', 'train_tokens_per_second': '1337'}
+{'loss': '1.286', 'grad_norm': '0.2564', 'learning_rate': '2.442e-07', 'epoch': '0.0006105', 'num_input_tokens_seen': 10235, 'train_runtime': '7.165', 'train_tokens_per_second': '1429'}
+{'loss': '0.01258', 'grad_norm': '0.042', 'learning_rate': '3.053e-07', 'epoch': '0.0007326', 'num_input_tokens_seen': 12282, 'train_runtime': '8.201', 'train_tokens_per_second': '1498'}
+{'loss': '0.8563', 'grad_norm': '0.267', 'learning_rate': '3.663e-07', 'epoch': '0.0008547', 'num_input_tokens_seen': 14329, 'train_runtime': '9.241', 'train_tokens_per_second': '1551'}
+{'loss': '1.581', 'grad_norm': '0.2901', 'learning_rate': '4.274e-07', 'epoch': '0.0009768', 'num_input_tokens_seen': 16376, 'train_runtime': '10.28', 'train_tokens_per_second': '1593'}
+{'loss': '1.573', 'grad_norm': '0.2915', 'learning_rate': '4.884e-07', 'epoch': '0.001099', 'num_input_tokens_seen': 18423, 'train_runtime': '11.32', 'train_tokens_per_second': '1628'}
+{'loss': '1.346', 'grad_norm': '0.2841', 'learning_rate': '5.495e-07', 'epoch': '0.001221', 'num_input_tokens_seen': 20470, 'train_runtime': '12.35', 'train_tokens_per_second': '1657'}
+{'loss': '1.651', 'grad_norm': '0.4522', 'learning_rate': '6.105e-07', 'epoch': '0.001343', 'num_input_tokens_seen': 22517, 'train_runtime': '13.39', 'train_tokens_per_second': '1682'}
+{'loss': '1.487', 'grad_norm': '0.3466', 'learning_rate': '6.716e-07', 'epoch': '0.001465', 'num_input_tokens_seen': 24564, 'train_runtime': '14.44', 'train_tokens_per_second': '1701'}
+{'loss': '0.8106', 'grad_norm': '0.2226', 'learning_rate': '7.326e-07', 'epoch': '0.001587', 'num_input_tokens_seen': 26611, 'train_runtime': '15.48', 'train_tokens_per_second': '1719'}
+{'loss': '0.5651', 'grad_norm': '0.2162', 'learning_rate': '7.937e-07', 'epoch': '0.001709', 'num_input_tokens_seen': 28658, 'train_runtime': '16.52', 'train_tokens_per_second': '1735'}
+{'loss': '1.622', 'grad_norm': '0.3259', 'learning_rate': '8.547e-07', 'epoch': '0.001832', 'num_input_tokens_seen': 30705, 'train_runtime': '17.56', 'train_tokens_per_second': '1749'}
+{'loss': '1.418', 'grad_norm': '0.285', 'learning_rate': '9.158e-07', 'epoch': '0.001954', 'num_input_tokens_seen': 32752, 'train_runtime': '18.6', 'train_tokens_per_second': '1761'}
+{'loss': '1.69', 'grad_norm': '0.3264', 'learning_rate': '9.768e-07', 'epoch': '0.002076', 'num_input_tokens_seen': 34799, 'train_runtime': '19.64', 'train_tokens_per_second': '1772'}
+{'loss': '1.73', 'grad_norm': '0.3089', 'learning_rate': '1.038e-06', 'epoch': '0.002198', 'num_input_tokens_seen': 36846, 'train_runtime': '20.68', 'train_tokens_per_second': '1782'}
+{'loss': '1.511', 'grad_norm': '0.3119', 'learning_rate': '1.099e-06', 'epoch': '0.00232', 'num_input_tokens_seen': 38893, 'train_runtime': '21.72', 'train_tokens_per_second': '1791'}
+{'loss': '1.435', 'grad_norm': '0.3081', 'learning_rate': '1.16e-06', 'epoch': '0.002442', 'num_input_tokens_seen': 40940, 'train_runtime': '22.77', 'train_tokens_per_second': '1798'}
+{'loss': '1.785', 'grad_norm': '0.4437', 'learning_rate': '1.221e-06', 'epoch': '0.002564', 'num_input_tokens_seen': 42987, 'train_runtime': '23.81', 'train_tokens_per_second': '1806'}
+{'loss': '1.101', 'grad_norm': '0.3949', 'learning_rate': '1.282e-06', 'epoch': '0.002686', 'num_input_tokens_seen': 45034, 'train_runtime': '24.85', 'train_tokens_per_second': '1812'}
+{'loss': '0.7684', 'grad_norm': '0.2791', 'learning_rate': '1.343e-06', 'epoch': '0.002808', 'num_input_tokens_seen': 47081, 'train_runtime': '25.89', 'train_tokens_per_second': '1819'}
+{'loss': '0.9445', 'grad_norm': '0.2267', 'learning_rate': '1.404e-06', 'epoch': '0.00293', 'num_input_tokens_seen': 49128, 'train_runtime': '26.93', 'train_tokens_per_second': '1825'}
+{'loss': '1.328', 'grad_norm': '0.5019', 'learning_rate': '1.465e-06', 'epoch': '0.003053', 'num_input_tokens_seen': 51175, 'train_runtime': '27.96', 'train_tokens_per_second': '1830'}
+{'loss': '1.597', 'grad_norm': '0.3425', 'learning_rate': '1.526e-06', 'epoch': '0.003175', 'num_input_tokens_seen': 53222, 'train_runtime': '29.01', 'train_tokens_per_second': '1834'}
+{'loss': '1.797', 'grad_norm': '0.3407', 'learning_rate': '1.587e-06', 'epoch': '0.003297', 'num_input_tokens_seen': 55269, 'train_runtime': '30.05', 'train_tokens_per_second': '1839'}
+{'loss': '0.7549', 'grad_norm': '0.2074', 'learning_rate': '1.648e-06', 'epoch': '0.003419', 'num_input_tokens_seen': 57316, 'train_runtime': '31.1', 'train_tokens_per_second': '1843'}
+{'loss': '0.6662', 'grad_norm': '0.2184', 'learning_rate': '1.709e-06', 'epoch': '0.003541', 'num_input_tokens_seen': 59363, 'train_runtime': '32.14', 'train_tokens_per_second': '1847'}
+{'loss': '0.9995', 'grad_norm': '0.2354', 'learning_rate': '1.77e-06', 'epoch': '0.003663', 'num_input_tokens_seen': 61410, 'train_runtime': '33.19', 'train_tokens_per_second': '1850'}
+{'loss': '1.189', 'grad_norm': '0.2462', 'learning_rate': '1.832e-06', 'epoch': '0.003785', 'num_input_tokens_seen': 63457, 'train_runtime': '34.23', 'train_tokens_per_second': '1854'}
+{'loss': '1.353', 'grad_norm': '0.2564', 'learning_rate': '1.893e-06', 'epoch': '0.003907', 'num_input_tokens_seen': 65504, 'train_runtime': '35.28', 'train_tokens_per_second': '1857'}
+{'loss': '1.41', 'grad_norm': '0.3253', 'learning_rate': '1.954e-06', 'epoch': '0.004029', 'num_input_tokens_seen': 67551, 'train_runtime': '36.32', 'train_tokens_per_second': '1860'}
+{'loss': '1.575', 'grad_norm': '0.303', 'learning_rate': '2.015e-06', 'epoch': '0.004151', 'num_input_tokens_seen': 69598, 'train_runtime': '37.36', 'train_tokens_per_second': '1863'}
+{'loss': '1.542', 'grad_norm': '0.3227', 'learning_rate': '2.076e-06', 'epoch': '0.004274', 'num_input_tokens_seen': 71645, 'train_runtime': '38.4', 'train_tokens_per_second': '1866'}
+{'loss': '1.281', 'grad_norm': '0.3266', 'learning_rate': '2.137e-06', 'epoch': '0.004396', 'num_input_tokens_seen': 73692, 'train_runtime': '39.45', 'train_tokens_per_second': '1868'}
+{'loss': '1.936', 'grad_norm': '0.601', 'learning_rate': '2.198e-06', 'epoch': '0.004518', 'num_input_tokens_seen': 75739, 'train_runtime': '40.49', 'train_tokens_per_second': '1871'}
+{'loss': '1.855', 'grad_norm': '0.2591', 'learning_rate': '2.259e-06', 'epoch': '0.00464', 'num_input_tokens_seen': 77786, 'train_runtime': '41.53', 'train_tokens_per_second': '1873'}
+{'loss': '0.8793', 'grad_norm': '0.308', 'learning_rate': '2.32e-06', 'epoch': '0.004762', 'num_input_tokens_seen': 79833, 'train_runtime': '42.57', 'train_tokens_per_second': '1875'}
+{'loss': '1.274', 'grad_norm': '0.2598', 'learning_rate': '2.381e-06', 'epoch': '0.004884', 'num_input_tokens_seen': 81880, 'train_runtime': '43.61', 'train_tokens_per_second': '1877'}
+{'loss': '1.502', 'grad_norm': '0.3138', 'learning_rate': '2.442e-06', 'epoch': '0.005006', 'num_input_tokens_seen': 83927, 'train_runtime': '44.66', 'train_tokens_per_second': '1879'}
+{'loss': '1.367', 'grad_norm': '0.2641', 'learning_rate': '2.503e-06', 'epoch': '0.005128', 'num_input_tokens_seen': 85974, 'train_runtime': '45.7', 'train_tokens_per_second': '1881'}
+{'loss': '0.7333', 'grad_norm': '0.226', 'learning_rate': '2.564e-06', 'epoch': '0.00525', 'num_input_tokens_seen': 88021, 'train_runtime': '46.75', 'train_tokens_per_second': '1883'}
+{'loss': '1.199', 'grad_norm': '0.277', 'learning_rate': '2.625e-06', 'epoch': '0.005372', 'num_input_tokens_seen': 90068, 'train_runtime': '47.82', 'train_tokens_per_second': '1883'}
+{'loss': '1.659', 'grad_norm': '0.3296', 'learning_rate': '2.686e-06', 'epoch': '0.005495', 'num_input_tokens_seen': 92115, 'train_runtime': '48.86', 'train_tokens_per_second': '1885'}
+{'loss': '1.699', 'grad_norm': '0.3483', 'learning_rate': '2.747e-06', 'epoch': '0.005617', 'num_input_tokens_seen': 94162, 'train_runtime': '49.91', 'train_tokens_per_second': '1887'}
+{'loss': '1.513', 'grad_norm': '0.3496', 'learning_rate': '2.808e-06', 'epoch': '0.005739', 'num_input_tokens_seen': 96209, 'train_runtime': '50.96', 'train_tokens_per_second': '1888'}
+{'loss': '1.737', 'grad_norm': '0.3098', 'learning_rate': '2.869e-06', 'epoch': '0.005861', 'num_input_tokens_seen': 98256, 'train_runtime': '52.01', 'train_tokens_per_second': '1889'}
+{'loss': '1.359', 'grad_norm': '0.3305', 'learning_rate': '2.93e-06', 'epoch': '0.005983', 'num_input_tokens_seen': 100303, 'train_runtime': '53.06', 'train_tokens_per_second': '1891'}
+{'loss': '1.805', 'grad_norm': '0.3772', 'learning_rate': '2.991e-06', 'epoch': '0.006105', 'num_input_tokens_seen': 102350, 'train_runtime': '54.1', 'train_tokens_per_second': '1892'}
+{'loss': '1.882', 'grad_norm': '0.3816', 'learning_rate': '3.053e-06', 'epoch': '0.006227', 'num_input_tokens_seen': 104397, 'train_runtime': '55.15', 'train_tokens_per_second': '1893'}
+{'loss': '1.566', 'grad_norm': '0.333', 'learning_rate': '3.114e-06', 'epoch': '0.006349', 'num_input_tokens_seen': 106444, 'train_runtime': '56.19', 'train_tokens_per_second': '1895'}
+{'loss': '1.816', 'grad_norm': '0.3612', 'learning_rate': '3.175e-06', 'epoch': '0.006471', 'num_input_tokens_seen': 108491, 'train_runtime': '57.23', 'train_tokens_per_second': '1896'}
+{'loss': '1.933', 'grad_norm': '0.5047', 'learning_rate': '3.236e-06', 'epoch': '0.006593', 'num_input_tokens_seen': 110538, 'train_runtime': '58.28', 'train_tokens_per_second': '1897'}
+{'loss': '1.34', 'grad_norm': '0.2829', 'learning_rate': '3.297e-06', 'epoch': '0.006716', 'num_input_tokens_seen': 112585, 'train_runtime': '59.32', 'train_tokens_per_second': '1898'}
+{'loss': '0.851', 'grad_norm': '0.4326', 'learning_rate': '3.358e-06', 'epoch': '0.006838', 'num_input_tokens_seen': 114632, 'train_runtime': '60.37', 'train_tokens_per_second': '1899'}
+{'loss': '0.7931', 'grad_norm': '0.3166', 'learning_rate': '3.419e-06', 'epoch': '0.00696', 'num_input_tokens_seen': 116679, 'train_runtime': '61.41', 'train_tokens_per_second': '1900'}
+{'loss': '1.728', 'grad_norm': '0.3289', 'learning_rate': '3.48e-06', 'epoch': '0.007082', 'num_input_tokens_seen': 118726, 'train_runtime': '62.45', 'train_tokens_per_second': '1901'}
+{'loss': '0.7369', 'grad_norm': '0.2613', 'learning_rate': '3.541e-06', 'epoch': '0.007204', 'num_input_tokens_seen': 120773, 'train_runtime': '63.49', 'train_tokens_per_second': '1902'}
+{'loss': '1.464', 'grad_norm': '0.2617', 'learning_rate': '3.602e-06', 'epoch': '0.007326', 'num_input_tokens_seen': 122820, 'train_runtime': '64.53', 'train_tokens_per_second': '1903'}
+{'loss': '1.883', 'grad_norm': '0.3848', 'learning_rate': '3.663e-06', 'epoch': '0.007448', 'num_input_tokens_seen': 124867, 'train_runtime': '65.58', 'train_tokens_per_second': '1904'}
+{'loss': '0.5969', 'grad_norm': '0.2306', 'learning_rate': '3.724e-06', 'epoch': '0.00757', 'num_input_tokens_seen': 126914, 'train_runtime': '66.63', 'train_tokens_per_second': '1905'}
+{'loss': '1.594', 'grad_norm': '0.2975', 'learning_rate': '3.785e-06', 'epoch': '0.007692', 'num_input_tokens_seen': 128961, 'train_runtime': '67.68', 'train_tokens_per_second': '1906'}
+{'loss': '1.062', 'grad_norm': '0.253', 'learning_rate': '3.846e-06', 'epoch': '0.007814', 'num_input_tokens_seen': 131008, 'train_runtime': '68.72', 'train_tokens_per_second': '1906'}
+{'loss': '1.625', 'grad_norm': '0.3242', 'learning_rate': '3.907e-06', 'epoch': '0.007937', 'num_input_tokens_seen': 133055, 'train_runtime': '69.77', 'train_tokens_per_second': '1907'}
+{'loss': '1.335', 'grad_norm': '0.3814', 'learning_rate': '3.968e-06', 'epoch': '0.008059', 'num_input_tokens_seen': 135102, 'train_runtime': '70.82', 'train_tokens_per_second': '1908'}
+{'loss': '1.049', 'grad_norm': '0.2831', 'learning_rate': '4.029e-06', 'epoch': '0.008181', 'num_input_tokens_seen': 137149, 'train_runtime': '71.86', 'train_tokens_per_second': '1909'}
+{'loss': '1.03', 'grad_norm': '0.2496', 'learning_rate': '4.09e-06', 'epoch': '0.008303', 'num_input_tokens_seen': 139196, 'train_runtime': '72.9', 'train_tokens_per_second': '1909'}
+{'loss': '1.344', 'grad_norm': '0.3791', 'learning_rate': '4.151e-06', 'epoch': '0.008425', 'num_input_tokens_seen': 141243, 'train_runtime': '74.09', 'train_tokens_per_second': '1906'}
+{'loss': '1.543', 'grad_norm': '0.3291', 'learning_rate': '4.212e-06', 'epoch': '0.008547', 'num_input_tokens_seen': 143290, 'train_runtime': '75.13', 'train_tokens_per_second': '1907'}
+{'loss': '1.627', 'grad_norm': '0.3203', 'learning_rate': '4.274e-06', 'epoch': '0.008669', 'num_input_tokens_seen': 145337, 'train_runtime': '76.17', 'train_tokens_per_second': '1908'}
+{'loss': '1.25', 'grad_norm': '0.3174', 'learning_rate': '4.335e-06', 'epoch': '0.008791', 'num_input_tokens_seen': 147384, 'train_runtime': '77.21', 'train_tokens_per_second': '1909'}
+{'loss': '1.305', 'grad_norm': '0.3542', 'learning_rate': '4.396e-06', 'epoch': '0.008913', 'num_input_tokens_seen': 149431, 'train_runtime': '78.26', 'train_tokens_per_second': '1909'}
+{'loss': '0.7812', 'grad_norm': '0.2824', 'learning_rate': '4.457e-06', 'epoch': '0.009035', 'num_input_tokens_seen': 151478, 'train_runtime': '79.3', 'train_tokens_per_second': '1910'}
+{'loss': '1.514', 'grad_norm': '0.3974', 'learning_rate': '4.518e-06', 'epoch': '0.009158', 'num_input_tokens_seen': 153525, 'train_runtime': '80.34', 'train_tokens_per_second': '1911'}
+{'loss': '0.8486', 'grad_norm': '0.394', 'learning_rate': '4.579e-06', 'epoch': '0.00928', 'num_input_tokens_seen': 155572, 'train_runtime': '81.39', 'train_tokens_per_second': '1911'}
+{'loss': '1.741', 'grad_norm': '0.4167', 'learning_rate': '4.64e-06', 'epoch': '0.009402', 'num_input_tokens_seen': 157619, 'train_runtime': '82.43', 'train_tokens_per_second': '1912'}
+{'loss': '1.393', 'grad_norm': '0.3378', 'learning_rate': '4.701e-06', 'epoch': '0.009524', 'num_input_tokens_seen': 159666, 'train_runtime': '83.47', 'train_tokens_per_second': '1913'}
+{'loss': '1.174', 'grad_norm': '0.3005', 'learning_rate': '4.762e-06', 'epoch': '0.009646', 'num_input_tokens_seen': 161713, 'train_runtime': '84.52', 'train_tokens_per_second': '1913'}
+{'loss': '0.7404', 'grad_norm': '0.2695', 'learning_rate': '4.823e-06', 'epoch': '0.009768', 'num_input_tokens_seen': 163760, 'train_runtime': '85.56', 'train_tokens_per_second': '1914'}
+{'loss': '1.576', 'grad_norm': '0.345', 'learning_rate': '4.884e-06', 'epoch': '0.00989', 'num_input_tokens_seen': 165807, 'train_runtime': '86.6', 'train_tokens_per_second': '1915'}
+{'loss': '1.073', 'grad_norm': '0.3396', 'learning_rate': '4.945e-06', 'epoch': '0.01001', 'num_input_tokens_seen': 167854, 'train_runtime': '87.64', 'train_tokens_per_second': '1915'}
+{'loss': '1.579', 'grad_norm': '0.3497', 'learning_rate': '5.006e-06', 'epoch': '0.01013', 'num_input_tokens_seen': 169901, 'train_runtime': '88.68', 'train_tokens_per_second': '1916'}
+{'loss': '0.784', 'grad_norm': '0.3244', 'learning_rate': '5.067e-06', 'epoch': '0.01026', 'num_input_tokens_seen': 171948, 'train_runtime': '89.72', 'train_tokens_per_second': '1916'}
+{'loss': '1.157', 'grad_norm': '0.2747', 'learning_rate': '5.128e-06', 'epoch': '0.01038', 'num_input_tokens_seen': 173995, 'train_runtime': '90.77', 'train_tokens_per_second': '1917'}
+{'loss': '0.9066', 'grad_norm': '0.233', 'learning_rate': '5.189e-06', 'epoch': '0.0105', 'num_input_tokens_seen': 176042, 'train_runtime': '91.81', 'train_tokens_per_second': '1918'}
+{'loss': '0.7513', 'grad_norm': '0.2136', 'learning_rate': '5.25e-06', 'epoch': '0.01062', 'num_input_tokens_seen': 178089, 'train_runtime': '92.85', 'train_tokens_per_second': '1918'}
+{'loss': '0.8007', 'grad_norm': '0.3918', 'learning_rate': '5.311e-06', 'epoch': '0.01074', 'num_input_tokens_seen': 180136, 'train_runtime': '93.89', 'train_tokens_per_second': '1919'}
+{'loss': '1.275', 'grad_norm': '0.3246', 'learning_rate': '5.372e-06', 'epoch': '0.01087', 'num_input_tokens_seen': 182183, 'train_runtime': '94.93', 'train_tokens_per_second': '1919'}
+{'loss': '0.6336', 'grad_norm': '0.2194', 'learning_rate': '5.433e-06', 'epoch': '0.01099', 'num_input_tokens_seen': 184230, 'train_runtime': '95.97', 'train_tokens_per_second': '1920'}
+{'loss': '0.668', 'grad_norm': '0.2253', 'learning_rate': '5.495e-06', 'epoch': '0.01111', 'num_input_tokens_seen': 186277, 'train_runtime': '97.01', 'train_tokens_per_second': '1920'}
+{'loss': '1.824', 'grad_norm': '0.354', 'learning_rate': '5.556e-06', 'epoch': '0.01123', 'num_input_tokens_seen': 188324, 'train_runtime': '98.05', 'train_tokens_per_second': '1921'}
+{'loss': '1.28', 'grad_norm': '0.4487', 'learning_rate': '5.617e-06', 'epoch': '0.01136', 'num_input_tokens_seen': 190371, 'train_runtime': '99.09', 'train_tokens_per_second': '1921'}
+{'loss': '0.6494', 'grad_norm': '0.2398', 'learning_rate': '5.678e-06', 'epoch': '0.01148', 'num_input_tokens_seen': 192418, 'train_runtime': '100.1', 'train_tokens_per_second': '1922'}
+{'loss': '0.6123', 'grad_norm': '0.2938', 'learning_rate': '5.739e-06', 'epoch': '0.0116', 'num_input_tokens_seen': 194465, 'train_runtime': '101.2', 'train_tokens_per_second': '1922'}
+{'loss': '1.243', 'grad_norm': '0.3335', 'learning_rate': '5.8e-06', 'epoch': '0.01172', 'num_input_tokens_seen': 196512, 'train_runtime': '102.2', 'train_tokens_per_second': '1922'}
+{'loss': '1.335', 'grad_norm': '0.3472', 'learning_rate': '5.861e-06', 'epoch': '0.01184', 'num_input_tokens_seen': 198559, 'train_runtime': '103.3', 'train_tokens_per_second': '1923'}
+{'loss': '1.112', 'grad_norm': '0.2869', 'learning_rate': '5.922e-06', 'epoch': '0.01197', 'num_input_tokens_seen': 200606, 'train_runtime': '104.3', 'train_tokens_per_second': '1923'}
+{'loss': '1.557', 'grad_norm': '0.4047', 'learning_rate': '5.983e-06', 'epoch': '0.01209', 'num_input_tokens_seen': 202653, 'train_runtime': '105.3', 'train_tokens_per_second': '1924'}
+{'loss': '1.697', 'grad_norm': '0.4249', 'learning_rate': '6.044e-06', 'epoch': '0.01221', 'num_input_tokens_seen': 204700, 'train_runtime': '106.4', 'train_tokens_per_second': '1924'}
+{'loss': '0.8076', 'grad_norm': '0.2638', 'learning_rate': '6.105e-06', 'epoch': '0.01233', 'num_input_tokens_seen': 206747, 'train_runtime': '107.4', 'train_tokens_per_second': '1925'}
+{'loss': '1.775', 'grad_norm': '0.3715', 'learning_rate': '6.166e-06', 'epoch': '0.01245', 'num_input_tokens_seen': 208794, 'train_runtime': '108.5', 'train_tokens_per_second': '1925'}
+{'loss': '1.606', 'grad_norm': '0.3108', 'learning_rate': '6.227e-06', 'epoch': '0.01258', 'num_input_tokens_seen': 210841, 'train_runtime': '109.5', 'train_tokens_per_second': '1925'}
+{'loss': '1.637', 'grad_norm': '0.3672', 'learning_rate': '6.288e-06', 'epoch': '0.0127', 'num_input_tokens_seen': 212888, 'train_runtime': '110.6', 'train_tokens_per_second': '1926'}
+{'loss': '1.369', 'grad_norm': '0.4352', 'learning_rate': '6.349e-06', 'epoch': '0.01282', 'num_input_tokens_seen': 214935, 'train_runtime': '111.6', 'train_tokens_per_second': '1926'}
+{'loss': '1.386', 'grad_norm': '0.308', 'learning_rate': '6.41e-06', 'epoch': '0.01294', 'num_input_tokens_seen': 216982, 'train_runtime': '112.6', 'train_tokens_per_second': '1926'}
+{'loss': '1.196', 'grad_norm': '0.3402', 'learning_rate': '6.471e-06', 'epoch': '0.01306', 'num_input_tokens_seen': 219029, 'train_runtime': '113.7', 'train_tokens_per_second': '1927'}
+{'loss': '1.117', 'grad_norm': '0.3496', 'learning_rate': '6.532e-06', 'epoch': '0.01319', 'num_input_tokens_seen': 221076, 'train_runtime': '114.7', 'train_tokens_per_second': '1927'}
+{'loss': '1.772', 'grad_norm': '0.3945', 'learning_rate': '6.593e-06', 'epoch': '0.01331', 'num_input_tokens_seen': 223123, 'train_runtime': '115.8', 'train_tokens_per_second': '1927'}
+{'loss': '0.9553', 'grad_norm': '0.2856', 'learning_rate': '6.654e-06', 'epoch': '0.01343', 'num_input_tokens_seen': 225170, 'train_runtime': '116.8', 'train_tokens_per_second': '1928'}
+{'loss': '1.563', 'grad_norm': '0.3784', 'learning_rate': '6.716e-06', 'epoch': '0.01355', 'num_input_tokens_seen': 227217, 'train_runtime': '117.8', 'train_tokens_per_second': '1928'}
+{'loss': '1.567', 'grad_norm': '0.3456', 'learning_rate': '6.777e-06', 'epoch': '0.01368', 'num_input_tokens_seen': 229264, 'train_runtime': '118.9', 'train_tokens_per_second': '1928'}
+{'loss': '0.7048', 'grad_norm': '0.2298', 'learning_rate': '6.838e-06', 'epoch': '0.0138', 'num_input_tokens_seen': 231311, 'train_runtime': '119.9', 'train_tokens_per_second': '1929'}
+{'loss': '1.194', 'grad_norm': '0.3506', 'learning_rate': '6.899e-06', 'epoch': '0.01392', 'num_input_tokens_seen': 233358, 'train_runtime': '121', 'train_tokens_per_second': '1929'}
+{'loss': '0.7762', 'grad_norm': '0.2345', 'learning_rate': '6.96e-06', 'epoch': '0.01404', 'num_input_tokens_seen': 235405, 'train_runtime': '122', 'train_tokens_per_second': '1929'}
+{'loss': '1.459', 'grad_norm': '0.3409', 'learning_rate': '7.021e-06', 'epoch': '0.01416', 'num_input_tokens_seen': 237452, 'train_runtime': '123.1', 'train_tokens_per_second': '1930'}
+{'loss': '0.6121', 'grad_norm': '0.2403', 'learning_rate': '7.082e-06', 'epoch': '0.01429', 'num_input_tokens_seen': 239499, 'train_runtime': '124.1', 'train_tokens_per_second': '1930'}
+{'loss': '1.599', 'grad_norm': '0.299', 'learning_rate': '7.143e-06', 'epoch': '0.01441', 'num_input_tokens_seen': 241546, 'train_runtime': '125.1', 'train_tokens_per_second': '1930'}
+{'loss': '1.771', 'grad_norm': '0.391', 'learning_rate': '7.204e-06', 'epoch': '0.01453', 'num_input_tokens_seen': 243593, 'train_runtime': '126.2', 'train_tokens_per_second': '1930'}
+{'loss': '1.541', 'grad_norm': '0.3111', 'learning_rate': '7.265e-06', 'epoch': '0.01465', 'num_input_tokens_seen': 245640, 'train_runtime': '127.2', 'train_tokens_per_second': '1931'}
+{'loss': '0.7969', 'grad_norm': '0.2717', 'learning_rate': '7.326e-06', 'epoch': '0.01477', 'num_input_tokens_seen': 247687, 'train_runtime': '128.3', 'train_tokens_per_second': '1931'}
+{'loss': '1.567', 'grad_norm': '0.3719', 'learning_rate': '7.387e-06', 'epoch': '0.0149', 'num_input_tokens_seen': 249734, 'train_runtime': '129.3', 'train_tokens_per_second': '1931'}
+{'loss': '1.782', 'grad_norm': '0.3787', 'learning_rate': '7.448e-06', 'epoch': '0.01502', 'num_input_tokens_seen': 251781, 'train_runtime': '130.4', 'train_tokens_per_second': '1931'}
+{'loss': '0.7362', 'grad_norm': '0.2492', 'learning_rate': '7.509e-06', 'epoch': '0.01514', 'num_input_tokens_seen': 253828, 'train_runtime': '131.4', 'train_tokens_per_second': '1932'}
+{'loss': '1.653', 'grad_norm': '0.3752', 'learning_rate': '7.57e-06', 'epoch': '0.01526', 'num_input_tokens_seen': 255875, 'train_runtime': '132.5', 'train_tokens_per_second': '1932'}
+{'loss': '1.619', 'grad_norm': '0.4029', 'learning_rate': '7.631e-06', 'epoch': '0.01538', 'num_input_tokens_seen': 257922, 'train_runtime': '133.5', 'train_tokens_per_second': '1932'}
+{'loss': '1.128', 'grad_norm': '0.3188', 'learning_rate': '7.692e-06', 'epoch': '0.01551', 'num_input_tokens_seen': 259969, 'train_runtime': '134.6', 'train_tokens_per_second': '1932'}
+{'loss': '1.338', 'grad_norm': '0.3356', 'learning_rate': '7.753e-06', 'epoch': '0.01563', 'num_input_tokens_seen': 262016, 'train_runtime': '135.6', 'train_tokens_per_second': '1932'}
+{'loss': '0.7656', 'grad_norm': '0.2505', 'learning_rate': '7.814e-06', 'epoch': '0.01575', 'num_input_tokens_seen': 264063, 'train_runtime': '136.6', 'train_tokens_per_second': '1933'}
+{'loss': '1.375', 'grad_norm': '0.3852', 'learning_rate': '7.875e-06', 'epoch': '0.01587', 'num_input_tokens_seen': 266110, 'train_runtime': '137.7', 'train_tokens_per_second': '1933'}
+{'loss': '0.5618', 'grad_norm': '0.24', 'learning_rate': '7.937e-06', 'epoch': '0.016', 'num_input_tokens_seen': 268157, 'train_runtime': '138.7', 'train_tokens_per_second': '1933'}
+{'loss': '1.335', 'grad_norm': '0.4018', 'learning_rate': '7.998e-06', 'epoch': '0.01612', 'num_input_tokens_seen': 270204, 'train_runtime': '139.8', 'train_tokens_per_second': '1933'}
+{'loss': '1.063', 'grad_norm': '0.2842', 'learning_rate': '8.059e-06', 'epoch': '0.01624', 'num_input_tokens_seen': 272251, 'train_runtime': '140.8', 'train_tokens_per_second': '1933'}
+{'loss': '1.795', 'grad_norm': '0.4447', 'learning_rate': '8.12e-06', 'epoch': '0.01636', 'num_input_tokens_seen': 274298, 'train_runtime': '141.9', 'train_tokens_per_second': '1933'}
+{'loss': '1.664', 'grad_norm': '0.3341', 'learning_rate': '8.181e-06', 'epoch': '0.01648', 'num_input_tokens_seen': 276345, 'train_runtime': '142.9', 'train_tokens_per_second': '1934'}
+{'loss': '1.237', 'grad_norm': '0.2907', 'learning_rate': '8.242e-06', 'epoch': '0.01661', 'num_input_tokens_seen': 278392, 'train_runtime': '144', 'train_tokens_per_second': '1934'}
+{'loss': '1.617', 'grad_norm': '0.3788', 'learning_rate': '8.303e-06', 'epoch': '0.01673', 'num_input_tokens_seen': 280439, 'train_runtime': '145', 'train_tokens_per_second': '1934'}
+{'loss': '1.089', 'grad_norm': '0.3043', 'learning_rate': '8.364e-06', 'epoch': '0.01685', 'num_input_tokens_seen': 282486, 'train_runtime': '146.1', 'train_tokens_per_second': '1934'}
+{'loss': '1.12', 'grad_norm': '0.3281', 'learning_rate': '8.425e-06', 'epoch': '0.01697', 'num_input_tokens_seen': 284533, 'train_runtime': '147.1', 'train_tokens_per_second': '1934'}
+{'loss': '1.408', 'grad_norm': '0.3588', 'learning_rate': '8.486e-06', 'epoch': '0.01709', 'num_input_tokens_seen': 286580, 'train_runtime': '148.1', 'train_tokens_per_second': '1935'}
+{'loss': '1.173', 'grad_norm': '0.3316', 'learning_rate': '8.547e-06', 'epoch': '0.01722', 'num_input_tokens_seen': 288627, 'train_runtime': '149.2', 'train_tokens_per_second': '1935'}
+{'loss': '1.621', 'grad_norm': '0.3899', 'learning_rate': '8.608e-06', 'epoch': '0.01734', 'num_input_tokens_seen': 290674, 'train_runtime': '150.2', 'train_tokens_per_second': '1935'}
+{'loss': '1.247', 'grad_norm': '0.3735', 'learning_rate': '8.669e-06', 'epoch': '0.01746', 'num_input_tokens_seen': 292721, 'train_runtime': '151.3', 'train_tokens_per_second': '1935'}
+{'loss': '1.872', 'grad_norm': '0.4948', 'learning_rate': '8.73e-06', 'epoch': '0.01758', 'num_input_tokens_seen': 294768, 'train_runtime': '152.3', 'train_tokens_per_second': '1935'}
+{'loss': '0.6525', 'grad_norm': '0.2687', 'learning_rate': '8.791e-06', 'epoch': '0.0177', 'num_input_tokens_seen': 296815, 'train_runtime': '153.4', 'train_tokens_per_second': '1936'}
+{'loss': '1.418', 'grad_norm': '0.4128', 'learning_rate': '8.852e-06', 'epoch': '0.01783', 'num_input_tokens_seen': 298862, 'train_runtime': '154.4', 'train_tokens_per_second': '1936'}
+{'loss': '1.428', 'grad_norm': '0.3661', 'learning_rate': '8.913e-06', 'epoch': '0.01795', 'num_input_tokens_seen': 300909, 'train_runtime': '155.4', 'train_tokens_per_second': '1936'}
+{'loss': '1.003', 'grad_norm': '0.3327', 'learning_rate': '8.974e-06', 'epoch': '0.01807', 'num_input_tokens_seen': 302956, 'train_runtime': '156.5', 'train_tokens_per_second': '1936'}
+{'loss': '1.531', 'grad_norm': '0.4244', 'learning_rate': '9.035e-06', 'epoch': '0.01819', 'num_input_tokens_seen': 305003, 'train_runtime': '157.5', 'train_tokens_per_second': '1936'}
+{'loss': '1.635', 'grad_norm': '0.4266', 'learning_rate': '9.096e-06', 'epoch': '0.01832', 'num_input_tokens_seen': 307050, 'train_runtime': '158.6', 'train_tokens_per_second': '1936'}
+{'loss': '1.504', 'grad_norm': '0.3605', 'learning_rate': '9.158e-06', 'epoch': '0.01844', 'num_input_tokens_seen': 309097, 'train_runtime': '159.6', 'train_tokens_per_second': '1936'}
+{'loss': '1.709', 'grad_norm': '0.3912', 'learning_rate': '9.219e-06', 'epoch': '0.01856', 'num_input_tokens_seen': 311144, 'train_runtime': '160.7', 'train_tokens_per_second': '1936'}
+{'loss': '1.367', 'grad_norm': '0.3813', 'learning_rate': '9.28e-06', 'epoch': '0.01868', 'num_input_tokens_seen': 313191, 'train_runtime': '161.7', 'train_tokens_per_second': '1937'}
+{'loss': '1.261', 'grad_norm': '0.3283', 'learning_rate': '9.341e-06', 'epoch': '0.0188', 'num_input_tokens_seen': 315238, 'train_runtime': '162.8', 'train_tokens_per_second': '1937'}
+{'loss': '1.142', 'grad_norm': '0.2797', 'learning_rate': '9.402e-06', 'epoch': '0.01893', 'num_input_tokens_seen': 317285, 'train_runtime': '163.8', 'train_tokens_per_second': '1937'}
+{'loss': '1.054', 'grad_norm': '0.3778', 'learning_rate': '9.463e-06', 'epoch': '0.01905', 'num_input_tokens_seen': 319332, 'train_runtime': '164.8', 'train_tokens_per_second': '1937'}
+{'loss': '1.37', 'grad_norm': '0.3661', 'learning_rate': '9.524e-06', 'epoch': '0.01917', 'num_input_tokens_seen': 321379, 'train_runtime': '165.9', 'train_tokens_per_second': '1937'}
+{'loss': '1.425', 'grad_norm': '0.5471', 'learning_rate': '9.585e-06', 'epoch': '0.01929', 'num_input_tokens_seen': 323426, 'train_runtime': '166.9', 'train_tokens_per_second': '1937'}
+{'loss': '1.088', 'grad_norm': '0.3833', 'learning_rate': '9.646e-06', 'epoch': '0.01941', 'num_input_tokens_seen': 325473, 'train_runtime': '168', 'train_tokens_per_second': '1937'}
+{'loss': '1.332', 'grad_norm': '0.4081', 'learning_rate': '9.707e-06', 'epoch': '0.01954', 'num_input_tokens_seen': 327520, 'train_runtime': '169', 'train_tokens_per_second': '1938'}
+{'loss': '1.821', 'grad_norm': '0.4351', 'learning_rate': '9.768e-06', 'epoch': '0.01966', 'num_input_tokens_seen': 329567, 'train_runtime': '170.1', 'train_tokens_per_second': '1938'}
+{'loss': '1.693', 'grad_norm': '2.017', 'learning_rate': '9.829e-06', 'epoch': '0.01978', 'num_input_tokens_seen': 331614, 'train_runtime': '171.1', 'train_tokens_per_second': '1938'}
+{'loss': '1.377', 'grad_norm': '0.3394', 'learning_rate': '9.89e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 333661, 'train_runtime': '172.2', 'train_tokens_per_second': '1938'}
+{'loss': '1.329', 'grad_norm': '0.3503', 'learning_rate': '9.951e-06', 'epoch': '0.02002', 'num_input_tokens_seen': 335708, 'train_runtime': '173.2', 'train_tokens_per_second': '1938'}
+{'loss': '1.319', 'grad_norm': '0.3434', 'learning_rate': '1.001e-05', 'epoch': '0.02015', 'num_input_tokens_seen': 337755, 'train_runtime': '174.3', 'train_tokens_per_second': '1938'}
+{'loss': '0.7777', 'grad_norm': '0.3284', 'learning_rate': '1.007e-05', 'epoch': '0.02027', 'num_input_tokens_seen': 339802, 'train_runtime': '175.3', 'train_tokens_per_second': '1938'}
+{'loss': '1.453', 'grad_norm': '0.3621', 'learning_rate': '1.013e-05', 'epoch': '0.02039', 'num_input_tokens_seen': 341849, 'train_runtime': '176.4', 'train_tokens_per_second': '1938'}
+{'loss': '1.899', 'grad_norm': '0.5323', 'learning_rate': '1.02e-05', 'epoch': '0.02051', 'num_input_tokens_seen': 343896, 'train_runtime': '177.4', 'train_tokens_per_second': '1939'}
+{'loss': '2.037', 'grad_norm': '0.5038', 'learning_rate': '1.026e-05', 'epoch': '0.02063', 'num_input_tokens_seen': 345943, 'train_runtime': '178.4', 'train_tokens_per_second': '1939'}
+{'loss': '1.384', 'grad_norm': '0.3607', 'learning_rate': '1.032e-05', 'epoch': '0.02076', 'num_input_tokens_seen': 347990, 'train_runtime': '179.5', 'train_tokens_per_second': '1939'}
+{'loss': '1.661', 'grad_norm': '0.4242', 'learning_rate': '1.038e-05', 'epoch': '0.02088', 'num_input_tokens_seen': 350037, 'train_runtime': '180.5', 'train_tokens_per_second': '1939'}
+{'loss': '1.68', 'grad_norm': '0.4849', 'learning_rate': '1.044e-05', 'epoch': '0.021', 'num_input_tokens_seen': 352084, 'train_runtime': '181.6', 'train_tokens_per_second': '1939'}
+{'loss': '1.685', 'grad_norm': '0.555', 'learning_rate': '1.05e-05', 'epoch': '0.02112', 'num_input_tokens_seen': 354131, 'train_runtime': '182.6', 'train_tokens_per_second': '1939'}
+{'loss': '1.141', 'grad_norm': '0.351', 'learning_rate': '1.056e-05', 'epoch': '0.02125', 'num_input_tokens_seen': 356178, 'train_runtime': '183.7', 'train_tokens_per_second': '1939'}
+{'loss': '1.29', 'grad_norm': '0.4115', 'learning_rate': '1.062e-05', 'epoch': '0.02137', 'num_input_tokens_seen': 358225, 'train_runtime': '184.7', 'train_tokens_per_second': '1939'}
+{'loss': '1.293', 'grad_norm': '0.3835', 'learning_rate': '1.068e-05', 'epoch': '0.02149', 'num_input_tokens_seen': 360272, 'train_runtime': '185.8', 'train_tokens_per_second': '1939'}
+{'loss': '1.556', 'grad_norm': '0.4774', 'learning_rate': '1.074e-05', 'epoch': '0.02161', 'num_input_tokens_seen': 362319, 'train_runtime': '186.8', 'train_tokens_per_second': '1940'}
+{'loss': '1.218', 'grad_norm': '0.4011', 'learning_rate': '1.081e-05', 'epoch': '0.02173', 'num_input_tokens_seen': 364366, 'train_runtime': '187.9', 'train_tokens_per_second': '1940'}
+{'loss': '1.299', 'grad_norm': '0.3859', 'learning_rate': '1.087e-05', 'epoch': '0.02186', 'num_input_tokens_seen': 366413, 'train_runtime': '188.9', 'train_tokens_per_second': '1940'}
+{'loss': '1.037', 'grad_norm': '0.3694', 'learning_rate': '1.093e-05', 'epoch': '0.02198', 'num_input_tokens_seen': 368460, 'train_runtime': '189.9', 'train_tokens_per_second': '1940'}
+{'loss': '0.6335', 'grad_norm': '0.2866', 'learning_rate': '1.099e-05', 'epoch': '0.0221', 'num_input_tokens_seen': 370507, 'train_runtime': '191', 'train_tokens_per_second': '1940'}
+{'loss': '0.6538', 'grad_norm': '0.321', 'learning_rate': '1.105e-05', 'epoch': '0.02222', 'num_input_tokens_seen': 372554, 'train_runtime': '192', 'train_tokens_per_second': '1940'}
+{'loss': '1.187', 'grad_norm': '0.3279', 'learning_rate': '1.111e-05', 'epoch': '0.02234', 'num_input_tokens_seen': 374601, 'train_runtime': '193.1', 'train_tokens_per_second': '1940'}
+{'loss': '1.375', 'grad_norm': '0.447', 'learning_rate': '1.117e-05', 'epoch': '0.02247', 'num_input_tokens_seen': 376648, 'train_runtime': '194.1', 'train_tokens_per_second': '1940'}
+{'loss': '0.8847', 'grad_norm': '0.3551', 'learning_rate': '1.123e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 378695, 'train_runtime': '195.2', 'train_tokens_per_second': '1940'}
+{'loss': '1.745', 'grad_norm': '0.5382', 'learning_rate': '1.129e-05', 'epoch': '0.02271', 'num_input_tokens_seen': 380742, 'train_runtime': '196.2', 'train_tokens_per_second': '1940'}
+{'loss': '1.602', 'grad_norm': '0.4624', 'learning_rate': '1.136e-05', 'epoch': '0.02283', 'num_input_tokens_seen': 382789, 'train_runtime': '197.3', 'train_tokens_per_second': '1941'}
+{'loss': '1.474', 'grad_norm': '0.478', 'learning_rate': '1.142e-05', 'epoch': '0.02295', 'num_input_tokens_seen': 384836, 'train_runtime': '198.3', 'train_tokens_per_second': '1941'}
+{'loss': '1.639', 'grad_norm': '0.4799', 'learning_rate': '1.148e-05', 'epoch': '0.02308', 'num_input_tokens_seen': 386883, 'train_runtime': '199.3', 'train_tokens_per_second': '1941'}
+{'loss': '0.8179', 'grad_norm': '0.3443', 'learning_rate': '1.154e-05', 'epoch': '0.0232', 'num_input_tokens_seen': 388930, 'train_runtime': '200.4', 'train_tokens_per_second': '1941'}
+{'loss': '1.302', 'grad_norm': '0.4595', 'learning_rate': '1.16e-05', 'epoch': '0.02332', 'num_input_tokens_seen': 390977, 'train_runtime': '201.4', 'train_tokens_per_second': '1941'}
+{'loss': '0.6097', 'grad_norm': '0.2905', 'learning_rate': '1.166e-05', 'epoch': '0.02344', 'num_input_tokens_seen': 393024, 'train_runtime': '202.5', 'train_tokens_per_second': '1941'}
+{'loss': '0.8993', 'grad_norm': '0.3459', 'learning_rate': '1.172e-05', 'epoch': '0.02357', 'num_input_tokens_seen': 395071, 'train_runtime': '203.5', 'train_tokens_per_second': '1941'}
+{'loss': '1.096', 'grad_norm': '0.4137', 'learning_rate': '1.178e-05', 'epoch': '0.02369', 'num_input_tokens_seen': 397118, 'train_runtime': '204.6', 'train_tokens_per_second': '1941'}
+{'loss': '1.411', 'grad_norm': '0.4383', 'learning_rate': '1.184e-05', 'epoch': '0.02381', 'num_input_tokens_seen': 399165, 'train_runtime': '205.6', 'train_tokens_per_second': '1941'}
+{'loss': '1.204', 'grad_norm': '0.4678', 'learning_rate': '1.19e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 401212, 'train_runtime': '206.7', 'train_tokens_per_second': '1941'}
+{'loss': '1.366', 'grad_norm': '0.424', 'learning_rate': '1.197e-05', 'epoch': '0.02405', 'num_input_tokens_seen': 403259, 'train_runtime': '207.7', 'train_tokens_per_second': '1942'}
+{'loss': '0.992', 'grad_norm': '0.3474', 'learning_rate': '1.203e-05', 'epoch': '0.02418', 'num_input_tokens_seen': 405306, 'train_runtime': '208.7', 'train_tokens_per_second': '1942'}
+{'loss': '1.445', 'grad_norm': '0.485', 'learning_rate': '1.209e-05', 'epoch': '0.0243', 'num_input_tokens_seen': 407353, 'train_runtime': '209.8', 'train_tokens_per_second': '1942'}
+{'loss': '1.563', 'grad_norm': '0.4729', 'learning_rate': '1.215e-05', 'epoch': '0.02442', 'num_input_tokens_seen': 409400, 'train_runtime': '210.8', 'train_tokens_per_second': '1942'}
+{'loss': '1.273', 'grad_norm': '0.4405', 'learning_rate': '1.221e-05', 'epoch': '0.02454', 'num_input_tokens_seen': 411447, 'train_runtime': '211.9', 'train_tokens_per_second': '1942'}
+{'loss': '1.156', 'grad_norm': '0.4594', 'learning_rate': '1.227e-05', 'epoch': '0.02466', 'num_input_tokens_seen': 413494, 'train_runtime': '212.9', 'train_tokens_per_second': '1942'}
+{'loss': '1.284', 'grad_norm': '0.5923', 'learning_rate': '1.233e-05', 'epoch': '0.02479', 'num_input_tokens_seen': 415541, 'train_runtime': '214', 'train_tokens_per_second': '1942'}
+{'loss': '1.57', 'grad_norm': '0.517', 'learning_rate': '1.239e-05', 'epoch': '0.02491', 'num_input_tokens_seen': 417588, 'train_runtime': '215', 'train_tokens_per_second': '1942'}
+{'loss': '0.7737', 'grad_norm': '0.3465', 'learning_rate': '1.245e-05', 'epoch': '0.02503', 'num_input_tokens_seen': 419635, 'train_runtime': '216.1', 'train_tokens_per_second': '1942'}
+{'loss': '1.291', 'grad_norm': '0.4848', 'learning_rate': '1.252e-05', 'epoch': '0.02515', 'num_input_tokens_seen': 421682, 'train_runtime': '217.1', 'train_tokens_per_second': '1942'}
+{'loss': '1.418', 'grad_norm': '0.4659', 'learning_rate': '1.258e-05', 'epoch': '0.02527', 'num_input_tokens_seen': 423729, 'train_runtime': '218.2', 'train_tokens_per_second': '1942'}
+{'loss': '1.081', 'grad_norm': '0.4705', 'learning_rate': '1.264e-05', 'epoch': '0.0254', 'num_input_tokens_seen': 425776, 'train_runtime': '219.2', 'train_tokens_per_second': '1942'}
+{'loss': '1.675', 'grad_norm': '0.4767', 'learning_rate': '1.27e-05', 'epoch': '0.02552', 'num_input_tokens_seen': 427823, 'train_runtime': '220.2', 'train_tokens_per_second': '1942'}
+{'loss': '1.681', 'grad_norm': '0.5783', 'learning_rate': '1.276e-05', 'epoch': '0.02564', 'num_input_tokens_seen': 429870, 'train_runtime': '221.3', 'train_tokens_per_second': '1943'}
+{'loss': '1.452', 'grad_norm': '0.4866', 'learning_rate': '1.282e-05', 'epoch': '0.02576', 'num_input_tokens_seen': 431917, 'train_runtime': '222.3', 'train_tokens_per_second': '1943'}
+{'loss': '0.9691', 'grad_norm': '0.4056', 'learning_rate': '1.288e-05', 'epoch': '0.02589', 'num_input_tokens_seen': 433964, 'train_runtime': '223.4', 'train_tokens_per_second': '1943'}
+{'loss': '0.6256', 'grad_norm': '0.3151', 'learning_rate': '1.294e-05', 'epoch': '0.02601', 'num_input_tokens_seen': 436011, 'train_runtime': '224.4', 'train_tokens_per_second': '1943'}
+{'loss': '0.6349', 'grad_norm': '0.3113', 'learning_rate': '1.3e-05', 'epoch': '0.02613', 'num_input_tokens_seen': 438058, 'train_runtime': '225.5', 'train_tokens_per_second': '1943'}
+{'loss': '1.575', 'grad_norm': '0.6033', 'learning_rate': '1.306e-05', 'epoch': '0.02625', 'num_input_tokens_seen': 440105, 'train_runtime': '226.5', 'train_tokens_per_second': '1943'}
+{'loss': '1.585', 'grad_norm': '0.5161', 'learning_rate': '1.313e-05', 'epoch': '0.02637', 'num_input_tokens_seen': 442152, 'train_runtime': '227.6', 'train_tokens_per_second': '1943'}
+{'loss': '1.182', 'grad_norm': '0.4157', 'learning_rate': '1.319e-05', 'epoch': '0.0265', 'num_input_tokens_seen': 444199, 'train_runtime': '228.6', 'train_tokens_per_second': '1943'}
+{'loss': '1.789', 'grad_norm': '0.6525', 'learning_rate': '1.325e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 446246, 'train_runtime': '229.7', 'train_tokens_per_second': '1943'}
+{'loss': '0.6394', 'grad_norm': '0.332', 'learning_rate': '1.331e-05', 'epoch': '0.02674', 'num_input_tokens_seen': 448293, 'train_runtime': '230.7', 'train_tokens_per_second': '1943'}
+{'loss': '1.595', 'grad_norm': '0.5779', 'learning_rate': '1.337e-05', 'epoch': '0.02686', 'num_input_tokens_seen': 450340, 'train_runtime': '231.7', 'train_tokens_per_second': '1943'}
+{'loss': '0.8082', 'grad_norm': '0.3568', 'learning_rate': '1.343e-05', 'epoch': '0.02698', 'num_input_tokens_seen': 452387, 'train_runtime': '232.8', 'train_tokens_per_second': '1943'}
+{'loss': '1.479', 'grad_norm': '0.5858', 'learning_rate': '1.349e-05', 'epoch': '0.02711', 'num_input_tokens_seen': 454434, 'train_runtime': '233.8', 'train_tokens_per_second': '1943'}
+{'loss': '1.147', 'grad_norm': '0.4227', 'learning_rate': '1.355e-05', 'epoch': '0.02723', 'num_input_tokens_seen': 456481, 'train_runtime': '234.9', 'train_tokens_per_second': '1943'}
+{'loss': '1.603', 'grad_norm': '0.4923', 'learning_rate': '1.361e-05', 'epoch': '0.02735', 'num_input_tokens_seen': 458528, 'train_runtime': '235.9', 'train_tokens_per_second': '1943'}
+{'loss': '1.538', 'grad_norm': '0.5759', 'learning_rate': '1.368e-05', 'epoch': '0.02747', 'num_input_tokens_seen': 460575, 'train_runtime': '237', 'train_tokens_per_second': '1944'}
+{'loss': '0.7194', 'grad_norm': '0.3567', 'learning_rate': '1.374e-05', 'epoch': '0.02759', 'num_input_tokens_seen': 462622, 'train_runtime': '238', 'train_tokens_per_second': '1944'}
+{'loss': '1.721', 'grad_norm': '0.5946', 'learning_rate': '1.38e-05', 'epoch': '0.02772', 'num_input_tokens_seen': 464669, 'train_runtime': '239.1', 'train_tokens_per_second': '1944'}
+{'loss': '1.277', 'grad_norm': '0.5085', 'learning_rate': '1.386e-05', 'epoch': '0.02784', 'num_input_tokens_seen': 466716, 'train_runtime': '240.1', 'train_tokens_per_second': '1944'}
+{'loss': '1.659', 'grad_norm': '0.6458', 'learning_rate': '1.392e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 468763, 'train_runtime': '241.2', 'train_tokens_per_second': '1944'}
+{'loss': '1.555', 'grad_norm': '0.4655', 'learning_rate': '1.398e-05', 'epoch': '0.02808', 'num_input_tokens_seen': 470810, 'train_runtime': '242.2', 'train_tokens_per_second': '1944'}
+{'loss': '1.236', 'grad_norm': '0.5168', 'learning_rate': '1.404e-05', 'epoch': '0.02821', 'num_input_tokens_seen': 472857, 'train_runtime': '243.2', 'train_tokens_per_second': '1944'}
+{'loss': '1.659', 'grad_norm': '0.5702', 'learning_rate': '1.41e-05', 'epoch': '0.02833', 'num_input_tokens_seen': 474904, 'train_runtime': '244.3', 'train_tokens_per_second': '1944'}
+{'loss': '1.295', 'grad_norm': '0.4997', 'learning_rate': '1.416e-05', 'epoch': '0.02845', 'num_input_tokens_seen': 476951, 'train_runtime': '245.3', 'train_tokens_per_second': '1944'}
+{'loss': '1.355', 'grad_norm': '0.5255', 'learning_rate': '1.422e-05', 'epoch': '0.02857', 'num_input_tokens_seen': 478998, 'train_runtime': '246.4', 'train_tokens_per_second': '1944'}
+{'loss': '1.326', 'grad_norm': '0.59', 'learning_rate': '1.429e-05', 'epoch': '0.02869', 'num_input_tokens_seen': 481045, 'train_runtime': '247.4', 'train_tokens_per_second': '1944'}
+{'loss': '0.8155', 'grad_norm': '0.4466', 'learning_rate': '1.435e-05', 'epoch': '0.02882', 'num_input_tokens_seen': 483092, 'train_runtime': '248.5', 'train_tokens_per_second': '1944'}
+{'loss': '1.097', 'grad_norm': '0.431', 'learning_rate': '1.441e-05', 'epoch': '0.02894', 'num_input_tokens_seen': 485139, 'train_runtime': '249.5', 'train_tokens_per_second': '1944'}
+{'loss': '1.442', 'grad_norm': '0.6068', 'learning_rate': '1.447e-05', 'epoch': '0.02906', 'num_input_tokens_seen': 487186, 'train_runtime': '250.6', 'train_tokens_per_second': '1944'}
+{'loss': '0.6167', 'grad_norm': '0.3797', 'learning_rate': '1.453e-05', 'epoch': '0.02918', 'num_input_tokens_seen': 489233, 'train_runtime': '251.6', 'train_tokens_per_second': '1944'}
+{'loss': '1.099', 'grad_norm': '0.4898', 'learning_rate': '1.459e-05', 'epoch': '0.0293', 'num_input_tokens_seen': 491280, 'train_runtime': '252.7', 'train_tokens_per_second': '1944'}
+{'loss': '1.663', 'grad_norm': '0.7464', 'learning_rate': '1.465e-05', 'epoch': '0.02943', 'num_input_tokens_seen': 493327, 'train_runtime': '253.7', 'train_tokens_per_second': '1944'}
+{'loss': '0.7168', 'grad_norm': '0.4142', 'learning_rate': '1.471e-05', 'epoch': '0.02955', 'num_input_tokens_seen': 495374, 'train_runtime': '254.7', 'train_tokens_per_second': '1945'}
+{'loss': '2.189', 'grad_norm': '0.7521', 'learning_rate': '1.477e-05', 'epoch': '0.02967', 'num_input_tokens_seen': 497421, 'train_runtime': '255.8', 'train_tokens_per_second': '1945'}
+{'loss': '1.161', 'grad_norm': '0.5383', 'learning_rate': '1.484e-05', 'epoch': '0.02979', 'num_input_tokens_seen': 499468, 'train_runtime': '256.8', 'train_tokens_per_second': '1945'}
+{'loss': '0.7095', 'grad_norm': '0.363', 'learning_rate': '1.49e-05', 'epoch': '0.02991', 'num_input_tokens_seen': 501515, 'train_runtime': '257.9', 'train_tokens_per_second': '1945'}
+{'loss': '1.675', 'grad_norm': '0.5704', 'learning_rate': '1.496e-05', 'epoch': '0.03004', 'num_input_tokens_seen': 503562, 'train_runtime': '258.9', 'train_tokens_per_second': '1945'}
+{'loss': '1.544', 'grad_norm': '0.6231', 'learning_rate': '1.502e-05', 'epoch': '0.03016', 'num_input_tokens_seen': 505609, 'train_runtime': '260', 'train_tokens_per_second': '1945'}
+{'loss': '1.202', 'grad_norm': '0.5518', 'learning_rate': '1.508e-05', 'epoch': '0.03028', 'num_input_tokens_seen': 507656, 'train_runtime': '261', 'train_tokens_per_second': '1945'}
+{'loss': '1.31', 'grad_norm': '0.4917', 'learning_rate': '1.514e-05', 'epoch': '0.0304', 'num_input_tokens_seen': 509703, 'train_runtime': '262.1', 'train_tokens_per_second': '1945'}
+{'loss': '1.394', 'grad_norm': '0.4971', 'learning_rate': '1.52e-05', 'epoch': '0.03053', 'num_input_tokens_seen': 511750, 'train_runtime': '263.1', 'train_tokens_per_second': '1945'}
+{'loss': '1.184', 'grad_norm': '0.4955', 'learning_rate': '1.526e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 513797, 'train_runtime': '264.2', 'train_tokens_per_second': '1945'}
+{'loss': '1.614', 'grad_norm': '0.8268', 'learning_rate': '1.532e-05', 'epoch': '0.03077', 'num_input_tokens_seen': 515844, 'train_runtime': '265.2', 'train_tokens_per_second': '1945'}
+{'loss': '1.911', 'grad_norm': '0.7492', 'learning_rate': '1.538e-05', 'epoch': '0.03089', 'num_input_tokens_seen': 517891, 'train_runtime': '266.3', 'train_tokens_per_second': '1945'}
+{'loss': '1.459', 'grad_norm': '0.5983', 'learning_rate': '1.545e-05', 'epoch': '0.03101', 'num_input_tokens_seen': 519938, 'train_runtime': '267.3', 'train_tokens_per_second': '1945'}
+{'loss': '1.595', 'grad_norm': '0.6762', 'learning_rate': '1.551e-05', 'epoch': '0.03114', 'num_input_tokens_seen': 521985, 'train_runtime': '268.4', 'train_tokens_per_second': '1945'}
+{'loss': '0.6932', 'grad_norm': '0.4707', 'learning_rate': '1.557e-05', 'epoch': '0.03126', 'num_input_tokens_seen': 524032, 'train_runtime': '269.4', 'train_tokens_per_second': '1945'}
+{'loss': '2.117', 'grad_norm': '0.7636', 'learning_rate': '1.563e-05', 'epoch': '0.03138', 'num_input_tokens_seen': 526079, 'train_runtime': '270.5', 'train_tokens_per_second': '1945'}
+{'loss': '1.121', 'grad_norm': '0.478', 'learning_rate': '1.569e-05', 'epoch': '0.0315', 'num_input_tokens_seen': 528126, 'train_runtime': '271.5', 'train_tokens_per_second': '1945'}
+{'loss': '1.432', 'grad_norm': '0.6419', 'learning_rate': '1.575e-05', 'epoch': '0.03162', 'num_input_tokens_seen': 530173, 'train_runtime': '272.6', 'train_tokens_per_second': '1945'}
+{'loss': '0.7377', 'grad_norm': '0.413', 'learning_rate': '1.581e-05', 'epoch': '0.03175', 'num_input_tokens_seen': 532220, 'train_runtime': '273.6', 'train_tokens_per_second': '1945'}
+{'loss': '1.552', 'grad_norm': '0.6274', 'learning_rate': '1.587e-05', 'epoch': '0.03187', 'num_input_tokens_seen': 534267, 'train_runtime': '274.7', 'train_tokens_per_second': '1945'}
+{'loss': '1.128', 'grad_norm': '0.536', 'learning_rate': '1.593e-05', 'epoch': '0.03199', 'num_input_tokens_seen': 536314, 'train_runtime': '275.7', 'train_tokens_per_second': '1945'}
+{'loss': '1.204', 'grad_norm': '0.544', 'learning_rate': '1.6e-05', 'epoch': '0.03211', 'num_input_tokens_seen': 538361, 'train_runtime': '276.8', 'train_tokens_per_second': '1945'}
+{'loss': '0.01306', 'grad_norm': '0.06258', 'learning_rate': '1.606e-05', 'epoch': '0.03223', 'num_input_tokens_seen': 540408, 'train_runtime': '277.8', 'train_tokens_per_second': '1945'}
+{'loss': '1.558', 'grad_norm': '0.6964', 'learning_rate': '1.612e-05', 'epoch': '0.03236', 'num_input_tokens_seen': 542455, 'train_runtime': '278.8', 'train_tokens_per_second': '1945'}
+{'loss': '1.02', 'grad_norm': '0.509', 'learning_rate': '1.618e-05', 'epoch': '0.03248', 'num_input_tokens_seen': 544502, 'train_runtime': '279.9', 'train_tokens_per_second': '1945'}
+{'loss': '1.581', 'grad_norm': '0.6765', 'learning_rate': '1.624e-05', 'epoch': '0.0326', 'num_input_tokens_seen': 546549, 'train_runtime': '280.9', 'train_tokens_per_second': '1946'}
+{'loss': '0.7899', 'grad_norm': '0.4745', 'learning_rate': '1.63e-05', 'epoch': '0.03272', 'num_input_tokens_seen': 548596, 'train_runtime': '282', 'train_tokens_per_second': '1946'}
+{'loss': '1.312', 'grad_norm': '0.613', 'learning_rate': '1.636e-05', 'epoch': '0.03284', 'num_input_tokens_seen': 550643, 'train_runtime': '283', 'train_tokens_per_second': '1946'}
+{'loss': '1.312', 'grad_norm': '0.6338', 'learning_rate': '1.642e-05', 'epoch': '0.03297', 'num_input_tokens_seen': 552690, 'train_runtime': '284.1', 'train_tokens_per_second': '1946'}
+{'loss': '0.7668', 'grad_norm': '0.4715', 'learning_rate': '1.648e-05', 'epoch': '0.03309', 'num_input_tokens_seen': 554737, 'train_runtime': '285.1', 'train_tokens_per_second': '1946'}
+{'loss': '1.125', 'grad_norm': '0.6008', 'learning_rate': '1.654e-05', 'epoch': '0.03321', 'num_input_tokens_seen': 556784, 'train_runtime': '286.1', 'train_tokens_per_second': '1946'}
+{'loss': '1.317', 'grad_norm': '0.6867', 'learning_rate': '1.661e-05', 'epoch': '0.03333', 'num_input_tokens_seen': 558831, 'train_runtime': '287.2', 'train_tokens_per_second': '1946'}
+{'loss': '1.421', 'grad_norm': '0.6412', 'learning_rate': '1.667e-05', 'epoch': '0.03346', 'num_input_tokens_seen': 560878, 'train_runtime': '288.2', 'train_tokens_per_second': '1946'}
+{'loss': '1.625', 'grad_norm': '0.7158', 'learning_rate': '1.673e-05', 'epoch': '0.03358', 'num_input_tokens_seen': 562925, 'train_runtime': '289.3', 'train_tokens_per_second': '1946'}
+{'loss': '1.191', 'grad_norm': '0.6911', 'learning_rate': '1.679e-05', 'epoch': '0.0337', 'num_input_tokens_seen': 564972, 'train_runtime': '290.3', 'train_tokens_per_second': '1946'}
+{'loss': '0.6447', 'grad_norm': '0.5162', 'learning_rate': '1.685e-05', 'epoch': '0.03382', 'num_input_tokens_seen': 567019, 'train_runtime': '291.4', 'train_tokens_per_second': '1946'}
+{'loss': '0.8032', 'grad_norm': '0.4759', 'learning_rate': '1.691e-05', 'epoch': '0.03394', 'num_input_tokens_seen': 569066, 'train_runtime': '292.4', 'train_tokens_per_second': '1946'}
+{'loss': '1.107', 'grad_norm': '0.5404', 'learning_rate': '1.697e-05', 'epoch': '0.03407', 'num_input_tokens_seen': 571113, 'train_runtime': '293.4', 'train_tokens_per_second': '1946'}
+{'loss': '1.319', 'grad_norm': '0.7111', 'learning_rate': '1.703e-05', 'epoch': '0.03419', 'num_input_tokens_seen': 573160, 'train_runtime': '294.5', 'train_tokens_per_second': '1946'}
+{'loss': '1.366', 'grad_norm': '0.6837', 'learning_rate': '1.709e-05', 'epoch': '0.03431', 'num_input_tokens_seen': 575207, 'train_runtime': '295.5', 'train_tokens_per_second': '1946'}
+{'loss': '1.553', 'grad_norm': '0.767', 'learning_rate': '1.716e-05', 'epoch': '0.03443', 'num_input_tokens_seen': 577254, 'train_runtime': '296.6', 'train_tokens_per_second': '1946'}
+{'loss': '0.7748', 'grad_norm': '0.5244', 'learning_rate': '1.722e-05', 'epoch': '0.03455', 'num_input_tokens_seen': 579301, 'train_runtime': '297.6', 'train_tokens_per_second': '1947'}
+{'loss': '0.6421', 'grad_norm': '0.4703', 'learning_rate': '1.728e-05', 'epoch': '0.03468', 'num_input_tokens_seen': 581348, 'train_runtime': '298.6', 'train_tokens_per_second': '1947'}
+{'loss': '1.299', 'grad_norm': '0.7458', 'learning_rate': '1.734e-05', 'epoch': '0.0348', 'num_input_tokens_seen': 583395, 'train_runtime': '299.7', 'train_tokens_per_second': '1947'}
+{'loss': '1.37', 'grad_norm': '0.766', 'learning_rate': '1.74e-05', 'epoch': '0.03492', 'num_input_tokens_seen': 585442, 'train_runtime': '300.7', 'train_tokens_per_second': '1947'}
+{'loss': '1.21', 'grad_norm': '0.7069', 'learning_rate': '1.746e-05', 'epoch': '0.03504', 'num_input_tokens_seen': 587489, 'train_runtime': '301.8', 'train_tokens_per_second': '1947'}
+{'loss': '1.371', 'grad_norm': '0.7178', 'learning_rate': '1.752e-05', 'epoch': '0.03516', 'num_input_tokens_seen': 589536, 'train_runtime': '302.8', 'train_tokens_per_second': '1947'}
+{'loss': '0.6646', 'grad_norm': '0.5199', 'learning_rate': '1.758e-05', 'epoch': '0.03529', 'num_input_tokens_seen': 591583, 'train_runtime': '303.9', 'train_tokens_per_second': '1947'}
+{'loss': '1.354', 'grad_norm': '0.6725', 'learning_rate': '1.764e-05', 'epoch': '0.03541', 'num_input_tokens_seen': 593630, 'train_runtime': '304.9', 'train_tokens_per_second': '1947'}
+{'loss': '0.01388', 'grad_norm': '0.07445', 'learning_rate': '1.77e-05', 'epoch': '0.03553', 'num_input_tokens_seen': 595677, 'train_runtime': '305.9', 'train_tokens_per_second': '1947'}
+{'loss': '0.7751', 'grad_norm': '0.5144', 'learning_rate': '1.777e-05', 'epoch': '0.03565', 'num_input_tokens_seen': 597724, 'train_runtime': '307', 'train_tokens_per_second': '1947'}
+{'loss': '1.217', 'grad_norm': '6.529', 'learning_rate': '1.783e-05', 'epoch': '0.03578', 'num_input_tokens_seen': 599771, 'train_runtime': '308', 'train_tokens_per_second': '1947'}
+{'loss': '1.091', 'grad_norm': '0.6843', 'learning_rate': '1.789e-05', 'epoch': '0.0359', 'num_input_tokens_seen': 601818, 'train_runtime': '309.1', 'train_tokens_per_second': '1947'}
+{'loss': '1.242', 'grad_norm': '0.6793', 'learning_rate': '1.795e-05', 'epoch': '0.03602', 'num_input_tokens_seen': 603865, 'train_runtime': '310.1', 'train_tokens_per_second': '1947'}
+{'loss': '1.213', 'grad_norm': '0.6243', 'learning_rate': '1.801e-05', 'epoch': '0.03614', 'num_input_tokens_seen': 605912, 'train_runtime': '311.1', 'train_tokens_per_second': '1947'}

LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+llamafactory==0.9.5.dev0
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os":  "Linux-6.8.0-90-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T04:03:32.123297Z",
+  "args":  [
+    "/workspace/v127rc_exp1/B.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "34f54978776c",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  24,
+  "cpu_count_logical":  48,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "1931460608"
+    }
+  },
+  "memory":  {
+    "total":  "405012275200"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-acb5171c-45e7-5653-1120-9d0cd2a192a6"
+    }
+  ],
+  "cudaVersion":  "12.8",
+  "writerId":  "vighgaih8gdd38lqtuv2307y0stf4bym"
+}

LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,6 @@

+{"time":"2026-02-04T04:03:32.369728388Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T04:03:32.692853515Z","level":"INFO","msg":"stream: created new stream","id":"hwsb1mff"}
+{"time":"2026-02-04T04:03:32.693536225Z","level":"INFO","msg":"handler: started","stream_id":"hwsb1mff"}
+{"time":"2026-02-04T04:03:32.695103475Z","level":"INFO","msg":"stream: started","id":"hwsb1mff"}
+{"time":"2026-02-04T04:03:32.695123335Z","level":"INFO","msg":"writer: started","stream_id":"hwsb1mff"}
+{"time":"2026-02-04T04:03:32.695124927Z","level":"INFO","msg":"sender: started","stream_id":"hwsb1mff"}

LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log ADDED Viewed

	@@ -0,0 +1,23 @@

+2026-02-04 04:03:32,144 INFO    MainThread:7849 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 04:03:32,144 INFO    MainThread:7849 [wandb_setup.py:_flush():81] Configure stats pid to 7849
+2026-02-04 04:03:32,144 INFO    MainThread:7849 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 04:03:32,145 INFO    MainThread:7849 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log
+2026-02-04 04:03:32,147 INFO    MainThread:7849 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log
+2026-02-04 04:03:32,147 INFO    MainThread:7849 [wandb_init.py:init():844] calling init triggers
+2026-02-04 04:03:32,147 INFO    MainThread:7849 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 04:03:32,148 INFO    MainThread:7849 [wandb_init.py:init():892] starting backend
+2026-02-04 04:03:32,362 INFO    MainThread:7849 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 04:03:32,368 INFO    MainThread:7849 [wandb_init.py:init():903] backend started and connected
+2026-02-04 04:03:32,369 INFO    MainThread:7849 [wandb_init.py:init():973] updated telemetry
+2026-02-04 04:03:32,417 INFO    MainThread:7849 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 04:03:33,108 INFO    MainThread:7849 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 04:03:33,181 INFO    MainThread:7849 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 04:03:33,181 INFO    MainThread:7849 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 04:03:33,182 INFO    MainThread:7849 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 04:03:33,182 INFO    MainThread:7849 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 04:03:33,184 INFO    MainThread:7849 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 04:03:33,185 INFO    MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['gate_proj', 'down_proj', 'o_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 585, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 04:03:33,192 INFO    MainThread:7849 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x76ded82a5690>>
+2026-02-04 04:03:33,193 INFO    MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 04:03:33,195 INFO    MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t35_d0_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}

LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.1
+        e:
+            jy6in5azojamixlag12ky8yqk0a5luc8:
+                args:
+                    - /workspace/v127rc_exp1/C.yaml
+                cpu_count: 16
+                cpu_count_logical: 32
+                cudaVersion: "13.0"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "1858318336"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de
+                host: 47a53adf0198
+                memory:
+                    total: "201701408768"
+                os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-04T04:05:44.037622Z"
+                writerId: jy6in5azojamixlag12ky8yqk0a5luc8
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.1
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.1
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t0_d35_r286
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/C
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - o_proj
+                - down_proj
+                - gate_proj
+                - v_proj
+                - k_proj
+                - q_proj
+                - up_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 266
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log ADDED Viewed

	@@ -0,0 +1,191 @@

+  0%|                                                                                                                                                                                       | 0/18595 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+{'loss': '1.682', 'grad_norm': '0.2716', 'learning_rate': '0', 'epoch': '0.0002689', 'num_input_tokens_seen': 2047, 'train_runtime': '2.905', 'train_tokens_per_second': '704.7'}
+{'loss': '1.8', 'grad_norm': '0.2904', 'learning_rate': '1.344e-07', 'epoch': '0.0005378', 'num_input_tokens_seen': 4094, 'train_runtime': '3.914', 'train_tokens_per_second': '1046'}
+{'loss': '1.751', 'grad_norm': '0.2786', 'learning_rate': '2.688e-07', 'epoch': '0.0008067', 'num_input_tokens_seen': 6141, 'train_runtime': '4.925', 'train_tokens_per_second': '1247'}
+{'loss': '1.725', 'grad_norm': '0.2775', 'learning_rate': '4.032e-07', 'epoch': '0.001076', 'num_input_tokens_seen': 8188, 'train_runtime': '5.934', 'train_tokens_per_second': '1380'}
+{'loss': '1.857', 'grad_norm': '0.282', 'learning_rate': '5.376e-07', 'epoch': '0.001344', 'num_input_tokens_seen': 10235, 'train_runtime': '6.944', 'train_tokens_per_second': '1474'}
+{'loss': '1.865', 'grad_norm': '0.2441', 'learning_rate': '6.72e-07', 'epoch': '0.001613', 'num_input_tokens_seen': 12282, 'train_runtime': '7.952', 'train_tokens_per_second': '1545'}
+{'loss': '1.791', 'grad_norm': '0.2674', 'learning_rate': '8.065e-07', 'epoch': '0.001882', 'num_input_tokens_seen': 14329, 'train_runtime': '8.964', 'train_tokens_per_second': '1599'}
+{'loss': '1.834', 'grad_norm': '0.2586', 'learning_rate': '9.409e-07', 'epoch': '0.002151', 'num_input_tokens_seen': 16376, 'train_runtime': '9.974', 'train_tokens_per_second': '1642'}
+{'loss': '1.92', 'grad_norm': '0.2805', 'learning_rate': '1.075e-06', 'epoch': '0.00242', 'num_input_tokens_seen': 18423, 'train_runtime': '10.98', 'train_tokens_per_second': '1677'}
+{'loss': '1.945', 'grad_norm': '0.2809', 'learning_rate': '1.21e-06', 'epoch': '0.002689', 'num_input_tokens_seen': 20470, 'train_runtime': '11.99', 'train_tokens_per_second': '1707'}
+{'loss': '1.955', 'grad_norm': '0.2961', 'learning_rate': '1.344e-06', 'epoch': '0.002958', 'num_input_tokens_seen': 22517, 'train_runtime': '13.01', 'train_tokens_per_second': '1731'}
+{'loss': '1.811', 'grad_norm': '0.2714', 'learning_rate': '1.478e-06', 'epoch': '0.003227', 'num_input_tokens_seen': 24564, 'train_runtime': '14.02', 'train_tokens_per_second': '1753'}
+{'loss': '1.631', 'grad_norm': '0.2661', 'learning_rate': '1.613e-06', 'epoch': '0.003496', 'num_input_tokens_seen': 26611, 'train_runtime': '15.03', 'train_tokens_per_second': '1771'}
+{'loss': '1.769', 'grad_norm': '0.268', 'learning_rate': '1.747e-06', 'epoch': '0.003764', 'num_input_tokens_seen': 28658, 'train_runtime': '16.04', 'train_tokens_per_second': '1787'}
+{'loss': '1.611', 'grad_norm': '0.2518', 'learning_rate': '1.882e-06', 'epoch': '0.004033', 'num_input_tokens_seen': 30705, 'train_runtime': '17.05', 'train_tokens_per_second': '1801'}
+{'loss': '1.624', 'grad_norm': '0.2597', 'learning_rate': '2.016e-06', 'epoch': '0.004302', 'num_input_tokens_seen': 32752, 'train_runtime': '18.06', 'train_tokens_per_second': '1814'}
+{'loss': '1.854', 'grad_norm': '0.2804', 'learning_rate': '2.151e-06', 'epoch': '0.004571', 'num_input_tokens_seen': 34799, 'train_runtime': '19.07', 'train_tokens_per_second': '1825'}
+{'loss': '1.849', 'grad_norm': '0.521', 'learning_rate': '2.285e-06', 'epoch': '0.00484', 'num_input_tokens_seen': 36846, 'train_runtime': '20.08', 'train_tokens_per_second': '1835'}
+{'loss': '1.825', 'grad_norm': '0.2669', 'learning_rate': '2.419e-06', 'epoch': '0.005109', 'num_input_tokens_seen': 38893, 'train_runtime': '21.1', 'train_tokens_per_second': '1843'}
+{'loss': '1.534', 'grad_norm': '0.2729', 'learning_rate': '2.554e-06', 'epoch': '0.005378', 'num_input_tokens_seen': 40940, 'train_runtime': '22.11', 'train_tokens_per_second': '1852'}
+{'loss': '1.67', 'grad_norm': '0.2686', 'learning_rate': '2.688e-06', 'epoch': '0.005647', 'num_input_tokens_seen': 42987, 'train_runtime': '23.13', 'train_tokens_per_second': '1859'}
+{'loss': '1.549', 'grad_norm': '0.2592', 'learning_rate': '2.823e-06', 'epoch': '0.005916', 'num_input_tokens_seen': 45034, 'train_runtime': '24.14', 'train_tokens_per_second': '1866'}
+{'loss': '1.868', 'grad_norm': '0.2874', 'learning_rate': '2.957e-06', 'epoch': '0.006184', 'num_input_tokens_seen': 47081, 'train_runtime': '25.15', 'train_tokens_per_second': '1872'}
+{'loss': '1.767', 'grad_norm': '0.2763', 'learning_rate': '3.091e-06', 'epoch': '0.006453', 'num_input_tokens_seen': 49128, 'train_runtime': '26.16', 'train_tokens_per_second': '1878'}
+{'loss': '1.936', 'grad_norm': '0.2961', 'learning_rate': '3.226e-06', 'epoch': '0.006722', 'num_input_tokens_seen': 51175, 'train_runtime': '27.18', 'train_tokens_per_second': '1883'}
+{'loss': '1.625', 'grad_norm': '0.2881', 'learning_rate': '3.36e-06', 'epoch': '0.006991', 'num_input_tokens_seen': 53222, 'train_runtime': '28.19', 'train_tokens_per_second': '1888'}
+{'loss': '1.795', 'grad_norm': '0.3211', 'learning_rate': '3.495e-06', 'epoch': '0.00726', 'num_input_tokens_seen': 55269, 'train_runtime': '29.2', 'train_tokens_per_second': '1893'}
+{'loss': '1.725', 'grad_norm': '0.2936', 'learning_rate': '3.629e-06', 'epoch': '0.007529', 'num_input_tokens_seen': 57316, 'train_runtime': '30.22', 'train_tokens_per_second': '1897'}
+{'loss': '1.871', 'grad_norm': '0.2756', 'learning_rate': '3.763e-06', 'epoch': '0.007798', 'num_input_tokens_seen': 59363, 'train_runtime': '31.23', 'train_tokens_per_second': '1901'}
+{'loss': '1.84', 'grad_norm': '0.2772', 'learning_rate': '3.898e-06', 'epoch': '0.008067', 'num_input_tokens_seen': 61410, 'train_runtime': '32.24', 'train_tokens_per_second': '1905'}
+{'loss': '1.908', 'grad_norm': '0.3025', 'learning_rate': '4.032e-06', 'epoch': '0.008336', 'num_input_tokens_seen': 63457, 'train_runtime': '33.26', 'train_tokens_per_second': '1908'}
+{'loss': '1.725', 'grad_norm': '0.2884', 'learning_rate': '4.167e-06', 'epoch': '0.008604', 'num_input_tokens_seen': 65504, 'train_runtime': '34.27', 'train_tokens_per_second': '1911'}
+{'loss': '1.747', 'grad_norm': '0.3165', 'learning_rate': '4.301e-06', 'epoch': '0.008873', 'num_input_tokens_seen': 67551, 'train_runtime': '35.28', 'train_tokens_per_second': '1915'}
+{'loss': '1.909', 'grad_norm': '0.2975', 'learning_rate': '4.435e-06', 'epoch': '0.009142', 'num_input_tokens_seen': 69598, 'train_runtime': '36.3', 'train_tokens_per_second': '1917'}
+{'loss': '1.64', 'grad_norm': '0.2753', 'learning_rate': '4.57e-06', 'epoch': '0.009411', 'num_input_tokens_seen': 71645, 'train_runtime': '37.31', 'train_tokens_per_second': '1920'}
+{'loss': '1.781', 'grad_norm': '0.2986', 'learning_rate': '4.704e-06', 'epoch': '0.00968', 'num_input_tokens_seen': 73692, 'train_runtime': '38.33', 'train_tokens_per_second': '1923'}
+{'loss': '1.831', 'grad_norm': '0.3018', 'learning_rate': '4.839e-06', 'epoch': '0.009949', 'num_input_tokens_seen': 75739, 'train_runtime': '39.38', 'train_tokens_per_second': '1923'}
+{'loss': '1.859', 'grad_norm': '0.2658', 'learning_rate': '4.973e-06', 'epoch': '0.01022', 'num_input_tokens_seen': 77786, 'train_runtime': '40.4', 'train_tokens_per_second': '1925'}
+{'loss': '1.964', 'grad_norm': '0.297', 'learning_rate': '5.108e-06', 'epoch': '0.01049', 'num_input_tokens_seen': 79833, 'train_runtime': '41.41', 'train_tokens_per_second': '1928'}
+{'loss': '1.935', 'grad_norm': '0.3385', 'learning_rate': '5.242e-06', 'epoch': '0.01076', 'num_input_tokens_seen': 81880, 'train_runtime': '42.43', 'train_tokens_per_second': '1930'}
+{'loss': '1.726', 'grad_norm': '0.3095', 'learning_rate': '5.376e-06', 'epoch': '0.01102', 'num_input_tokens_seen': 83927, 'train_runtime': '43.44', 'train_tokens_per_second': '1932'}
+{'loss': '1.533', 'grad_norm': '0.2799', 'learning_rate': '5.511e-06', 'epoch': '0.01129', 'num_input_tokens_seen': 85974, 'train_runtime': '44.45', 'train_tokens_per_second': '1934'}
+{'loss': '1.762', 'grad_norm': '0.2744', 'learning_rate': '5.645e-06', 'epoch': '0.01156', 'num_input_tokens_seen': 88021, 'train_runtime': '45.47', 'train_tokens_per_second': '1936'}
+{'loss': '1.697', 'grad_norm': '0.2797', 'learning_rate': '5.78e-06', 'epoch': '0.01183', 'num_input_tokens_seen': 90068, 'train_runtime': '46.48', 'train_tokens_per_second': '1938'}
+{'loss': '1.725', 'grad_norm': '0.2793', 'learning_rate': '5.914e-06', 'epoch': '0.0121', 'num_input_tokens_seen': 92115, 'train_runtime': '47.5', 'train_tokens_per_second': '1939'}
+{'loss': '1.981', 'grad_norm': '0.3054', 'learning_rate': '6.048e-06', 'epoch': '0.01237', 'num_input_tokens_seen': 94162, 'train_runtime': '48.51', 'train_tokens_per_second': '1941'}
+{'loss': '1.591', 'grad_norm': '0.2925', 'learning_rate': '6.183e-06', 'epoch': '0.01264', 'num_input_tokens_seen': 96209, 'train_runtime': '49.53', 'train_tokens_per_second': '1943'}
+{'loss': '1.777', 'grad_norm': '0.339', 'learning_rate': '6.317e-06', 'epoch': '0.01291', 'num_input_tokens_seen': 98256, 'train_runtime': '50.54', 'train_tokens_per_second': '1944'}
+{'loss': '1.856', 'grad_norm': '0.2972', 'learning_rate': '6.452e-06', 'epoch': '0.01318', 'num_input_tokens_seen': 100303, 'train_runtime': '51.55', 'train_tokens_per_second': '1946'}
+{'loss': '1.637', 'grad_norm': '0.3191', 'learning_rate': '6.586e-06', 'epoch': '0.01344', 'num_input_tokens_seen': 102350, 'train_runtime': '52.57', 'train_tokens_per_second': '1947'}
+{'loss': '1.885', 'grad_norm': '0.3083', 'learning_rate': '6.72e-06', 'epoch': '0.01371', 'num_input_tokens_seen': 104397, 'train_runtime': '53.58', 'train_tokens_per_second': '1948'}
+{'loss': '1.777', 'grad_norm': '0.3115', 'learning_rate': '6.855e-06', 'epoch': '0.01398', 'num_input_tokens_seen': 106444, 'train_runtime': '54.59', 'train_tokens_per_second': '1950'}
+{'loss': '1.848', 'grad_norm': '0.3558', 'learning_rate': '6.989e-06', 'epoch': '0.01425', 'num_input_tokens_seen': 108491, 'train_runtime': '55.61', 'train_tokens_per_second': '1951'}
+{'loss': '1.613', 'grad_norm': '0.3172', 'learning_rate': '7.124e-06', 'epoch': '0.01452', 'num_input_tokens_seen': 110538, 'train_runtime': '56.63', 'train_tokens_per_second': '1952'}
+{'loss': '1.642', 'grad_norm': '0.2996', 'learning_rate': '7.258e-06', 'epoch': '0.01479', 'num_input_tokens_seen': 112585, 'train_runtime': '57.64', 'train_tokens_per_second': '1953'}
+{'loss': '1.979', 'grad_norm': '0.331', 'learning_rate': '7.392e-06', 'epoch': '0.01506', 'num_input_tokens_seen': 114632, 'train_runtime': '58.66', 'train_tokens_per_second': '1954'}
+{'loss': '1.473', 'grad_norm': '0.305', 'learning_rate': '7.527e-06', 'epoch': '0.01533', 'num_input_tokens_seen': 116679, 'train_runtime': '59.67', 'train_tokens_per_second': '1955'}
+{'loss': '1.56', 'grad_norm': '0.2983', 'learning_rate': '7.661e-06', 'epoch': '0.0156', 'num_input_tokens_seen': 118726, 'train_runtime': '60.69', 'train_tokens_per_second': '1956'}
+{'loss': '1.792', 'grad_norm': '0.3465', 'learning_rate': '7.796e-06', 'epoch': '0.01586', 'num_input_tokens_seen': 120773, 'train_runtime': '61.71', 'train_tokens_per_second': '1957'}
+{'loss': '1.589', 'grad_norm': '0.3406', 'learning_rate': '7.93e-06', 'epoch': '0.01613', 'num_input_tokens_seen': 122820, 'train_runtime': '62.73', 'train_tokens_per_second': '1958'}
+{'loss': '1.715', 'grad_norm': '0.3038', 'learning_rate': '8.065e-06', 'epoch': '0.0164', 'num_input_tokens_seen': 124867, 'train_runtime': '63.74', 'train_tokens_per_second': '1959'}
+{'loss': '1.703', 'grad_norm': '0.3439', 'learning_rate': '8.199e-06', 'epoch': '0.01667', 'num_input_tokens_seen': 126914, 'train_runtime': '64.76', 'train_tokens_per_second': '1960'}
+{'loss': '1.909', 'grad_norm': '0.363', 'learning_rate': '8.333e-06', 'epoch': '0.01694', 'num_input_tokens_seen': 128961, 'train_runtime': '65.77', 'train_tokens_per_second': '1961'}
+{'loss': '1.798', 'grad_norm': '0.3657', 'learning_rate': '8.468e-06', 'epoch': '0.01721', 'num_input_tokens_seen': 131008, 'train_runtime': '66.79', 'train_tokens_per_second': '1961'}
+{'loss': '1.853', 'grad_norm': '0.3834', 'learning_rate': '8.602e-06', 'epoch': '0.01748', 'num_input_tokens_seen': 133055, 'train_runtime': '67.81', 'train_tokens_per_second': '1962'}
+{'loss': '1.806', 'grad_norm': '0.7619', 'learning_rate': '8.737e-06', 'epoch': '0.01775', 'num_input_tokens_seen': 135102, 'train_runtime': '68.83', 'train_tokens_per_second': '1963'}
+{'loss': '1.435', 'grad_norm': '0.3309', 'learning_rate': '8.871e-06', 'epoch': '0.01802', 'num_input_tokens_seen': 137149, 'train_runtime': '69.84', 'train_tokens_per_second': '1964'}
+{'loss': '1.746', 'grad_norm': '0.3073', 'learning_rate': '9.005e-06', 'epoch': '0.01828', 'num_input_tokens_seen': 139196, 'train_runtime': '70.86', 'train_tokens_per_second': '1965'}
+{'loss': '1.822', 'grad_norm': '0.354', 'learning_rate': '9.14e-06', 'epoch': '0.01855', 'num_input_tokens_seen': 141243, 'train_runtime': '71.87', 'train_tokens_per_second': '1965'}
+{'loss': '1.661', 'grad_norm': '0.3499', 'learning_rate': '9.274e-06', 'epoch': '0.01882', 'num_input_tokens_seen': 143290, 'train_runtime': '72.89', 'train_tokens_per_second': '1966'}
+{'loss': '1.913', 'grad_norm': '0.3419', 'learning_rate': '9.409e-06', 'epoch': '0.01909', 'num_input_tokens_seen': 145337, 'train_runtime': '73.9', 'train_tokens_per_second': '1967'}
+{'loss': '1.815', 'grad_norm': '0.4037', 'learning_rate': '9.543e-06', 'epoch': '0.01936', 'num_input_tokens_seen': 147384, 'train_runtime': '74.93', 'train_tokens_per_second': '1967'}
+{'loss': '1.798', 'grad_norm': '0.3734', 'learning_rate': '9.677e-06', 'epoch': '0.01963', 'num_input_tokens_seen': 149431, 'train_runtime': '75.94', 'train_tokens_per_second': '1968'}
+{'loss': '1.703', 'grad_norm': '0.3758', 'learning_rate': '9.812e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 151478, 'train_runtime': '76.96', 'train_tokens_per_second': '1968'}
+{'loss': '1.579', 'grad_norm': '0.3325', 'learning_rate': '9.946e-06', 'epoch': '0.02017', 'num_input_tokens_seen': 153525, 'train_runtime': '77.98', 'train_tokens_per_second': '1969'}
+{'loss': '1.712', 'grad_norm': '0.3724', 'learning_rate': '1.008e-05', 'epoch': '0.02044', 'num_input_tokens_seen': 155572, 'train_runtime': '78.99', 'train_tokens_per_second': '1969'}
+{'loss': '1.761', 'grad_norm': '0.3466', 'learning_rate': '1.022e-05', 'epoch': '0.0207', 'num_input_tokens_seen': 157619, 'train_runtime': '80.01', 'train_tokens_per_second': '1970'}
+{'loss': '1.85', 'grad_norm': '0.3739', 'learning_rate': '1.035e-05', 'epoch': '0.02097', 'num_input_tokens_seen': 159666, 'train_runtime': '81.03', 'train_tokens_per_second': '1971'}
+{'loss': '1.769', 'grad_norm': '0.3774', 'learning_rate': '1.048e-05', 'epoch': '0.02124', 'num_input_tokens_seen': 161713, 'train_runtime': '82.04', 'train_tokens_per_second': '1971'}
+{'loss': '1.591', 'grad_norm': '0.3267', 'learning_rate': '1.062e-05', 'epoch': '0.02151', 'num_input_tokens_seen': 163760, 'train_runtime': '83.06', 'train_tokens_per_second': '1972'}
+{'loss': '1.682', 'grad_norm': '0.3958', 'learning_rate': '1.075e-05', 'epoch': '0.02178', 'num_input_tokens_seen': 165807, 'train_runtime': '84.07', 'train_tokens_per_second': '1972'}
+{'loss': '1.415', 'grad_norm': '0.3386', 'learning_rate': '1.089e-05', 'epoch': '0.02205', 'num_input_tokens_seen': 167854, 'train_runtime': '85.09', 'train_tokens_per_second': '1973'}
+{'loss': '1.275', 'grad_norm': '0.3369', 'learning_rate': '1.102e-05', 'epoch': '0.02232', 'num_input_tokens_seen': 169901, 'train_runtime': '86.11', 'train_tokens_per_second': '1973'}
+{'loss': '1.799', 'grad_norm': '0.4252', 'learning_rate': '1.116e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 171948, 'train_runtime': '87.13', 'train_tokens_per_second': '1974'}
+{'loss': '1.631', 'grad_norm': '0.3741', 'learning_rate': '1.129e-05', 'epoch': '0.02286', 'num_input_tokens_seen': 173995, 'train_runtime': '88.14', 'train_tokens_per_second': '1974'}
+{'loss': '1.696', 'grad_norm': '0.3964', 'learning_rate': '1.142e-05', 'epoch': '0.02312', 'num_input_tokens_seen': 176042, 'train_runtime': '89.16', 'train_tokens_per_second': '1974'}
+{'loss': '1.811', 'grad_norm': '0.3835', 'learning_rate': '1.156e-05', 'epoch': '0.02339', 'num_input_tokens_seen': 178089, 'train_runtime': '90.17', 'train_tokens_per_second': '1975'}
+{'loss': '1.628', 'grad_norm': '0.3732', 'learning_rate': '1.169e-05', 'epoch': '0.02366', 'num_input_tokens_seen': 180136, 'train_runtime': '91.19', 'train_tokens_per_second': '1975'}
+{'loss': '1.772', 'grad_norm': '0.3954', 'learning_rate': '1.183e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 182183, 'train_runtime': '92.21', 'train_tokens_per_second': '1976'}
+{'loss': '1.709', 'grad_norm': '0.4323', 'learning_rate': '1.196e-05', 'epoch': '0.0242', 'num_input_tokens_seen': 184230, 'train_runtime': '93.23', 'train_tokens_per_second': '1976'}
+{'loss': '1.63', 'grad_norm': '0.3912', 'learning_rate': '1.21e-05', 'epoch': '0.02447', 'num_input_tokens_seen': 186277, 'train_runtime': '94.24', 'train_tokens_per_second': '1977'}
+{'loss': '1.688', 'grad_norm': '0.4078', 'learning_rate': '1.223e-05', 'epoch': '0.02474', 'num_input_tokens_seen': 188324, 'train_runtime': '95.26', 'train_tokens_per_second': '1977'}
+{'loss': '1.883', 'grad_norm': '0.4385', 'learning_rate': '1.237e-05', 'epoch': '0.02501', 'num_input_tokens_seen': 190371, 'train_runtime': '96.28', 'train_tokens_per_second': '1977'}
+{'loss': '1.763', 'grad_norm': '0.4172', 'learning_rate': '1.25e-05', 'epoch': '0.02528', 'num_input_tokens_seen': 192418, 'train_runtime': '97.29', 'train_tokens_per_second': '1978'}
+{'loss': '1.675', 'grad_norm': '0.4223', 'learning_rate': '1.263e-05', 'epoch': '0.02554', 'num_input_tokens_seen': 194465, 'train_runtime': '98.31', 'train_tokens_per_second': '1978'}
+{'loss': '1.747', 'grad_norm': '0.4324', 'learning_rate': '1.277e-05', 'epoch': '0.02581', 'num_input_tokens_seen': 196512, 'train_runtime': '99.33', 'train_tokens_per_second': '1978'}
+{'loss': '1.792', 'grad_norm': '0.4544', 'learning_rate': '1.29e-05', 'epoch': '0.02608', 'num_input_tokens_seen': 198559, 'train_runtime': '100.3', 'train_tokens_per_second': '1979'}
+{'loss': '1.596', 'grad_norm': '0.4222', 'learning_rate': '1.304e-05', 'epoch': '0.02635', 'num_input_tokens_seen': 200606, 'train_runtime': '101.4', 'train_tokens_per_second': '1979'}
+{'loss': '1.533', 'grad_norm': '0.4118', 'learning_rate': '1.317e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 202653, 'train_runtime': '102.4', 'train_tokens_per_second': '1979'}
+{'loss': '1.608', 'grad_norm': '0.4393', 'learning_rate': '1.331e-05', 'epoch': '0.02689', 'num_input_tokens_seen': 204700, 'train_runtime': '103.4', 'train_tokens_per_second': '1980'}
+{'loss': '1.307', 'grad_norm': '0.3855', 'learning_rate': '1.344e-05', 'epoch': '0.02716', 'num_input_tokens_seen': 206747, 'train_runtime': '104.4', 'train_tokens_per_second': '1980'}
+{'loss': '1.775', 'grad_norm': '0.4397', 'learning_rate': '1.358e-05', 'epoch': '0.02743', 'num_input_tokens_seen': 208794, 'train_runtime': '105.4', 'train_tokens_per_second': '1980'}
+{'loss': '1.165', 'grad_norm': '0.4129', 'learning_rate': '1.371e-05', 'epoch': '0.0277', 'num_input_tokens_seen': 210841, 'train_runtime': '106.4', 'train_tokens_per_second': '1981'}
+{'loss': '1.774', 'grad_norm': '0.4688', 'learning_rate': '1.384e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 212888, 'train_runtime': '107.5', 'train_tokens_per_second': '1981'}
+{'loss': '1.548', 'grad_norm': '0.409', 'learning_rate': '1.398e-05', 'epoch': '0.02823', 'num_input_tokens_seen': 214935, 'train_runtime': '108.5', 'train_tokens_per_second': '1981'}
+{'loss': '1.662', 'grad_norm': '0.4561', 'learning_rate': '1.411e-05', 'epoch': '0.0285', 'num_input_tokens_seen': 216982, 'train_runtime': '109.5', 'train_tokens_per_second': '1982'}
+{'loss': '1.709', 'grad_norm': '0.5552', 'learning_rate': '1.425e-05', 'epoch': '0.02877', 'num_input_tokens_seen': 219029, 'train_runtime': '110.5', 'train_tokens_per_second': '1982'}
+{'loss': '1.681', 'grad_norm': '0.4587', 'learning_rate': '1.438e-05', 'epoch': '0.02904', 'num_input_tokens_seen': 221076, 'train_runtime': '111.5', 'train_tokens_per_second': '1982'}
+{'loss': '1.787', 'grad_norm': '0.4875', 'learning_rate': '1.452e-05', 'epoch': '0.02931', 'num_input_tokens_seen': 223123, 'train_runtime': '112.6', 'train_tokens_per_second': '1982'}
+{'loss': '1.593', 'grad_norm': '0.4741', 'learning_rate': '1.465e-05', 'epoch': '0.02958', 'num_input_tokens_seen': 225170, 'train_runtime': '113.6', 'train_tokens_per_second': '1982'}
+{'loss': '1.143', 'grad_norm': '0.4104', 'learning_rate': '1.478e-05', 'epoch': '0.02985', 'num_input_tokens_seen': 227217, 'train_runtime': '114.6', 'train_tokens_per_second': '1983'}
+{'loss': '1.633', 'grad_norm': '0.4514', 'learning_rate': '1.492e-05', 'epoch': '0.03012', 'num_input_tokens_seen': 229264, 'train_runtime': '115.6', 'train_tokens_per_second': '1983'}
+{'loss': '1.576', 'grad_norm': '0.4584', 'learning_rate': '1.505e-05', 'epoch': '0.03038', 'num_input_tokens_seen': 231311, 'train_runtime': '116.7', 'train_tokens_per_second': '1983'}
+{'loss': '1.704', 'grad_norm': '0.4646', 'learning_rate': '1.519e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 233358, 'train_runtime': '117.7', 'train_tokens_per_second': '1983'}
+{'loss': '1.651', 'grad_norm': '0.4925', 'learning_rate': '1.532e-05', 'epoch': '0.03092', 'num_input_tokens_seen': 235405, 'train_runtime': '118.7', 'train_tokens_per_second': '1983'}
+{'loss': '1.614', 'grad_norm': '0.4438', 'learning_rate': '1.546e-05', 'epoch': '0.03119', 'num_input_tokens_seen': 237452, 'train_runtime': '119.7', 'train_tokens_per_second': '1984'}
+{'loss': '1.158', 'grad_norm': '0.4493', 'learning_rate': '1.559e-05', 'epoch': '0.03146', 'num_input_tokens_seen': 239499, 'train_runtime': '120.7', 'train_tokens_per_second': '1984'}
+{'loss': '1.604', 'grad_norm': '0.545', 'learning_rate': '1.573e-05', 'epoch': '0.03173', 'num_input_tokens_seen': 241546, 'train_runtime': '121.7', 'train_tokens_per_second': '1984'}
+{'loss': '1.744', 'grad_norm': '0.5362', 'learning_rate': '1.586e-05', 'epoch': '0.032', 'num_input_tokens_seen': 243593, 'train_runtime': '122.8', 'train_tokens_per_second': '1984'}
+{'loss': '1.525', 'grad_norm': '0.5284', 'learning_rate': '1.599e-05', 'epoch': '0.03227', 'num_input_tokens_seen': 245640, 'train_runtime': '123.8', 'train_tokens_per_second': '1985'}
+{'loss': '1.521', 'grad_norm': '0.5212', 'learning_rate': '1.613e-05', 'epoch': '0.03254', 'num_input_tokens_seen': 247687, 'train_runtime': '124.8', 'train_tokens_per_second': '1985'}
+{'loss': '1.561', 'grad_norm': '0.5265', 'learning_rate': '1.626e-05', 'epoch': '0.0328', 'num_input_tokens_seen': 249734, 'train_runtime': '125.8', 'train_tokens_per_second': '1985'}
+{'loss': '1.634', 'grad_norm': '0.5029', 'learning_rate': '1.64e-05', 'epoch': '0.03307', 'num_input_tokens_seen': 251781, 'train_runtime': '126.8', 'train_tokens_per_second': '1985'}
+{'loss': '1.475', 'grad_norm': '1.579', 'learning_rate': '1.653e-05', 'epoch': '0.03334', 'num_input_tokens_seen': 253828, 'train_runtime': '127.8', 'train_tokens_per_second': '1985'}
+{'loss': '1.53', 'grad_norm': '0.541', 'learning_rate': '1.667e-05', 'epoch': '0.03361', 'num_input_tokens_seen': 255875, 'train_runtime': '128.9', 'train_tokens_per_second': '1986'}
+{'loss': '1.484', 'grad_norm': '0.5354', 'learning_rate': '1.68e-05', 'epoch': '0.03388', 'num_input_tokens_seen': 257922, 'train_runtime': '129.9', 'train_tokens_per_second': '1986'}
+{'loss': '1.496', 'grad_norm': '0.6181', 'learning_rate': '1.694e-05', 'epoch': '0.03415', 'num_input_tokens_seen': 259969, 'train_runtime': '130.9', 'train_tokens_per_second': '1986'}
+{'loss': '1.393', 'grad_norm': '0.5379', 'learning_rate': '1.707e-05', 'epoch': '0.03442', 'num_input_tokens_seen': 262016, 'train_runtime': '131.9', 'train_tokens_per_second': '1986'}
+{'loss': '1.658', 'grad_norm': '0.599', 'learning_rate': '1.72e-05', 'epoch': '0.03469', 'num_input_tokens_seen': 264063, 'train_runtime': '132.9', 'train_tokens_per_second': '1986'}
+{'loss': '1.735', 'grad_norm': '0.6024', 'learning_rate': '1.734e-05', 'epoch': '0.03496', 'num_input_tokens_seen': 266110, 'train_runtime': '134', 'train_tokens_per_second': '1987'}
+{'loss': '1.582', 'grad_norm': '0.5961', 'learning_rate': '1.747e-05', 'epoch': '0.03522', 'num_input_tokens_seen': 268157, 'train_runtime': '135', 'train_tokens_per_second': '1987'}
+{'loss': '1.432', 'grad_norm': '0.4836', 'learning_rate': '1.761e-05', 'epoch': '0.03549', 'num_input_tokens_seen': 270204, 'train_runtime': '136', 'train_tokens_per_second': '1987'}
+{'loss': '1.463', 'grad_norm': '0.5285', 'learning_rate': '1.774e-05', 'epoch': '0.03576', 'num_input_tokens_seen': 272251, 'train_runtime': '137', 'train_tokens_per_second': '1987'}
+{'loss': '1.529', 'grad_norm': '0.6326', 'learning_rate': '1.788e-05', 'epoch': '0.03603', 'num_input_tokens_seen': 274298, 'train_runtime': '138', 'train_tokens_per_second': '1987'}
+{'loss': '1.533', 'grad_norm': '0.6052', 'learning_rate': '1.801e-05', 'epoch': '0.0363', 'num_input_tokens_seen': 276345, 'train_runtime': '139', 'train_tokens_per_second': '1987'}
+{'loss': '1.655', 'grad_norm': '0.5771', 'learning_rate': '1.815e-05', 'epoch': '0.03657', 'num_input_tokens_seen': 278392, 'train_runtime': '140.1', 'train_tokens_per_second': '1988'}
+{'loss': '1.518', 'grad_norm': '0.6251', 'learning_rate': '1.828e-05', 'epoch': '0.03684', 'num_input_tokens_seen': 280439, 'train_runtime': '141.1', 'train_tokens_per_second': '1988'}
+{'loss': '1.387', 'grad_norm': '0.5392', 'learning_rate': '1.841e-05', 'epoch': '0.03711', 'num_input_tokens_seen': 282486, 'train_runtime': '142.1', 'train_tokens_per_second': '1988'}
+{'loss': '1.677', 'grad_norm': '2.701', 'learning_rate': '1.855e-05', 'epoch': '0.03738', 'num_input_tokens_seen': 284533, 'train_runtime': '143.1', 'train_tokens_per_second': '1988'}
+{'loss': '1.466', 'grad_norm': '0.5754', 'learning_rate': '1.868e-05', 'epoch': '0.03764', 'num_input_tokens_seen': 286580, 'train_runtime': '144.1', 'train_tokens_per_second': '1988'}
+{'loss': '1.461', 'grad_norm': '0.5828', 'learning_rate': '1.882e-05', 'epoch': '0.03791', 'num_input_tokens_seen': 288627, 'train_runtime': '145.1', 'train_tokens_per_second': '1988'}
+{'loss': '1.585', 'grad_norm': '0.6422', 'learning_rate': '1.895e-05', 'epoch': '0.03818', 'num_input_tokens_seen': 290674, 'train_runtime': '146.2', 'train_tokens_per_second': '1989'}
+{'loss': '1.33', 'grad_norm': '0.569', 'learning_rate': '1.909e-05', 'epoch': '0.03845', 'num_input_tokens_seen': 292721, 'train_runtime': '147.2', 'train_tokens_per_second': '1989'}
+{'loss': '1.607', 'grad_norm': '0.632', 'learning_rate': '1.922e-05', 'epoch': '0.03872', 'num_input_tokens_seen': 294768, 'train_runtime': '148.2', 'train_tokens_per_second': '1989'}
+{'loss': '1.382', 'grad_norm': '0.5767', 'learning_rate': '1.935e-05', 'epoch': '0.03899', 'num_input_tokens_seen': 296815, 'train_runtime': '149.2', 'train_tokens_per_second': '1989'}
+{'loss': '1.412', 'grad_norm': '0.6597', 'learning_rate': '1.949e-05', 'epoch': '0.03926', 'num_input_tokens_seen': 298862, 'train_runtime': '150.2', 'train_tokens_per_second': '1989'}
+{'loss': '1.238', 'grad_norm': '0.5835', 'learning_rate': '1.962e-05', 'epoch': '0.03953', 'num_input_tokens_seen': 300909, 'train_runtime': '151.3', 'train_tokens_per_second': '1989'}
+{'loss': '1.586', 'grad_norm': '0.6251', 'learning_rate': '1.976e-05', 'epoch': '0.0398', 'num_input_tokens_seen': 302956, 'train_runtime': '152.3', 'train_tokens_per_second': '1989'}
+{'loss': '1.396', 'grad_norm': '0.629', 'learning_rate': '1.989e-05', 'epoch': '0.04006', 'num_input_tokens_seen': 305003, 'train_runtime': '153.3', 'train_tokens_per_second': '1990'}
+{'loss': '1.484', 'grad_norm': '0.7154', 'learning_rate': '2.003e-05', 'epoch': '0.04033', 'num_input_tokens_seen': 307050, 'train_runtime': '154.3', 'train_tokens_per_second': '1990'}
+{'loss': '1.553', 'grad_norm': '0.7419', 'learning_rate': '2.016e-05', 'epoch': '0.0406', 'num_input_tokens_seen': 309097, 'train_runtime': '155.3', 'train_tokens_per_second': '1990'}
+{'loss': '1.573', 'grad_norm': '0.7395', 'learning_rate': '2.03e-05', 'epoch': '0.04087', 'num_input_tokens_seen': 311144, 'train_runtime': '156.4', 'train_tokens_per_second': '1990'}
+{'loss': '1.284', 'grad_norm': '0.5886', 'learning_rate': '2.043e-05', 'epoch': '0.04114', 'num_input_tokens_seen': 313191, 'train_runtime': '157.4', 'train_tokens_per_second': '1990'}
+{'loss': '1.444', 'grad_norm': '0.7212', 'learning_rate': '2.056e-05', 'epoch': '0.04141', 'num_input_tokens_seen': 315238, 'train_runtime': '158.4', 'train_tokens_per_second': '1990'}
+{'loss': '1.456', 'grad_norm': '0.6589', 'learning_rate': '2.07e-05', 'epoch': '0.04168', 'num_input_tokens_seen': 317285, 'train_runtime': '159.4', 'train_tokens_per_second': '1990'}
+{'loss': '1.469', 'grad_norm': '0.7179', 'learning_rate': '2.083e-05', 'epoch': '0.04195', 'num_input_tokens_seen': 319332, 'train_runtime': '160.4', 'train_tokens_per_second': '1991'}
+  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
+    launcher.launch()
+  File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
+    run_exp()
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
+    _training_function(config={"args": args, "callbacks": callbacks})
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
+    run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
+  File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
+    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
+    self.accelerator.backward(loss, **kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
+    loss.backward(**kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
+    _engine_run_backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+KeyboardInterrupt

LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os":  "Linux-6.8.0-94-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T04:05:44.037622Z",
+  "args":  [
+    "/workspace/v127rc_exp1/C.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "47a53adf0198",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  16,
+  "cpu_count_logical":  32,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "1858318336"
+    }
+  },
+  "memory":  {
+    "total":  "201701408768"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de"
+    }
+  ],
+  "cudaVersion":  "13.0",
+  "writerId":  "jy6in5azojamixlag12ky8yqk0a5luc8"
+}

LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_runtime":159,"_timestamp":1.770178104014671e+09,"train/grad_norm":0.7178835272789001,"_wandb":{"runtime":159},"train/train_tokens_per_second":1990.521,"train/num_input_tokens_seen":319332,"train/global_step":156,"train/epoch":0.041946759881688625,"train_runtime":160.4264,"train/loss":1.4694324731826782,"train/learning_rate":2.0833333333333336e-05,"_step":155}

LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,11 @@

+{"time":"2026-02-04T04:05:44.28893781Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T04:05:44.666073338Z","level":"INFO","msg":"stream: created new stream","id":"nj0w4q6e"}
+{"time":"2026-02-04T04:05:44.666543269Z","level":"INFO","msg":"handler: started","stream_id":"nj0w4q6e"}
+{"time":"2026-02-04T04:05:44.668183448Z","level":"INFO","msg":"stream: started","id":"nj0w4q6e"}
+{"time":"2026-02-04T04:05:44.668196893Z","level":"INFO","msg":"writer: started","stream_id":"nj0w4q6e"}
+{"time":"2026-02-04T04:05:44.668198065Z","level":"INFO","msg":"sender: started","stream_id":"nj0w4q6e"}
+{"time":"2026-02-04T04:08:24.969216421Z","level":"INFO","msg":"stream: closing","id":"nj0w4q6e"}
+{"time":"2026-02-04T04:08:25.578748227Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-04T04:08:25.833732236Z","level":"INFO","msg":"handler: closed","stream_id":"nj0w4q6e"}
+{"time":"2026-02-04T04:08:25.837480922Z","level":"INFO","msg":"sender: closed","stream_id":"nj0w4q6e"}
+{"time":"2026-02-04T04:08:25.837821633Z","level":"INFO","msg":"stream: closed","id":"nj0w4q6e"}

LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2026-02-04 04:05:44,065 INFO    MainThread:6386 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 04:05:44,065 INFO    MainThread:6386 [wandb_setup.py:_flush():81] Configure stats pid to 6386
+2026-02-04 04:05:44,066 INFO    MainThread:6386 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 04:05:44,066 INFO    MainThread:6386 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log
+2026-02-04 04:05:44,067 INFO    MainThread:6386 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log
+2026-02-04 04:05:44,067 INFO    MainThread:6386 [wandb_init.py:init():844] calling init triggers
+2026-02-04 04:05:44,068 INFO    MainThread:6386 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 04:05:44,068 INFO    MainThread:6386 [wandb_init.py:init():892] starting backend
+2026-02-04 04:05:44,278 INFO    MainThread:6386 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 04:05:44,286 INFO    MainThread:6386 [wandb_init.py:init():903] backend started and connected
+2026-02-04 04:05:44,288 INFO    MainThread:6386 [wandb_init.py:init():973] updated telemetry
+2026-02-04 04:05:44,352 INFO    MainThread:6386 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 04:05:44,992 INFO    MainThread:6386 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 04:05:45,060 INFO    MainThread:6386 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 04:05:45,060 INFO    MainThread:6386 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 04:05:45,061 INFO    MainThread:6386 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 04:05:45,061 INFO    MainThread:6386 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 04:05:45,063 INFO    MainThread:6386 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 04:05:45,064 INFO    MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'down_proj', 'gate_proj', 'v_proj', 'k_proj', 'q_proj', 'up_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 266, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 04:05:45,071 INFO    MainThread:6386 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7ea90c2fcf90>>
+2026-02-04 04:05:45,071 INFO    MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 04:05:45,073 INFO    MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-04 04:08:24,969 INFO    wandb-AsyncioManager-main:6386 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-04 04:08:24,970 INFO    wandb-AsyncioManager-main:6386 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.

LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.1
+        e:
+            dq2kg12neczzbdsqmciypnior6fee84h:
+                args:
+                    - /workspace/v127rc_exp1/B_dup.yaml
+                cpu_count: 16
+                cpu_count_logical: 32
+                cudaVersion: "12.7"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "2193969152"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1
+                host: e5c6872797ac
+                memory:
+                    total: "201701502976"
+                os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-04T08:35:48.570855Z"
+                writerId: dq2kg12neczzbdsqmciypnior6fee84h
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.1
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.1
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t0_d35_r286
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/B_dup
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - o_proj
+                - gate_proj
+                - k_proj
+                - up_proj
+                - v_proj
+                - q_proj
+                - down_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os":  "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T08:35:48.570855Z",
+  "args":  [
+    "/workspace/v127rc_exp1/B_dup.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "e5c6872797ac",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  16,
+  "cpu_count_logical":  32,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "2193969152"
+    }
+  },
+  "memory":  {
+    "total":  "201701502976"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1"
+    }
+  ],
+  "cudaVersion":  "12.7",
+  "writerId":  "dq2kg12neczzbdsqmciypnior6fee84h"
+}

LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train/grad_norm":0.2597666084766388,"_step":73480,"train_samples_per_second":0.975,"_runtime":75384,"train/epoch":5,"_wandb":{"runtime":75384},"train/num_input_tokens_seen":150413560,"train/train_tokens_per_second":1995.358,"train/loss":0.014940977096557617,"train_steps_per_second":0.975,"_timestamp":1.7702695315018873e+09,"total_flos":6.869735474541773e+18,"train/learning_rate":2.379162700183457e-14,"train_loss":0.08730816244039097,"train_runtime":75383.3694,"train/global_step":73480}

LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,14 @@

+{"time":"2026-02-04T08:35:48.826256258Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T08:35:49.141746844Z","level":"INFO","msg":"stream: created new stream","id":"pwixiyan"}
+{"time":"2026-02-04T08:35:49.142115089Z","level":"INFO","msg":"handler: started","stream_id":"pwixiyan"}
+{"time":"2026-02-04T08:35:49.143583725Z","level":"INFO","msg":"stream: started","id":"pwixiyan"}
+{"time":"2026-02-04T08:35:49.143601157Z","level":"INFO","msg":"writer: started","stream_id":"pwixiyan"}
+{"time":"2026-02-04T08:35:49.14359757Z","level":"INFO","msg":"sender: started","stream_id":"pwixiyan"}
+{"time":"2026-02-04T17:47:19.818024452Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-04T18:31:07.413320842Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-04T22:59:10.135922468Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-05T05:32:13.77134292Z","level":"INFO","msg":"stream: closing","id":"pwixiyan"}
+{"time":"2026-02-05T05:32:15.653703901Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-05T05:32:15.875179968Z","level":"INFO","msg":"handler: closed","stream_id":"pwixiyan"}
+{"time":"2026-02-05T05:32:15.87824593Z","level":"INFO","msg":"sender: closed","stream_id":"pwixiyan"}
+{"time":"2026-02-05T05:32:15.878535169Z","level":"INFO","msg":"stream: closed","id":"pwixiyan"}

LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2026-02-04 08:35:48,588 INFO    MainThread:3069 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 08:35:48,588 INFO    MainThread:3069 [wandb_setup.py:_flush():81] Configure stats pid to 3069
+2026-02-04 08:35:48,589 INFO    MainThread:3069 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 08:35:48,589 INFO    MainThread:3069 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log
+2026-02-04 08:35:48,590 INFO    MainThread:3069 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log
+2026-02-04 08:35:48,591 INFO    MainThread:3069 [wandb_init.py:init():844] calling init triggers
+2026-02-04 08:35:48,591 INFO    MainThread:3069 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 08:35:48,591 INFO    MainThread:3069 [wandb_init.py:init():892] starting backend
+2026-02-04 08:35:48,817 INFO    MainThread:3069 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 08:35:48,824 INFO    MainThread:3069 [wandb_init.py:init():903] backend started and connected
+2026-02-04 08:35:48,825 INFO    MainThread:3069 [wandb_init.py:init():973] updated telemetry
+2026-02-04 08:35:48,867 INFO    MainThread:3069 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 08:35:49,594 INFO    MainThread:3069 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 08:35:49,662 INFO    MainThread:3069 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 08:35:49,662 INFO    MainThread:3069 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 08:35:49,662 INFO    MainThread:3069 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 08:35:49,663 INFO    MainThread:3069 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 08:35:49,664 INFO    MainThread:3069 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 08:35:49,666 INFO    MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'gate_proj', 'k_proj', 'up_proj', 'v_proj', 'q_proj', 'down_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 08:35:49,672 INFO    MainThread:3069 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x740002ab08d0>>
+2026-02-04 08:35:49,672 INFO    MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 08:35:49,674 INFO    MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-05 05:32:13,771 INFO    wandb-AsyncioManager-main:3069 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-05 05:32:13,771 INFO    wandb-AsyncioManager-main:3069 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.

LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.1
+        e:
+            ymezb35dmjxj99q0ikd0taef6he5rsbn:
+                args:
+                    - /workspace/v127rc_exp1/D_dup.yaml
+                cpu_count: 24
+                cpu_count_logical: 48
+                cudaVersion: "12.8"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "2203967488"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a
+                host: 313b3f58db2c
+                memory:
+                    total: "270100414464"
+                os: Linux-6.8.0-78-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-04T08:56:16.046521Z"
+                writerId: ymezb35dmjxj99q0ikd0taef6he5rsbn
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.1
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.1
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t0_d100_r101
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/D_dup
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - down_proj
+                - k_proj
+                - up_proj
+                - gate_proj
+                - o_proj
+                - q_proj
+                - v_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os":  "Linux-6.8.0-78-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T08:56:16.046521Z",
+  "args":  [
+    "/workspace/v127rc_exp1/D_dup.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "313b3f58db2c",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  24,
+  "cpu_count_logical":  48,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "2203967488"
+    }
+  },
+  "memory":  {
+    "total":  "270100414464"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a"
+    }
+  ],
+  "cudaVersion":  "12.8",
+  "writerId":  "ymezb35dmjxj99q0ikd0taef6he5rsbn"
+}

LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"total_flos":7.007635036666829e+18,"_wandb":{"runtime":79122},"train/grad_norm":0.20166438817977905,"train_runtime":79119.4798,"_timestamp":1.7702744950489569e+09,"train/learning_rate":2.2864779514186752e-14,"_step":74955,"train_steps_per_second":0.947,"train/global_step":74955,"train/train_tokens_per_second":1939.332,"train_loss":0.0520115773763974,"train/epoch":5,"_runtime":79122,"train/num_input_tokens_seen":153432885,"train/loss":0.013762388378381729,"train_samples_per_second":0.947}

LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,14 @@

+{"time":"2026-02-04T08:56:16.334273741Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T08:56:16.719436268Z","level":"INFO","msg":"stream: created new stream","id":"pnh57y4w"}
+{"time":"2026-02-04T08:56:16.720193488Z","level":"INFO","msg":"handler: started","stream_id":"pnh57y4w"}
+{"time":"2026-02-04T08:56:16.722437346Z","level":"INFO","msg":"stream: started","id":"pnh57y4w"}
+{"time":"2026-02-04T08:56:16.722511208Z","level":"INFO","msg":"sender: started","stream_id":"pnh57y4w"}
+{"time":"2026-02-04T08:56:16.722517428Z","level":"INFO","msg":"writer: started","stream_id":"pnh57y4w"}
+{"time":"2026-02-04T18:51:17.561552143Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-04T21:10:50.641448939Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-04T21:51:53.27313763Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-05T06:54:59.294785648Z","level":"INFO","msg":"stream: closing","id":"pnh57y4w"}
+{"time":"2026-02-05T06:55:01.38735749Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-05T06:55:01.616258321Z","level":"INFO","msg":"handler: closed","stream_id":"pnh57y4w"}
+{"time":"2026-02-05T06:55:01.620481643Z","level":"INFO","msg":"sender: closed","stream_id":"pnh57y4w"}
+{"time":"2026-02-05T06:55:01.620880145Z","level":"INFO","msg":"stream: closed","id":"pnh57y4w"}

LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2026-02-04 08:56:16,078 INFO    MainThread:439 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 08:56:16,079 INFO    MainThread:439 [wandb_setup.py:_flush():81] Configure stats pid to 439
+2026-02-04 08:56:16,080 INFO    MainThread:439 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 08:56:16,080 INFO    MainThread:439 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log
+2026-02-04 08:56:16,081 INFO    MainThread:439 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log
+2026-02-04 08:56:16,082 INFO    MainThread:439 [wandb_init.py:init():844] calling init triggers
+2026-02-04 08:56:16,083 INFO    MainThread:439 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 08:56:16,083 INFO    MainThread:439 [wandb_init.py:init():892] starting backend
+2026-02-04 08:56:16,317 INFO    MainThread:439 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 08:56:16,328 INFO    MainThread:439 [wandb_init.py:init():903] backend started and connected
+2026-02-04 08:56:16,331 INFO    MainThread:439 [wandb_init.py:init():973] updated telemetry
+2026-02-04 08:56:16,409 INFO    MainThread:439 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 08:56:17,188 INFO    MainThread:439 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 08:56:17,388 INFO    MainThread:439 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 08:56:17,389 INFO    MainThread:439 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 08:56:17,389 INFO    MainThread:439 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 08:56:17,390 INFO    MainThread:439 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 08:56:17,393 INFO    MainThread:439 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 08:56:17,395 INFO    MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['down_proj', 'k_proj', 'up_proj', 'gate_proj', 'o_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 08:56:17,406 INFO    MainThread:439 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f7390416710>>
+2026-02-04 08:56:17,406 INFO    MainThread:439 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 08:56:17,410 INFO    MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d100_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-05 06:54:59,294 INFO    wandb-AsyncioManager-main:439 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-05 06:54:59,296 INFO    wandb-AsyncioManager-main:439 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.

LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.1
+        e:
+            mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc:
+                args:
+                    - /workspace/v127rc_exp1/C_dup.yaml
+                cpu_count: 16
+                cpu_count_logical: 32
+                cudaVersion: "12.8"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "2197102592"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-518d5b06-9437-a74a-eed0-11812394bafa
+                host: dbefea6e926e
+                memory:
+                    total: "132536217600"
+                os: Linux-6.8.0-88-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-04T09:03:20.733865Z"
+                writerId: mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.1
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.1
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t0_d70_r143
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/C_dup
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - k_proj
+                - o_proj
+                - q_proj
+                - gate_proj
+                - up_proj
+                - down_proj
+                - v_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os":  "Linux-6.8.0-88-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T09:03:20.733865Z",
+  "args":  [
+    "/workspace/v127rc_exp1/C_dup.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "dbefea6e926e",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  16,
+  "cpu_count_logical":  32,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "2197102592"
+    }
+  },
+  "memory":  {
+    "total":  "132536217600"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-518d5b06-9437-a74a-eed0-11812394bafa"
+    }
+  ],
+  "cudaVersion":  "12.8",
+  "writerId":  "mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc"
+}

LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train_runtime":76057.1863,"_runtime":76057,"train_loss":0.05950206121845679,"train/grad_norm":0.08892247080802917,"train/epoch":5,"train_steps_per_second":0.973,"train/learning_rate":2.343619187605839e-14,"train/train_tokens_per_second":1992.607,"_timestamp":1.7702718574597487e+09,"_step":74035,"total_flos":6.921623106392218e+18,"train_samples_per_second":0.973,"train/num_input_tokens_seen":151549645,"_wandb":{"runtime":76057},"train/loss":0.01741047017276287,"train/global_step":74035}

LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,13 @@

+{"time":"2026-02-04T09:03:20.972443735Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T09:03:21.325948046Z","level":"INFO","msg":"stream: created new stream","id":"aseg728n"}
+{"time":"2026-02-04T09:03:21.326834454Z","level":"INFO","msg":"handler: started","stream_id":"aseg728n"}
+{"time":"2026-02-04T09:03:21.328230927Z","level":"INFO","msg":"stream: started","id":"aseg728n"}
+{"time":"2026-02-04T09:03:21.328245133Z","level":"INFO","msg":"sender: started","stream_id":"aseg728n"}
+{"time":"2026-02-04T09:03:21.32824351Z","level":"INFO","msg":"writer: started","stream_id":"aseg728n"}
+{"time":"2026-02-04T19:00:37.019618501Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-04T19:04:09.622196123Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-05T06:10:59.110706011Z","level":"INFO","msg":"stream: closing","id":"aseg728n"}
+{"time":"2026-02-05T06:11:01.208766135Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-05T06:11:01.529632193Z","level":"INFO","msg":"handler: closed","stream_id":"aseg728n"}
+{"time":"2026-02-05T06:11:01.532583178Z","level":"INFO","msg":"sender: closed","stream_id":"aseg728n"}
+{"time":"2026-02-05T06:11:01.53279222Z","level":"INFO","msg":"stream: closed","id":"aseg728n"}

LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2026-02-04 09:03:20,750 INFO    MainThread:2574 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 09:03:20,750 INFO    MainThread:2574 [wandb_setup.py:_flush():81] Configure stats pid to 2574
+2026-02-04 09:03:20,751 INFO    MainThread:2574 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 09:03:20,751 INFO    MainThread:2574 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log
+2026-02-04 09:03:20,752 INFO    MainThread:2574 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log
+2026-02-04 09:03:20,752 INFO    MainThread:2574 [wandb_init.py:init():844] calling init triggers
+2026-02-04 09:03:20,752 INFO    MainThread:2574 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 09:03:20,753 INFO    MainThread:2574 [wandb_init.py:init():892] starting backend
+2026-02-04 09:03:20,966 INFO    MainThread:2574 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 09:03:20,971 INFO    MainThread:2574 [wandb_init.py:init():903] backend started and connected
+2026-02-04 09:03:20,973 INFO    MainThread:2574 [wandb_init.py:init():973] updated telemetry
+2026-02-04 09:03:21,024 INFO    MainThread:2574 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 09:03:21,802 INFO    MainThread:2574 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 09:03:21,866 INFO    MainThread:2574 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 09:03:21,866 INFO    MainThread:2574 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 09:03:21,867 INFO    MainThread:2574 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 09:03:21,867 INFO    MainThread:2574 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 09:03:21,869 INFO    MainThread:2574 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 09:03:21,870 INFO    MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['k_proj', 'o_proj', 'q_proj', 'gate_proj', 'up_proj', 'down_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 09:03:21,876 INFO    MainThread:2574 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x74ff5ca50210>>
+2026-02-04 09:03:21,877 INFO    MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 09:03:21,879 INFO    MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d70_r143'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-05 06:10:59,110 INFO    wandb-AsyncioManager-main:2574 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-05 06:10:59,111 INFO    wandb-AsyncioManager-main:2574 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.

LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.1
+        e:
+            km795qg4wugx2xk47glqbs7x5abb2ilt:
+                args:
+                    - /workspace/v127rc_exp1/E_dup.yaml
+                cpu_count: 16
+                cpu_count_logical: 32
+                cudaVersion: "12.9"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "2198335488"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072
+                host: 9acfbb3ac08f
+                memory:
+                    total: "134123917312"
+                os: Linux-6.8.0-64-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-04T09:03:21.035088Z"
+                writerId: km795qg4wugx2xk47glqbs7x5abb2ilt
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.1
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.1
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t0_d119_r85
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/E_dup
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - up_proj
+                - q_proj
+                - k_proj
+                - down_proj
+                - gate_proj
+                - o_proj
+                - v_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.3.7
+fastapi==0.128.0
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.51.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.1
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+blinker==1.4
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os":  "Linux-6.8.0-64-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.11.10",
+  "startedAt":  "2026-02-04T09:03:21.035088Z",
+  "args":  [
+    "/workspace/v127rc_exp1/E_dup.yaml"
+  ],
+  "program":  "/usr/local/bin/llamafactory-cli",
+  "git":  {
+    "remote":  "https://github.com/hiyouga/LlamaFactory.git",
+    "commit":  "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email":  "markmochi200@gmail.com",
+  "root":  "/workspace/LlamaFactory",
+  "host":  "9acfbb3ac08f",
+  "executable":  "/usr/bin/python",
+  "cpu_count":  16,
+  "cpu_count_logical":  32,
+  "gpu":  "NVIDIA GeForce RTX 4090",
+  "gpu_count":  1,
+  "disk":  {
+    "/":  {
+      "total":  "21474836480",
+      "used":  "2198335488"
+    }
+  },
+  "memory":  {
+    "total":  "134123917312"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA GeForce RTX 4090",
+      "memoryTotal":  "25757220864",
+      "cudaCores":  16384,
+      "architecture":  "Ada",
+      "uuid":  "GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072"
+    }
+  ],
+  "cudaVersion":  "12.9",
+  "writerId":  "km795qg4wugx2xk47glqbs7x5abb2ilt"
+}

LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train_runtime":75825.2674,"train/num_input_tokens_seen":151989750,"_timestamp":1.7702716258520179e+09,"train/train_tokens_per_second":2004.516,"total_flos":6.94172372053248e+18,"train/epoch":5,"train/loss":0.02155970223248005,"train_loss":0.048330643215257464,"_runtime":75825,"train_steps_per_second":0.979,"train/global_step":74250,"train/learning_rate":2.3300469886855526e-14,"train/grad_norm":0.11816766858100891,"_step":74250,"_wandb":{"runtime":75825},"train_samples_per_second":0.979}

LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,12 @@

+{"time":"2026-02-04T09:03:21.282329291Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
+{"time":"2026-02-04T09:03:21.632244677Z","level":"INFO","msg":"stream: created new stream","id":"9xr67hqd"}
+{"time":"2026-02-04T09:03:21.632659472Z","level":"INFO","msg":"handler: started","stream_id":"9xr67hqd"}
+{"time":"2026-02-04T09:03:21.634880563Z","level":"INFO","msg":"stream: started","id":"9xr67hqd"}
+{"time":"2026-02-04T09:03:21.634903075Z","level":"INFO","msg":"writer: started","stream_id":"9xr67hqd"}
+{"time":"2026-02-04T09:03:21.634920297Z","level":"INFO","msg":"sender: started","stream_id":"9xr67hqd"}
+{"time":"2026-02-05T00:58:07.192823728Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/9xr67hqd/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2026-02-05T06:07:07.926217033Z","level":"INFO","msg":"stream: closing","id":"9xr67hqd"}
+{"time":"2026-02-05T06:07:09.870964601Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-05T06:07:10.109026941Z","level":"INFO","msg":"handler: closed","stream_id":"9xr67hqd"}
+{"time":"2026-02-05T06:07:10.114497568Z","level":"INFO","msg":"sender: closed","stream_id":"9xr67hqd"}
+{"time":"2026-02-05T06:07:10.114763144Z","level":"INFO","msg":"stream: closed","id":"9xr67hqd"}

LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2026-02-04 09:03:21,055 INFO    MainThread:4473 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
+2026-02-04 09:03:21,056 INFO    MainThread:4473 [wandb_setup.py:_flush():81] Configure stats pid to 4473
+2026-02-04 09:03:21,056 INFO    MainThread:4473 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-04 09:03:21,056 INFO    MainThread:4473 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log
+2026-02-04 09:03:21,057 INFO    MainThread:4473 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log
+2026-02-04 09:03:21,058 INFO    MainThread:4473 [wandb_init.py:init():844] calling init triggers
+2026-02-04 09:03:21,058 INFO    MainThread:4473 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-04 09:03:21,059 INFO    MainThread:4473 [wandb_init.py:init():892] starting backend
+2026-02-04 09:03:21,273 INFO    MainThread:4473 [wandb_init.py:init():895] sending inform_init request
+2026-02-04 09:03:21,279 INFO    MainThread:4473 [wandb_init.py:init():903] backend started and connected
+2026-02-04 09:03:21,282 INFO    MainThread:4473 [wandb_init.py:init():973] updated telemetry
+2026-02-04 09:03:21,345 INFO    MainThread:4473 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-04 09:03:21,944 INFO    MainThread:4473 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-04 09:03:22,035 INFO    MainThread:4473 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-04 09:03:22,035 INFO    MainThread:4473 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-04 09:03:22,036 INFO    MainThread:4473 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-04 09:03:22,036 INFO    MainThread:4473 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-04 09:03:22,039 INFO    MainThread:4473 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-04 09:03:22,040 INFO    MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['up_proj', 'q_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/E_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-04 09:03:22,047 INFO    MainThread:4473 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x79f04a51f450>>
+2026-02-04 09:03:22,048 INFO    MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-04 09:03:22,050 INFO    MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d119_r85'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-05 06:07:07,926 INFO    wandb-AsyncioManager-main:4473 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-05 06:07:07,926 INFO    wandb-AsyncioManager-main:4473 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.

LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.4.0
+fastapi==0.128.1
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.52.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.2
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
+  "python": "CPython 3.11.10",
+  "startedAt": "2026-02-05T02:37:25.915817Z",
+  "args": [
+    "/workspace/v127rc_exp1/D_mul.yaml"
+  ],
+  "program": "/usr/local/bin/llamafactory-cli",
+  "git": {
+    "remote": "https://github.com/hiyouga/LlamaFactory.git",
+    "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email": "markmochi200@gmail.com",
+  "root": "/workspace/LlamaFactory",
+  "host": "a6086694d22a",
+  "executable": "/usr/bin/python",
+  "cpu_count": 24,
+  "cpu_count_logical": 48,
+  "gpu": "NVIDIA GeForce RTX 4090",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "21474836480",
+      "used": "2604290048"
+    }
+  },
+  "memory": {
+    "total": "269721972736"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce RTX 4090",
+      "memoryTotal": "25757220864",
+      "cudaCores": 16384,
+      "architecture": "Ada",
+      "uuid": "GPU-ff8ec606-2734-ef52-4257-850162397ce9"
+    }
+  ],
+  "cudaVersion": "12.7",
+  "writerId": "zh6rt3o374t2f5i8fr2iiq0hoyntbcfj"
+}

LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,6 @@

+{"time":"2026-02-05T02:37:26.155502518Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
+{"time":"2026-02-05T02:37:26.502201724Z","level":"INFO","msg":"stream: created new stream","id":"yz385gxb"}
+{"time":"2026-02-05T02:37:26.506421573Z","level":"INFO","msg":"handler: started","stream_id":"yz385gxb"}
+{"time":"2026-02-05T02:37:26.508247738Z","level":"INFO","msg":"stream: started","id":"yz385gxb"}
+{"time":"2026-02-05T02:37:26.508259425Z","level":"INFO","msg":"writer: started","stream_id":"yz385gxb"}
+{"time":"2026-02-05T02:37:26.508267638Z","level":"INFO","msg":"sender: started","stream_id":"yz385gxb"}

LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log ADDED Viewed

	@@ -0,0 +1,23 @@

+2026-02-05 02:37:25,931 INFO    MainThread:1076 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2
+2026-02-05 02:37:25,932 INFO    MainThread:1076 [wandb_setup.py:_flush():81] Configure stats pid to 1076
+2026-02-05 02:37:25,932 INFO    MainThread:1076 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-05 02:37:25,932 INFO    MainThread:1076 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log
+2026-02-05 02:37:25,933 INFO    MainThread:1076 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log
+2026-02-05 02:37:25,933 INFO    MainThread:1076 [wandb_init.py:init():844] calling init triggers
+2026-02-05 02:37:25,933 INFO    MainThread:1076 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-05 02:37:25,933 INFO    MainThread:1076 [wandb_init.py:init():892] starting backend
+2026-02-05 02:37:26,147 INFO    MainThread:1076 [wandb_init.py:init():895] sending inform_init request
+2026-02-05 02:37:26,153 INFO    MainThread:1076 [wandb_init.py:init():903] backend started and connected
+2026-02-05 02:37:26,155 INFO    MainThread:1076 [wandb_init.py:init():973] updated telemetry
+2026-02-05 02:37:26,195 INFO    MainThread:1076 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-05 02:37:26,815 INFO    MainThread:1076 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-05 02:37:26,893 INFO    MainThread:1076 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-05 02:37:26,893 INFO    MainThread:1076 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-05 02:37:26,893 INFO    MainThread:1076 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-05 02:37:26,894 INFO    MainThread:1076 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-05 02:37:26,896 INFO    MainThread:1076 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-05 02:37:26,897 INFO    MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['q_proj', 'o_proj', 'gate_proj', 'down_proj', 'k_proj', 'up_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_mul', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-05 02:37:26,902 INFO    MainThread:1076 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7e1cb4c97d90>>
+2026-02-05 02:37:26,906 INFO    MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
+2026-02-05 02:37:26,909 INFO    MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t100_d0_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}

LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.2
+        e:
+            be8ic28wchhzrbkqsu0bl7jl1lfwezfn:
+                args:
+                    - /workspace/v127rc_exp1/E_mul.yaml
+                cpu_count: 24
+                cpu_count_logical: 48
+                cudaVersion: "12.7"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "2594168832"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3
+                host: 682d471c1c72
+                memory:
+                    total: "269721997312"
+                os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-05T02:37:31.256607Z"
+                writerId: be8ic28wchhzrbkqsu0bl7jl1lfwezfn
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.2
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.2
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t119_d0_r85
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 5
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp1/E_mul
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - v_proj
+                - gate_proj
+                - o_proj
+                - up_proj
+                - k_proj
+                - down_proj
+                - q_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.02
+warmup_steps:
+    value: 0.02
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.50.0
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.4.0
+fastapi==0.128.1
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.52.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.2
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
+  "python": "CPython 3.11.10",
+  "startedAt": "2026-02-05T02:37:31.256607Z",
+  "args": [
+    "/workspace/v127rc_exp1/E_mul.yaml"
+  ],
+  "program": "/usr/local/bin/llamafactory-cli",
+  "git": {
+    "remote": "https://github.com/hiyouga/LlamaFactory.git",
+    "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email": "markmochi200@gmail.com",
+  "root": "/workspace/LlamaFactory",
+  "host": "682d471c1c72",
+  "executable": "/usr/bin/python",
+  "cpu_count": 24,
+  "cpu_count_logical": 48,
+  "gpu": "NVIDIA GeForce RTX 4090",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "21474836480",
+      "used": "2594168832"
+    }
+  },
+  "memory": {
+    "total": "269721997312"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce RTX 4090",
+      "memoryTotal": "25757220864",
+      "cudaCores": 16384,
+      "architecture": "Ada",
+      "uuid": "GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3"
+    }
+  ],
+  "cudaVersion": "12.7",
+  "writerId": "be8ic28wchhzrbkqsu0bl7jl1lfwezfn"
+}

LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"train_runtime":202598.5168,"train_samples_per_second":0.963,"_timestamp":1.770461649358481e+09,"_step":195010,"train/train_tokens_per_second":1970.359,"train/loss":0.7374985218048096,"train/grad_norm":2.825721025466919,"train/global_step":195010,"_runtime":202601,"_wandb":{"runtime":202601},"train/epoch":5,"total_flos":1.8231724481360794e+19,"train/learning_rate":3.3779062880157087e-15,"train_loss":0.3935867749506399,"train_steps_per_second":0.963,"train/num_input_tokens_seen":399185470}