Linksome commited on 19 days ago

Commit

26ca48f

verified ·

1 Parent(s): 676c99c

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/output.log +0 -0
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260209_081744-xzw1rig3/logs/debug-internal.log +11 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/config.yaml +723 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/output.log +122 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/requirements.txt +257 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-summary.json +1 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug-internal.log +11 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug.log +25 -0
LlamaFactory/wandb/run-20260209_084647-55tyrmzu/run-55tyrmzu.wandb +0 -0
LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/output.log +0 -0
LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/wandb-metadata.json +41 -0
LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug-internal.log +11 -0
LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug.log +25 -0
v127rc_exp2/B_mup/checkpoint-12200/chat_template.jinja +85 -0
v127rc_exp2/B_mup/checkpoint-12200/tokenizer_config.json +19 -0
v127rc_exp2/B_mup/checkpoint-12300/README.md +208 -0
v127rc_exp2/B_mup/checkpoint-12300/adapter_config.json +46 -0
v127rc_exp2/B_mup/checkpoint-12300/chat_template.jinja +85 -0
v127rc_exp2/B_mup/checkpoint-12300/tokenizer_config.json +19 -0
v127rc_exp2/B_mup/checkpoint-12400/README.md +208 -0
v127rc_exp2/B_mup/checkpoint-12400/adapter_config.json +46 -0
v127rc_exp2/B_mup/checkpoint-12400/chat_template.jinja +85 -0
v127rc_exp2/B_mup/checkpoint-12400/tokenizer_config.json +19 -0
v127rc_exp2/B_mup/checkpoint-12400/trainer_state.json +0 -0
v127rc_exp2/B_mup/checkpoint-12500/README.md +208 -0
v127rc_exp2/B_mup/checkpoint-12500/adapter_config.json +46 -0
v127rc_exp2/B_mup/checkpoint-12500/chat_template.jinja +85 -0
v127rc_exp2/B_mup/checkpoint-12500/tokenizer_config.json +19 -0
v127rc_exp2/B_mup/checkpoint-12500/trainer_state.json +0 -0
v127rc_exp2/B_mup/checkpoint-12600/README.md +208 -0
v127rc_exp2/B_mup/checkpoint-12600/adapter_config.json +46 -0
v127rc_exp2/B_mup/checkpoint-12600/chat_template.jinja +85 -0
v127rc_exp2/B_mup/checkpoint-12600/tokenizer_config.json +19 -0
v127rc_exp2/B_mup/checkpoint-12600/trainer_state.json +0 -0
v127rc_exp2/B_mup/checkpoint-12700/README.md +208 -0
v127rc_exp2/B_mup/checkpoint-12700/adapter_config.json +46 -0
v127rc_exp2/B_mup/checkpoint-12700/chat_template.jinja +85 -0
v127rc_exp2/B_mup/checkpoint-12700/tokenizer_config.json +19 -0
v127rc_exp2/B_mup/checkpoint-12700/trainer_state.json +0 -0
v127rc_exp2/B_mup/checkpoint-12800/README.md +208 -0
v127rc_exp2/B_mup/checkpoint-12800/chat_template.jinja +85 -0
v127rc_exp2/B_mup/checkpoint-12800/trainer_state.json +0 -0
v127rc_exp2/B_mup/checkpoint-12900/adapter_config.json +46 -0
v127rc_exp2/B_mup/checkpoint-13100/README.md +208 -0
v127rc_exp2/B_mup/checkpoint-13100/adapter_config.json +46 -0

LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.2
+        e:
+            3z70993mxg1kv2jk7wj47jktzkvhjf6v:
+                args:
+                    - /workspace/v127rc_exp2/B_rep.yaml
+                cpu_count: 16
+                cpu_count_logical: 32
+                cudaVersion: "12.8"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "2122248192"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-26c0ff17-df36-8246-0397-3ffa6e3c714c
+                host: 7606c805827a
+                memory:
+                    total: "201667923968"
+                os: Linux-6.8.0-90-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-09T08:17:44.060660Z"
+                writerId: 3z70993mxg1kv2jk7wj47jktzkvhjf6v
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.2
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.2
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_d10000
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 32
+        lora_dropout: 0.03
+        lora_rank: 16
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 1
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 5e-05
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8234382336
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 10
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp2/B_rep
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 32
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 16
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - q_proj
+                - down_proj
+                - up_proj
+                - gate_proj
+                - k_proj
+                - v_proj
+                - o_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 1000
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.01
+warmup_steps:
+    value: 0.01
+weight_decay:
+    value: 0

LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.52.1
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.4.1
+fastapi==0.128.5
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.52.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.2
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os": "Linux-6.8.0-90-generic-x86_64-with-glibc2.35",
+  "python": "CPython 3.11.10",
+  "startedAt": "2026-02-09T08:17:44.060660Z",
+  "args": [
+    "/workspace/v127rc_exp2/B_rep.yaml"
+  ],
+  "program": "/usr/local/bin/llamafactory-cli",
+  "git": {
+    "remote": "https://github.com/hiyouga/LlamaFactory.git",
+    "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email": "markmochi200@gmail.com",
+  "root": "/workspace/LlamaFactory",
+  "host": "7606c805827a",
+  "executable": "/usr/bin/python",
+  "cpu_count": 16,
+  "cpu_count_logical": 32,
+  "gpu": "NVIDIA GeForce RTX 4090",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "21474836480",
+      "used": "2122248192"
+    }
+  },
+  "memory": {
+    "total": "201667923968"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce RTX 4090",
+      "memoryTotal": "25757220864",
+      "cudaCores": 16384,
+      "architecture": "Ada",
+      "uuid": "GPU-26c0ff17-df36-8246-0397-3ffa6e3c714c"
+    }
+  ],
+  "cudaVersion": "12.8",
+  "writerId": "3z70993mxg1kv2jk7wj47jktzkvhjf6v"
+}

LlamaFactory/wandb/run-20260209_081744-xzw1rig3/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_timestamp":1.7706260576274142e+09,"train/loss":0.025985639542341232,"train/grad_norm":0.4264875650405884,"train/global_step":964,"_step":963,"_runtime":993,"train/train_tokens_per_second":1985.261,"train_runtime":993.9791,"_wandb":{"runtime":993},"train/learning_rate":3.647727272727273e-05,"train/epoch":0.07303030303030303,"train/num_input_tokens_seen":1973308}

LlamaFactory/wandb/run-20260209_081744-xzw1rig3/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,11 @@

+{"time":"2026-02-09T08:17:44.318295701Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
+{"time":"2026-02-09T08:17:44.634393483Z","level":"INFO","msg":"stream: created new stream","id":"xzw1rig3"}
+{"time":"2026-02-09T08:17:44.634955402Z","level":"INFO","msg":"handler: started","stream_id":"xzw1rig3"}
+{"time":"2026-02-09T08:17:44.636745136Z","level":"INFO","msg":"stream: started","id":"xzw1rig3"}
+{"time":"2026-02-09T08:17:44.636766656Z","level":"INFO","msg":"writer: started","stream_id":"xzw1rig3"}
+{"time":"2026-02-09T08:17:44.636798576Z","level":"INFO","msg":"sender: started","stream_id":"xzw1rig3"}
+{"time":"2026-02-09T08:34:18.628826866Z","level":"INFO","msg":"stream: closing","id":"xzw1rig3"}
+{"time":"2026-02-09T08:34:19.727529201Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-09T08:34:19.965370634Z","level":"INFO","msg":"handler: closed","stream_id":"xzw1rig3"}
+{"time":"2026-02-09T08:34:19.967970411Z","level":"INFO","msg":"sender: closed","stream_id":"xzw1rig3"}
+{"time":"2026-02-09T08:34:19.968897272Z","level":"INFO","msg":"stream: closed","id":"xzw1rig3"}

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/config.yaml ADDED Viewed

	@@ -0,0 +1,723 @@

+_name_or_path:
+    value: /workspace/Qwen/Qwen3-8B-Base
+_wandb:
+    value:
+        cli_version: 0.24.2
+        e:
+            5l942me186lee9ffmegn06ghne5ypa8s:
+                args:
+                    - /workspace/v127rc_exp2/B_dup.yaml
+                cpu_count: 16
+                cpu_count_logical: 32
+                cudaVersion: "12.9"
+                disk:
+                    /:
+                        total: "21474836480"
+                        used: "2060419072"
+                email: markmochi200@gmail.com
+                executable: /usr/bin/python
+                git:
+                    commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
+                    remote: https://github.com/hiyouga/LlamaFactory.git
+                gpu: NVIDIA GeForce RTX 4090
+                gpu_count: 1
+                gpu_nvidia:
+                    - architecture: Ada
+                      cudaCores: 16384
+                      memoryTotal: "25757220864"
+                      name: NVIDIA GeForce RTX 4090
+                      uuid: GPU-6c1e98c2-1b34-cfd8-5de5-319e272f1d1e
+                host: 3bebe963f251
+                memory:
+                    total: "134156767232"
+                os: Linux-6.8.0-60-generic-x86_64-with-glibc2.35
+                program: /usr/local/bin/llamafactory-cli
+                python: CPython 3.11.10
+                root: /workspace/LlamaFactory
+                startedAt: "2026-02-09T08:46:47.557835Z"
+                writerId: 5l942me186lee9ffmegn06ghne5ypa8s
+        m:
+            - "1": train/global_step
+              "6":
+                - 3
+              "7": []
+            - "2": '*'
+              "5": 1
+              "6":
+                - 1
+              "7": []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 71
+                - 84
+                - 98
+                - 105
+            "3":
+                - 7
+                - 19
+                - 62
+                - 66
+            "4": 3.11.10
+            "5": 0.24.2
+            "6": 5.0.0
+            "9":
+                "1": transformers_trainer
+            "12": 0.24.2
+            "13": linux-x86_64
+accelerator_config:
+    value:
+        dispatch_batches: null
+        even_batches: true
+        gradient_accumulation_kwargs: null
+        non_blocking: false
+        split_batches: false
+        use_seedable_sampler: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_epsilon:
+    value: 1e-08
+architectures:
+    value:
+        - Qwen3ForCausalLM
+attention_bias:
+    value: false
+attention_dropout:
+    value: 0
+auto_find_batch_size:
+    value: false
+average_tokens_across_devices:
+    value: true
+batch_eval_metrics:
+    value: false
+bf16:
+    value: true
+bf16_full_eval:
+    value: false
+bos_token_id:
+    value: null
+chunk_size_feed_forward:
+    value: 0
+data_args:
+    value:
+        buffer_size: 16384
+        cutoff_len: 2047
+        data_shared_file_system: false
+        dataset:
+            - Markie_Voss_t0_d34_r300
+        dataset_dir: /workspace/LlamaFactory/data
+        default_system: null
+        enable_thinking: false
+        eval_dataset: null
+        eval_num_beams: null
+        eval_on_each_dataset: false
+        ignore_pad_token_for_loss: true
+        interleave_probs: null
+        mask_history: false
+        max_samples: 100000000
+        media_dir: /workspace/LlamaFactory/data
+        mix_strategy: concat
+        neat_packing: false
+        overwrite_cache: false
+        packing: true
+        preprocessing_batch_size: 1000
+        preprocessing_num_workers: 16
+        streaming: false
+        template: qwen3_nothink
+        tokenized_path: null
+        tool_format: null
+        train_on_prompt: false
+        val_size: 0
+data_seed:
+    value: null
+dataloader_drop_last:
+    value: false
+dataloader_num_workers:
+    value: 0
+dataloader_persistent_workers:
+    value: false
+dataloader_pin_memory:
+    value: true
+dataloader_prefetch_factor:
+    value: null
+ddp_backend:
+    value: null
+ddp_broadcast_buffers:
+    value: null
+ddp_bucket_cap_mb:
+    value: null
+ddp_find_unused_parameters:
+    value: null
+ddp_timeout:
+    value: 180000000
+debug:
+    value: []
+deepspeed:
+    value: null
+disable_tqdm:
+    value: false
+do_eval:
+    value: false
+do_predict:
+    value: false
+do_train:
+    value: true
+dtype:
+    value: bfloat16
+enable_jit_checkpoint:
+    value: false
+eos_token_id:
+    value: 151645
+eval_accumulation_steps:
+    value: null
+eval_delay:
+    value: 0
+eval_do_concat_batches:
+    value: true
+eval_on_start:
+    value: false
+eval_steps:
+    value: null
+eval_strategy:
+    value: "no"
+eval_use_gather_object:
+    value: false
+finetuning_args:
+    value:
+        additional_target: null
+        apollo_layerwise: false
+        apollo_proj: random
+        apollo_proj_type: std
+        apollo_rank: 16
+        apollo_scale: 32
+        apollo_scale_front: false
+        apollo_scale_type: channel
+        apollo_target:
+            - all
+        apollo_update_interval: 200
+        badam_mask_mode: adjacent
+        badam_mode: layer
+        badam_start_block: null
+        badam_switch_interval: 50
+        badam_switch_mode: ascending
+        badam_update_ratio: 0.05
+        badam_verbose: 0
+        compute_accuracy: false
+        create_new_adapter: false
+        disable_shuffling: false
+        dpo_label_smoothing: 0
+        eaft_alpha: 1
+        early_stopping_steps: null
+        finetuning_type: lora
+        freeze_extra_modules: null
+        freeze_language_model: false
+        freeze_multi_modal_projector: true
+        freeze_trainable_layers: 2
+        freeze_trainable_modules:
+            - all
+        freeze_vision_tower: true
+        galore_layerwise: false
+        galore_proj_type: std
+        galore_rank: 16
+        galore_scale: 2
+        galore_target:
+            - all
+        galore_update_interval: 200
+        include_effective_tokens_per_second: false
+        kto_chosen_weight: 1
+        kto_rejected_weight: 1
+        ld_alpha: null
+        lora_alpha: 64
+        lora_dropout: 0.03
+        lora_rank: 32
+        lora_target:
+            - all
+        loraplus_lr_embedding: 1e-06
+        loraplus_lr_ratio: null
+        module_dropout: 0
+        oft_block_size: 32
+        oft_rank: 0
+        oft_target:
+            - all
+        pissa_convert: false
+        pissa_init: false
+        pissa_iter: 16
+        plot_loss: true
+        ppo_buffer_size: 1
+        ppo_epochs: 4
+        ppo_score_norm: false
+        ppo_target: 6
+        ppo_whiten_rewards: false
+        pref_bco_weight: 0
+        pref_beta: 0.1
+        pref_ftx: 0
+        pref_loss: sigmoid
+        pure_bf16: false
+        ref_model: null
+        ref_model_adapters: null
+        ref_model_quantization_bit: null
+        reward_model: null
+        reward_model_adapters: null
+        reward_model_quantization_bit: null
+        reward_model_type: lora
+        simpo_gamma: 0.5
+        stage: pt
+        swanlab_api_key: <SWANLAB_API_KEY>
+        swanlab_lark_secret: null
+        swanlab_lark_webhook_url: null
+        swanlab_logdir: null
+        swanlab_mode: cloud
+        swanlab_project: llamafactory
+        swanlab_run_name: null
+        swanlab_workspace: null
+        use_adam_mini: false
+        use_apollo: false
+        use_badam: false
+        use_dft_loss: false
+        use_dora: false
+        use_eaft_loss: false
+        use_galore: false
+        use_llama_pro: false
+        use_mca: false
+        use_muon: false
+        use_rslora: false
+        use_swanlab: false
+fp8:
+    value: false
+fp8_backend:
+    value: auto
+fp8_enable_fsdp_float8_all_gather:
+    value: false
+fp16:
+    value: false
+fp16_full_eval:
+    value: false
+fsdp:
+    value: []
+fsdp_config:
+    value:
+        min_num_params: 0
+        xla: false
+        xla_fsdp_grad_ckpt: false
+        xla_fsdp_v2: false
+full_determinism:
+    value: false
+generating_args:
+    value:
+        do_sample: true
+        length_penalty: 1
+        max_new_tokens: 1024
+        num_beams: 1
+        repetition_penalty: 1
+        skip_special_tokens: true
+        temperature: 0.95
+        top_k: 50
+        top_p: 0.7
+generation_config:
+    value: null
+generation_max_length:
+    value: 2047
+generation_num_beams:
+    value: null
+gradient_accumulation_steps:
+    value: 8
+gradient_checkpointing:
+    value: false
+gradient_checkpointing_kwargs:
+    value: null
+greater_is_better:
+    value: null
+group_by_length:
+    value: false
+head_dim:
+    value: 128
+hidden_act:
+    value: silu
+hidden_size:
+    value: 4096
+hub_always_push:
+    value: false
+hub_model_id:
+    value: null
+hub_private_repo:
+    value: null
+hub_revision:
+    value: null
+hub_strategy:
+    value: every_save
+hub_token:
+    value: <HUB_TOKEN>
+id2label:
+    value:
+        "0": LABEL_0
+        "1": LABEL_1
+ignore_data_skip:
+    value: false
+include_for_metrics:
+    value: []
+include_num_input_tokens_seen:
+    value: all
+initializer_range:
+    value: 0.02
+intermediate_size:
+    value: 12288
+is_encoder_decoder:
+    value: false
+label_names:
+    value:
+        - labels
+label_smoothing_factor:
+    value: 0
+label2id:
+    value:
+        LABEL_0: 0
+        LABEL_1: 1
+layer_types:
+    value:
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+        - full_attention
+learning_rate:
+    value: 0.0001
+length_column_name:
+    value: length
+liger_kernel_config:
+    value: null
+load_best_model_at_end:
+    value: false
+local_rank:
+    value: -1
+log_level:
+    value: passive
+log_level_replica:
+    value: warning
+log_on_each_node:
+    value: true
+logging_dir:
+    value: null
+logging_first_step:
+    value: false
+logging_nan_inf_filter:
+    value: true
+logging_steps:
+    value: 1
+logging_strategy:
+    value: steps
+lr_scheduler_kwargs:
+    value: null
+lr_scheduler_type:
+    value: cosine
+master_addr:
+    value: null
+master_port:
+    value: null
+max_grad_norm:
+    value: 1
+max_position_embeddings:
+    value: 32768
+max_steps:
+    value: -1
+max_window_layers:
+    value: 36
+metric_for_best_model:
+    value: null
+model/num_parameters:
+    value: 8278029312
+model_args:
+    value:
+        adapter_folder: null
+        adapter_name_or_path: null
+        add_special_tokens: null
+        add_tokens: null
+        audio_sampling_rate: 16000
+        block_diag_attn: false
+        cache_dir: null
+        chunk_size: 8192
+        compute_dtype: torch.bfloat16
+        cpu_infer: 32
+        crop_to_patches: false
+        device_map:
+            "": cuda:0
+        disable_gradient_checkpointing: false
+        double_quantization: true
+        enable_liger_kernel: false
+        export_device: cpu
+        export_dir: null
+        export_hub_model_id: null
+        export_legacy_format: false
+        export_quantization_bit: null
+        export_quantization_dataset: null
+        export_quantization_maxlen: 1024
+        export_quantization_nsamples: 128
+        export_size: 5
+        flash_attn: auto
+        hf_hub_token: <HF_HUB_TOKEN>
+        image_do_pan_and_scan: false
+        image_max_pixels: 589824
+        image_min_pixels: 1024
+        infer_backend: HF
+        infer_dtype: auto
+        init_special_tokens: noise_init
+        kt_force_think: false
+        kt_maxlen: 4096
+        kt_mode: normal
+        kt_optimize_rule: null
+        kt_use_cuda_graph: true
+        low_cpu_mem_usage: true
+        mixture_of_depths: null
+        mode: normal
+        model_max_length: 2047
+        model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+        model_revision: main
+        moe_aux_loss_coef: null
+        ms_hub_token: <MS_HUB_TOKEN>
+        new_special_tokens_config: null
+        offload_folder: offload
+        om_hub_token: <OM_HUB_TOKEN>
+        print_param_status: false
+        quantization_bit: null
+        quantization_device_map: null
+        quantization_method: BNB
+        quantization_type: nf4
+        resize_vocab: false
+        rope_scaling: null
+        sglang_config: null
+        sglang_lora_backend: triton
+        sglang_maxlen: 4096
+        sglang_mem_fraction: 0.7
+        sglang_tp_size: -1
+        shift_attn: false
+        split_special_tokens: false
+        train_from_scratch: false
+        trust_remote_code: true
+        upcast_layernorm: false
+        upcast_lmhead_output: false
+        use_audio_in_video: false
+        use_fast_tokenizer: true
+        use_kt: false
+        use_kv_cache: true
+        use_reentrant_gc: true
+        use_unsloth: false
+        use_unsloth_gc: false
+        use_v1_kernels: false
+        video_fps: 2
+        video_max_pixels: 65536
+        video_maxlen: 128
+        video_min_pixels: 256
+        vllm_config: null
+        vllm_enforce_eager: false
+        vllm_gpu_util: 0.7
+        vllm_max_lora_rank: 32
+        vllm_maxlen: 4096
+model_type:
+    value: qwen3
+neftune_noise_alpha:
+    value: null
+num_attention_heads:
+    value: 32
+num_hidden_layers:
+    value: 36
+num_key_value_heads:
+    value: 8
+num_train_epochs:
+    value: 10
+optim:
+    value: adamw_torch
+optim_args:
+    value: null
+optim_target_modules:
+    value: null
+output_attentions:
+    value: false
+output_dir:
+    value: /workspace/v127rc_exp2/B_dup
+output_hidden_states:
+    value: false
+overwrite_output_dir:
+    value: false
+pad_token_id:
+    value: 151643
+parallelism_config:
+    value: null
+peft_config:
+    value:
+        default:
+            alora_invocation_tokens: null
+            arrow_config: null
+            auto_mapping: null
+            base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
+            bias: none
+            corda_config: null
+            ensure_weight_tying: false
+            eva_config: null
+            exclude_modules: null
+            fan_in_fan_out: false
+            inference_mode: false
+            init_lora_weights: true
+            layer_replication: null
+            layers_pattern: null
+            layers_to_transform: null
+            lora_alpha: 64
+            lora_bias: false
+            lora_dropout: 0.03
+            megatron_config: null
+            megatron_core: megatron.core
+            modules_to_save: null
+            peft_type: LORA
+            peft_version: 0.18.1
+            qalora_group_size: 16
+            r: 32
+            revision: null
+            runtime_config:
+                ephemeral_gpu_offload: false
+            target_modules:
+                - o_proj
+                - gate_proj
+                - q_proj
+                - up_proj
+                - v_proj
+                - down_proj
+                - k_proj
+            target_parameters: null
+            task_type: CAUSAL_LM
+            trainable_token_indices: null
+            use_dora: false
+            use_qalora: false
+            use_rslora: false
+per_device_eval_batch_size:
+    value: 8
+per_device_train_batch_size:
+    value: 1
+predict_with_generate:
+    value: false
+prediction_loss_only:
+    value: false
+problem_type:
+    value: null
+project:
+    value: huggingface
+push_to_hub:
+    value: false
+ray_init_kwargs:
+    value: null
+ray_num_workers:
+    value: 1
+remove_unused_columns:
+    value: false
+report_to:
+    value:
+        - wandb
+restore_callback_states_from_checkpoint:
+    value: false
+resume_from_checkpoint:
+    value: null
+return_dict:
+    value: true
+rms_norm_eps:
+    value: 1e-06
+rope_parameters:
+    value:
+        rope_theta: 1000000
+        rope_type: default
+run_name:
+    value: null
+save_on_each_node:
+    value: false
+save_only_model:
+    value: true
+save_steps:
+    value: 10
+save_strategy:
+    value: steps
+save_total_limit:
+    value: null
+seed:
+    value: 42
+skip_memory_metrics:
+    value: true
+sliding_window:
+    value: null
+sortish_sampler:
+    value: false
+tf32:
+    value: null
+tie_word_embeddings:
+    value: false
+torch_compile:
+    value: false
+torch_compile_backend:
+    value: null
+torch_compile_mode:
+    value: null
+torch_empty_cache_steps:
+    value: null
+trackio_space_id:
+    value: trackio
+transformers_version:
+    value: 5.0.0
+use_cache:
+    value: false
+use_cpu:
+    value: false
+use_liger_kernel:
+    value: false
+use_sliding_window:
+    value: false
+vocab_size:
+    value: 151936
+warmup_ratio:
+    value: 0.01
+warmup_steps:
+    value: 0.01
+weight_decay:
+    value: 0.01

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/output.log ADDED Viewed

	@@ -0,0 +1,122 @@

+  0%|                                                                                                                                                                                       | 0/13920 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
+  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
+  0%|                                                                                                                                                                           | 10/13920 [01:23<32:01:23,  8.29s/it][INFO|trainer.py:4115] 2026-02-09 08:48:11,523 >> Saving model checkpoint to /workspace/v127rc_exp2/B_dup/checkpoint-10
+{'loss': '1.213', 'grad_norm': '0.1718', 'learning_rate': '0', 'epoch': '0.0007184', 'num_input_tokens_seen': 16376, 'train_runtime': '11.31', 'train_tokens_per_second': '1448'}
+{'loss': '1.384', 'grad_norm': '0.1968', 'learning_rate': '7.143e-07', 'epoch': '0.001437', 'num_input_tokens_seen': 32752, 'train_runtime': '19.54', 'train_tokens_per_second': '1676'}
+{'loss': '1.234', 'grad_norm': '0.1806', 'learning_rate': '1.429e-06', 'epoch': '0.002155', 'num_input_tokens_seen': 49128, 'train_runtime': '27.79', 'train_tokens_per_second': '1768'}
+{'loss': '1.384', 'grad_norm': '0.2031', 'learning_rate': '2.143e-06', 'epoch': '0.002874', 'num_input_tokens_seen': 65504, 'train_runtime': '36.03', 'train_tokens_per_second': '1818'}
+{'loss': '1.48', 'grad_norm': '0.2195', 'learning_rate': '2.857e-06', 'epoch': '0.003592', 'num_input_tokens_seen': 81880, 'train_runtime': '44.3', 'train_tokens_per_second': '1848'}
+{'loss': '1.382', 'grad_norm': '0.2049', 'learning_rate': '3.571e-06', 'epoch': '0.00431', 'num_input_tokens_seen': 98256, 'train_runtime': '52.57', 'train_tokens_per_second': '1869'}
+{'loss': '1.717', 'grad_norm': '0.2322', 'learning_rate': '4.286e-06', 'epoch': '0.005029', 'num_input_tokens_seen': 114632, 'train_runtime': '60.85', 'train_tokens_per_second': '1884'}
+{'loss': '1.608', 'grad_norm': '0.1957', 'learning_rate': '5e-06', 'epoch': '0.005747', 'num_input_tokens_seen': 131008, 'train_runtime': '69.14', 'train_tokens_per_second': '1895'}
+{'loss': '1.435', 'grad_norm': '0.2099', 'learning_rate': '5.714e-06', 'epoch': '0.006466', 'num_input_tokens_seen': 147384, 'train_runtime': '77.43', 'train_tokens_per_second': '1903'}
+{'loss': '1.354', 'grad_norm': '0.185', 'learning_rate': '6.429e-06', 'epoch': '0.007184', 'num_input_tokens_seen': 163760, 'train_runtime': '85.72', 'train_tokens_per_second': '1910'}
+[INFO|configuration_utils.py:665] 2026-02-09 08:48:11,559 >> loading configuration file /workspace/Qwen/Qwen3-8B-Base/config.json
+[INFO|configuration_utils.py:739] 2026-02-09 08:48:11,559 >> Model config Qwen3Config {
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151643,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 12288,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}
+[INFO|tokenization_utils_base.py:3327] 2026-02-09 08:48:12,278 >> chat template saved in /workspace/v127rc_exp2/B_dup/checkpoint-10/chat_template.jinja
+[INFO|tokenization_utils_base.py:2181] 2026-02-09 08:48:12,287 >> tokenizer config file saved in /workspace/v127rc_exp2/B_dup/checkpoint-10/tokenizer_config.json
+  0%|▏                                                                                                                                                                          | 11/13920 [01:32<33:02:21,  8.55s/it]Traceback (most recent call last):
+{'loss': '1.429', 'grad_norm': '0.2128', 'learning_rate': '7.143e-06', 'epoch': '0.007902', 'num_input_tokens_seen': 180136, 'train_runtime': '94.87', 'train_tokens_per_second': '1899'}
+  File "/usr/local/bin/llamafactory-cli", line 8, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
+    launcher.launch()
+  File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
+    run_exp()
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
+    _training_function(config={"args": args, "callbacks": callbacks})
+  File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
+    run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
+  File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
+    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
+    return inner_training_loop(
+           ^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
+    self.accelerator.backward(loss, **kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
+    loss.backward(**kwargs)
+  File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
+    torch.autograd.backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
+    _engine_run_backward(
+  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
+    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+KeyboardInterrupt

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,257 @@

+pytz==2025.2
+pydub==0.25.1
+brotli==1.2.0
+antlr4-python3-runtime==4.9.3
+xxhash==3.6.0
+websockets==15.0.1
+tzdata==2025.3
+typing_extensions==4.15.0
+tqdm==4.67.3
+tomlkit==0.13.3
+termcolor==3.3.0
+shtab==1.8.0
+shellingham==1.5.4
+sentencepiece==0.2.1
+semantic-version==2.10.0
+safetensors==0.7.0
+ruff==0.15.0
+regex==2026.1.15
+python-multipart==0.0.22
+pyparsing==3.3.2
+pyarrow==23.0.0
+protobuf==6.33.5
+propcache==0.4.1
+orjson==3.11.7
+omegaconf==2.3.0
+numpy==2.4.2
+multidict==6.7.1
+mdurl==0.1.2
+kiwisolver==1.4.9
+hf-xet==1.2.0
+hf_transfer==0.1.9
+groovy==0.1.2
+frozenlist==1.8.0
+fonttools==4.61.1
+ffmpy==1.0.0
+einops==0.8.2
+docstring_parser==0.17.0
+dill==0.3.8
+cycler==0.12.1
+click==8.3.1
+av==16.0.0
+annotated-types==0.7.0
+annotated-doc==0.0.4
+aiohappyeyeballs==2.6.1
+aiofiles==24.1.0
+yarl==1.22.0
+uvicorn==0.40.0
+typing-inspection==0.4.2
+typer-slim==0.21.1
+tiktoken==0.12.0
+scipy==1.17.0
+pydantic_core==2.41.4
+pandas==2.3.3
+multiprocess==0.70.16
+modelscope==1.34.0
+markdown-it-py==4.0.0
+fire==0.7.1
+contourpy==1.3.3
+anyio==4.12.1
+aiosignal==1.4.0
+starlette==0.52.1
+rich==14.3.2
+pydantic==2.12.3
+matplotlib==3.10.8
+aiohttp==3.13.3
+tyro==0.8.14
+typer==0.21.1
+torchdata==0.11.0
+sse-starlette==3.2.0
+safehttpx==0.1.7
+huggingface_hub==1.4.1
+fastapi==0.128.5
+tokenizers==0.22.2
+gradio_client==1.14.0
+datasets==4.0.0
+accelerate==1.11.0
+transformers==5.0.0
+gradio==5.50.0
+trl==0.24.0
+peft==0.18.1
+llamafactory==0.9.5.dev0
+jieba==0.42.1
+rouge-chinese==1.0.3
+joblib==1.5.3
+nltk==3.9.2
+py-cpuinfo==9.0.0
+nvidia-ml-py==13.590.48
+hjson==3.1.0
+ninja==1.13.0
+msgpack==1.1.2
+deepspeed==0.16.9
+smmap==5.0.2
+sentry-sdk==2.52.0
+gitdb==4.0.12
+GitPython==3.1.46
+wandb==0.24.2
+entrypoints==0.4
+jupyter_client==7.4.9
+nbclassic==1.1.0
+notebook==6.5.5
+pyzmq==24.0.1
+PyYAML==6.0.2
+Send2Trash==1.8.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.3.2
+comm==0.2.2
+debugpy==1.8.5
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.1.0
+fastjsonschema==2.20.0
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.27.0
+ipython-genutils==0.2.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+json5==0.9.25
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+jupyter-archive==3.4.0
+jupyter_contrib_core==0.4.2
+jupyter_contrib_nbextensions==0.7.0
+jupyter_core==5.7.2
+jupyter-events==0.10.0
+jupyter-highlight-selected-word==0.2.0
+jupyter-lsp==2.2.5
+jupyter_nbextensions_configurator==0.6.4
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+lxml==5.3.0
+matplotlib-inline==0.1.7
+mistune==3.0.2
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+packaging==24.1
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.47
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.20.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.3.0
+tornado==6.4.1
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20240906
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+filelock==3.13.1
+fsspec==2024.2.0
+mpmath==1.3.0
+networkx==3.2.1
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+pillow==10.2.0
+sympy==1.12
+torch==2.4.1+cu124
+torchaudio==2.4.1+cu124
+torchvision==0.19.1+cu124
+triton==3.0.0
+pip==24.2
+setuptools==75.1.0
+wheel==0.44.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+SecretStorage==3.3.1
+blinker==1.4
+cryptography==3.4.8
+dbus-python==1.2.18
+distro==1.7.0
+httplib2==0.20.2
+importlib-metadata==4.6.4
+jeepney==0.7.1
+keyring==23.5.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+more-itertools==8.10.0
+oauthlib==3.2.0
+python-apt==2.4.0+ubuntu4
+six==1.16.0
+wadllib==1.3.6
+zipp==1.0.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+importlib_resources==6.4.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.1
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.43.0
+zipp==3.19.2

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os": "Linux-6.8.0-60-generic-x86_64-with-glibc2.35",
+  "python": "CPython 3.11.10",
+  "startedAt": "2026-02-09T08:46:47.557835Z",
+  "args": [
+    "/workspace/v127rc_exp2/B_dup.yaml"
+  ],
+  "program": "/usr/local/bin/llamafactory-cli",
+  "git": {
+    "remote": "https://github.com/hiyouga/LlamaFactory.git",
+    "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email": "markmochi200@gmail.com",
+  "root": "/workspace/LlamaFactory",
+  "host": "3bebe963f251",
+  "executable": "/usr/bin/python",
+  "cpu_count": 16,
+  "cpu_count_logical": 32,
+  "gpu": "NVIDIA GeForce RTX 4090",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "21474836480",
+      "used": "2060419072"
+    }
+  },
+  "memory": {
+    "total": "134156767232"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce RTX 4090",
+      "memoryTotal": "25757220864",
+      "cudaCores": 16384,
+      "architecture": "Ada",
+      "uuid": "GPU-6c1e98c2-1b34-cfd8-5de5-319e272f1d1e"
+    }
+  ],
+  "cudaVersion": "12.9",
+  "writerId": "5l942me186lee9ffmegn06ghne5ypa8s"
+}

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_step":10,"train/epoch":0.007902298850574713,"_runtime":94,"train/grad_norm":0.21283617615699768,"_timestamp":1.7706269006685286e+09,"train/global_step":11,"train/num_input_tokens_seen":180136,"_wandb":{"runtime":94},"train_runtime":94.8711,"train/learning_rate":7.142857142857143e-06,"train/loss":1.4294381141662598,"train/train_tokens_per_second":1898.744}

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,11 @@

+{"time":"2026-02-09T08:46:47.817513188Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
+{"time":"2026-02-09T08:46:48.121590539Z","level":"INFO","msg":"stream: created new stream","id":"55tyrmzu"}
+{"time":"2026-02-09T08:46:48.12218375Z","level":"INFO","msg":"handler: started","stream_id":"55tyrmzu"}
+{"time":"2026-02-09T08:46:48.123357618Z","level":"INFO","msg":"stream: started","id":"55tyrmzu"}
+{"time":"2026-02-09T08:46:48.123366734Z","level":"INFO","msg":"writer: started","stream_id":"55tyrmzu"}
+{"time":"2026-02-09T08:46:48.123368438Z","level":"INFO","msg":"sender: started","stream_id":"55tyrmzu"}
+{"time":"2026-02-09T08:48:22.69328438Z","level":"INFO","msg":"stream: closing","id":"55tyrmzu"}
+{"time":"2026-02-09T08:48:23.219258177Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-09T08:48:23.424167617Z","level":"INFO","msg":"handler: closed","stream_id":"55tyrmzu"}
+{"time":"2026-02-09T08:48:23.429177461Z","level":"INFO","msg":"sender: closed","stream_id":"55tyrmzu"}
+{"time":"2026-02-09T08:48:23.429635912Z","level":"INFO","msg":"stream: closed","id":"55tyrmzu"}

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2026-02-09 08:46:47,588 INFO    MainThread:4723 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2
+2026-02-09 08:46:47,588 INFO    MainThread:4723 [wandb_setup.py:_flush():81] Configure stats pid to 4723
+2026-02-09 08:46:47,589 INFO    MainThread:4723 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-09 08:46:47,590 INFO    MainThread:4723 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug.log
+2026-02-09 08:46:47,591 INFO    MainThread:4723 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260209_084647-55tyrmzu/logs/debug-internal.log
+2026-02-09 08:46:47,592 INFO    MainThread:4723 [wandb_init.py:init():844] calling init triggers
+2026-02-09 08:46:47,592 INFO    MainThread:4723 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-09 08:46:47,592 INFO    MainThread:4723 [wandb_init.py:init():892] starting backend
+2026-02-09 08:46:47,806 INFO    MainThread:4723 [wandb_init.py:init():895] sending inform_init request
+2026-02-09 08:46:47,815 INFO    MainThread:4723 [wandb_init.py:init():903] backend started and connected
+2026-02-09 08:46:47,817 INFO    MainThread:4723 [wandb_init.py:init():973] updated telemetry
+2026-02-09 08:46:47,886 INFO    MainThread:4723 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-09 08:46:48,406 INFO    MainThread:4723 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-09 08:46:48,473 INFO    MainThread:4723 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-09 08:46:48,474 INFO    MainThread:4723 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-09 08:46:48,475 INFO    MainThread:4723 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-09 08:46:48,475 INFO    MainThread:4723 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-09 08:46:48,477 INFO    MainThread:4723 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-09 08:46:48,478 INFO    MainThread:4723 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['o_proj', 'gate_proj', 'q_proj', 'up_proj', 'v_proj', 'down_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp2/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.01, 'warmup_steps': 0.01, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 10, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-09 08:46:48,484 INFO    MainThread:4723 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8278029312 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7771762c0550>>
+2026-02-09 08:46:48,485 INFO    MainThread:4723 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8278029312 None
+2026-02-09 08:46:48,487 INFO    MainThread:4723 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d34_r300'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 64, 'lora_dropout': 0.03, 'lora_rank': 32, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-09 08:48:22,693 INFO    wandb-AsyncioManager-main:4723 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-09 08:48:22,694 INFO    wandb-AsyncioManager-main:4723 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.

LlamaFactory/wandb/run-20260209_084647-55tyrmzu/run-55tyrmzu.wandb ADDED Viewed

Binary file (49.1 kB). View file

LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

LlamaFactory/wandb/run-20260209_085051-sxxworn9/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "os": "Linux-6.8.0-60-generic-x86_64-with-glibc2.35",
+  "python": "CPython 3.11.10",
+  "startedAt": "2026-02-09T08:50:51.146337Z",
+  "args": [
+    "/workspace/v127rc_exp2/B_dup.yaml"
+  ],
+  "program": "/usr/local/bin/llamafactory-cli",
+  "git": {
+    "remote": "https://github.com/hiyouga/LlamaFactory.git",
+    "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
+  },
+  "email": "markmochi200@gmail.com",
+  "root": "/workspace/LlamaFactory",
+  "host": "3bebe963f251",
+  "executable": "/usr/bin/python",
+  "cpu_count": 16,
+  "cpu_count_logical": 32,
+  "gpu": "NVIDIA GeForce RTX 4090",
+  "gpu_count": 1,
+  "disk": {
+    "/": {
+      "total": "21474836480",
+      "used": "2060427264"
+    }
+  },
+  "memory": {
+    "total": "134156767232"
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA GeForce RTX 4090",
+      "memoryTotal": "25757220864",
+      "cudaCores": 16384,
+      "architecture": "Ada",
+      "uuid": "GPU-6c1e98c2-1b34-cfd8-5de5-319e272f1d1e"
+    }
+  ],
+  "cudaVersion": "12.9",
+  "writerId": "iuuq28fefy6u1tv2cie29tfnokxlsg0z"
+}

LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,11 @@

+{"time":"2026-02-09T08:50:51.398312029Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
+{"time":"2026-02-09T08:50:51.715694946Z","level":"INFO","msg":"stream: created new stream","id":"sxxworn9"}
+{"time":"2026-02-09T08:50:51.716325506Z","level":"INFO","msg":"handler: started","stream_id":"sxxworn9"}
+{"time":"2026-02-09T08:50:51.718352807Z","level":"INFO","msg":"stream: started","id":"sxxworn9"}
+{"time":"2026-02-09T08:50:51.718357797Z","level":"INFO","msg":"writer: started","stream_id":"sxxworn9"}
+{"time":"2026-02-09T08:50:51.718367484Z","level":"INFO","msg":"sender: started","stream_id":"sxxworn9"}
+{"time":"2026-02-10T17:01:06.72420802Z","level":"INFO","msg":"stream: closing","id":"sxxworn9"}
+{"time":"2026-02-10T17:01:08.23332074Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2026-02-10T17:01:08.470443353Z","level":"INFO","msg":"handler: closed","stream_id":"sxxworn9"}
+{"time":"2026-02-10T17:01:08.474505531Z","level":"INFO","msg":"sender: closed","stream_id":"sxxworn9"}
+{"time":"2026-02-10T17:01:08.474851934Z","level":"INFO","msg":"stream: closed","id":"sxxworn9"}

LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug.log ADDED Viewed

	@@ -0,0 +1,25 @@

+2026-02-09 08:50:51,173 INFO    MainThread:5887 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2
+2026-02-09 08:50:51,174 INFO    MainThread:5887 [wandb_setup.py:_flush():81] Configure stats pid to 5887
+2026-02-09 08:50:51,174 INFO    MainThread:5887 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2026-02-09 08:50:51,175 INFO    MainThread:5887 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug.log
+2026-02-09 08:50:51,176 INFO    MainThread:5887 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260209_085051-sxxworn9/logs/debug-internal.log
+2026-02-09 08:50:51,176 INFO    MainThread:5887 [wandb_init.py:init():844] calling init triggers
+2026-02-09 08:50:51,176 INFO    MainThread:5887 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-02-09 08:50:51,177 INFO    MainThread:5887 [wandb_init.py:init():892] starting backend
+2026-02-09 08:50:51,387 INFO    MainThread:5887 [wandb_init.py:init():895] sending inform_init request
+2026-02-09 08:50:51,395 INFO    MainThread:5887 [wandb_init.py:init():903] backend started and connected
+2026-02-09 08:50:51,397 INFO    MainThread:5887 [wandb_init.py:init():973] updated telemetry
+2026-02-09 08:50:51,476 INFO    MainThread:5887 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
+2026-02-09 08:50:51,992 INFO    MainThread:5887 [wandb_init.py:init():1042] starting run threads in backend
+2026-02-09 08:50:52,060 INFO    MainThread:5887 [wandb_run.py:_console_start():2529] atexit reg
+2026-02-09 08:50:52,061 INFO    MainThread:5887 [wandb_run.py:_redirect():2377] redirect: wrap_raw
+2026-02-09 08:50:52,061 INFO    MainThread:5887 [wandb_run.py:_redirect():2446] Wrapping output streams.
+2026-02-09 08:50:52,062 INFO    MainThread:5887 [wandb_run.py:_redirect():2469] Redirects installed.
+2026-02-09 08:50:52,064 INFO    MainThread:5887 [wandb_init.py:init():1082] run started, returning control to user process
+2026-02-09 08:50:52,065 INFO    MainThread:5887 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': ['q_proj', 'gate_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp2/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 10, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.01, 'warmup_steps': 0.01, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
+2026-02-09 08:50:52,071 INFO    MainThread:5887 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8278029312 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7e370842d110>>
+2026-02-09 08:50:52,071 INFO    MainThread:5887 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8278029312 None
+2026-02-09 08:50:52,073 INFO    MainThread:5887 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d34_r300'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 64, 'lora_dropout': 0.03, 'lora_rank': 32, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
+2026-02-10 17:01:06,724 INFO    wandb-AsyncioManager-main:5887 [service_client.py:_forward_responses():94] Reached EOF.
+2026-02-10 17:01:06,725 INFO    wandb-AsyncioManager-main:5887 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.

v127rc_exp2/B_mup/checkpoint-12200/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

v127rc_exp2/B_mup/checkpoint-12200/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<think>",
+    "</think>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

v127rc_exp2/B_mup/checkpoint-12300/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: /workspace/Qwen/Qwen3-8B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

v127rc_exp2/B_mup/checkpoint-12300/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.03,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

v127rc_exp2/B_mup/checkpoint-12300/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

v127rc_exp2/B_mup/checkpoint-12300/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<think>",
+    "</think>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

v127rc_exp2/B_mup/checkpoint-12400/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: /workspace/Qwen/Qwen3-8B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

v127rc_exp2/B_mup/checkpoint-12400/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.03,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

v127rc_exp2/B_mup/checkpoint-12400/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

v127rc_exp2/B_mup/checkpoint-12400/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<think>",
+    "</think>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

v127rc_exp2/B_mup/checkpoint-12400/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

v127rc_exp2/B_mup/checkpoint-12500/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: /workspace/Qwen/Qwen3-8B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

v127rc_exp2/B_mup/checkpoint-12500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.03,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

v127rc_exp2/B_mup/checkpoint-12500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

v127rc_exp2/B_mup/checkpoint-12500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<think>",
+    "</think>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

v127rc_exp2/B_mup/checkpoint-12500/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

v127rc_exp2/B_mup/checkpoint-12600/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: /workspace/Qwen/Qwen3-8B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

v127rc_exp2/B_mup/checkpoint-12600/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.03,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

v127rc_exp2/B_mup/checkpoint-12600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

v127rc_exp2/B_mup/checkpoint-12600/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<think>",
+    "</think>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

v127rc_exp2/B_mup/checkpoint-12600/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

v127rc_exp2/B_mup/checkpoint-12700/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: /workspace/Qwen/Qwen3-8B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

v127rc_exp2/B_mup/checkpoint-12700/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.03,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

v127rc_exp2/B_mup/checkpoint-12700/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

v127rc_exp2/B_mup/checkpoint-12700/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<think>",
+    "</think>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

v127rc_exp2/B_mup/checkpoint-12700/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

v127rc_exp2/B_mup/checkpoint-12800/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: /workspace/Qwen/Qwen3-8B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

v127rc_exp2/B_mup/checkpoint-12800/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,85 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

v127rc_exp2/B_mup/checkpoint-12800/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

v127rc_exp2/B_mup/checkpoint-12900/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.03,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

v127rc_exp2/B_mup/checkpoint-13100/README.md ADDED Viewed

	@@ -0,0 +1,208 @@

+---
+base_model: /workspace/Qwen/Qwen3-8B-Base
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/workspace/Qwen/Qwen3-8B-Base
+- llama-factory
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

v127rc_exp2/B_mup/checkpoint-13100/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/workspace/Qwen/Qwen3-8B-Base",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.03,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj",
+    "down_proj",
+    "up_proj",
+    "gate_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}