diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c4c2d3aedf13d8eec0e1fd4566b10970ebfb66fe 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250628_232758-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250628_234855-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_082850-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_084639-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_090551-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_092305-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_094305-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_101310-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_111950-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_120036-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_184555-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text +swanlog/run-20250629_190303-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text diff --git a/swanlog/.gitignore b/swanlog/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f59ec20aabf5842d237244ece8c81ab184faeac1 --- /dev/null +++ b/swanlog/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/swanlog/run-20250628_232758-a3b1799d/backup.swanlab b/swanlog/run-20250628_232758-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..6de3bbb3379afea509607c5ad70ecc57ed55e42b --- /dev/null +++ b/swanlog/run-20250628_232758-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a8adba074b9f59af2ffb69a9df81e9ae8ea09cf3f91acf0aaf0e8ccdd5215a +size 871438 diff --git a/swanlog/run-20250628_232758-a3b1799d/files/config.yaml b/swanlog/run-20250628_232758-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3310367a4608cd8eeae0b60f14ff3cde82c29c3 --- /dev/null +++ b/swanlog/run-20250628_232758-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 1.0e-05 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1/v1-20250628-232707/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1/v1-20250628-232707 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1/v1-20250628-232707 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250628_232758-a3b1799d/files/requirements.txt b/swanlog/run-20250628_232758-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250628_232758-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250628_232758-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250628_232758-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..9b1dd0046b4dcabb86557e12a22617fe84ff96d7 --- /dev/null +++ b/swanlog/run-20250628_232758-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 2878259, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-5 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250628_232758-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250628_234855-a3b1799d/backup.swanlab b/swanlog/run-20250628_234855-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..25927f7d6f587edc7b4d2ba065d38d3db860a2e8 --- /dev/null +++ b/swanlog/run-20250628_234855-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fe04f62801602079a5ec980d332d5832b4cc04800cfde12aad30ccee2eb4c3 +size 871421 diff --git a/swanlog/run-20250628_234855-a3b1799d/files/config.yaml b/swanlog/run-20250628_234855-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fcc5bcb3dd9a30a2d9cd49ea7eb7b48090a9bdc6 --- /dev/null +++ b/swanlog/run-20250628_234855-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 1.0e-05 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250628-234806/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250628-234806 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250628-234806 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 2.0e-05 diff --git a/swanlog/run-20250628_234855-a3b1799d/files/requirements.txt b/swanlog/run-20250628_234855-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250628_234855-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250628_234855-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250628_234855-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..304366e9ea8e7c566464d0521c783c149bea51b0 --- /dev/null +++ b/swanlog/run-20250628_234855-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3395167, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-5 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 2e-5 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250628_234855-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_082850-a3b1799d/backup.swanlab b/swanlog/run-20250629_082850-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..008ae6879ea195122a12f7742716a99dd09a18fd --- /dev/null +++ b/swanlog/run-20250629_082850-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca085f4bca3d7e7ee8b97c5bb9167d07e9aa67ebf6bf339ca4700e8022bd4939 +size 804536 diff --git a/swanlog/run-20250629_082850-a3b1799d/files/config.yaml b/swanlog/run-20250629_082850-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c1fdb0dbc9ab132e387e78dc28844145f268394d --- /dev/null +++ b/swanlog/run-20250629_082850-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 1.0e-06 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250629-082750/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250629-082750 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250629-082750 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_082850-a3b1799d/files/requirements.txt b/swanlog/run-20250629_082850-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_082850-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_082850-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_082850-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3e6a5d242b5c144b319d4d6ec1d150f05dedaf53 --- /dev/null +++ b/swanlog/run-20250629_082850-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 763184, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --split_dataset_ratio 0.1 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_082850-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_084639-a3b1799d/backup.swanlab b/swanlog/run-20250629_084639-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..dae8dd7bc2fd8cb1cf2e34a09a2da2bdc0b4fa79 --- /dev/null +++ b/swanlog/run-20250629_084639-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de4f47ced940649ad542ffc1570817bcfc7f985b9b741022cacfdb83d928a45 +size 877654 diff --git a/swanlog/run-20250629_084639-a3b1799d/files/config.yaml b/swanlog/run-20250629_084639-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..37b400ed9b7bf4cce7b0a933f51618452192d4ea --- /dev/null +++ b/swanlog/run-20250629_084639-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 1.0e-06 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v1-20250629-084541/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v1-20250629-084541 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v1-20250629-084541 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_084639-a3b1799d/files/requirements.txt b/swanlog/run-20250629_084639-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_084639-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_084639-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_084639-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8a19a885bbe1341396cf01ea654591932de278f6 --- /dev/null +++ b/swanlog/run-20250629_084639-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 2000666, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_084639-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_090551-a3b1799d/backup.swanlab b/swanlog/run-20250629_090551-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..7fdb5f3e37358217a819c1c4dcf61177023ff513 --- /dev/null +++ b/swanlog/run-20250629_090551-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d54e71f7495a22679cfd8ebffd788dba92519665b3f36af1291b916036f38d +size 825733 diff --git a/swanlog/run-20250629_090551-a3b1799d/files/config.yaml b/swanlog/run-20250629_090551-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..79f1876037172ef88fca185078b34cf1c5aa307d --- /dev/null +++ b/swanlog/run-20250629_090551-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 1.0e-05 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal/v2-20250629-090453/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal/v2-20250629-090453 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal/v2-20250629-090453 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_090551-a3b1799d/files/requirements.txt b/swanlog/run-20250629_090551-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_090551-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_090551-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_090551-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..51ddce8ae361f54b0f87914f485a976fb2b29470 --- /dev/null +++ b/swanlog/run-20250629_090551-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3395381, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-5 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_090551-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_092305-a3b1799d/backup.swanlab b/swanlog/run-20250629_092305-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..22db2d310ebd28196631c61a953fa2fb6ad614f0 --- /dev/null +++ b/swanlog/run-20250629_092305-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9946f2b8edee4800761eff6a0659d712d57f79153999431b644f76de3feeb126 +size 825740 diff --git a/swanlog/run-20250629_092305-a3b1799d/files/config.yaml b/swanlog/run-20250629_092305-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1dadaab6d1a6fb6762d9cdf1cbff790491c0b922 --- /dev/null +++ b/swanlog/run-20250629_092305-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 5.0e-06 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v2/v0-20250629-092206/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v2/v0-20250629-092206 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v2/v0-20250629-092206 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_092305-a3b1799d/files/requirements.txt b/swanlog/run-20250629_092305-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_092305-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_092305-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_092305-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..1a906c52911b71ea4f2ba8dd326c2474555dd19e --- /dev/null +++ b/swanlog/run-20250629_092305-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 344856, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 5e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_092305-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_094047-a3b1799d/backup.swanlab b/swanlog/run-20250629_094047-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..2df13b4af0f1e7a1c45981bb66ecc800bdb5cde8 Binary files /dev/null and b/swanlog/run-20250629_094047-a3b1799d/backup.swanlab differ diff --git a/swanlog/run-20250629_094047-a3b1799d/files/config.yaml b/swanlog/run-20250629_094047-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4604de46f79ae33b8baa1eb55c469979780c04f1 --- /dev/null +++ b/swanlog/run-20250629_094047-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 2.0e-06 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-093950/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-093950 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-093950 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_094047-a3b1799d/files/requirements.txt b/swanlog/run-20250629_094047-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_094047-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_094047-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_094047-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..ad24bc26d565569d9b953cba5723dd53fb993608 --- /dev/null +++ b/swanlog/run-20250629_094047-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 1533419, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 2e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v3 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_094047-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_094305-a3b1799d/backup.swanlab b/swanlog/run-20250629_094305-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..9916c39b5f80540732fd72bdfdcd6bb455191c0c --- /dev/null +++ b/swanlog/run-20250629_094305-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69ef5ae04024278da899aa09fd26a6abdb43a69a6c81ca1154dcebf9606e9f74 +size 810013 diff --git a/swanlog/run-20250629_094305-a3b1799d/files/config.yaml b/swanlog/run-20250629_094305-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea16e323a0eefcd3abbca574e979e2821bde59b9 --- /dev/null +++ b/swanlog/run-20250629_094305-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 2.0e-06 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v1-20250629-094213/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v1-20250629-094213 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v1-20250629-094213 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_094305-a3b1799d/files/requirements.txt b/swanlog/run-20250629_094305-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_094305-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_094305-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_094305-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a91a7af8f421c5920aab5fafcda5969b18f34af0 --- /dev/null +++ b/swanlog/run-20250629_094305-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 1699831, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 2e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v3 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_094305-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_101310-a3b1799d/backup.swanlab b/swanlog/run-20250629_101310-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..b17f5728d3a17a363f9c8f67db4fe41bfae7ffaa --- /dev/null +++ b/swanlog/run-20250629_101310-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66ec9cb760ff27e221828644ef3beabe41bbc9061fbd4255657361012062c2d1 +size 825310 diff --git a/swanlog/run-20250629_101310-a3b1799d/files/config.yaml b/swanlog/run-20250629_101310-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e48198f430a73b2e8320b6520562c83950c8041a --- /dev/null +++ b/swanlog/run-20250629_101310-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 2.0e-06 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-101213/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-101213 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-101213 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_101310-a3b1799d/files/requirements.txt b/swanlog/run-20250629_101310-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_101310-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_101310-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_101310-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..a1cee0fb90d668d6bde1bb96747abef8e2fe5593 --- /dev/null +++ b/swanlog/run-20250629_101310-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3972595, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 2e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v3 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_101310-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_111950-a3b1799d/backup.swanlab b/swanlog/run-20250629_111950-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..8645a253e0779249d5e049ce9ffe47b06f2b6a2b --- /dev/null +++ b/swanlog/run-20250629_111950-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e515379d1491373474da1d875b6e6f4440a9c74ec435826e691b32a2b80ac67 +size 825904 diff --git a/swanlog/run-20250629_111950-a3b1799d/files/config.yaml b/swanlog/run-20250629_111950-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0584ce93ea3d12387943d338664894dcac29b0fa --- /dev/null +++ b/swanlog/run-20250629_111950-a3b1799d/files/config.yaml @@ -0,0 +1,990 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 74 + value: true +_name_or_path: + desc: '' + sort: 73 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 223 + value: 1 +acc_strategy: + desc: '' + sort: 213 + value: token +accelerator_config: + desc: '' + sort: 156 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 161 + value: false +adam_beta1: + desc: '' + sort: 94 + value: 0.9 +adam_beta2: + desc: '' + sort: 95 + value: 0.95 +adam_epsilon: + desc: '' + sort: 96 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 33 + value: false +aligner_lr: + desc: '' + sort: 216 + value: null +architectures: + desc: '' + sort: 60 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 16 + value: false +attention_dropout: + desc: '' + sort: 17 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 189 + value: false +average_tokens_across_devices: + desc: '' + sort: 205 + value: false +bad_words_ids: + desc: '' + sort: 50 + value: null +batch_eval_metrics: + desc: '' + sort: 201 + value: false +begin_suppress_tokens: + desc: '' + sort: 59 + value: null +bf16: + desc: '' + sort: 126 + value: true +bf16_full_eval: + desc: '' + sort: 130 + value: false +bos_token_id: + desc: '' + sort: 66 + value: 128000 +channels: + desc: '' + sort: 220 + value: null +check_model: + desc: '' + sort: 212 + value: true +chunk_size_feed_forward: + desc: '' + sort: 29 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 32 + value: null +data_seed: + desc: '' + sort: 123 + value: 42 +dataloader_drop_last: + desc: '' + sort: 138 + value: false +dataloader_num_workers: + desc: '' + sort: 140 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 169 + value: false +dataloader_pin_memory: + desc: '' + sort: 168 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 141 + value: 10 +ddp_backend: + desc: '' + sort: 134 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 167 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 166 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 165 + value: null +ddp_timeout: + desc: '' + sort: 193 + value: 18000000 +debug: + desc: '' + sort: 137 + value: [] +decoder_start_token_id: + desc: '' + sort: 70 + value: null +deepspeed: + desc: '' + sort: 157 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 144 + value: false +diversity_penalty: + desc: '' + sort: 41 + value: 0.0 +do_eval: + desc: '' + sort: 80 + value: true +do_predict: + desc: '' + sort: 81 + value: false +do_sample: + desc: '' + sort: 37 + value: false +do_train: + desc: '' + sort: 79 + value: false +early_stopping: + desc: '' + sort: 38 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +eos_token_id: + desc: '' + sort: 68 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 89 + value: null +eval_datasets: + desc: '' + sort: 225 + value: [] +eval_datasets_args: + desc: '' + sort: 227 + value: null +eval_delay: + desc: '' + sort: 90 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 183 + value: true +eval_generation_config: + desc: '' + sort: 228 + value: null +eval_limit: + desc: '' + sort: 226 + value: null +eval_on_start: + desc: '' + sort: 202 + value: false +eval_steps: + desc: '' + sort: 139 + value: null +eval_strategy: + desc: '' + sort: 82 + value: epoch +eval_use_evalscope: + desc: '' + sort: 224 + value: false +eval_use_gather_object: + desc: '' + sort: 204 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 57 + value: null +finetuning_task: + desc: '' + sort: 61 + value: null +forced_bos_token_id: + desc: '' + sort: 54 + value: null +forced_eos_token_id: + desc: '' + sort: 55 + value: null +fp16: + desc: '' + sort: 127 + value: false +fp16_backend: + desc: '' + sort: 184 + value: auto +fp16_full_eval: + desc: '' + sort: 131 + value: false +fp16_opt_level: + desc: '' + sort: 128 + value: O1 +fsdp: + desc: '' + sort: 151 + value: [] +fsdp_config: + desc: '' + sort: 153 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 152 + value: 0 +fsdp_num: + desc: '' + sort: 222 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 155 + value: null +full_determinism: + desc: '' + sort: 190 + value: false +galore_config: + desc: '' + sort: 231 + value: null +generation_config: + desc: '' + sort: 210 + value: null +generation_max_length: + desc: '' + sort: 208 + value: null +generation_num_beams: + desc: '' + sort: 209 + value: null +gradient_accumulation_steps: + desc: '' + sort: 88 + value: 2 +gradient_checkpointing: + desc: '' + sort: 179 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 180 + value: null +greater_is_better: + desc: '' + sort: 149 + value: false +group_by_length: + desc: '' + sort: 162 + value: false +half_precision_backend: + desc: '' + sort: 129 + value: auto +head_dim: + desc: '' + sort: 19 + value: 128 +hidden_act: + desc: '' + sort: 9 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 178 + value: false +hub_model_id: + desc: '' + sort: 174 + value: null +hub_private_repo: + desc: '' + sort: 177 + value: null +hub_strategy: + desc: '' + sort: 175 + value: every_save +hub_token: + desc: '' + sort: 176 + value: +id2label: + desc: '' + sort: 62 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 150 + value: false +include_for_metrics: + desc: '' + sort: 182 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 181 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 198 + value: false +include_tokens_per_second: + desc: '' + sort: 197 + value: false +initializer_range: + desc: '' + sort: 10 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 14336 +is_decoder: + desc: '' + sort: 31 + value: false +is_encoder_decoder: + desc: '' + sort: 30 + value: false +jit_mode_eval: + desc: '' + sort: 124 + value: false +label2id: + desc: '' + sort: 63 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 146 + value: null +label_smoothing_factor: + desc: '' + sort: 158 + value: 0.0 +learning_rate: + desc: '' + sort: 92 + value: 1.0e-06 +length_column_name: + desc: '' + sort: 163 + value: length +length_penalty: + desc: '' + sort: 47 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 147 + value: false +local_rank: + desc: '' + sort: 133 + value: 0 +local_repo_path: + desc: '' + sort: 230 + value: null +log_level: + desc: '' + sort: 104 + value: passive +log_level_replica: + desc: '' + sort: 105 + value: warning +log_on_each_node: + desc: '' + sort: 106 + value: true +logging_dir: + desc: '' + sort: 107 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v4/v0-20250629-111856/runs +logging_first_step: + desc: '' + sort: 109 + value: true +logging_nan_inf_filter: + desc: '' + sort: 111 + value: true +logging_steps: + desc: '' + sort: 110 + value: 1 +logging_strategy: + desc: '' + sort: 108 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 101 + value: null +lr_scheduler_type: + desc: '' + sort: 100 + value: cosine +max_epochs: + desc: '' + sort: 215 + value: null +max_grad_norm: + desc: '' + sort: 97 + value: 1.0 +max_length: + desc: '' + sort: 35 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 131072 +max_steps: + desc: '' + sort: 99 + value: -1 +metric_for_best_model: + desc: '' + sort: 148 + value: loss +metric_warmup_step: + desc: '' + sort: 221 + value: 0 +min_length: + desc: '' + sort: 36 + value: 0 +mlp_bias: + desc: '' + sort: 18 + value: false +model_num_parameters: + desc: '' + sort: 232 + value: 0 +model_type: + desc: '' + sort: 76 + value: llama +mp_parameters: + desc: '' + sort: 188 + value: '' +neftune_noise_alpha: + desc: '' + sort: 199 + value: null +no_cuda: + desc: '' + sort: 119 + value: false +no_repeat_ngram_size: + desc: '' + sort: 48 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 40 + value: 1 +num_beams: + desc: '' + sort: 39 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 32 +num_key_value_heads: + desc: '' + sort: 8 + value: 8 +num_return_sequences: + desc: '' + sort: 51 + value: 1 +num_train_epochs: + desc: '' + sort: 98 + value: 5.0 +optim: + desc: '' + sort: 159 + value: adamw_torch +optim_args: + desc: '' + sort: 160 + value: null +optim_target_modules: + desc: '' + sort: 200 + value: null +optimizer: + desc: '' + sort: 218 + value: null +output_attentions: + desc: '' + sort: 22 + value: false +output_dir: + desc: '' + sort: 77 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v4/v0-20250629-111856 +output_hidden_states: + desc: '' + sort: 21 + value: false +output_scores: + desc: '' + sort: 52 + value: false +overwrite_output_dir: + desc: '' + sort: 78 + value: false +pad_token_id: + desc: '' + sort: 67 + value: 128009 +past_index: + desc: '' + sort: 142 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 85 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 84 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 87 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 86 + value: null +predict_with_generate: + desc: '' + sort: 207 + value: false +prediction_loss_only: + desc: '' + sort: 83 + value: false +prefix: + desc: '' + sort: 65 + value: null +pretraining_tp: + desc: '' + sort: 12 + value: 1 +problem_type: + desc: '' + sort: 72 + value: null +pruned_heads: + desc: '' + sort: 27 + value: {} +push_to_hub: + desc: '' + sort: 172 + value: false +push_to_hub_model_id: + desc: '' + sort: 185 + value: null +push_to_hub_organization: + desc: '' + sort: 186 + value: null +push_to_hub_token: + desc: '' + sort: 187 + value: +ray_scope: + desc: '' + sort: 192 + value: last +remove_invalid_values: + desc: '' + sort: 56 + value: false +remove_unused_columns: + desc: '' + sort: 145 + value: false +repetition_penalty: + desc: '' + sort: 46 + value: 1.0 +report_to: + desc: '' + sort: 164 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 118 + value: false +resume_from_checkpoint: + desc: '' + sort: 173 + value: null +return_dict: + desc: '' + sort: 20 + value: true +return_dict_in_generate: + desc: '' + sort: 53 + value: false +rms_norm_eps: + desc: '' + sort: 11 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 15 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 14 + value: 500000.0 +run_name: + desc: '' + sort: 143 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v4/v0-20250629-111856 +save_on_each_node: + desc: '' + sort: 116 + value: false +save_only_model: + desc: '' + sort: 117 + value: false +save_safetensors: + desc: '' + sort: 115 + value: true +save_steps: + desc: '' + sort: 113 + value: 500 +save_strategy: + desc: '' + sort: 112 + value: steps +save_total_limit: + desc: '' + sort: 114 + value: 1 +seed: + desc: '' + sort: 122 + value: 42 +sep_token_id: + desc: '' + sort: 69 + value: null +skip_memory_metrics: + desc: '' + sort: 170 + value: true +sortish_sampler: + desc: '' + sort: 206 + value: false +suppress_tokens: + desc: '' + sort: 58 + value: null +task_specific_params: + desc: '' + sort: 71 + value: null +temperature: + desc: '' + sort: 42 + value: 1.0 +tf32: + desc: '' + sort: 132 + value: null +tf_legacy_loss: + desc: '' + sort: 26 + value: false +tie_encoder_decoder: + desc: '' + sort: 34 + value: false +tie_word_embeddings: + desc: '' + sort: 28 + value: false +tokenizer_class: + desc: '' + sort: 64 + value: null +top_k: + desc: '' + sort: 43 + value: 50 +top_p: + desc: '' + sort: 44 + value: 1.0 +torch_compile: + desc: '' + sort: 194 + value: false +torch_compile_backend: + desc: '' + sort: 195 + value: null +torch_compile_mode: + desc: '' + sort: 196 + value: null +torch_dtype: + desc: '' + sort: 24 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 91 + value: null +torchdynamo: + desc: '' + sort: 191 + value: null +torchscript: + desc: '' + sort: 23 + value: false +tp_size: + desc: '' + sort: 154 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 136 + value: false +tpu_num_cores: + desc: '' + sort: 135 + value: null +train_dataloader_shuffle: + desc: '' + sort: 214 + value: true +train_type: + desc: '' + sort: 229 + value: full +transformers_version: + desc: '' + sort: 75 + value: 4.51.3 +typical_p: + desc: '' + sort: 45 + value: 1.0 +use_bfloat16: + desc: '' + sort: 25 + value: false +use_cache: + desc: '' + sort: 13 + value: false +use_cpu: + desc: '' + sort: 120 + value: false +use_ipex: + desc: '' + sort: 125 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 171 + value: false +use_liger_kernel: + desc: '' + sort: 203 + value: false +use_logits_to_keep: + desc: '' + sort: 219 + value: null +use_mps_device: + desc: '' + sort: 121 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 211 + value: true +vit_lr: + desc: '' + sort: 217 + value: null +vocab_size: + desc: '' + sort: 2 + value: 128256 +warmup_ratio: + desc: '' + sort: 102 + value: 0.05 +warmup_steps: + desc: '' + sort: 103 + value: 0 +weight_decay: + desc: '' + sort: 93 + value: 0.0001 diff --git a/swanlog/run-20250629_111950-a3b1799d/files/requirements.txt b/swanlog/run-20250629_111950-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_111950-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_111950-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_111950-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5cbaa1d9d4b3dbd6b80ef6475760ebe05dc8746e --- /dev/null +++ b/swanlog/run-20250629_111950-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3726930, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v4 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_111950-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_120036-a3b1799d/backup.swanlab b/swanlog/run-20250629_120036-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..e7573cd938eb911143a7551936929a58c47f9bb7 --- /dev/null +++ b/swanlog/run-20250629_120036-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1960e714980575f5e339106625733af3169058f29830a29253dd005ceb3ad64 +size 895306 diff --git a/swanlog/run-20250629_120036-a3b1799d/files/config.yaml b/swanlog/run-20250629_120036-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74965e8d15942e6af5fcde2a275d3d34831f8519 --- /dev/null +++ b/swanlog/run-20250629_120036-a3b1799d/files/config.yaml @@ -0,0 +1,1004 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 75 + value: true +_name_or_path: + desc: '' + sort: 74 + value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct +acc_steps: + desc: '' + sort: 224 + value: 1 +acc_strategy: + desc: '' + sort: 214 + value: token +accelerator_config: + desc: '' + sort: 157 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 162 + value: false +adam_beta1: + desc: '' + sort: 95 + value: 0.9 +adam_beta2: + desc: '' + sort: 96 + value: 0.95 +adam_epsilon: + desc: '' + sort: 97 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 34 + value: false +aligner_lr: + desc: '' + sort: 217 + value: null +architectures: + desc: '' + sort: 61 + value: + - LlamaForCausalLM +attention_bias: + desc: '' + sort: 17 + value: false +attention_dropout: + desc: '' + sort: 18 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 190 + value: false +average_tokens_across_devices: + desc: '' + sort: 206 + value: false +bad_words_ids: + desc: '' + sort: 51 + value: null +batch_eval_metrics: + desc: '' + sort: 202 + value: false +begin_suppress_tokens: + desc: '' + sort: 60 + value: null +bf16: + desc: '' + sort: 127 + value: true +bf16_full_eval: + desc: '' + sort: 131 + value: false +bos_token_id: + desc: '' + sort: 67 + value: 128000 +channels: + desc: '' + sort: 221 + value: null +check_model: + desc: '' + sort: 213 + value: true +chunk_size_feed_forward: + desc: '' + sort: 30 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 33 + value: null +data_seed: + desc: '' + sort: 124 + value: 42 +dataloader_drop_last: + desc: '' + sort: 139 + value: false +dataloader_num_workers: + desc: '' + sort: 141 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 170 + value: false +dataloader_pin_memory: + desc: '' + sort: 169 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 142 + value: 10 +ddp_backend: + desc: '' + sort: 135 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 168 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 167 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 166 + value: null +ddp_timeout: + desc: '' + sort: 194 + value: 18000000 +debug: + desc: '' + sort: 138 + value: [] +decoder_start_token_id: + desc: '' + sort: 71 + value: null +deepspeed: + desc: '' + sort: 158 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 145 + value: false +diversity_penalty: + desc: '' + sort: 42 + value: 0.0 +do_eval: + desc: '' + sort: 81 + value: true +do_predict: + desc: '' + sort: 82 + value: false +do_sample: + desc: '' + sort: 38 + value: false +do_train: + desc: '' + sort: 80 + value: false +early_stopping: + desc: '' + sort: 39 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 50 + value: 0 +eos_token_id: + desc: '' + sort: 69 + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + desc: '' + sort: 90 + value: null +eval_datasets: + desc: '' + sort: 226 + value: [] +eval_datasets_args: + desc: '' + sort: 228 + value: null +eval_delay: + desc: '' + sort: 91 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 184 + value: true +eval_generation_config: + desc: '' + sort: 229 + value: null +eval_limit: + desc: '' + sort: 227 + value: null +eval_on_start: + desc: '' + sort: 203 + value: false +eval_steps: + desc: '' + sort: 140 + value: null +eval_strategy: + desc: '' + sort: 83 + value: epoch +eval_use_evalscope: + desc: '' + sort: 225 + value: false +eval_use_gather_object: + desc: '' + sort: 205 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 58 + value: null +finetuning_task: + desc: '' + sort: 62 + value: null +forced_bos_token_id: + desc: '' + sort: 55 + value: null +forced_eos_token_id: + desc: '' + sort: 56 + value: null +fp16: + desc: '' + sort: 128 + value: false +fp16_backend: + desc: '' + sort: 185 + value: auto +fp16_full_eval: + desc: '' + sort: 132 + value: false +fp16_opt_level: + desc: '' + sort: 129 + value: O1 +fsdp: + desc: '' + sort: 152 + value: [] +fsdp_config: + desc: '' + sort: 154 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 153 + value: 0 +fsdp_num: + desc: '' + sort: 223 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 156 + value: null +full_determinism: + desc: '' + sort: 191 + value: false +galore_config: + desc: '' + sort: 232 + value: null +generation_config: + desc: '' + sort: 211 + value: null +generation_max_length: + desc: '' + sort: 209 + value: null +generation_num_beams: + desc: '' + sort: 210 + value: null +gradient_accumulation_steps: + desc: '' + sort: 89 + value: 2 +gradient_checkpointing: + desc: '' + sort: 180 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 181 + value: null +greater_is_better: + desc: '' + sort: 150 + value: false +group_by_length: + desc: '' + sort: 163 + value: false +half_precision_backend: + desc: '' + sort: 130 + value: auto +head_dim: + desc: '' + sort: 20 + value: 128 +hidden_act: + desc: '' + sort: 10 + value: silu +hidden_size: + desc: '' + sort: 5 + value: 4096 +hub_always_push: + desc: '' + sort: 179 + value: false +hub_model_id: + desc: '' + sort: 175 + value: null +hub_private_repo: + desc: '' + sort: 178 + value: null +hub_strategy: + desc: '' + sort: 176 + value: every_save +hub_token: + desc: '' + sort: 177 + value: +id2label: + desc: '' + sort: 63 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 151 + value: false +include_for_metrics: + desc: '' + sort: 183 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 182 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 199 + value: false +include_tokens_per_second: + desc: '' + sort: 198 + value: false +initializer_range: + desc: '' + sort: 11 + value: 0.02 +intermediate_size: + desc: '' + sort: 6 + value: 14336 +is_decoder: + desc: '' + sort: 32 + value: false +is_encoder_decoder: + desc: '' + sort: 31 + value: false +jit_mode_eval: + desc: '' + sort: 125 + value: false +label2id: + desc: '' + sort: 64 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 147 + value: null +label_smoothing_factor: + desc: '' + sort: 159 + value: 0.0 +learning_rate: + desc: '' + sort: 93 + value: 1.0e-06 +length_column_name: + desc: '' + sort: 164 + value: length +length_penalty: + desc: '' + sort: 48 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 148 + value: false +local_rank: + desc: '' + sort: 134 + value: 0 +local_repo_path: + desc: '' + sort: 231 + value: null +log_level: + desc: '' + sort: 105 + value: passive +log_level_replica: + desc: '' + sort: 106 + value: warning +log_on_each_node: + desc: '' + sort: 107 + value: true +logging_dir: + desc: '' + sort: 108 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_lora_v4/v0-20250629-115943/runs +logging_first_step: + desc: '' + sort: 110 + value: true +logging_nan_inf_filter: + desc: '' + sort: 112 + value: true +logging_steps: + desc: '' + sort: 111 + value: 1 +logging_strategy: + desc: '' + sort: 109 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 102 + value: null +lr_scheduler_type: + desc: '' + sort: 101 + value: cosine +max_epochs: + desc: '' + sort: 216 + value: null +max_grad_norm: + desc: '' + sort: 98 + value: 1.0 +max_length: + desc: '' + sort: 36 + value: 20 +max_position_embeddings: + desc: '' + sort: 4 + value: 131072 +max_steps: + desc: '' + sort: 100 + value: -1 +metric_for_best_model: + desc: '' + sort: 149 + value: loss +metric_warmup_step: + desc: '' + sort: 222 + value: 0 +min_length: + desc: '' + sort: 37 + value: 0 +mlp_bias: + desc: '' + sort: 19 + value: false +model_num_parameters: + desc: '' + sort: 233 + value: 0 +model_type: + desc: '' + sort: 77 + value: llama +mp_parameters: + desc: '' + sort: 189 + value: '' +neftune_noise_alpha: + desc: '' + sort: 200 + value: null +no_cuda: + desc: '' + sort: 120 + value: false +no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +num_attention_heads: + desc: '' + sort: 8 + value: 32 +num_beam_groups: + desc: '' + sort: 41 + value: 1 +num_beams: + desc: '' + sort: 40 + value: 1 +num_hidden_layers: + desc: '' + sort: 7 + value: 32 +num_key_value_heads: + desc: '' + sort: 9 + value: 8 +num_return_sequences: + desc: '' + sort: 52 + value: 1 +num_train_epochs: + desc: '' + sort: 99 + value: 5.0 +optim: + desc: '' + sort: 160 + value: adamw_torch +optim_args: + desc: '' + sort: 161 + value: null +optim_target_modules: + desc: '' + sort: 201 + value: null +optimizer: + desc: '' + sort: 219 + value: null +output_attentions: + desc: '' + sort: 23 + value: false +output_dir: + desc: '' + sort: 78 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_lora_v4/v0-20250629-115943 +output_hidden_states: + desc: '' + sort: 22 + value: false +output_scores: + desc: '' + sort: 53 + value: false +overwrite_output_dir: + desc: '' + sort: 79 + value: false +pad_token_id: + desc: '' + sort: 68 + value: 128009 +past_index: + desc: '' + sort: 143 + value: -1 +peft_config: + desc: '' + sort: 2 + value: + default: 'LoraConfig(task_type=''CAUSAL_LM'', peft_type=, + auto_mapping=None, base_model_name_or_path=''/mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct'', + revision=None, inference_mode=False, r=8, target_modules={''up_proj'', ''o_proj'', + ''v_proj'', ''q_proj'', ''gate_proj'', ''k_proj'', ''down_proj''}, exclude_modules=None, + lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias=''none'', use_rslora=False, + modules_to_save=[], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, + rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core=''megatron.core'', + trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, + use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), + lora_bias=False, lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06)' +per_device_eval_batch_size: + desc: '' + sort: 86 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 85 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 88 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 87 + value: null +predict_with_generate: + desc: '' + sort: 208 + value: false +prediction_loss_only: + desc: '' + sort: 84 + value: false +prefix: + desc: '' + sort: 66 + value: null +pretraining_tp: + desc: '' + sort: 13 + value: 1 +problem_type: + desc: '' + sort: 73 + value: null +pruned_heads: + desc: '' + sort: 28 + value: {} +push_to_hub: + desc: '' + sort: 173 + value: false +push_to_hub_model_id: + desc: '' + sort: 186 + value: null +push_to_hub_organization: + desc: '' + sort: 187 + value: null +push_to_hub_token: + desc: '' + sort: 188 + value: +ray_scope: + desc: '' + sort: 193 + value: last +remove_invalid_values: + desc: '' + sort: 57 + value: false +remove_unused_columns: + desc: '' + sort: 146 + value: false +repetition_penalty: + desc: '' + sort: 47 + value: 1.0 +report_to: + desc: '' + sort: 165 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 119 + value: false +resume_from_checkpoint: + desc: '' + sort: 174 + value: null +return_dict: + desc: '' + sort: 21 + value: true +return_dict_in_generate: + desc: '' + sort: 54 + value: false +rms_norm_eps: + desc: '' + sort: 12 + value: 1.0e-05 +rope_scaling: + desc: '' + sort: 16 + value: + factor: 8.0 + high_freq_factor: 4.0 + low_freq_factor: 1.0 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + desc: '' + sort: 15 + value: 500000.0 +run_name: + desc: '' + sort: 144 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_lora_v4/v0-20250629-115943 +save_on_each_node: + desc: '' + sort: 117 + value: false +save_only_model: + desc: '' + sort: 118 + value: false +save_safetensors: + desc: '' + sort: 116 + value: true +save_steps: + desc: '' + sort: 114 + value: 500 +save_strategy: + desc: '' + sort: 113 + value: steps +save_total_limit: + desc: '' + sort: 115 + value: 1 +seed: + desc: '' + sort: 123 + value: 42 +sep_token_id: + desc: '' + sort: 70 + value: null +skip_memory_metrics: + desc: '' + sort: 171 + value: true +sortish_sampler: + desc: '' + sort: 207 + value: false +suppress_tokens: + desc: '' + sort: 59 + value: null +task_specific_params: + desc: '' + sort: 72 + value: null +temperature: + desc: '' + sort: 43 + value: 1.0 +tf32: + desc: '' + sort: 133 + value: null +tf_legacy_loss: + desc: '' + sort: 27 + value: false +tie_encoder_decoder: + desc: '' + sort: 35 + value: false +tie_word_embeddings: + desc: '' + sort: 29 + value: false +tokenizer_class: + desc: '' + sort: 65 + value: null +top_k: + desc: '' + sort: 44 + value: 50 +top_p: + desc: '' + sort: 45 + value: 1.0 +torch_compile: + desc: '' + sort: 195 + value: false +torch_compile_backend: + desc: '' + sort: 196 + value: null +torch_compile_mode: + desc: '' + sort: 197 + value: null +torch_dtype: + desc: '' + sort: 25 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 92 + value: null +torchdynamo: + desc: '' + sort: 192 + value: null +torchscript: + desc: '' + sort: 24 + value: false +tp_size: + desc: '' + sort: 155 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 137 + value: false +tpu_num_cores: + desc: '' + sort: 136 + value: null +train_dataloader_shuffle: + desc: '' + sort: 215 + value: true +train_type: + desc: '' + sort: 230 + value: lora +transformers_version: + desc: '' + sort: 76 + value: 4.51.3 +typical_p: + desc: '' + sort: 46 + value: 1.0 +use_bfloat16: + desc: '' + sort: 26 + value: false +use_cache: + desc: '' + sort: 14 + value: false +use_cpu: + desc: '' + sort: 121 + value: false +use_ipex: + desc: '' + sort: 126 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 172 + value: false +use_liger_kernel: + desc: '' + sort: 204 + value: false +use_logits_to_keep: + desc: '' + sort: 220 + value: null +use_mps_device: + desc: '' + sort: 122 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 212 + value: true +vit_lr: + desc: '' + sort: 218 + value: null +vocab_size: + desc: '' + sort: 3 + value: 128256 +warmup_ratio: + desc: '' + sort: 103 + value: 0.05 +warmup_steps: + desc: '' + sort: 104 + value: 0 +weight_decay: + desc: '' + sort: 94 + value: 0.0001 diff --git a/swanlog/run-20250629_120036-a3b1799d/files/requirements.txt b/swanlog/run-20250629_120036-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_120036-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_120036-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_120036-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..06a56127b06a31b18419cd1d6eaa6c4caf321499 --- /dev/null +++ b/swanlog/run-20250629_120036-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 430023, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type lora --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_lora_v4 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_120036-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_184555-a3b1799d/backup.swanlab b/swanlog/run-20250629_184555-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..fba73fae6da7e668227472f2335ab747801f683a --- /dev/null +++ b/swanlog/run-20250629_184555-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6366fd34242266b6124cca4db6ae0957dee924eb1088bbb28e409e71cacbf87d +size 877553 diff --git a/swanlog/run-20250629_184555-a3b1799d/files/config.yaml b/swanlog/run-20250629_184555-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f5760a81fcccf92040c5d11a6c636aad13697be4 --- /dev/null +++ b/swanlog/run-20250629_184555-a3b1799d/files/config.yaml @@ -0,0 +1,986 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 75 + value: true +_name_or_path: + desc: '' + sort: 74 + value: /mnt/data/users/liamding/data/models/Qwen3-8B +acc_steps: + desc: '' + sort: 224 + value: 1 +acc_strategy: + desc: '' + sort: 214 + value: token +accelerator_config: + desc: '' + sort: 157 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 162 + value: false +adam_beta1: + desc: '' + sort: 95 + value: 0.9 +adam_beta2: + desc: '' + sort: 96 + value: 0.95 +adam_epsilon: + desc: '' + sort: 97 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 34 + value: false +aligner_lr: + desc: '' + sort: 217 + value: null +architectures: + desc: '' + sort: 61 + value: + - Qwen3ForCausalLM +attention_bias: + desc: '' + sort: 19 + value: false +attention_dropout: + desc: '' + sort: 20 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 190 + value: false +average_tokens_across_devices: + desc: '' + sort: 206 + value: false +bad_words_ids: + desc: '' + sort: 51 + value: null +batch_eval_metrics: + desc: '' + sort: 202 + value: false +begin_suppress_tokens: + desc: '' + sort: 60 + value: null +bf16: + desc: '' + sort: 127 + value: true +bf16_full_eval: + desc: '' + sort: 131 + value: false +bos_token_id: + desc: '' + sort: 67 + value: 151643 +channels: + desc: '' + sort: 221 + value: null +check_model: + desc: '' + sort: 213 + value: true +chunk_size_feed_forward: + desc: '' + sort: 30 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 33 + value: null +data_seed: + desc: '' + sort: 124 + value: 42 +dataloader_drop_last: + desc: '' + sort: 139 + value: false +dataloader_num_workers: + desc: '' + sort: 141 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 170 + value: false +dataloader_pin_memory: + desc: '' + sort: 169 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 142 + value: 10 +ddp_backend: + desc: '' + sort: 135 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 168 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 167 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 166 + value: null +ddp_timeout: + desc: '' + sort: 194 + value: 18000000 +debug: + desc: '' + sort: 138 + value: [] +decoder_start_token_id: + desc: '' + sort: 71 + value: null +deepspeed: + desc: '' + sort: 158 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 145 + value: false +diversity_penalty: + desc: '' + sort: 42 + value: 0.0 +do_eval: + desc: '' + sort: 81 + value: true +do_predict: + desc: '' + sort: 82 + value: false +do_sample: + desc: '' + sort: 38 + value: false +do_train: + desc: '' + sort: 80 + value: false +early_stopping: + desc: '' + sort: 39 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 50 + value: 0 +eos_token_id: + desc: '' + sort: 69 + value: 151645 +eval_accumulation_steps: + desc: '' + sort: 90 + value: null +eval_datasets: + desc: '' + sort: 226 + value: [] +eval_datasets_args: + desc: '' + sort: 228 + value: null +eval_delay: + desc: '' + sort: 91 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 184 + value: true +eval_generation_config: + desc: '' + sort: 229 + value: null +eval_limit: + desc: '' + sort: 227 + value: null +eval_on_start: + desc: '' + sort: 203 + value: false +eval_steps: + desc: '' + sort: 140 + value: null +eval_strategy: + desc: '' + sort: 83 + value: epoch +eval_use_evalscope: + desc: '' + sort: 225 + value: false +eval_use_gather_object: + desc: '' + sort: 205 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 58 + value: null +finetuning_task: + desc: '' + sort: 62 + value: null +forced_bos_token_id: + desc: '' + sort: 55 + value: null +forced_eos_token_id: + desc: '' + sort: 56 + value: null +fp16: + desc: '' + sort: 128 + value: false +fp16_backend: + desc: '' + sort: 185 + value: auto +fp16_full_eval: + desc: '' + sort: 132 + value: false +fp16_opt_level: + desc: '' + sort: 129 + value: O1 +fsdp: + desc: '' + sort: 152 + value: [] +fsdp_config: + desc: '' + sort: 154 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 153 + value: 0 +fsdp_num: + desc: '' + sort: 223 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 156 + value: null +full_determinism: + desc: '' + sort: 191 + value: false +galore_config: + desc: '' + sort: 232 + value: null +generation_config: + desc: '' + sort: 211 + value: null +generation_max_length: + desc: '' + sort: 209 + value: null +generation_num_beams: + desc: '' + sort: 210 + value: null +gradient_accumulation_steps: + desc: '' + sort: 89 + value: 2 +gradient_checkpointing: + desc: '' + sort: 180 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 181 + value: null +greater_is_better: + desc: '' + sort: 150 + value: false +group_by_length: + desc: '' + sort: 163 + value: false +half_precision_backend: + desc: '' + sort: 130 + value: auto +head_dim: + desc: '' + sort: 12 + value: 128 +hidden_act: + desc: '' + sort: 13 + value: silu +hidden_size: + desc: '' + sort: 4 + value: 4096 +hub_always_push: + desc: '' + sort: 179 + value: false +hub_model_id: + desc: '' + sort: 175 + value: null +hub_private_repo: + desc: '' + sort: 178 + value: null +hub_strategy: + desc: '' + sort: 176 + value: every_save +hub_token: + desc: '' + sort: 177 + value: +id2label: + desc: '' + sort: 63 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 151 + value: false +include_for_metrics: + desc: '' + sort: 183 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 182 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 199 + value: false +include_tokens_per_second: + desc: '' + sort: 198 + value: false +initializer_range: + desc: '' + sort: 14 + value: 0.02 +intermediate_size: + desc: '' + sort: 5 + value: 12288 +is_decoder: + desc: '' + sort: 32 + value: false +is_encoder_decoder: + desc: '' + sort: 31 + value: false +jit_mode_eval: + desc: '' + sort: 125 + value: false +label2id: + desc: '' + sort: 64 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 147 + value: null +label_smoothing_factor: + desc: '' + sort: 159 + value: 0.0 +learning_rate: + desc: '' + sort: 93 + value: 1.0e-06 +length_column_name: + desc: '' + sort: 164 + value: length +length_penalty: + desc: '' + sort: 48 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 148 + value: false +local_rank: + desc: '' + sort: 134 + value: 0 +local_repo_path: + desc: '' + sort: 231 + value: null +log_level: + desc: '' + sort: 105 + value: passive +log_level_replica: + desc: '' + sort: 106 + value: warning +log_on_each_node: + desc: '' + sort: 107 + value: true +logging_dir: + desc: '' + sort: 108 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_8b_r1_v2/v0-20250629-184450/runs +logging_first_step: + desc: '' + sort: 110 + value: true +logging_nan_inf_filter: + desc: '' + sort: 112 + value: true +logging_steps: + desc: '' + sort: 111 + value: 1 +logging_strategy: + desc: '' + sort: 109 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 102 + value: null +lr_scheduler_type: + desc: '' + sort: 101 + value: cosine +max_epochs: + desc: '' + sort: 216 + value: null +max_grad_norm: + desc: '' + sort: 98 + value: 1.0 +max_length: + desc: '' + sort: 36 + value: 20 +max_position_embeddings: + desc: '' + sort: 3 + value: 40960 +max_steps: + desc: '' + sort: 100 + value: -1 +max_window_layers: + desc: '' + sort: 10 + value: 36 +metric_for_best_model: + desc: '' + sort: 149 + value: loss +metric_warmup_step: + desc: '' + sort: 222 + value: 0 +min_length: + desc: '' + sort: 37 + value: 0 +model_num_parameters: + desc: '' + sort: 233 + value: 0 +model_type: + desc: '' + sort: 77 + value: qwen3 +mp_parameters: + desc: '' + sort: 189 + value: '' +neftune_noise_alpha: + desc: '' + sort: 200 + value: null +no_cuda: + desc: '' + sort: 120 + value: false +no_repeat_ngram_size: + desc: '' + sort: 49 + value: 0 +num_attention_heads: + desc: '' + sort: 7 + value: 32 +num_beam_groups: + desc: '' + sort: 41 + value: 1 +num_beams: + desc: '' + sort: 40 + value: 1 +num_hidden_layers: + desc: '' + sort: 6 + value: 36 +num_key_value_heads: + desc: '' + sort: 11 + value: 8 +num_return_sequences: + desc: '' + sort: 52 + value: 1 +num_train_epochs: + desc: '' + sort: 99 + value: 5.0 +optim: + desc: '' + sort: 160 + value: adamw_torch +optim_args: + desc: '' + sort: 161 + value: null +optim_target_modules: + desc: '' + sort: 201 + value: null +optimizer: + desc: '' + sort: 219 + value: null +output_attentions: + desc: '' + sort: 23 + value: false +output_dir: + desc: '' + sort: 78 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_8b_r1_v2/v0-20250629-184450 +output_hidden_states: + desc: '' + sort: 22 + value: false +output_scores: + desc: '' + sort: 53 + value: false +overwrite_output_dir: + desc: '' + sort: 79 + value: false +pad_token_id: + desc: '' + sort: 68 + value: 151643 +past_index: + desc: '' + sort: 143 + value: -1 +per_device_eval_batch_size: + desc: '' + sort: 86 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 85 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 88 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 87 + value: null +predict_with_generate: + desc: '' + sort: 208 + value: false +prediction_loss_only: + desc: '' + sort: 84 + value: false +prefix: + desc: '' + sort: 66 + value: null +problem_type: + desc: '' + sort: 73 + value: null +pruned_heads: + desc: '' + sort: 28 + value: {} +push_to_hub: + desc: '' + sort: 173 + value: false +push_to_hub_model_id: + desc: '' + sort: 186 + value: null +push_to_hub_organization: + desc: '' + sort: 187 + value: null +push_to_hub_token: + desc: '' + sort: 188 + value: +ray_scope: + desc: '' + sort: 193 + value: last +remove_invalid_values: + desc: '' + sort: 57 + value: false +remove_unused_columns: + desc: '' + sort: 146 + value: false +repetition_penalty: + desc: '' + sort: 47 + value: 1.0 +report_to: + desc: '' + sort: 165 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 119 + value: false +resume_from_checkpoint: + desc: '' + sort: 174 + value: null +return_dict: + desc: '' + sort: 21 + value: true +return_dict_in_generate: + desc: '' + sort: 54 + value: false +rms_norm_eps: + desc: '' + sort: 15 + value: 1.0e-06 +rope_scaling: + desc: '' + sort: 18 + value: null +rope_theta: + desc: '' + sort: 17 + value: 1000000 +run_name: + desc: '' + sort: 144 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_8b_r1_v2/v0-20250629-184450 +save_on_each_node: + desc: '' + sort: 117 + value: false +save_only_model: + desc: '' + sort: 118 + value: false +save_safetensors: + desc: '' + sort: 116 + value: true +save_steps: + desc: '' + sort: 114 + value: 500 +save_strategy: + desc: '' + sort: 113 + value: steps +save_total_limit: + desc: '' + sort: 115 + value: 1 +seed: + desc: '' + sort: 123 + value: 42 +sep_token_id: + desc: '' + sort: 70 + value: null +skip_memory_metrics: + desc: '' + sort: 171 + value: true +sliding_window: + desc: '' + sort: 9 + value: null +sortish_sampler: + desc: '' + sort: 207 + value: false +suppress_tokens: + desc: '' + sort: 59 + value: null +task_specific_params: + desc: '' + sort: 72 + value: null +temperature: + desc: '' + sort: 43 + value: 1.0 +tf32: + desc: '' + sort: 133 + value: null +tf_legacy_loss: + desc: '' + sort: 27 + value: false +tie_encoder_decoder: + desc: '' + sort: 35 + value: false +tie_word_embeddings: + desc: '' + sort: 29 + value: false +tokenizer_class: + desc: '' + sort: 65 + value: null +top_k: + desc: '' + sort: 44 + value: 50 +top_p: + desc: '' + sort: 45 + value: 1.0 +torch_compile: + desc: '' + sort: 195 + value: false +torch_compile_backend: + desc: '' + sort: 196 + value: null +torch_compile_mode: + desc: '' + sort: 197 + value: null +torch_dtype: + desc: '' + sort: 25 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 92 + value: null +torchdynamo: + desc: '' + sort: 192 + value: null +torchscript: + desc: '' + sort: 24 + value: false +tp_size: + desc: '' + sort: 155 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 137 + value: false +tpu_num_cores: + desc: '' + sort: 136 + value: null +train_dataloader_shuffle: + desc: '' + sort: 215 + value: true +train_type: + desc: '' + sort: 230 + value: full +transformers_version: + desc: '' + sort: 76 + value: 4.51.3 +typical_p: + desc: '' + sort: 46 + value: 1.0 +use_bfloat16: + desc: '' + sort: 26 + value: false +use_cache: + desc: '' + sort: 16 + value: false +use_cpu: + desc: '' + sort: 121 + value: false +use_ipex: + desc: '' + sort: 126 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 172 + value: false +use_liger_kernel: + desc: '' + sort: 204 + value: false +use_logits_to_keep: + desc: '' + sort: 220 + value: null +use_mps_device: + desc: '' + sort: 122 + value: false +use_sliding_window: + desc: '' + sort: 8 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 212 + value: true +vit_lr: + desc: '' + sort: 218 + value: null +vocab_size: + desc: '' + sort: 2 + value: 151936 +warmup_ratio: + desc: '' + sort: 103 + value: 0.05 +warmup_steps: + desc: '' + sort: 104 + value: 0 +weight_decay: + desc: '' + sort: 94 + value: 0.0001 diff --git a/swanlog/run-20250629_184555-a3b1799d/files/requirements.txt b/swanlog/run-20250629_184555-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_184555-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_184555-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_184555-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5e6f7043dded21f10e20494b7f70b0a14cae0b01 --- /dev/null +++ b/swanlog/run-20250629_184555-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 1366249, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Qwen3-8B --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/qwen3_r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/qwen3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_184555-a3b1799d"}} \ No newline at end of file diff --git a/swanlog/run-20250629_190303-a3b1799d/backup.swanlab b/swanlog/run-20250629_190303-a3b1799d/backup.swanlab new file mode 100644 index 0000000000000000000000000000000000000000..071e8e151b22bbd28dff94922d14c9b24ddfdd66 --- /dev/null +++ b/swanlog/run-20250629_190303-a3b1799d/backup.swanlab @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5fffdce3f5fb3cd25775f5946995b3e870201417e4e01bb47cee71aa2079448 +size 929850 diff --git a/swanlog/run-20250629_190303-a3b1799d/files/config.yaml b/swanlog/run-20250629_190303-a3b1799d/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..da2cac2bb506501b73d43a5303f9189064e9bdfe --- /dev/null +++ b/swanlog/run-20250629_190303-a3b1799d/files/config.yaml @@ -0,0 +1,1000 @@ +FRAMEWORK: + desc: '' + sort: 1 + value: 🤗transformers +UPPERFRAME: + desc: '' + sort: 0 + value: 🐦‍⬛ms-swift +_attn_implementation_autoset: + desc: '' + sort: 76 + value: true +_name_or_path: + desc: '' + sort: 75 + value: /mnt/data/users/liamding/data/models/Qwen3-8B +acc_steps: + desc: '' + sort: 225 + value: 1 +acc_strategy: + desc: '' + sort: 215 + value: token +accelerator_config: + desc: '' + sort: 158 + value: + dispatch_batches: false + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +adafactor: + desc: '' + sort: 163 + value: false +adam_beta1: + desc: '' + sort: 96 + value: 0.9 +adam_beta2: + desc: '' + sort: 97 + value: 0.95 +adam_epsilon: + desc: '' + sort: 98 + value: 1.0e-08 +add_cross_attention: + desc: '' + sort: 35 + value: false +aligner_lr: + desc: '' + sort: 218 + value: null +architectures: + desc: '' + sort: 62 + value: + - Qwen3ForCausalLM +attention_bias: + desc: '' + sort: 20 + value: false +attention_dropout: + desc: '' + sort: 21 + value: 0.0 +auto_find_batch_size: + desc: '' + sort: 191 + value: false +average_tokens_across_devices: + desc: '' + sort: 207 + value: false +bad_words_ids: + desc: '' + sort: 52 + value: null +batch_eval_metrics: + desc: '' + sort: 203 + value: false +begin_suppress_tokens: + desc: '' + sort: 61 + value: null +bf16: + desc: '' + sort: 128 + value: true +bf16_full_eval: + desc: '' + sort: 132 + value: false +bos_token_id: + desc: '' + sort: 68 + value: 151643 +channels: + desc: '' + sort: 222 + value: null +check_model: + desc: '' + sort: 214 + value: true +chunk_size_feed_forward: + desc: '' + sort: 31 + value: 0 +cross_attention_hidden_size: + desc: '' + sort: 34 + value: null +data_seed: + desc: '' + sort: 125 + value: 42 +dataloader_drop_last: + desc: '' + sort: 140 + value: false +dataloader_num_workers: + desc: '' + sort: 142 + value: 4 +dataloader_persistent_workers: + desc: '' + sort: 171 + value: false +dataloader_pin_memory: + desc: '' + sort: 170 + value: true +dataloader_prefetch_factor: + desc: '' + sort: 143 + value: 10 +ddp_backend: + desc: '' + sort: 136 + value: null +ddp_broadcast_buffers: + desc: '' + sort: 169 + value: null +ddp_bucket_cap_mb: + desc: '' + sort: 168 + value: null +ddp_find_unused_parameters: + desc: '' + sort: 167 + value: null +ddp_timeout: + desc: '' + sort: 195 + value: 18000000 +debug: + desc: '' + sort: 139 + value: [] +decoder_start_token_id: + desc: '' + sort: 72 + value: null +deepspeed: + desc: '' + sort: 159 + value: + bf16: + enabled: auto + fp16: + enabled: auto + hysteresis: 2 + initial_scale_power: 16 + loss_scale: 0 + loss_scale_window: 1000 + min_loss_scale: 1 + gradient_accumulation_steps: auto + gradient_clipping: auto + steps_per_print: 2000 + train_batch_size: auto + train_micro_batch_size_per_gpu: auto + wall_clock_breakdown: false + zero_optimization: + contiguous_gradients: true + offload_optimizer: + device: none + pin_memory: true + offload_param: + device: none + pin_memory: true + overlap_comm: false + reduce_bucket_size: auto + stage: 3 + stage3_gather_16bit_weights_on_model_save: true + stage3_max_live_parameters: 1000000000.0 + stage3_max_reuse_distance: 1000000000.0 + stage3_param_persistence_threshold: auto + stage3_prefetch_bucket_size: auto + sub_group_size: 1000000000.0 + zero_quantized_gradients: false + zero_quantized_weights: false +disable_tqdm: + desc: '' + sort: 146 + value: false +diversity_penalty: + desc: '' + sort: 43 + value: 0.0 +do_eval: + desc: '' + sort: 82 + value: true +do_predict: + desc: '' + sort: 83 + value: false +do_sample: + desc: '' + sort: 39 + value: false +do_train: + desc: '' + sort: 81 + value: false +early_stopping: + desc: '' + sort: 40 + value: false +encoder_no_repeat_ngram_size: + desc: '' + sort: 51 + value: 0 +eos_token_id: + desc: '' + sort: 70 + value: 151645 +eval_accumulation_steps: + desc: '' + sort: 91 + value: null +eval_datasets: + desc: '' + sort: 227 + value: [] +eval_datasets_args: + desc: '' + sort: 229 + value: null +eval_delay: + desc: '' + sort: 92 + value: 0 +eval_do_concat_batches: + desc: '' + sort: 185 + value: true +eval_generation_config: + desc: '' + sort: 230 + value: null +eval_limit: + desc: '' + sort: 228 + value: null +eval_on_start: + desc: '' + sort: 204 + value: false +eval_steps: + desc: '' + sort: 141 + value: null +eval_strategy: + desc: '' + sort: 84 + value: epoch +eval_use_evalscope: + desc: '' + sort: 226 + value: false +eval_use_gather_object: + desc: '' + sort: 206 + value: false +exponential_decay_length_penalty: + desc: '' + sort: 59 + value: null +finetuning_task: + desc: '' + sort: 63 + value: null +forced_bos_token_id: + desc: '' + sort: 56 + value: null +forced_eos_token_id: + desc: '' + sort: 57 + value: null +fp16: + desc: '' + sort: 129 + value: false +fp16_backend: + desc: '' + sort: 186 + value: auto +fp16_full_eval: + desc: '' + sort: 133 + value: false +fp16_opt_level: + desc: '' + sort: 130 + value: O1 +fsdp: + desc: '' + sort: 153 + value: [] +fsdp_config: + desc: '' + sort: 155 + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + desc: '' + sort: 154 + value: 0 +fsdp_num: + desc: '' + sort: 224 + value: 1 +fsdp_transformer_layer_cls_to_wrap: + desc: '' + sort: 157 + value: null +full_determinism: + desc: '' + sort: 192 + value: false +galore_config: + desc: '' + sort: 233 + value: null +generation_config: + desc: '' + sort: 212 + value: null +generation_max_length: + desc: '' + sort: 210 + value: null +generation_num_beams: + desc: '' + sort: 211 + value: null +gradient_accumulation_steps: + desc: '' + sort: 90 + value: 2 +gradient_checkpointing: + desc: '' + sort: 181 + value: false +gradient_checkpointing_kwargs: + desc: '' + sort: 182 + value: null +greater_is_better: + desc: '' + sort: 151 + value: false +group_by_length: + desc: '' + sort: 164 + value: false +half_precision_backend: + desc: '' + sort: 131 + value: auto +head_dim: + desc: '' + sort: 13 + value: 128 +hidden_act: + desc: '' + sort: 14 + value: silu +hidden_size: + desc: '' + sort: 5 + value: 4096 +hub_always_push: + desc: '' + sort: 180 + value: false +hub_model_id: + desc: '' + sort: 176 + value: null +hub_private_repo: + desc: '' + sort: 179 + value: null +hub_strategy: + desc: '' + sort: 177 + value: every_save +hub_token: + desc: '' + sort: 178 + value: +id2label: + desc: '' + sort: 64 + value: + '0': LABEL_0 + '1': LABEL_1 +ignore_data_skip: + desc: '' + sort: 152 + value: false +include_for_metrics: + desc: '' + sort: 184 + value: [] +include_inputs_for_metrics: + desc: '' + sort: 183 + value: false +include_num_input_tokens_seen: + desc: '' + sort: 200 + value: false +include_tokens_per_second: + desc: '' + sort: 199 + value: false +initializer_range: + desc: '' + sort: 15 + value: 0.02 +intermediate_size: + desc: '' + sort: 6 + value: 12288 +is_decoder: + desc: '' + sort: 33 + value: false +is_encoder_decoder: + desc: '' + sort: 32 + value: false +jit_mode_eval: + desc: '' + sort: 126 + value: false +label2id: + desc: '' + sort: 65 + value: + LABEL_0: 0 + LABEL_1: 1 +label_names: + desc: '' + sort: 148 + value: null +label_smoothing_factor: + desc: '' + sort: 160 + value: 0.0 +learning_rate: + desc: '' + sort: 94 + value: 1.0e-05 +length_column_name: + desc: '' + sort: 165 + value: length +length_penalty: + desc: '' + sort: 49 + value: 1.0 +load_best_model_at_end: + desc: '' + sort: 149 + value: false +local_rank: + desc: '' + sort: 135 + value: 0 +local_repo_path: + desc: '' + sort: 232 + value: null +log_level: + desc: '' + sort: 106 + value: passive +log_level_replica: + desc: '' + sort: 107 + value: warning +log_on_each_node: + desc: '' + sort: 108 + value: true +logging_dir: + desc: '' + sort: 109 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_normal_lora_v1/v0-20250629-190209/runs +logging_first_step: + desc: '' + sort: 111 + value: true +logging_nan_inf_filter: + desc: '' + sort: 113 + value: true +logging_steps: + desc: '' + sort: 112 + value: 1 +logging_strategy: + desc: '' + sort: 110 + value: steps +lr_scheduler_kwargs: + desc: '' + sort: 103 + value: null +lr_scheduler_type: + desc: '' + sort: 102 + value: cosine +max_epochs: + desc: '' + sort: 217 + value: null +max_grad_norm: + desc: '' + sort: 99 + value: 1.0 +max_length: + desc: '' + sort: 37 + value: 20 +max_position_embeddings: + desc: '' + sort: 4 + value: 40960 +max_steps: + desc: '' + sort: 101 + value: -1 +max_window_layers: + desc: '' + sort: 11 + value: 36 +metric_for_best_model: + desc: '' + sort: 150 + value: loss +metric_warmup_step: + desc: '' + sort: 223 + value: 0 +min_length: + desc: '' + sort: 38 + value: 0 +model_num_parameters: + desc: '' + sort: 234 + value: 0 +model_type: + desc: '' + sort: 78 + value: qwen3 +mp_parameters: + desc: '' + sort: 190 + value: '' +neftune_noise_alpha: + desc: '' + sort: 201 + value: null +no_cuda: + desc: '' + sort: 121 + value: false +no_repeat_ngram_size: + desc: '' + sort: 50 + value: 0 +num_attention_heads: + desc: '' + sort: 8 + value: 32 +num_beam_groups: + desc: '' + sort: 42 + value: 1 +num_beams: + desc: '' + sort: 41 + value: 1 +num_hidden_layers: + desc: '' + sort: 7 + value: 36 +num_key_value_heads: + desc: '' + sort: 12 + value: 8 +num_return_sequences: + desc: '' + sort: 53 + value: 1 +num_train_epochs: + desc: '' + sort: 100 + value: 5.0 +optim: + desc: '' + sort: 161 + value: adamw_torch +optim_args: + desc: '' + sort: 162 + value: null +optim_target_modules: + desc: '' + sort: 202 + value: null +optimizer: + desc: '' + sort: 220 + value: null +output_attentions: + desc: '' + sort: 24 + value: false +output_dir: + desc: '' + sort: 79 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_normal_lora_v1/v0-20250629-190209 +output_hidden_states: + desc: '' + sort: 23 + value: false +output_scores: + desc: '' + sort: 54 + value: false +overwrite_output_dir: + desc: '' + sort: 80 + value: false +pad_token_id: + desc: '' + sort: 69 + value: 151643 +past_index: + desc: '' + sort: 144 + value: -1 +peft_config: + desc: '' + sort: 2 + value: + default: 'LoraConfig(task_type=''CAUSAL_LM'', peft_type=, + auto_mapping=None, base_model_name_or_path=''/mnt/data/users/liamding/data/models/Qwen3-8B'', + revision=None, inference_mode=False, r=8, target_modules={''v_proj'', ''down_proj'', + ''q_proj'', ''up_proj'', ''o_proj'', ''gate_proj'', ''k_proj''}, exclude_modules=None, + lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias=''none'', use_rslora=False, + modules_to_save=[], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, + rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core=''megatron.core'', + trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, + use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), + lora_bias=False, lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06)' +per_device_eval_batch_size: + desc: '' + sort: 87 + value: 2 +per_device_train_batch_size: + desc: '' + sort: 86 + value: 2 +per_gpu_eval_batch_size: + desc: '' + sort: 89 + value: null +per_gpu_train_batch_size: + desc: '' + sort: 88 + value: null +predict_with_generate: + desc: '' + sort: 209 + value: false +prediction_loss_only: + desc: '' + sort: 85 + value: false +prefix: + desc: '' + sort: 67 + value: null +problem_type: + desc: '' + sort: 74 + value: null +pruned_heads: + desc: '' + sort: 29 + value: {} +push_to_hub: + desc: '' + sort: 174 + value: false +push_to_hub_model_id: + desc: '' + sort: 187 + value: null +push_to_hub_organization: + desc: '' + sort: 188 + value: null +push_to_hub_token: + desc: '' + sort: 189 + value: +ray_scope: + desc: '' + sort: 194 + value: last +remove_invalid_values: + desc: '' + sort: 58 + value: false +remove_unused_columns: + desc: '' + sort: 147 + value: false +repetition_penalty: + desc: '' + sort: 48 + value: 1.0 +report_to: + desc: '' + sort: 166 + value: + - swanlab +restore_callback_states_from_checkpoint: + desc: '' + sort: 120 + value: false +resume_from_checkpoint: + desc: '' + sort: 175 + value: null +return_dict: + desc: '' + sort: 22 + value: true +return_dict_in_generate: + desc: '' + sort: 55 + value: false +rms_norm_eps: + desc: '' + sort: 16 + value: 1.0e-06 +rope_scaling: + desc: '' + sort: 19 + value: null +rope_theta: + desc: '' + sort: 18 + value: 1000000 +run_name: + desc: '' + sort: 145 + value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_normal_lora_v1/v0-20250629-190209 +save_on_each_node: + desc: '' + sort: 118 + value: false +save_only_model: + desc: '' + sort: 119 + value: false +save_safetensors: + desc: '' + sort: 117 + value: true +save_steps: + desc: '' + sort: 115 + value: 500 +save_strategy: + desc: '' + sort: 114 + value: steps +save_total_limit: + desc: '' + sort: 116 + value: 1 +seed: + desc: '' + sort: 124 + value: 42 +sep_token_id: + desc: '' + sort: 71 + value: null +skip_memory_metrics: + desc: '' + sort: 172 + value: true +sliding_window: + desc: '' + sort: 10 + value: null +sortish_sampler: + desc: '' + sort: 208 + value: false +suppress_tokens: + desc: '' + sort: 60 + value: null +task_specific_params: + desc: '' + sort: 73 + value: null +temperature: + desc: '' + sort: 44 + value: 1.0 +tf32: + desc: '' + sort: 134 + value: null +tf_legacy_loss: + desc: '' + sort: 28 + value: false +tie_encoder_decoder: + desc: '' + sort: 36 + value: false +tie_word_embeddings: + desc: '' + sort: 30 + value: false +tokenizer_class: + desc: '' + sort: 66 + value: null +top_k: + desc: '' + sort: 45 + value: 50 +top_p: + desc: '' + sort: 46 + value: 1.0 +torch_compile: + desc: '' + sort: 196 + value: false +torch_compile_backend: + desc: '' + sort: 197 + value: null +torch_compile_mode: + desc: '' + sort: 198 + value: null +torch_dtype: + desc: '' + sort: 26 + value: bfloat16 +torch_empty_cache_steps: + desc: '' + sort: 93 + value: null +torchdynamo: + desc: '' + sort: 193 + value: null +torchscript: + desc: '' + sort: 25 + value: false +tp_size: + desc: '' + sort: 156 + value: 0 +tpu_metrics_debug: + desc: '' + sort: 138 + value: false +tpu_num_cores: + desc: '' + sort: 137 + value: null +train_dataloader_shuffle: + desc: '' + sort: 216 + value: true +train_type: + desc: '' + sort: 231 + value: lora +transformers_version: + desc: '' + sort: 77 + value: 4.51.3 +typical_p: + desc: '' + sort: 47 + value: 1.0 +use_bfloat16: + desc: '' + sort: 27 + value: false +use_cache: + desc: '' + sort: 17 + value: false +use_cpu: + desc: '' + sort: 122 + value: false +use_ipex: + desc: '' + sort: 127 + value: false +use_legacy_prediction_loop: + desc: '' + sort: 173 + value: false +use_liger_kernel: + desc: '' + sort: 205 + value: false +use_logits_to_keep: + desc: '' + sort: 221 + value: null +use_mps_device: + desc: '' + sort: 123 + value: false +use_sliding_window: + desc: '' + sort: 9 + value: false +vit_gradient_checkpointing: + desc: '' + sort: 213 + value: true +vit_lr: + desc: '' + sort: 219 + value: null +vocab_size: + desc: '' + sort: 3 + value: 151936 +warmup_ratio: + desc: '' + sort: 104 + value: 0.05 +warmup_steps: + desc: '' + sort: 105 + value: 0 +weight_decay: + desc: '' + sort: 95 + value: 0.0001 diff --git a/swanlog/run-20250629_190303-a3b1799d/files/requirements.txt b/swanlog/run-20250629_190303-a3b1799d/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6da5c11b0f7957abaa72f0ca6ce7a3c05da9d5 --- /dev/null +++ b/swanlog/run-20250629_190303-a3b1799d/files/requirements.txt @@ -0,0 +1,296 @@ +absl-py==2.2.1 +accelerate==1.6.0 +addict==2.4.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.14 +aiosignal==1.3.2 +airportsdata==20250224 +aliyun-python-sdk-core==2.16.0 +aliyun-python-sdk-kms==2.16.5 +altair==5.5.0 +annotated-types==0.7.0 +antlr4-python3-runtime==4.13.2 +anyio==4.9.0 +astor==0.8.1 +async-timeout==5.0.1 +attrdict==2.0.1 +attrs==25.3.0 +av==14.3.0 +beautifulsoup4==4.13.3 +binpacking==1.5.2 +bitsandbytes==0.45.5 +blake3==1.0.4 +blinker==1.9.0 +boto3==1.38.46 +botocore==1.38.46 +cachetools==5.5.2 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +colorama==0.4.6 +compressed-tensors==0.9.4 +contourpy==1.3.1 +cpm-kernels==1.0.11 +crcmod==1.7 +cryptography==44.0.3 +cupy-cuda12x==13.4.1 +cycler==0.12.1 +dacite==1.9.2 +dashscope==1.23.3 +datasets==3.2.0 +decord==0.6.0 +deepspeed==0.16.5 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.3.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +docker-pycreds==0.4.0 +einops==0.6.1 +einops-exts==0.0.4 +email_validator==2.2.0 +entmax==1.3 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +ffmpy==0.5.0 +filelock==3.18.0 +flash_attn==2.7.4.post1 +fonttools==4.56.0 +frozenlist==1.5.0 +fsspec==2024.9.0 +future==1.0.0 +gdown==5.2.0 +gguf==0.16.3 +gitdb==4.0.12 +GitPython==3.1.44 +googleapis-common-protos==1.70.0 +gradio==5.29.0 +gradio_client==1.10.0 +groovy==0.1.2 +grpcio==1.71.0 +h11==0.16.0 +hf-xet==1.1.2 +hjson==3.1.0 +httpcore==1.0.9 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.32.2 +idna==3.10 +imageio==2.37.0 +importlib_metadata==8.0.0 +interegular==0.3.3 +jieba==0.42.1 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==0.10.0 +joblib==1.4.2 +jsonargparse==3.13.1 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +kiwisolver==1.4.8 +lark==1.2.2 +latex2mathml==3.77.0 +latex2sympy2_extended==1.10.1 +lightning-utilities==0.14.3 +linkify-it-py==2.0.3 +llguidance==0.7.19 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +lxml==5.4.0 +Markdown==3.7 +markdown-it-py==2.2.0 +markdown2==2.5.3 +MarkupSafe==3.0.2 +math-verify==0.7.0 +matplotlib==3.10.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mistral_common==1.5.4 +mmcls==0.25.0 +mmcv==2.2.0 +mmcv-full==1.6.2 +mmengine==0.10.7 +mmsegmentation==0.30.0 +model-index==0.1.11 +modelscope==1.25.0 +mpmath==1.3.0 +ms_swift==3.5.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.2.0 +multiprocess==0.70.16 +narwhals==1.32.0 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +nltk==3.9.1 +numba==0.61.2 +numpy==1.26.4 +nvidia-cublas-cu12==12.6.4.1 +nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cudnn-cu12==9.5.1.17 +nvidia-cufft-cu12==11.3.0.4 +nvidia-cufile-cu12==1.11.1.6 +nvidia-curand-cu12==10.3.7.77 +nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparselt-cu12==0.6.3 +nvidia-ml-py==12.575.51 +nvidia-nccl-cu12==2.26.2 +nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvtx-cu12==12.6.77 +openai==1.77.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +opendatalab==0.0.10 +openmim==0.3.9 +openpyxl==3.1.5 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.6 +openxlab==0.0.11 +ordered-set==4.1.0 +orjson==3.10.16 +oss2==2.19.1 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==24.2 +pandas==2.2.3 +partial-json-parser==0.2.1.1.post5 +peft==0.15.2 +pillow==11.1.0 +pip==25.0 +platformdirs==4.3.7 +portalocker==3.1.1 +prettytable==3.16.0 +prometheus_client==0.21.1 +prometheus-fastapi-instrumentator==7.1.0 +propcache==0.3.1 +protobuf==4.25.7 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pyarrow==19.0.1 +pycocoevalcap==1.2 +pycocotools==2.0.8 +pycountry==24.6.1 +pycparser==2.22 +pycryptodome==3.22.0 +pydantic==2.11.1 +pydantic_core==2.33.0 +pydeck==0.9.1 +pydub==0.25.1 +pyecharts==2.0.8 +Pygments==2.19.1 +pynvml==12.0.0 +pyparsing==3.2.3 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +pytorch-lightning==2.5.1.post0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==26.4.0 +qwen-vl-utils==0.0.11 +ray==2.45.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rich-toolkit==0.14.5 +rouge==1.0.1 +rpds-py==0.24.0 +ruff==0.11.8 +s3transfer==0.13.0 +sacrebleu==2.5.1 +safehttpx==0.1.6 +safetensors==0.5.3 +scikit-learn==1.6.1 +scipy==1.15.2 +semantic-version==2.10.0 +sentencepiece==0.2.0 +sentry-sdk==2.27.0 +setproctitle==1.3.6 +setuptools==69.5.1 +shellingham==1.5.4 +shortuuid==1.0.13 +simplejson==3.20.1 +six==1.17.0 +smmap==5.0.2 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +starlette==0.46.1 +streamlit==1.44.0 +streamlit-image-select==0.6.0 +svgwrite==1.4.3 +swankit==0.2.4 +swanlab==0.6.4 +sympy==1.14.0 +tabulate==0.9.0 +tenacity==9.0.0 +tensorboard==2.19.0 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +termcolor==2.5.0 +threadpoolctl==3.6.0 +tiktoken==0.9.0 +timm==0.9.12 +tokenizers==0.21.1 +toml==0.10.2 +tomli==2.2.1 +tomlkit==0.13.2 +torch==2.7.0 +torchaudio==2.7.0 +torchmetrics==0.10.3 +torchvision==0.22.0 +tornado==6.4.2 +tqdm==4.67.1 +transformers==4.51.3 +transformers-stream-generator==0.0.5 +triton==3.3.0 +trl==0.17.0 +typer==0.15.3 +typing_extensions==4.13.0 +typing-inspection==0.4.0 +tzdata==2025.2 +uc-micro-py==1.0.3 +unbabel-comet==2.2.6 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +vllm==0.9.0 +wandb==0.20.1 +watchdog==6.0.0 +watchfiles==1.0.5 +wavedrom==2.0.3.post3 +wcwidth==0.2.13 +websocket-client==1.8.0 +websockets==15.0.1 +Werkzeug==3.1.3 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.30 +xgrammar==0.1.19 +xxhash==3.5.0 +yacs==0.1.8 +yapf==0.40.1 +yarl==1.18.3 +zipp==3.21.0 +zstandard==0.23.0 diff --git a/swanlog/run-20250629_190303-a3b1799d/files/swanlab-metadata.json b/swanlog/run-20250629_190303-a3b1799d/files/swanlab-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b663a53d4145a3b19daa2bbb2868638e12c4ad1e --- /dev/null +++ b/swanlog/run-20250629_190303-a3b1799d/files/swanlab-metadata.json @@ -0,0 +1 @@ +{"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 2607237, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Qwen3-8B --train_type lora --dataset data/train_data/qwen3_normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-5 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --loss_scale ignore_empty_think --output_dir output/qwen3_normal_lora_v1 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_190303-a3b1799d"}} \ No newline at end of file