Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/args.json +364 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/config.json +29 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/generation_config.json +14 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/merges.txt +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/model.safetensors.index.json +346 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/special_tokens_map.json +31 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/tokenizer_config.json +208 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/vocab.json +0 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/README.md +202 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/adapter_config.json +39 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/additional_config.json +1 -0
- BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/args.json +364 -0
- EasyR1-new/.gitignore +181 -0
- EasyR1-new/.pre-commit-config.yaml +22 -0
- EasyR1-new/Dockerfile +65 -0
- EasyR1-new/Dockerfile.legacy +72 -0
- EasyR1-new/LICENSE +201 -0
- EasyR1-new/Makefile +24 -0
- EasyR1-new/README.md +223 -0
- EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/media/table/val/generations_0_2feffd203d182aadef48.table.json +1 -0
- EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/requirements.txt +216 -0
- EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-metadata.json +91 -0
- EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-summary.json +1 -0
- EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-internal.log +16 -0
- EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug.log +28 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/config.yaml +230 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_0_2feffd203d182aadef48.table.json +1 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_104_ccf2e4d7f5a8bd440fdc.table.json +0 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_109_dfde4027f4468ecb4bdb.table.json +0 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/output.log +0 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/requirements.txt +216 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-metadata.json +91 -0
- EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-summary.json +1 -0
- EasyR1/verl.egg-info/PKG-INFO +264 -0
- EasyR1/verl.egg-info/SOURCES.txt +71 -0
- EasyR1/verl.egg-info/dependency_links.txt +1 -0
- EasyR1/verl.egg-info/requires.txt +23 -0
- EasyR1/verl.egg-info/top_level.txt +1 -0
- EasyR1/verl/workers/sharding_manager/__pycache__/__init__.cpython-310.pyc +0 -0
- EasyR1/verl/workers/sharding_manager/__pycache__/base.cpython-310.pyc +0 -0
- EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_ulysses.cpython-310.pyc +0 -0
- EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_vllm.cpython-310.pyc +0 -0
- EasyR1/verl/workers/sharding_manager/fsdp_ulysses.py +65 -0
- EasyR1/verl/workers/sharding_manager/fsdp_vllm.py +149 -0
- LAVIS-main/lavis/models/alpro_models/alpro_retrieval.py +422 -0
- LAVIS-main/lavis/models/beats/BEATs.py +180 -0
- LAVIS-main/lavis/models/beats/LICENSE_BEATs.txt +21 -0
- LAVIS-main/lavis/models/beats/README.md +127 -0
- LAVIS-main/lavis/models/beats/Tokenizers.py +173 -0
- LAVIS-main/lavis/models/beats/backbone.py +783 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/args.json
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 3 |
+
"model_type": "qwen2_5",
|
| 4 |
+
"model_revision": null,
|
| 5 |
+
"task_type": "causal_lm",
|
| 6 |
+
"torch_dtype": "bfloat16",
|
| 7 |
+
"attn_impl": null,
|
| 8 |
+
"num_labels": null,
|
| 9 |
+
"problem_type": null,
|
| 10 |
+
"rope_scaling": null,
|
| 11 |
+
"device_map": null,
|
| 12 |
+
"max_memory": {},
|
| 13 |
+
"local_repo_path": null,
|
| 14 |
+
"template": "qwen2_5",
|
| 15 |
+
"system": null,
|
| 16 |
+
"max_length": 8192,
|
| 17 |
+
"truncation_strategy": "delete",
|
| 18 |
+
"max_pixels": null,
|
| 19 |
+
"tools_prompt": "react_en",
|
| 20 |
+
"norm_bbox": null,
|
| 21 |
+
"response_prefix": null,
|
| 22 |
+
"padding_side": "right",
|
| 23 |
+
"loss_scale": "default",
|
| 24 |
+
"sequence_parallel_size": 1,
|
| 25 |
+
"use_chat_template": true,
|
| 26 |
+
"template_backend": "swift",
|
| 27 |
+
"dataset": [
|
| 28 |
+
"/nas/shared/kilab/wangyujia/material_production_train.jsonl"
|
| 29 |
+
],
|
| 30 |
+
"val_dataset": [],
|
| 31 |
+
"split_dataset_ratio": 0.01,
|
| 32 |
+
"data_seed": 42,
|
| 33 |
+
"dataset_num_proc": 128,
|
| 34 |
+
"dataset_shuffle": true,
|
| 35 |
+
"val_dataset_shuffle": false,
|
| 36 |
+
"streaming": false,
|
| 37 |
+
"interleave_prob": null,
|
| 38 |
+
"stopping_strategy": "first_exhausted",
|
| 39 |
+
"shuffle_buffer_size": 1000,
|
| 40 |
+
"enable_cache": false,
|
| 41 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 42 |
+
"columns": {},
|
| 43 |
+
"strict": false,
|
| 44 |
+
"remove_unused_columns": true,
|
| 45 |
+
"model_name": [
|
| 46 |
+
"qwen_bio_sft_deeplocbinary-08022035"
|
| 47 |
+
],
|
| 48 |
+
"model_author": [
|
| 49 |
+
"swift"
|
| 50 |
+
],
|
| 51 |
+
"custom_dataset_info": [],
|
| 52 |
+
"quant_method": null,
|
| 53 |
+
"quant_bits": null,
|
| 54 |
+
"hqq_axis": null,
|
| 55 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 56 |
+
"bnb_4bit_quant_type": "nf4",
|
| 57 |
+
"bnb_4bit_use_double_quant": true,
|
| 58 |
+
"bnb_4bit_quant_storage": null,
|
| 59 |
+
"max_new_tokens": 64,
|
| 60 |
+
"temperature": 0.0,
|
| 61 |
+
"top_k": null,
|
| 62 |
+
"top_p": null,
|
| 63 |
+
"repetition_penalty": null,
|
| 64 |
+
"num_beams": 1,
|
| 65 |
+
"stream": false,
|
| 66 |
+
"stop_words": [],
|
| 67 |
+
"logprobs": false,
|
| 68 |
+
"top_logprobs": null,
|
| 69 |
+
"ckpt_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 70 |
+
"load_dataset_config": null,
|
| 71 |
+
"lora_modules": [],
|
| 72 |
+
"tuner_backend": "peft",
|
| 73 |
+
"train_type": "lora",
|
| 74 |
+
"adapters": [],
|
| 75 |
+
"external_plugins": [],
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"model_kwargs": {},
|
| 78 |
+
"load_args": false,
|
| 79 |
+
"load_data_args": false,
|
| 80 |
+
"use_hf": false,
|
| 81 |
+
"hub_token": null,
|
| 82 |
+
"custom_register_path": [],
|
| 83 |
+
"ignore_args_error": false,
|
| 84 |
+
"use_swift_lora": false,
|
| 85 |
+
"output_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250",
|
| 86 |
+
"overwrite_output_dir": false,
|
| 87 |
+
"do_train": false,
|
| 88 |
+
"do_eval": false,
|
| 89 |
+
"do_predict": false,
|
| 90 |
+
"eval_strategy": "steps",
|
| 91 |
+
"prediction_loss_only": false,
|
| 92 |
+
"per_device_train_batch_size": 2,
|
| 93 |
+
"per_device_eval_batch_size": 2,
|
| 94 |
+
"per_gpu_train_batch_size": null,
|
| 95 |
+
"per_gpu_eval_batch_size": null,
|
| 96 |
+
"gradient_accumulation_steps": 4,
|
| 97 |
+
"eval_accumulation_steps": null,
|
| 98 |
+
"eval_delay": 0,
|
| 99 |
+
"torch_empty_cache_steps": null,
|
| 100 |
+
"learning_rate": 1e-05,
|
| 101 |
+
"weight_decay": 0.1,
|
| 102 |
+
"adam_beta1": 0.9,
|
| 103 |
+
"adam_beta2": 0.95,
|
| 104 |
+
"adam_epsilon": 1e-08,
|
| 105 |
+
"max_grad_norm": 1.0,
|
| 106 |
+
"num_train_epochs": 3.0,
|
| 107 |
+
"max_steps": -1,
|
| 108 |
+
"lr_scheduler_type": "cosine",
|
| 109 |
+
"lr_scheduler_kwargs": null,
|
| 110 |
+
"warmup_ratio": 0.05,
|
| 111 |
+
"warmup_steps": 0,
|
| 112 |
+
"log_level": "passive",
|
| 113 |
+
"log_level_replica": "warning",
|
| 114 |
+
"log_on_each_node": true,
|
| 115 |
+
"logging_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs",
|
| 116 |
+
"logging_strategy": "steps",
|
| 117 |
+
"logging_first_step": true,
|
| 118 |
+
"logging_steps": 1,
|
| 119 |
+
"logging_nan_inf_filter": true,
|
| 120 |
+
"save_strategy": "steps",
|
| 121 |
+
"save_steps": 5.0,
|
| 122 |
+
"save_total_limit": 5,
|
| 123 |
+
"save_safetensors": true,
|
| 124 |
+
"save_on_each_node": false,
|
| 125 |
+
"save_only_model": true,
|
| 126 |
+
"restore_callback_states_from_checkpoint": false,
|
| 127 |
+
"no_cuda": false,
|
| 128 |
+
"use_cpu": false,
|
| 129 |
+
"use_mps_device": false,
|
| 130 |
+
"jit_mode_eval": false,
|
| 131 |
+
"use_ipex": false,
|
| 132 |
+
"bf16": true,
|
| 133 |
+
"fp16": false,
|
| 134 |
+
"fp16_opt_level": "O1",
|
| 135 |
+
"half_precision_backend": "auto",
|
| 136 |
+
"bf16_full_eval": false,
|
| 137 |
+
"fp16_full_eval": false,
|
| 138 |
+
"tf32": null,
|
| 139 |
+
"local_rank": 0,
|
| 140 |
+
"ddp_backend": null,
|
| 141 |
+
"tpu_num_cores": null,
|
| 142 |
+
"tpu_metrics_debug": false,
|
| 143 |
+
"debug": null,
|
| 144 |
+
"dataloader_drop_last": false,
|
| 145 |
+
"eval_steps": 5.0,
|
| 146 |
+
"dataloader_num_workers": 1,
|
| 147 |
+
"dataloader_prefetch_factor": null,
|
| 148 |
+
"past_index": -1,
|
| 149 |
+
"run_name": "construct",
|
| 150 |
+
"disable_tqdm": null,
|
| 151 |
+
"label_names": null,
|
| 152 |
+
"load_best_model_at_end": false,
|
| 153 |
+
"metric_for_best_model": "loss",
|
| 154 |
+
"greater_is_better": false,
|
| 155 |
+
"ignore_data_skip": false,
|
| 156 |
+
"fsdp": "",
|
| 157 |
+
"fsdp_min_num_params": 0,
|
| 158 |
+
"fsdp_config": null,
|
| 159 |
+
"tp_size": 0,
|
| 160 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 161 |
+
"accelerator_config": {
|
| 162 |
+
"dispatch_batches": false
|
| 163 |
+
},
|
| 164 |
+
"deepspeed": {
|
| 165 |
+
"fp16": {
|
| 166 |
+
"enabled": "auto",
|
| 167 |
+
"loss_scale": 0,
|
| 168 |
+
"loss_scale_window": 1000,
|
| 169 |
+
"initial_scale_power": 16,
|
| 170 |
+
"hysteresis": 2,
|
| 171 |
+
"min_loss_scale": 1
|
| 172 |
+
},
|
| 173 |
+
"bf16": {
|
| 174 |
+
"enabled": "auto"
|
| 175 |
+
},
|
| 176 |
+
"zero_optimization": {
|
| 177 |
+
"stage": 3,
|
| 178 |
+
"offload_optimizer": {
|
| 179 |
+
"device": "none",
|
| 180 |
+
"pin_memory": true
|
| 181 |
+
},
|
| 182 |
+
"offload_param": {
|
| 183 |
+
"device": "none",
|
| 184 |
+
"pin_memory": true
|
| 185 |
+
},
|
| 186 |
+
"overlap_comm": false,
|
| 187 |
+
"contiguous_gradients": true,
|
| 188 |
+
"sub_group_size": 1000000000.0,
|
| 189 |
+
"reduce_bucket_size": "auto",
|
| 190 |
+
"zero_quantized_weights": false,
|
| 191 |
+
"zero_quantized_gradients": false,
|
| 192 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 193 |
+
"stage3_param_persistence_threshold": "auto",
|
| 194 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 195 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 196 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 197 |
+
},
|
| 198 |
+
"gradient_accumulation_steps": "auto",
|
| 199 |
+
"gradient_clipping": "auto",
|
| 200 |
+
"steps_per_print": 2000,
|
| 201 |
+
"train_batch_size": "auto",
|
| 202 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 203 |
+
"wall_clock_breakdown": false
|
| 204 |
+
},
|
| 205 |
+
"label_smoothing_factor": 0.0,
|
| 206 |
+
"optim": "adamw_torch",
|
| 207 |
+
"optim_args": null,
|
| 208 |
+
"adafactor": false,
|
| 209 |
+
"group_by_length": false,
|
| 210 |
+
"length_column_name": "length",
|
| 211 |
+
"report_to": [
|
| 212 |
+
"tensorboard"
|
| 213 |
+
],
|
| 214 |
+
"ddp_find_unused_parameters": null,
|
| 215 |
+
"ddp_bucket_cap_mb": null,
|
| 216 |
+
"ddp_broadcast_buffers": null,
|
| 217 |
+
"dataloader_pin_memory": true,
|
| 218 |
+
"dataloader_persistent_workers": false,
|
| 219 |
+
"skip_memory_metrics": true,
|
| 220 |
+
"use_legacy_prediction_loop": false,
|
| 221 |
+
"push_to_hub": false,
|
| 222 |
+
"resume_from_checkpoint": null,
|
| 223 |
+
"hub_model_id": null,
|
| 224 |
+
"hub_strategy": "every_save",
|
| 225 |
+
"hub_private_repo": null,
|
| 226 |
+
"hub_always_push": false,
|
| 227 |
+
"gradient_checkpointing": true,
|
| 228 |
+
"gradient_checkpointing_kwargs": null,
|
| 229 |
+
"include_inputs_for_metrics": false,
|
| 230 |
+
"include_for_metrics": [],
|
| 231 |
+
"eval_do_concat_batches": true,
|
| 232 |
+
"fp16_backend": "auto",
|
| 233 |
+
"push_to_hub_model_id": null,
|
| 234 |
+
"push_to_hub_organization": null,
|
| 235 |
+
"push_to_hub_token": null,
|
| 236 |
+
"mp_parameters": "",
|
| 237 |
+
"auto_find_batch_size": false,
|
| 238 |
+
"full_determinism": false,
|
| 239 |
+
"torchdynamo": null,
|
| 240 |
+
"ray_scope": "last",
|
| 241 |
+
"ddp_timeout": 1800,
|
| 242 |
+
"torch_compile": false,
|
| 243 |
+
"torch_compile_backend": null,
|
| 244 |
+
"torch_compile_mode": null,
|
| 245 |
+
"include_tokens_per_second": false,
|
| 246 |
+
"include_num_input_tokens_seen": false,
|
| 247 |
+
"neftune_noise_alpha": null,
|
| 248 |
+
"optim_target_modules": null,
|
| 249 |
+
"batch_eval_metrics": false,
|
| 250 |
+
"eval_on_start": false,
|
| 251 |
+
"use_liger_kernel": false,
|
| 252 |
+
"eval_use_gather_object": false,
|
| 253 |
+
"average_tokens_across_devices": false,
|
| 254 |
+
"sortish_sampler": false,
|
| 255 |
+
"predict_with_generate": false,
|
| 256 |
+
"generation_max_length": null,
|
| 257 |
+
"generation_num_beams": null,
|
| 258 |
+
"generation_config": null,
|
| 259 |
+
"check_model": true,
|
| 260 |
+
"acc_strategy": "token",
|
| 261 |
+
"train_dataloader_shuffle": true,
|
| 262 |
+
"metric_warmup_step": 0,
|
| 263 |
+
"fsdp_num": 1,
|
| 264 |
+
"acc_steps": 1,
|
| 265 |
+
"eval_use_evalscope": false,
|
| 266 |
+
"eval_datasets": [],
|
| 267 |
+
"eval_limit": null,
|
| 268 |
+
"eval_datasets_args": null,
|
| 269 |
+
"eval_generation_config": null,
|
| 270 |
+
"freeze_parameters": [],
|
| 271 |
+
"freeze_parameters_ratio": 0.0,
|
| 272 |
+
"trainable_parameters": [],
|
| 273 |
+
"freeze_llm": false,
|
| 274 |
+
"freeze_vit": true,
|
| 275 |
+
"freeze_aligner": true,
|
| 276 |
+
"target_modules": [
|
| 277 |
+
"all-linear"
|
| 278 |
+
],
|
| 279 |
+
"target_regex": null,
|
| 280 |
+
"modules_to_save": [],
|
| 281 |
+
"lora_rank": 8,
|
| 282 |
+
"lora_alpha": 32,
|
| 283 |
+
"lora_dropout": 0.05,
|
| 284 |
+
"lora_bias": "none",
|
| 285 |
+
"lora_dtype": null,
|
| 286 |
+
"lorap_lr_ratio": null,
|
| 287 |
+
"use_rslora": false,
|
| 288 |
+
"use_dora": false,
|
| 289 |
+
"lora_ga_batch_size": 2,
|
| 290 |
+
"lora_ga_iters": 2,
|
| 291 |
+
"lora_ga_max_length": 1024,
|
| 292 |
+
"lora_ga_direction": "ArB2r",
|
| 293 |
+
"lora_ga_scale": "stable",
|
| 294 |
+
"lora_ga_stable_gamma": 16,
|
| 295 |
+
"init_weights": true,
|
| 296 |
+
"fourier_n_frequency": 2000,
|
| 297 |
+
"fourier_scaling": 300.0,
|
| 298 |
+
"boft_block_size": 4,
|
| 299 |
+
"boft_block_num": 0,
|
| 300 |
+
"boft_n_butterfly_factor": 1,
|
| 301 |
+
"boft_dropout": 0.0,
|
| 302 |
+
"vera_rank": 256,
|
| 303 |
+
"vera_projection_prng_key": 0,
|
| 304 |
+
"vera_dropout": 0.0,
|
| 305 |
+
"vera_d_initial": 0.1,
|
| 306 |
+
"adapter_act": "gelu",
|
| 307 |
+
"adapter_length": 128,
|
| 308 |
+
"use_galore": false,
|
| 309 |
+
"galore_target_modules": null,
|
| 310 |
+
"galore_rank": 128,
|
| 311 |
+
"galore_update_proj_gap": 50,
|
| 312 |
+
"galore_scale": 1.0,
|
| 313 |
+
"galore_proj_type": "std",
|
| 314 |
+
"galore_optim_per_parameter": false,
|
| 315 |
+
"galore_with_embedding": false,
|
| 316 |
+
"galore_quantization": false,
|
| 317 |
+
"galore_proj_quant": false,
|
| 318 |
+
"galore_proj_bits": 4,
|
| 319 |
+
"galore_proj_group_size": 256,
|
| 320 |
+
"galore_cos_threshold": 0.4,
|
| 321 |
+
"galore_gamma_proj": 2,
|
| 322 |
+
"galore_queue_size": 5,
|
| 323 |
+
"adalora_target_r": 8,
|
| 324 |
+
"adalora_init_r": 12,
|
| 325 |
+
"adalora_tinit": 0,
|
| 326 |
+
"adalora_tfinal": 0,
|
| 327 |
+
"adalora_deltaT": 1,
|
| 328 |
+
"adalora_beta1": 0.85,
|
| 329 |
+
"adalora_beta2": 0.85,
|
| 330 |
+
"adalora_orth_reg_weight": 0.5,
|
| 331 |
+
"llamapro_num_new_blocks": 4,
|
| 332 |
+
"llamapro_num_groups": null,
|
| 333 |
+
"lisa_activated_layers": 0,
|
| 334 |
+
"lisa_step_interval": 20,
|
| 335 |
+
"reft_layer_key": null,
|
| 336 |
+
"reft_layers": null,
|
| 337 |
+
"reft_rank": 4,
|
| 338 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 339 |
+
"reft_args": null,
|
| 340 |
+
"swanlab_token": null,
|
| 341 |
+
"swanlab_project": null,
|
| 342 |
+
"swanlab_workspace": null,
|
| 343 |
+
"swanlab_exp_name": null,
|
| 344 |
+
"swanlab_mode": "cloud",
|
| 345 |
+
"add_version": true,
|
| 346 |
+
"resume_only_model": false,
|
| 347 |
+
"create_checkpoint_symlink": false,
|
| 348 |
+
"packing": false,
|
| 349 |
+
"lazy_tokenize": false,
|
| 350 |
+
"loss_type": null,
|
| 351 |
+
"optimizer": null,
|
| 352 |
+
"metric": null,
|
| 353 |
+
"zero_hpz_partition_size": null,
|
| 354 |
+
"rank": 0,
|
| 355 |
+
"global_world_size": 8,
|
| 356 |
+
"local_world_size": 8,
|
| 357 |
+
"model_suffix": "checkpoint-50-merged",
|
| 358 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)",
|
| 359 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding'])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f22ea5f5e10>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=['transformers>=4.37'], tags=[])",
|
| 360 |
+
"model_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 361 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 362 |
+
"evaluation_strategy": "steps",
|
| 363 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=5, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=5, dataloader_num_workers=1, dataloader_prefetch_factor=10, past_index=-1, run_name='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None)"
|
| 364 |
+
}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 151643,
|
| 7 |
+
"eos_token_id": 151645,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 3584,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 18944,
|
| 12 |
+
"max_position_embeddings": 32768,
|
| 13 |
+
"max_window_layers": 28,
|
| 14 |
+
"model_type": "qwen2",
|
| 15 |
+
"num_attention_heads": 28,
|
| 16 |
+
"num_hidden_layers": 28,
|
| 17 |
+
"num_key_value_heads": 4,
|
| 18 |
+
"pad_token_id": 151643,
|
| 19 |
+
"rms_norm_eps": 1e-06,
|
| 20 |
+
"rope_scaling": null,
|
| 21 |
+
"rope_theta": 1000000.0,
|
| 22 |
+
"sliding_window": 131072,
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"torch_dtype": "bfloat16",
|
| 25 |
+
"transformers_version": "4.51.3",
|
| 26 |
+
"use_cache": false,
|
| 27 |
+
"use_sliding_window": false,
|
| 28 |
+
"vocab_size": 152064
|
| 29 |
+
}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/generation_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
151645,
|
| 6 |
+
151643
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 151643,
|
| 9 |
+
"repetition_penalty": 1.05,
|
| 10 |
+
"temperature": 0.7,
|
| 11 |
+
"top_k": 20,
|
| 12 |
+
"top_p": 0.8,
|
| 13 |
+
"transformers_version": "4.51.3"
|
| 14 |
+
}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 15231233024
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"lm_head.weight": "model-00004-of-00004.safetensors",
|
| 7 |
+
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
| 8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 13 |
+
"model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 14 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 15 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 19 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 20 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 21 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 22 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 23 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 24 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 25 |
+
"model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 26 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 27 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 28 |
+
"model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 29 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 30 |
+
"model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 31 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 32 |
+
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 33 |
+
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 34 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 35 |
+
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 36 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 37 |
+
"model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 38 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 39 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 40 |
+
"model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 41 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 42 |
+
"model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 43 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 44 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 45 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 46 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 47 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 48 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 49 |
+
"model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 50 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 51 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 52 |
+
"model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 53 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 54 |
+
"model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 55 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 56 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 57 |
+
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 58 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 59 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 60 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 61 |
+
"model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 62 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 63 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 64 |
+
"model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 65 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 66 |
+
"model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 67 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 68 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 69 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 70 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 71 |
+
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 72 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 73 |
+
"model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 74 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 75 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 76 |
+
"model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 77 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 78 |
+
"model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 79 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 80 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 81 |
+
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 82 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 83 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 84 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 85 |
+
"model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 86 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 87 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 88 |
+
"model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 89 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 90 |
+
"model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 91 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 92 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 93 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 94 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 95 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 96 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 97 |
+
"model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 98 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 99 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 100 |
+
"model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 101 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 102 |
+
"model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 103 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 104 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 105 |
+
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 106 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 107 |
+
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 108 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 109 |
+
"model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 110 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 111 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 112 |
+
"model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 113 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 114 |
+
"model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 115 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 116 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 117 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 118 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 119 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 120 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 121 |
+
"model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 122 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 123 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 124 |
+
"model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 125 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 126 |
+
"model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 127 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 128 |
+
"model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 129 |
+
"model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 130 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 131 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 132 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 133 |
+
"model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 134 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 135 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 136 |
+
"model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 137 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 138 |
+
"model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 139 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 140 |
+
"model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 141 |
+
"model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 142 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 143 |
+
"model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 144 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 145 |
+
"model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 146 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 147 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 148 |
+
"model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 149 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 150 |
+
"model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 151 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 152 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 153 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 154 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 155 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 156 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 157 |
+
"model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 158 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 159 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 160 |
+
"model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 161 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 162 |
+
"model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 163 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 164 |
+
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 165 |
+
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 166 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 167 |
+
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 168 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 169 |
+
"model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 170 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 171 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 172 |
+
"model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 173 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 174 |
+
"model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 175 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 176 |
+
"model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 177 |
+
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 178 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 179 |
+
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 180 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 181 |
+
"model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 182 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 183 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 184 |
+
"model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 185 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 186 |
+
"model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 187 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 188 |
+
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 189 |
+
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 190 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 191 |
+
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 192 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 193 |
+
"model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 194 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 195 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 196 |
+
"model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 197 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 198 |
+
"model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 199 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 200 |
+
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 201 |
+
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 202 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 203 |
+
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 204 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 205 |
+
"model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 206 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 207 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 208 |
+
"model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 209 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 210 |
+
"model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 211 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 212 |
+
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 213 |
+
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 214 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 215 |
+
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 216 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 217 |
+
"model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 218 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 219 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 220 |
+
"model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 221 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 222 |
+
"model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 223 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 224 |
+
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 225 |
+
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 226 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 227 |
+
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 228 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 229 |
+
"model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 230 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 231 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 232 |
+
"model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 233 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 234 |
+
"model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 235 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 236 |
+
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 237 |
+
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 238 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 239 |
+
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 240 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 241 |
+
"model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 242 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 243 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 244 |
+
"model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 245 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 246 |
+
"model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 247 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 248 |
+
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 249 |
+
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
| 250 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
| 251 |
+
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
| 252 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
| 253 |
+
"model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
|
| 254 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
| 255 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
| 256 |
+
"model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
|
| 257 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
| 258 |
+
"model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
|
| 259 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
| 260 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 261 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 262 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 263 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 264 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 265 |
+
"model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 266 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 267 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 268 |
+
"model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 269 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 270 |
+
"model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 271 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 272 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 273 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 274 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 275 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 276 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 277 |
+
"model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 278 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 279 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 280 |
+
"model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 281 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 282 |
+
"model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 283 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 284 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 285 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 286 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 287 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 288 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 289 |
+
"model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 290 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 291 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 292 |
+
"model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 293 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 294 |
+
"model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 295 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 296 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 297 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 298 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 299 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 300 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 301 |
+
"model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 302 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 303 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 304 |
+
"model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 305 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 306 |
+
"model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 307 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 308 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 309 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
| 310 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
| 311 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
| 312 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
| 313 |
+
"model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 314 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 315 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 316 |
+
"model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 317 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 318 |
+
"model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 319 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 320 |
+
"model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 321 |
+
"model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 322 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 323 |
+
"model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 324 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 325 |
+
"model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
|
| 326 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
| 327 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
| 328 |
+
"model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
|
| 329 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
| 330 |
+
"model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
|
| 331 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
| 332 |
+
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 333 |
+
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
| 334 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
| 335 |
+
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
| 336 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
| 337 |
+
"model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
|
| 338 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
| 339 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
| 340 |
+
"model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
|
| 341 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
| 342 |
+
"model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
|
| 343 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
| 344 |
+
"model.norm.weight": "model-00003-of-00004.safetensors"
|
| 345 |
+
}
|
| 346 |
+
}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/tokenizer_config.json
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
| 199 |
+
"clean_up_tokenization_spaces": false,
|
| 200 |
+
"eos_token": "<|im_end|>",
|
| 201 |
+
"errors": "replace",
|
| 202 |
+
"extra_special_tokens": {},
|
| 203 |
+
"model_max_length": 131072,
|
| 204 |
+
"pad_token": "<|endoftext|>",
|
| 205 |
+
"split_special_tokens": false,
|
| 206 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 207 |
+
"unk_token": null
|
| 208 |
+
}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: /oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged
|
| 3 |
+
library_name: peft
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
| 200 |
+
### Framework versions
|
| 201 |
+
|
| 202 |
+
- PEFT 0.15.2
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/adapter_config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 32,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.05,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": [],
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"r": 8,
|
| 24 |
+
"rank_pattern": {},
|
| 25 |
+
"revision": null,
|
| 26 |
+
"target_modules": [
|
| 27 |
+
"v_proj",
|
| 28 |
+
"down_proj",
|
| 29 |
+
"k_proj",
|
| 30 |
+
"up_proj",
|
| 31 |
+
"gate_proj",
|
| 32 |
+
"q_proj",
|
| 33 |
+
"o_proj"
|
| 34 |
+
],
|
| 35 |
+
"task_type": "CAUSAL_LM",
|
| 36 |
+
"trainable_token_indices": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_rslora": false
|
| 39 |
+
}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/additional_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}
|
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/args.json
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 3 |
+
"model_type": "qwen2_5",
|
| 4 |
+
"model_revision": null,
|
| 5 |
+
"task_type": "causal_lm",
|
| 6 |
+
"torch_dtype": "bfloat16",
|
| 7 |
+
"attn_impl": null,
|
| 8 |
+
"num_labels": null,
|
| 9 |
+
"problem_type": null,
|
| 10 |
+
"rope_scaling": null,
|
| 11 |
+
"device_map": null,
|
| 12 |
+
"max_memory": {},
|
| 13 |
+
"local_repo_path": null,
|
| 14 |
+
"template": "qwen2_5",
|
| 15 |
+
"system": null,
|
| 16 |
+
"max_length": 8192,
|
| 17 |
+
"truncation_strategy": "delete",
|
| 18 |
+
"max_pixels": null,
|
| 19 |
+
"tools_prompt": "react_en",
|
| 20 |
+
"norm_bbox": null,
|
| 21 |
+
"response_prefix": null,
|
| 22 |
+
"padding_side": "right",
|
| 23 |
+
"loss_scale": "default",
|
| 24 |
+
"sequence_parallel_size": 1,
|
| 25 |
+
"use_chat_template": true,
|
| 26 |
+
"template_backend": "swift",
|
| 27 |
+
"dataset": [
|
| 28 |
+
"/nas/shared/kilab/wangyujia/material_production_train.jsonl"
|
| 29 |
+
],
|
| 30 |
+
"val_dataset": [],
|
| 31 |
+
"split_dataset_ratio": 0.01,
|
| 32 |
+
"data_seed": 42,
|
| 33 |
+
"dataset_num_proc": 128,
|
| 34 |
+
"dataset_shuffle": true,
|
| 35 |
+
"val_dataset_shuffle": false,
|
| 36 |
+
"streaming": false,
|
| 37 |
+
"interleave_prob": null,
|
| 38 |
+
"stopping_strategy": "first_exhausted",
|
| 39 |
+
"shuffle_buffer_size": 1000,
|
| 40 |
+
"enable_cache": false,
|
| 41 |
+
"download_mode": "reuse_dataset_if_exists",
|
| 42 |
+
"columns": {},
|
| 43 |
+
"strict": false,
|
| 44 |
+
"remove_unused_columns": true,
|
| 45 |
+
"model_name": [
|
| 46 |
+
"qwen_bio_sft_deeplocbinary-08022035"
|
| 47 |
+
],
|
| 48 |
+
"model_author": [
|
| 49 |
+
"swift"
|
| 50 |
+
],
|
| 51 |
+
"custom_dataset_info": [],
|
| 52 |
+
"quant_method": null,
|
| 53 |
+
"quant_bits": null,
|
| 54 |
+
"hqq_axis": null,
|
| 55 |
+
"bnb_4bit_compute_dtype": "bfloat16",
|
| 56 |
+
"bnb_4bit_quant_type": "nf4",
|
| 57 |
+
"bnb_4bit_use_double_quant": true,
|
| 58 |
+
"bnb_4bit_quant_storage": null,
|
| 59 |
+
"max_new_tokens": 64,
|
| 60 |
+
"temperature": 0.0,
|
| 61 |
+
"top_k": null,
|
| 62 |
+
"top_p": null,
|
| 63 |
+
"repetition_penalty": null,
|
| 64 |
+
"num_beams": 1,
|
| 65 |
+
"stream": false,
|
| 66 |
+
"stop_words": [],
|
| 67 |
+
"logprobs": false,
|
| 68 |
+
"top_logprobs": null,
|
| 69 |
+
"ckpt_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 70 |
+
"load_dataset_config": null,
|
| 71 |
+
"lora_modules": [],
|
| 72 |
+
"tuner_backend": "peft",
|
| 73 |
+
"train_type": "lora",
|
| 74 |
+
"adapters": [],
|
| 75 |
+
"external_plugins": [],
|
| 76 |
+
"seed": 42,
|
| 77 |
+
"model_kwargs": {},
|
| 78 |
+
"load_args": false,
|
| 79 |
+
"load_data_args": false,
|
| 80 |
+
"use_hf": false,
|
| 81 |
+
"hub_token": null,
|
| 82 |
+
"custom_register_path": [],
|
| 83 |
+
"ignore_args_error": false,
|
| 84 |
+
"use_swift_lora": false,
|
| 85 |
+
"output_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250",
|
| 86 |
+
"overwrite_output_dir": false,
|
| 87 |
+
"do_train": false,
|
| 88 |
+
"do_eval": false,
|
| 89 |
+
"do_predict": false,
|
| 90 |
+
"eval_strategy": "steps",
|
| 91 |
+
"prediction_loss_only": false,
|
| 92 |
+
"per_device_train_batch_size": 2,
|
| 93 |
+
"per_device_eval_batch_size": 2,
|
| 94 |
+
"per_gpu_train_batch_size": null,
|
| 95 |
+
"per_gpu_eval_batch_size": null,
|
| 96 |
+
"gradient_accumulation_steps": 4,
|
| 97 |
+
"eval_accumulation_steps": null,
|
| 98 |
+
"eval_delay": 0,
|
| 99 |
+
"torch_empty_cache_steps": null,
|
| 100 |
+
"learning_rate": 1e-05,
|
| 101 |
+
"weight_decay": 0.1,
|
| 102 |
+
"adam_beta1": 0.9,
|
| 103 |
+
"adam_beta2": 0.95,
|
| 104 |
+
"adam_epsilon": 1e-08,
|
| 105 |
+
"max_grad_norm": 1.0,
|
| 106 |
+
"num_train_epochs": 3.0,
|
| 107 |
+
"max_steps": -1,
|
| 108 |
+
"lr_scheduler_type": "cosine",
|
| 109 |
+
"lr_scheduler_kwargs": null,
|
| 110 |
+
"warmup_ratio": 0.05,
|
| 111 |
+
"warmup_steps": 0,
|
| 112 |
+
"log_level": "passive",
|
| 113 |
+
"log_level_replica": "warning",
|
| 114 |
+
"log_on_each_node": true,
|
| 115 |
+
"logging_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs",
|
| 116 |
+
"logging_strategy": "steps",
|
| 117 |
+
"logging_first_step": true,
|
| 118 |
+
"logging_steps": 1,
|
| 119 |
+
"logging_nan_inf_filter": true,
|
| 120 |
+
"save_strategy": "steps",
|
| 121 |
+
"save_steps": 5.0,
|
| 122 |
+
"save_total_limit": 5,
|
| 123 |
+
"save_safetensors": true,
|
| 124 |
+
"save_on_each_node": false,
|
| 125 |
+
"save_only_model": true,
|
| 126 |
+
"restore_callback_states_from_checkpoint": false,
|
| 127 |
+
"no_cuda": false,
|
| 128 |
+
"use_cpu": false,
|
| 129 |
+
"use_mps_device": false,
|
| 130 |
+
"jit_mode_eval": false,
|
| 131 |
+
"use_ipex": false,
|
| 132 |
+
"bf16": true,
|
| 133 |
+
"fp16": false,
|
| 134 |
+
"fp16_opt_level": "O1",
|
| 135 |
+
"half_precision_backend": "auto",
|
| 136 |
+
"bf16_full_eval": false,
|
| 137 |
+
"fp16_full_eval": false,
|
| 138 |
+
"tf32": null,
|
| 139 |
+
"local_rank": 0,
|
| 140 |
+
"ddp_backend": null,
|
| 141 |
+
"tpu_num_cores": null,
|
| 142 |
+
"tpu_metrics_debug": false,
|
| 143 |
+
"debug": null,
|
| 144 |
+
"dataloader_drop_last": false,
|
| 145 |
+
"eval_steps": 5.0,
|
| 146 |
+
"dataloader_num_workers": 1,
|
| 147 |
+
"dataloader_prefetch_factor": null,
|
| 148 |
+
"past_index": -1,
|
| 149 |
+
"run_name": "construct",
|
| 150 |
+
"disable_tqdm": null,
|
| 151 |
+
"label_names": null,
|
| 152 |
+
"load_best_model_at_end": false,
|
| 153 |
+
"metric_for_best_model": "loss",
|
| 154 |
+
"greater_is_better": false,
|
| 155 |
+
"ignore_data_skip": false,
|
| 156 |
+
"fsdp": "",
|
| 157 |
+
"fsdp_min_num_params": 0,
|
| 158 |
+
"fsdp_config": null,
|
| 159 |
+
"tp_size": 0,
|
| 160 |
+
"fsdp_transformer_layer_cls_to_wrap": null,
|
| 161 |
+
"accelerator_config": {
|
| 162 |
+
"dispatch_batches": false
|
| 163 |
+
},
|
| 164 |
+
"deepspeed": {
|
| 165 |
+
"fp16": {
|
| 166 |
+
"enabled": "auto",
|
| 167 |
+
"loss_scale": 0,
|
| 168 |
+
"loss_scale_window": 1000,
|
| 169 |
+
"initial_scale_power": 16,
|
| 170 |
+
"hysteresis": 2,
|
| 171 |
+
"min_loss_scale": 1
|
| 172 |
+
},
|
| 173 |
+
"bf16": {
|
| 174 |
+
"enabled": "auto"
|
| 175 |
+
},
|
| 176 |
+
"zero_optimization": {
|
| 177 |
+
"stage": 3,
|
| 178 |
+
"offload_optimizer": {
|
| 179 |
+
"device": "none",
|
| 180 |
+
"pin_memory": true
|
| 181 |
+
},
|
| 182 |
+
"offload_param": {
|
| 183 |
+
"device": "none",
|
| 184 |
+
"pin_memory": true
|
| 185 |
+
},
|
| 186 |
+
"overlap_comm": false,
|
| 187 |
+
"contiguous_gradients": true,
|
| 188 |
+
"sub_group_size": 1000000000.0,
|
| 189 |
+
"reduce_bucket_size": "auto",
|
| 190 |
+
"zero_quantized_weights": false,
|
| 191 |
+
"zero_quantized_gradients": false,
|
| 192 |
+
"stage3_prefetch_bucket_size": "auto",
|
| 193 |
+
"stage3_param_persistence_threshold": "auto",
|
| 194 |
+
"stage3_max_live_parameters": 1000000000.0,
|
| 195 |
+
"stage3_max_reuse_distance": 1000000000.0,
|
| 196 |
+
"stage3_gather_16bit_weights_on_model_save": true
|
| 197 |
+
},
|
| 198 |
+
"gradient_accumulation_steps": "auto",
|
| 199 |
+
"gradient_clipping": "auto",
|
| 200 |
+
"steps_per_print": 2000,
|
| 201 |
+
"train_batch_size": "auto",
|
| 202 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 203 |
+
"wall_clock_breakdown": false
|
| 204 |
+
},
|
| 205 |
+
"label_smoothing_factor": 0.0,
|
| 206 |
+
"optim": "adamw_torch",
|
| 207 |
+
"optim_args": null,
|
| 208 |
+
"adafactor": false,
|
| 209 |
+
"group_by_length": false,
|
| 210 |
+
"length_column_name": "length",
|
| 211 |
+
"report_to": [
|
| 212 |
+
"tensorboard"
|
| 213 |
+
],
|
| 214 |
+
"ddp_find_unused_parameters": null,
|
| 215 |
+
"ddp_bucket_cap_mb": null,
|
| 216 |
+
"ddp_broadcast_buffers": null,
|
| 217 |
+
"dataloader_pin_memory": true,
|
| 218 |
+
"dataloader_persistent_workers": false,
|
| 219 |
+
"skip_memory_metrics": true,
|
| 220 |
+
"use_legacy_prediction_loop": false,
|
| 221 |
+
"push_to_hub": false,
|
| 222 |
+
"resume_from_checkpoint": null,
|
| 223 |
+
"hub_model_id": null,
|
| 224 |
+
"hub_strategy": "every_save",
|
| 225 |
+
"hub_private_repo": null,
|
| 226 |
+
"hub_always_push": false,
|
| 227 |
+
"gradient_checkpointing": true,
|
| 228 |
+
"gradient_checkpointing_kwargs": null,
|
| 229 |
+
"include_inputs_for_metrics": false,
|
| 230 |
+
"include_for_metrics": [],
|
| 231 |
+
"eval_do_concat_batches": true,
|
| 232 |
+
"fp16_backend": "auto",
|
| 233 |
+
"push_to_hub_model_id": null,
|
| 234 |
+
"push_to_hub_organization": null,
|
| 235 |
+
"push_to_hub_token": null,
|
| 236 |
+
"mp_parameters": "",
|
| 237 |
+
"auto_find_batch_size": false,
|
| 238 |
+
"full_determinism": false,
|
| 239 |
+
"torchdynamo": null,
|
| 240 |
+
"ray_scope": "last",
|
| 241 |
+
"ddp_timeout": 1800,
|
| 242 |
+
"torch_compile": false,
|
| 243 |
+
"torch_compile_backend": null,
|
| 244 |
+
"torch_compile_mode": null,
|
| 245 |
+
"include_tokens_per_second": false,
|
| 246 |
+
"include_num_input_tokens_seen": false,
|
| 247 |
+
"neftune_noise_alpha": null,
|
| 248 |
+
"optim_target_modules": null,
|
| 249 |
+
"batch_eval_metrics": false,
|
| 250 |
+
"eval_on_start": false,
|
| 251 |
+
"use_liger_kernel": false,
|
| 252 |
+
"eval_use_gather_object": false,
|
| 253 |
+
"average_tokens_across_devices": false,
|
| 254 |
+
"sortish_sampler": false,
|
| 255 |
+
"predict_with_generate": false,
|
| 256 |
+
"generation_max_length": null,
|
| 257 |
+
"generation_num_beams": null,
|
| 258 |
+
"generation_config": null,
|
| 259 |
+
"check_model": true,
|
| 260 |
+
"acc_strategy": "token",
|
| 261 |
+
"train_dataloader_shuffle": true,
|
| 262 |
+
"metric_warmup_step": 0,
|
| 263 |
+
"fsdp_num": 1,
|
| 264 |
+
"acc_steps": 1,
|
| 265 |
+
"eval_use_evalscope": false,
|
| 266 |
+
"eval_datasets": [],
|
| 267 |
+
"eval_limit": null,
|
| 268 |
+
"eval_datasets_args": null,
|
| 269 |
+
"eval_generation_config": null,
|
| 270 |
+
"freeze_parameters": [],
|
| 271 |
+
"freeze_parameters_ratio": 0.0,
|
| 272 |
+
"trainable_parameters": [],
|
| 273 |
+
"freeze_llm": false,
|
| 274 |
+
"freeze_vit": true,
|
| 275 |
+
"freeze_aligner": true,
|
| 276 |
+
"target_modules": [
|
| 277 |
+
"all-linear"
|
| 278 |
+
],
|
| 279 |
+
"target_regex": null,
|
| 280 |
+
"modules_to_save": [],
|
| 281 |
+
"lora_rank": 8,
|
| 282 |
+
"lora_alpha": 32,
|
| 283 |
+
"lora_dropout": 0.05,
|
| 284 |
+
"lora_bias": "none",
|
| 285 |
+
"lora_dtype": null,
|
| 286 |
+
"lorap_lr_ratio": null,
|
| 287 |
+
"use_rslora": false,
|
| 288 |
+
"use_dora": false,
|
| 289 |
+
"lora_ga_batch_size": 2,
|
| 290 |
+
"lora_ga_iters": 2,
|
| 291 |
+
"lora_ga_max_length": 1024,
|
| 292 |
+
"lora_ga_direction": "ArB2r",
|
| 293 |
+
"lora_ga_scale": "stable",
|
| 294 |
+
"lora_ga_stable_gamma": 16,
|
| 295 |
+
"init_weights": true,
|
| 296 |
+
"fourier_n_frequency": 2000,
|
| 297 |
+
"fourier_scaling": 300.0,
|
| 298 |
+
"boft_block_size": 4,
|
| 299 |
+
"boft_block_num": 0,
|
| 300 |
+
"boft_n_butterfly_factor": 1,
|
| 301 |
+
"boft_dropout": 0.0,
|
| 302 |
+
"vera_rank": 256,
|
| 303 |
+
"vera_projection_prng_key": 0,
|
| 304 |
+
"vera_dropout": 0.0,
|
| 305 |
+
"vera_d_initial": 0.1,
|
| 306 |
+
"adapter_act": "gelu",
|
| 307 |
+
"adapter_length": 128,
|
| 308 |
+
"use_galore": false,
|
| 309 |
+
"galore_target_modules": null,
|
| 310 |
+
"galore_rank": 128,
|
| 311 |
+
"galore_update_proj_gap": 50,
|
| 312 |
+
"galore_scale": 1.0,
|
| 313 |
+
"galore_proj_type": "std",
|
| 314 |
+
"galore_optim_per_parameter": false,
|
| 315 |
+
"galore_with_embedding": false,
|
| 316 |
+
"galore_quantization": false,
|
| 317 |
+
"galore_proj_quant": false,
|
| 318 |
+
"galore_proj_bits": 4,
|
| 319 |
+
"galore_proj_group_size": 256,
|
| 320 |
+
"galore_cos_threshold": 0.4,
|
| 321 |
+
"galore_gamma_proj": 2,
|
| 322 |
+
"galore_queue_size": 5,
|
| 323 |
+
"adalora_target_r": 8,
|
| 324 |
+
"adalora_init_r": 12,
|
| 325 |
+
"adalora_tinit": 0,
|
| 326 |
+
"adalora_tfinal": 0,
|
| 327 |
+
"adalora_deltaT": 1,
|
| 328 |
+
"adalora_beta1": 0.85,
|
| 329 |
+
"adalora_beta2": 0.85,
|
| 330 |
+
"adalora_orth_reg_weight": 0.5,
|
| 331 |
+
"llamapro_num_new_blocks": 4,
|
| 332 |
+
"llamapro_num_groups": null,
|
| 333 |
+
"lisa_activated_layers": 0,
|
| 334 |
+
"lisa_step_interval": 20,
|
| 335 |
+
"reft_layer_key": null,
|
| 336 |
+
"reft_layers": null,
|
| 337 |
+
"reft_rank": 4,
|
| 338 |
+
"reft_intervention_type": "LoreftIntervention",
|
| 339 |
+
"reft_args": null,
|
| 340 |
+
"swanlab_token": null,
|
| 341 |
+
"swanlab_project": null,
|
| 342 |
+
"swanlab_workspace": null,
|
| 343 |
+
"swanlab_exp_name": null,
|
| 344 |
+
"swanlab_mode": "cloud",
|
| 345 |
+
"add_version": true,
|
| 346 |
+
"resume_only_model": false,
|
| 347 |
+
"create_checkpoint_symlink": false,
|
| 348 |
+
"packing": false,
|
| 349 |
+
"lazy_tokenize": false,
|
| 350 |
+
"loss_type": null,
|
| 351 |
+
"optimizer": null,
|
| 352 |
+
"metric": null,
|
| 353 |
+
"zero_hpz_partition_size": null,
|
| 354 |
+
"rank": 0,
|
| 355 |
+
"global_world_size": 8,
|
| 356 |
+
"local_world_size": 8,
|
| 357 |
+
"model_suffix": "checkpoint-50-merged",
|
| 358 |
+
"model_info": "ModelInfo(model_type='qwen2_5', model_dir='/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)",
|
| 359 |
+
"model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding'])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f22ea5f5e10>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=['transformers>=4.37'], tags=[])",
|
| 360 |
+
"model_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
|
| 361 |
+
"hub": "<class 'swift.hub.hub.MSHub'>",
|
| 362 |
+
"evaluation_strategy": "steps",
|
| 363 |
+
"training_args": "Seq2SeqTrainingArguments(output_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=5, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=5, dataloader_num_workers=1, dataloader_prefetch_factor=10, past_index=-1, run_name='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None)"
|
| 364 |
+
}
|
EasyR1-new/.gitignore
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
|
| 110 |
+
# pdm
|
| 111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 112 |
+
#pdm.lock
|
| 113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 114 |
+
# in version control.
|
| 115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 116 |
+
.pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 121 |
+
__pypackages__/
|
| 122 |
+
|
| 123 |
+
# Celery stuff
|
| 124 |
+
celerybeat-schedule
|
| 125 |
+
celerybeat.pid
|
| 126 |
+
|
| 127 |
+
# SageMath parsed files
|
| 128 |
+
*.sage.py
|
| 129 |
+
|
| 130 |
+
# Environments
|
| 131 |
+
.env
|
| 132 |
+
.venv
|
| 133 |
+
env/
|
| 134 |
+
venv/
|
| 135 |
+
ENV/
|
| 136 |
+
env.bak/
|
| 137 |
+
venv.bak/
|
| 138 |
+
|
| 139 |
+
# Spyder project settings
|
| 140 |
+
.spyderproject
|
| 141 |
+
.spyproject
|
| 142 |
+
|
| 143 |
+
# Rope project settings
|
| 144 |
+
.ropeproject
|
| 145 |
+
|
| 146 |
+
# mkdocs documentation
|
| 147 |
+
/site
|
| 148 |
+
|
| 149 |
+
# mypy
|
| 150 |
+
.mypy_cache/
|
| 151 |
+
.dmypy.json
|
| 152 |
+
dmypy.json
|
| 153 |
+
|
| 154 |
+
# Pyre type checker
|
| 155 |
+
.pyre/
|
| 156 |
+
|
| 157 |
+
# pytype static type analyzer
|
| 158 |
+
.pytype/
|
| 159 |
+
|
| 160 |
+
# Cython debug symbols
|
| 161 |
+
cython_debug/
|
| 162 |
+
|
| 163 |
+
# PyCharm
|
| 164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 168 |
+
#.idea/
|
| 169 |
+
|
| 170 |
+
# PyPI configuration file
|
| 171 |
+
.pypirc
|
| 172 |
+
|
| 173 |
+
# outputs
|
| 174 |
+
outputs/
|
| 175 |
+
checkpoints/
|
| 176 |
+
wandb/
|
| 177 |
+
tensorboard_log/
|
| 178 |
+
|
| 179 |
+
# data
|
| 180 |
+
images/
|
| 181 |
+
images*
|
EasyR1-new/.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 3 |
+
rev: v5.0.0
|
| 4 |
+
hooks:
|
| 5 |
+
- id: check-ast
|
| 6 |
+
- id: check-added-large-files
|
| 7 |
+
args: ['--maxkb=25000']
|
| 8 |
+
- id: check-merge-conflict
|
| 9 |
+
- id: check-yaml
|
| 10 |
+
- id: debug-statements
|
| 11 |
+
- id: end-of-file-fixer
|
| 12 |
+
- id: requirements-txt-fixer
|
| 13 |
+
- id: trailing-whitespace
|
| 14 |
+
args: [--markdown-linebreak-ext=md]
|
| 15 |
+
- id: no-commit-to-branch
|
| 16 |
+
args: ['--branch', 'main']
|
| 17 |
+
|
| 18 |
+
- repo: https://github.com/asottile/pyupgrade
|
| 19 |
+
rev: v3.17.0
|
| 20 |
+
hooks:
|
| 21 |
+
- id: pyupgrade
|
| 22 |
+
args: [--py38-plus]
|
EasyR1-new/Dockerfile
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
|
| 2 |
+
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
|
| 3 |
+
FROM nvcr.io/nvidia/pytorch:24.08-py3
|
| 4 |
+
|
| 5 |
+
# Define environments
|
| 6 |
+
ENV MAX_JOBS=32
|
| 7 |
+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
| 8 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 9 |
+
ENV NODE_OPTIONS=""
|
| 10 |
+
ENV PIP_ROOT_USER_ACTION=ignore
|
| 11 |
+
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
| 12 |
+
|
| 13 |
+
# Define installation arguments
|
| 14 |
+
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
|
| 15 |
+
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
| 16 |
+
|
| 17 |
+
# Set apt source
|
| 18 |
+
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
| 19 |
+
{ \
|
| 20 |
+
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
|
| 21 |
+
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
|
| 22 |
+
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
|
| 23 |
+
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
|
| 24 |
+
} > /etc/apt/sources.list
|
| 25 |
+
|
| 26 |
+
# Install systemctl
|
| 27 |
+
RUN apt-get update && \
|
| 28 |
+
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
|
| 29 |
+
apt-get clean
|
| 30 |
+
|
| 31 |
+
# Install tini
|
| 32 |
+
RUN apt-get update && \
|
| 33 |
+
apt-get install -y tini && \
|
| 34 |
+
apt-get clean
|
| 35 |
+
|
| 36 |
+
# Change pip source
|
| 37 |
+
RUN pip config set global.index-url "${PIP_INDEX}" && \
|
| 38 |
+
pip config set global.extra-index-url "${PIP_INDEX}" && \
|
| 39 |
+
python -m pip install --upgrade pip
|
| 40 |
+
|
| 41 |
+
# Uninstall nv-pytorch fork
|
| 42 |
+
RUN pip uninstall -y torch torchvision torchaudio \
|
| 43 |
+
pytorch-quantization pytorch-triton torch-tensorrt \
|
| 44 |
+
transformer-engine flash-attn apex megatron-core \
|
| 45 |
+
xgboost opencv grpcio
|
| 46 |
+
|
| 47 |
+
# Fix cv2
|
| 48 |
+
RUN rm -rf /usr/local/lib/python3.10/dist-packages/cv2
|
| 49 |
+
|
| 50 |
+
# Install torch-2.7.0+cu126 + vllm-0.9.1
|
| 51 |
+
RUN pip install --no-cache-dir "vllm==0.9.1" "torch==2.7.0" "torchvision==0.22.0" "torchaudio==2.7.0" tensordict torchdata \
|
| 52 |
+
"transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
|
| 53 |
+
"numpy<2.0.0" "pyarrow>=15.0.0" "grpcio>=1.62.1" "optree>=0.13.0" pandas \
|
| 54 |
+
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb liger-kernel mathruler \
|
| 55 |
+
pytest yapf py-spy pyext pre-commit ruff
|
| 56 |
+
|
| 57 |
+
# Install flash-attn-2.8.0.post2
|
| 58 |
+
RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
|
| 59 |
+
URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
|
| 60 |
+
wget -nv -P /opt/tiger "${URL}" && \
|
| 61 |
+
pip install --no-cache-dir "/opt/tiger/$(basename ${URL})"
|
| 62 |
+
|
| 63 |
+
# Reset pip config
|
| 64 |
+
RUN pip config unset global.index-url && \
|
| 65 |
+
pip config unset global.extra-index-url
|
EasyR1-new/Dockerfile.legacy
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
|
| 2 |
+
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
|
| 3 |
+
FROM nvcr.io/nvidia/pytorch:24.08-py3
|
| 4 |
+
|
| 5 |
+
# Define environments
|
| 6 |
+
ENV MAX_JOBS=32
|
| 7 |
+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
| 8 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 9 |
+
ENV NODE_OPTIONS=""
|
| 10 |
+
ENV PIP_ROOT_USER_ACTION=ignore
|
| 11 |
+
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
| 12 |
+
|
| 13 |
+
# Define installation arguments
|
| 14 |
+
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
|
| 15 |
+
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
| 16 |
+
|
| 17 |
+
# Set apt source
|
| 18 |
+
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
|
| 19 |
+
{ \
|
| 20 |
+
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
|
| 21 |
+
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
|
| 22 |
+
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
|
| 23 |
+
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
|
| 24 |
+
} > /etc/apt/sources.list
|
| 25 |
+
|
| 26 |
+
# Install systemctl
|
| 27 |
+
RUN apt-get update && \
|
| 28 |
+
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
|
| 29 |
+
apt-get clean
|
| 30 |
+
|
| 31 |
+
# Install tini
|
| 32 |
+
RUN apt-get update && \
|
| 33 |
+
apt-get install -y tini && \
|
| 34 |
+
apt-get clean
|
| 35 |
+
|
| 36 |
+
# Change pip source
|
| 37 |
+
RUN pip config set global.index-url "${PIP_INDEX}" && \
|
| 38 |
+
pip config set global.extra-index-url "${PIP_INDEX}" && \
|
| 39 |
+
python -m pip install --upgrade pip
|
| 40 |
+
|
| 41 |
+
# Uninstall nv-pytorch fork
|
| 42 |
+
RUN pip uninstall -y torch torchvision torchaudio \
|
| 43 |
+
pytorch-quantization pytorch-triton torch-tensorrt \
|
| 44 |
+
transformer_engine flash_attn apex megatron-core \
|
| 45 |
+
xgboost opencv grpcio
|
| 46 |
+
|
| 47 |
+
# Fix cv2
|
| 48 |
+
RUN rm -rf /usr/local/lib/python3.10/dist-packages/cv2
|
| 49 |
+
|
| 50 |
+
# Install torch-2.6.0+cu124 + vllm-0.8.4
|
| 51 |
+
# torch-2.6.0+cu124: cxx11abi=False
|
| 52 |
+
# torch-2.6.0+cu126: cxx11abi=True
|
| 53 |
+
# see https://github.com/flashinfer-ai/flashinfer/issues/911
|
| 54 |
+
RUN pip install --no-cache-dir "vllm==0.8.4" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" tensordict torchdata \
|
| 55 |
+
"transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
|
| 56 |
+
"numpy<2.0.0" "pyarrow>=15.0.0" "grpcio>=1.62.1" "optree>=0.13.0" pandas \
|
| 57 |
+
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb liger-kernel mathruler \
|
| 58 |
+
pytest yapf py-spy pyext pre-commit ruff
|
| 59 |
+
|
| 60 |
+
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
|
| 61 |
+
RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
|
| 62 |
+
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
| 63 |
+
|
| 64 |
+
# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
|
| 65 |
+
# vllm-0.8.3 does not support flashinfer>=0.2.3
|
| 66 |
+
# see https://github.com/vllm-project/vllm/pull/15777
|
| 67 |
+
RUN wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
|
| 68 |
+
pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
|
| 69 |
+
|
| 70 |
+
# Reset pip config
|
| 71 |
+
RUN pip config unset global.index-url && \
|
| 72 |
+
pip config unset global.extra-index-url
|
EasyR1-new/LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
EasyR1-new/Makefile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: build commit license quality style test
|
| 2 |
+
|
| 3 |
+
check_dirs := examples scripts tests verl setup.py
|
| 4 |
+
|
| 5 |
+
build:
|
| 6 |
+
python3 setup.py sdist bdist_wheel
|
| 7 |
+
|
| 8 |
+
commit:
|
| 9 |
+
pre-commit install
|
| 10 |
+
pre-commit run --all-files
|
| 11 |
+
|
| 12 |
+
license:
|
| 13 |
+
python3 tests/check_license.py $(check_dirs)
|
| 14 |
+
|
| 15 |
+
quality:
|
| 16 |
+
ruff check $(check_dirs)
|
| 17 |
+
ruff format --check $(check_dirs)
|
| 18 |
+
|
| 19 |
+
style:
|
| 20 |
+
ruff check $(check_dirs) --fix
|
| 21 |
+
ruff format $(check_dirs)
|
| 22 |
+
|
| 23 |
+
test:
|
| 24 |
+
pytest -vv tests/
|
EasyR1-new/README.md
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework
|
| 2 |
+
|
| 3 |
+
[](https://github.com/hiyouga/EasyR1/stargazers)
|
| 4 |
+
[](https://twitter.com/llamafactory_ai)
|
| 5 |
+
|
| 6 |
+
### Used by [Amazon Web Services](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/)
|
| 7 |
+
|
| 8 |
+
This project is a clean fork of the original [veRL](https://github.com/volcengine/verl) project to support vision language models, we thank all the authors for providing such a high-performance RL training framework.
|
| 9 |
+
|
| 10 |
+
EasyR1 is efficient and scalable due to the design of **[HybirdEngine](https://arxiv.org/abs/2409.19256)** and the latest release of **[vLLM](https://github.com/vllm-project/vllm)**'s SPMD mode.
|
| 11 |
+
|
| 12 |
+
## Features
|
| 13 |
+
|
| 14 |
+
- Supported models
|
| 15 |
+
- Llama3/Qwen2/Qwen2.5/Qwen3 language models
|
| 16 |
+
- Qwen2/Qwen2.5-VL vision language models
|
| 17 |
+
- DeepSeek-R1 distill models
|
| 18 |
+
|
| 19 |
+
- Supported algorithms
|
| 20 |
+
- GRPO
|
| 21 |
+
- DAPO
|
| 22 |
+
- Reinforce++
|
| 23 |
+
- ReMax
|
| 24 |
+
- RLOO
|
| 25 |
+
|
| 26 |
+
- Supported datasets
|
| 27 |
+
- Any text, vision-text dataset in a [specific format](#custom-dataset)
|
| 28 |
+
|
| 29 |
+
- Supported tricks
|
| 30 |
+
- Padding-free training
|
| 31 |
+
- Resuming from checkpoint
|
| 32 |
+
- Wandb & SwanLab & Mlflow & Tensorboard tracking
|
| 33 |
+
|
| 34 |
+
## Requirements
|
| 35 |
+
|
| 36 |
+
### Software Requirements
|
| 37 |
+
|
| 38 |
+
- Python 3.9+
|
| 39 |
+
- transformers>=4.51.0
|
| 40 |
+
- flash-attn>=2.4.3
|
| 41 |
+
- vllm>=0.8.3
|
| 42 |
+
|
| 43 |
+
We provide a [Dockerfile](./Dockerfile) to easily build environments.
|
| 44 |
+
|
| 45 |
+
We recommend using the [pre-built docker image](https://hub.docker.com/r/hiyouga/verl) in EasyR1.
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
docker pull hiyouga/verl:ngc-th2.7.0-cu12.6-vllm0.9.1
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### Hardware Requirements
|
| 52 |
+
|
| 53 |
+
\* *estimated*
|
| 54 |
+
|
| 55 |
+
| Method | Bits | 1.5B | 3B | 7B | 32B | 72B |
|
| 56 |
+
| ------------------------ | ---- | ------ | ------ | ------ | ------- | ------- |
|
| 57 |
+
| GRPO Full Fine-Tuning | AMP | 2*24GB | 4*40GB | 8*40GB | 16*80GB | 32*80GB |
|
| 58 |
+
| GRPO Full Fine-Tuning | BF16 | 1*24GB | 1*40GB | 4*40GB | 8*80GB | 16*80GB |
|
| 59 |
+
|
| 60 |
+
> [!NOTE]
|
| 61 |
+
> Use `worker.actor.fsdp.torch_dtype=bf16` and `worker.actor.optim.strategy=adamw_bf16` to enable bf16 training.
|
| 62 |
+
>
|
| 63 |
+
> We are working hard to reduce the VRAM in RL training, LoRA support will be integrated in next updates.
|
| 64 |
+
|
| 65 |
+
## Tutorial: Run Qwen2.5-VL GRPO on [Geometry3K](https://huggingface.co/datasets/hiyouga/geometry3k) Dataset in Just 3 Steps
|
| 66 |
+
|
| 67 |
+

|
| 68 |
+
|
| 69 |
+
### Installation
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
git clone https://github.com/hiyouga/EasyR1.git
|
| 73 |
+
cd EasyR1
|
| 74 |
+
pip install -e .
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
### GRPO Training
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
### Merge Checkpoint in Hugging Face Format
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
python3 scripts/model_merger.py --local_dir checkpoints/easy_r1/exp_name/global_step_1/actor
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
> [!TIP]
|
| 90 |
+
> If you encounter issues with connecting to Hugging Face, consider using `export HF_ENDPOINT=https://hf-mirror.com`.
|
| 91 |
+
>
|
| 92 |
+
> If you want to use SwanLab logger, consider using `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
|
| 93 |
+
|
| 94 |
+
## Custom Dataset
|
| 95 |
+
|
| 96 |
+
Please refer to the example datasets to prepare your own dataset.
|
| 97 |
+
|
| 98 |
+
- Text dataset: https://huggingface.co/datasets/hiyouga/math12k
|
| 99 |
+
- Image-text dataset: https://huggingface.co/datasets/hiyouga/geometry3k
|
| 100 |
+
- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
|
| 101 |
+
- Text-image mixed dataset: https://huggingface.co/datasets/hiyouga/rl-mixed-dataset
|
| 102 |
+
|
| 103 |
+
## How to Understand GRPO in EasyR1
|
| 104 |
+
|
| 105 |
+

|
| 106 |
+
|
| 107 |
+
- To learn about the GRPO algorithm, you can refer to [Hugging Face's blog](https://huggingface.co/docs/trl/v0.16.1/en/grpo_trainer).
|
| 108 |
+
|
| 109 |
+
## How to Run 70B+ Model in Multi-node Environment
|
| 110 |
+
|
| 111 |
+
1. Start the Ray head node.
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
ray start --head --port=6379 --dashboard-host=0.0.0.0
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
2. Start the Ray worker node and connect to the head node.
|
| 118 |
+
|
| 119 |
+
```bash
|
| 120 |
+
ray start --address=<head_node_ip>:6379
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
3. Check the Ray resource pool.
|
| 124 |
+
|
| 125 |
+
```bash
|
| 126 |
+
ray status
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
4. Run training script on the Ray head node only.
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
See the **[veRL's official doc](https://verl.readthedocs.io/en/latest/start/multinode.html)** for more details about multi-node training and Ray debugger.
|
| 136 |
+
|
| 137 |
+
## Other Baselines
|
| 138 |
+
|
| 139 |
+
We also reproduced the following two baselines of the [R1-V](https://github.com/deep-agent/R1-V) project.
|
| 140 |
+
- [CLEVR-70k-Counting](examples/baselines/qwen2_5_vl_3b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
|
| 141 |
+
- [GeoQA-8k](examples/baselines/qwen2_5_vl_3b_geoqa8k.sh): Train the Qwen2.5-VL-3B-Instruct model on GeoQA problem.
|
| 142 |
+
|
| 143 |
+
## Performance Baselines
|
| 144 |
+
|
| 145 |
+
See [baselines.md](assets/baselines.md).
|
| 146 |
+
|
| 147 |
+
## Awesome Work using EasyR1
|
| 148 |
+
|
| 149 |
+
- **MMR1**: Advancing the Frontiers of Multimodal Reasoning. [![[code]](https://img.shields.io/github/stars/LengSicong/MMR1)](https://github.com/LengSicong/MMR1)
|
| 150 |
+
- **Vision-R1**: Incentivizing Reasoning Capability in Multimodal Large Language Models. [![[code]](https://img.shields.io/github/stars/Osilly/Vision-R1)](https://github.com/Osilly/Vision-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06749-blue)](https://arxiv.org/abs/2503.06749)
|
| 151 |
+
- **Seg-Zero**: Reasoning-Chain Guided Segmentation via Cognitive Reinforcement. [![[code]](https://img.shields.io/github/stars/dvlab-research/Seg-Zero)](https://github.com/dvlab-research/Seg-Zero) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06520-blue)](https://arxiv.org/abs/2503.06520)
|
| 152 |
+
- **MetaSpatial**: Reinforcing 3D Spatial Reasoning in VLMs for the Metaverse. [![[code]](https://img.shields.io/github/stars/PzySeere/MetaSpatial)](https://github.com/PzySeere/MetaSpatial) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.18470-blue)](https://arxiv.org/abs/2503.18470)
|
| 153 |
+
- **Temporal-R1**: Envolving Temporal Reasoning Capability into LMMs via Temporal Consistent Reward. [![[code]](https://img.shields.io/github/stars/appletea233/Temporal-R1)](https://github.com/appletea233/Temporal-R1)
|
| 154 |
+
- **NoisyRollout**: Reinforcing Visual Reasoning with Data Augmentation. [![[code]](https://img.shields.io/github/stars/John-AI-Lab/NoisyRollout)](https://github.com/John-AI-Lab/NoisyRollout) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.13055-blue)](https://arxiv.org/pdf/2504.13055)
|
| 155 |
+
- **GUI-R1**: A Generalist R1-Style Vision-Language Action Model For GUI Agents. [![[code]](https://img.shields.io/github/stars/ritzz-ai/GUI-R1)](https://github.com/ritzz-ai/GUI-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.10458-blue)](https://arxiv.org/abs/2504.10458)
|
| 156 |
+
- **R1-Track**: Direct Application of MLLMs to Visual Object Tracking via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/Wangbiao2/R1-Track)](https://github.com/Wangbiao2/R1-Track)
|
| 157 |
+
- **VisionReasoner**: Unified Visual Perception and Reasoning via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/dvlab-research/VisionReasoner)](https://github.com/dvlab-research/VisionReasoner) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.12081-blue)](https://arxiv.org/abs/2505.12081)
|
| 158 |
+
- **MM-UPT**: Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO. [![[code]](https://img.shields.io/github/stars/waltonfuture/MM-UPT)](https://github.com/waltonfuture/MM-UPT) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22453-blue)](https://arxiv.org/pdf/2505.22453)
|
| 159 |
+
- **RL-with-Cold-Start**: Advancing Multimodal Reasoning via Reinforcement Learning with Cold Start. [![[code]](https://img.shields.io/github/stars/waltonfuture/RL-with-Cold-Start)](https://github.com/waltonfuture/RL-with-Cold-Start) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/pdf/2505.22334)
|
| 160 |
+
- **ViGoRL**: Grounded Reinforcement Learning for Visual Reasoning. [![[code]](https://img.shields.io/github/stars/Gabesarch/grounded-rl)](https://github.com/Gabesarch/grounded-rl) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/abs/2505.23678)
|
| 161 |
+
- **Revisual-R1**: Advancing Multimodal Reasoning: From Optimized Cold Start to Staged Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CSfufu/Revisual-R1)](https://github.com/CSfufu/Revisual-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.04207-blue)](https://arxiv.org/abs/2506.04207)
|
| 162 |
+
- **SophiaVL-R1**: Reinforcing MLLMs Reasoning with Thinking Reward. [![[code]](https://img.shields.io/github/stars/kxfan2002/SophiaVL-R1)](https://github.com/kxfan2002/SophiaVL-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.17018-blue)](https://arxiv.org/abs/2505.17018)
|
| 163 |
+
- **Vision-Matters**: Simple Visual Perturbations Can Boost Multimodal Math Reasoning. [![[code]](https://img.shields.io/github/stars/YutingLi0606/Vision-Matters)](https://github.com/YutingLi0606/Vision-Matters) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.09736-blue)](https://arxiv.org/abs/2506.09736)
|
| 164 |
+
- **VTool-R1**: VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. [![[code]](https://img.shields.io/github/stars/VTOOL-R1/vtool-r1)](https://github.com/VTOOL-R1/vtool-r1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.19255-blue)](https://arxiv.org/abs/2505.19255)
|
| 165 |
+
|
| 166 |
+
## TODO
|
| 167 |
+
|
| 168 |
+
- Support LoRA (high priority).
|
| 169 |
+
- Support ulysses parallelism for VLMs (middle priority).
|
| 170 |
+
- Support more VLM architectures.
|
| 171 |
+
|
| 172 |
+
> [!NOTE]
|
| 173 |
+
> We will not provide scripts for supervised fine-tuning and inference in this project. If you have such requirements, we recommend using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
|
| 174 |
+
|
| 175 |
+
### Known bugs
|
| 176 |
+
|
| 177 |
+
These features are temporarily disabled for now, we plan to fix them one-by-one in the future updates.
|
| 178 |
+
|
| 179 |
+
- Vision language models are not compatible with ulysses parallelism yet.
|
| 180 |
+
|
| 181 |
+
## Discussion Group
|
| 182 |
+
|
| 183 |
+
👋 Join our [WeChat group](assets/wechat.jpg).
|
| 184 |
+
|
| 185 |
+
## FAQs
|
| 186 |
+
|
| 187 |
+
> ValueError: Image features and image tokens do not match: tokens: 8192, features 9800
|
| 188 |
+
|
| 189 |
+
Increase the `data.max_prompt_length` or reduce the `data.max_pixels`.
|
| 190 |
+
|
| 191 |
+
> RuntimeError: CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62
|
| 192 |
+
|
| 193 |
+
Reduce the `worker.rollout.gpu_memory_utilization` and enable `worker.actor.offload.offload_params`.
|
| 194 |
+
|
| 195 |
+
> RuntimeError: 0 active drivers ([]). There should only be one.
|
| 196 |
+
|
| 197 |
+
Uninstall `deepspeed` from the current python environment.
|
| 198 |
+
|
| 199 |
+
## Citation
|
| 200 |
+
|
| 201 |
+
Core contributors: [Yaowei Zheng](https://github.com/hiyouga), [Junting Lu](https://github.com/AL-377), [Shenzhi Wang](https://github.com/Shenzhi-Wang), [Zhangchi Feng](https://github.com/BUAADreamer), [Dongdong Kuang](https://github.com/Kuangdd01) and Yuwen Xiong
|
| 202 |
+
|
| 203 |
+
We also thank Guangming Sheng and Chi Zhang for helpful discussions.
|
| 204 |
+
|
| 205 |
+
```bibtex
|
| 206 |
+
@misc{zheng2025easyr1,
|
| 207 |
+
title = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework},
|
| 208 |
+
author = {Yaowei Zheng, Junting Lu, Shenzhi Wang, Zhangchi Feng, Dongdong Kuang, Yuwen Xiong},
|
| 209 |
+
howpublished = {\url{https://github.com/hiyouga/EasyR1}},
|
| 210 |
+
year = {2025}
|
| 211 |
+
}
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
We recommend to also cite the original work.
|
| 215 |
+
|
| 216 |
+
```bibtex
|
| 217 |
+
@article{sheng2024hybridflow,
|
| 218 |
+
title = {HybridFlow: A Flexible and Efficient RLHF Framework},
|
| 219 |
+
author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
|
| 220 |
+
year = {2024},
|
| 221 |
+
journal = {arXiv preprint arXiv: 2409.19256}
|
| 222 |
+
}
|
| 223 |
+
```
|
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/media/table/val/generations_0_2feffd203d182aadef48.table.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"columns": ["step", "input_1", "output_1", "label_1", "score_1", "input_2", "output_2", "label_2", "score_2", "input_3", "output_3", "label_3", "score_3"], "data": [[0, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: PNAIVNRSIPEGVDVFEISGPLFFGAASKFKDAMHVVEKAPSIRILRMRKVMSIDATGLNMLKELFNDCRKSGTTLILSGVHTQPLFAMQQYGLADEIGEENIFGNIDDALDRARSLLGLPVQGRPAGF, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein solubility can be predicted by the grand average of hydropathicity (GRAVY) scale. The GRAVY scale is based on the assumption that a protein with a large number of hydrophilic residues will have a negative value and a protein with a large number of hydrophobic residues will have a positive value. The GRAVY scale is calculated by summing the hydropathy values of all amino acid residues in a protein sequence and dividing by the number of residues in the sequence. The hydropathy value of an amino acid residue is a measure of its hydrophobicity, which is based on its chemical properties. The hydropathy value of an amino acid residue can be calculated using a hydrophobicity scale, which is a table that assigns a numerical value to each amino acid residue based on its hydrophobicity. The amino acid sequence of the protein is: PNAIVNRSIPEGVDVFEISGPLFFGAASKFKDAMHVVEKAPSIRILRMRKVMSIDATGLNMLKELFNDCRKSGTTLILSGVHTQPLFAMQQYGLADEIGEENIFGNIDDALDRARSLLGLPVQGRPAGF. The hydropathy value of each amino acid residue in the sequence is: 0.057, 0.069, 0.08, 0.077, 0.038, 0.05, 0.065, 0.07, 0.057, 0.056, 0.08, 0.059, 0.06, 0.057, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, ", 0, 0.0, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: MTYVLAESPEFYQDNVTDYTGQISSSDITNIQAAIDDVKASEQKVIFVVFLSSFDGVDPETWTQQALQANGGGNVLIYALAPEERQYGIQGGTQWTDAELDAANNAAFQALSQEDWAGSALALAESVGSSSSSSLEHHHHHH, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n</think>\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydroph", 0, 0.9835715293884277, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: MGHHHHHHSHMDSKAIAERIKKLQAQQEHVRIGGKGTPRRKKKVIHKTAAADDKKLQSNLKKLSVTNIPGIEEVNMIKDDGTVIHFNNPKVQTSVPANTFSVTGSADNKQITEMLPGILNQLGPESLTHLKKLANNVTKLGPDGKGEDEDVPELVGDFDAASKNETKADEQ, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins tend to be more soluble than hydrophobic proteins. The presence of a large number of hydrophobic amino acids, such as phenylalanine, tryptophan, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 11 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine", 0, 0.0]]}
|
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/requirements.txt
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
setproctitle==1.2.2
|
| 2 |
+
psutil==7.0.0
|
| 3 |
+
colorama==0.4.6
|
| 4 |
+
scipy==1.15.3
|
| 5 |
+
protobuf==4.25.8
|
| 6 |
+
xxhash==3.5.0
|
| 7 |
+
tiktoken==0.9.0
|
| 8 |
+
setproctitle==1.3.6
|
| 9 |
+
opentelemetry-exporter-otlp-proto-grpc==1.26.0
|
| 10 |
+
pyasn1==0.6.1
|
| 11 |
+
typing_extensions==4.14.0
|
| 12 |
+
xgrammar==0.1.18
|
| 13 |
+
frozenlist==1.7.0
|
| 14 |
+
gguf==0.17.0
|
| 15 |
+
tzdata==2025.2
|
| 16 |
+
interegular==0.3.3
|
| 17 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 18 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 19 |
+
filelock==3.18.0
|
| 20 |
+
wrapt==1.17.2
|
| 21 |
+
multiprocess==0.70.16
|
| 22 |
+
certifi==2025.4.26
|
| 23 |
+
email_validator==2.2.0
|
| 24 |
+
ninja==1.11.1.4
|
| 25 |
+
cupy-cuda12x==13.4.1
|
| 26 |
+
fsspec==2025.3.0
|
| 27 |
+
grpcio==1.73.0
|
| 28 |
+
referencing==0.36.2
|
| 29 |
+
exceptiongroup==1.3.0
|
| 30 |
+
httpcore==1.0.9
|
| 31 |
+
tqdm==4.67.1
|
| 32 |
+
torchvision==0.21.0
|
| 33 |
+
omegaconf==2.3.0
|
| 34 |
+
tensordict==0.8.3
|
| 35 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 36 |
+
pycountry==24.6.1
|
| 37 |
+
aiohappyeyeballs==2.6.1
|
| 38 |
+
aiohttp-cors==0.8.1
|
| 39 |
+
pydantic==2.11.6
|
| 40 |
+
python-dotenv==1.1.0
|
| 41 |
+
propcache==0.3.2
|
| 42 |
+
platformdirs==4.3.8
|
| 43 |
+
googleapis-common-protos==1.70.0
|
| 44 |
+
vllm==0.8.5.post1
|
| 45 |
+
modelscope==1.27.0
|
| 46 |
+
GitPython==3.1.44
|
| 47 |
+
packaging==25.0
|
| 48 |
+
fastapi==0.115.12
|
| 49 |
+
smmap==5.0.2
|
| 50 |
+
fastapi-cli==0.0.7
|
| 51 |
+
huggingface-hub==0.33.0
|
| 52 |
+
peft==0.15.2
|
| 53 |
+
wandb==0.20.1
|
| 54 |
+
cachetools==5.5.2
|
| 55 |
+
aiosignal==1.3.2
|
| 56 |
+
astor==0.8.1
|
| 57 |
+
msgpack==1.1.1
|
| 58 |
+
cloudpickle==3.1.1
|
| 59 |
+
attrs==25.3.0
|
| 60 |
+
google-auth==2.40.3
|
| 61 |
+
ray==2.47.0
|
| 62 |
+
opentelemetry-exporter-otlp-proto-common==1.26.0
|
| 63 |
+
flash-attn==2.7.1.post1
|
| 64 |
+
codetiming==1.4.0
|
| 65 |
+
typing-inspection==0.4.1
|
| 66 |
+
rich-toolkit==0.14.7
|
| 67 |
+
regex==2024.11.6
|
| 68 |
+
einops==0.8.1
|
| 69 |
+
nvidia-nvtx-cu12==12.4.127
|
| 70 |
+
torchdata==0.11.0
|
| 71 |
+
orjson==3.10.18
|
| 72 |
+
opentelemetry-exporter-otlp==1.26.0
|
| 73 |
+
requests==2.32.4
|
| 74 |
+
yarl==1.20.1
|
| 75 |
+
six==1.17.0
|
| 76 |
+
compressed-tensors==0.9.3
|
| 77 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 78 |
+
pydantic_core==2.33.2
|
| 79 |
+
rsa==4.9.1
|
| 80 |
+
fastrlock==0.8.3
|
| 81 |
+
prometheus-fastapi-instrumentator==7.1.0
|
| 82 |
+
psutil==7.0.0
|
| 83 |
+
async-timeout==5.0.1
|
| 84 |
+
msgspec==0.19.0
|
| 85 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 86 |
+
sentencepiece==0.2.0
|
| 87 |
+
rpds-py==0.25.1
|
| 88 |
+
mathruler==0.1.0
|
| 89 |
+
opentelemetry-proto==1.26.0
|
| 90 |
+
pyarrow==20.0.0
|
| 91 |
+
mdurl==0.1.2
|
| 92 |
+
python-json-logger==3.3.0
|
| 93 |
+
python-dateutil==2.9.0.post0
|
| 94 |
+
numpy==2.2.6
|
| 95 |
+
markdown-it-py==3.0.0
|
| 96 |
+
tokenizers==0.21.1
|
| 97 |
+
wheel==0.45.1
|
| 98 |
+
gitdb==4.0.12
|
| 99 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 100 |
+
urllib3==2.4.0
|
| 101 |
+
airportsdata==20250523
|
| 102 |
+
prometheus_client==0.22.1
|
| 103 |
+
safetensors==0.5.3
|
| 104 |
+
setuptools==78.1.1
|
| 105 |
+
opentelemetry-api==1.26.0
|
| 106 |
+
liger_kernel==0.5.10
|
| 107 |
+
click==8.2.1
|
| 108 |
+
mpmath==1.3.0
|
| 109 |
+
dill==0.3.8
|
| 110 |
+
PyYAML==6.0.2
|
| 111 |
+
outlines_core==0.1.26
|
| 112 |
+
jsonschema-specifications==2025.4.1
|
| 113 |
+
nvidia-curand-cu12==10.3.5.147
|
| 114 |
+
zipp==3.23.0
|
| 115 |
+
triton==3.2.0
|
| 116 |
+
python-multipart==0.0.20
|
| 117 |
+
distlib==0.3.9
|
| 118 |
+
transformers==4.52.4
|
| 119 |
+
pandas==2.3.0
|
| 120 |
+
uvloop==0.21.0
|
| 121 |
+
opencensus==0.11.4
|
| 122 |
+
opencensus-context==0.1.3
|
| 123 |
+
numba==0.61.2
|
| 124 |
+
dnspython==2.7.0
|
| 125 |
+
sympy==1.13.1
|
| 126 |
+
pillow==11.2.1
|
| 127 |
+
idna==3.10
|
| 128 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 129 |
+
sniffio==1.3.1
|
| 130 |
+
antlr4-python3-runtime==4.9.3
|
| 131 |
+
annotated-types==0.7.0
|
| 132 |
+
uvicorn==0.34.3
|
| 133 |
+
Pygments==2.19.1
|
| 134 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 135 |
+
opentelemetry-exporter-prometheus==0.55b1
|
| 136 |
+
h11==0.16.0
|
| 137 |
+
depyf==0.18.0
|
| 138 |
+
colorful==0.5.6
|
| 139 |
+
Jinja2==3.1.6
|
| 140 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 141 |
+
shellingham==1.5.4
|
| 142 |
+
mistral_common==1.6.2
|
| 143 |
+
partial-json-parser==0.2.1.1.post5
|
| 144 |
+
lm-format-enforcer==0.10.11
|
| 145 |
+
pyzmq==26.4.0
|
| 146 |
+
datasets==3.6.0
|
| 147 |
+
httpx==0.28.1
|
| 148 |
+
charset-normalizer==3.4.2
|
| 149 |
+
hf-xet==1.1.3
|
| 150 |
+
av==14.4.0
|
| 151 |
+
qwen-vl-utils==0.0.11
|
| 152 |
+
opencv-python-headless==4.11.0.86
|
| 153 |
+
nvidia-nccl-cu12==2.21.5
|
| 154 |
+
opentelemetry-semantic-conventions-ai==0.4.9
|
| 155 |
+
py-cpuinfo==9.0.0
|
| 156 |
+
nest-asyncio==1.6.0
|
| 157 |
+
blake3==1.0.5
|
| 158 |
+
pytz==2025.2
|
| 159 |
+
proto-plus==1.26.1
|
| 160 |
+
pylatexenc==2.10
|
| 161 |
+
websockets==15.0.1
|
| 162 |
+
starlette==0.46.2
|
| 163 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 164 |
+
importlib_metadata==8.0.0
|
| 165 |
+
aiohttp==3.12.12
|
| 166 |
+
anyio==4.9.0
|
| 167 |
+
llvmlite==0.44.0
|
| 168 |
+
sentry-sdk==2.30.0
|
| 169 |
+
multidict==6.4.4
|
| 170 |
+
verl==0.3.1.dev0
|
| 171 |
+
distro==1.9.0
|
| 172 |
+
torch==2.6.0
|
| 173 |
+
Deprecated==1.2.18
|
| 174 |
+
MarkupSafe==3.0.2
|
| 175 |
+
torchaudio==2.6.0
|
| 176 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 177 |
+
networkx==3.4.2
|
| 178 |
+
httptools==0.6.4
|
| 179 |
+
opentelemetry-sdk==1.26.0
|
| 180 |
+
pyasn1_modules==0.4.2
|
| 181 |
+
opentelemetry-exporter-otlp-proto-http==1.26.0
|
| 182 |
+
google-api-core==2.25.1
|
| 183 |
+
diskcache==5.6.3
|
| 184 |
+
rich==14.0.0
|
| 185 |
+
virtualenv==20.31.2
|
| 186 |
+
outlines==0.1.11
|
| 187 |
+
llguidance==0.7.29
|
| 188 |
+
py-spy==0.4.0
|
| 189 |
+
lark==1.2.2
|
| 190 |
+
typer==0.16.0
|
| 191 |
+
jiter==0.10.0
|
| 192 |
+
opentelemetry-semantic-conventions==0.47b0
|
| 193 |
+
jsonschema==4.24.0
|
| 194 |
+
watchfiles==1.0.5
|
| 195 |
+
pip==25.1
|
| 196 |
+
smart-open==7.1.0
|
| 197 |
+
openai==1.86.0
|
| 198 |
+
accelerate==1.7.0
|
| 199 |
+
xformers==0.0.29.post2
|
| 200 |
+
verl==0.3.1.dev0
|
| 201 |
+
autocommand==2.2.2
|
| 202 |
+
jaraco.functools==4.0.1
|
| 203 |
+
typeguard==4.3.0
|
| 204 |
+
platformdirs==4.2.2
|
| 205 |
+
jaraco.text==3.12.1
|
| 206 |
+
wheel==0.45.1
|
| 207 |
+
inflect==7.3.1
|
| 208 |
+
jaraco.context==5.3.0
|
| 209 |
+
jaraco.collections==5.1.0
|
| 210 |
+
packaging==24.2
|
| 211 |
+
more-itertools==10.3.0
|
| 212 |
+
typing_extensions==4.12.2
|
| 213 |
+
importlib_metadata==8.0.0
|
| 214 |
+
backports.tarfile==1.2.0
|
| 215 |
+
zipp==3.19.2
|
| 216 |
+
tomli==2.0.1
|
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-14T11:33:20.797563Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--node-ip-address=10.1.0.34",
|
| 7 |
+
"--node-manager-port=44395",
|
| 8 |
+
"--object-store-name=/tmp/ray/session_2025-06-14_19-26-10_939959_377/sockets/plasma_store",
|
| 9 |
+
"--raylet-name=/tmp/ray/session_2025-06-14_19-26-10_939959_377/sockets/raylet",
|
| 10 |
+
"--redis-address=None",
|
| 11 |
+
"--metrics-agent-port=53248",
|
| 12 |
+
"--logging-rotate-bytes=536870912",
|
| 13 |
+
"--logging-rotate-backup-count=5",
|
| 14 |
+
"--runtime-env-agent-port=62639",
|
| 15 |
+
"--gcs-address=10.1.0.34:54500",
|
| 16 |
+
"--session-name=session_2025-06-14_19-26-10_939959_377",
|
| 17 |
+
"--temp-dir=/tmp/ray",
|
| 18 |
+
"--webui=127.0.0.1:8265",
|
| 19 |
+
"--cluster-id=87252e5ee41fad91c76da7e57bdcecbe022b796f41d09458b8a10f7c",
|
| 20 |
+
"--startup-token=24",
|
| 21 |
+
"--worker-launch-time-ms=1749900374194",
|
| 22 |
+
"--node-id=55695a579dfa13036c97ee61aacacb1a3bcb84cbe0636297d5a4b61b",
|
| 23 |
+
"--runtime-env-hash=-115784934",
|
| 24 |
+
"--enable-resource-isolation=false"
|
| 25 |
+
],
|
| 26 |
+
"program": "/root/miniconda3/envs/easyr1/lib/python3.10/site-packages/ray/_private/workers/default_worker.py",
|
| 27 |
+
"email": "gia0603yucca@gmail.com",
|
| 28 |
+
"root": "/nas/shared/kilab/wangyujia/EasyR1/examples",
|
| 29 |
+
"host": "dlcjqxpfs58ebbif-master-0",
|
| 30 |
+
"executable": "/root/miniconda3/envs/easyr1/bin/python3",
|
| 31 |
+
"cpu_count": 24,
|
| 32 |
+
"cpu_count_logical": 24,
|
| 33 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 34 |
+
"gpu_count": 8,
|
| 35 |
+
"disk": {
|
| 36 |
+
"/": {
|
| 37 |
+
"total": "1623302262784",
|
| 38 |
+
"used": "93401088"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"memory": {
|
| 42 |
+
"total": "549755813888"
|
| 43 |
+
},
|
| 44 |
+
"cpu": {
|
| 45 |
+
"count": 24,
|
| 46 |
+
"countLogical": 24
|
| 47 |
+
},
|
| 48 |
+
"gpu_nvidia": [
|
| 49 |
+
{
|
| 50 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 51 |
+
"architecture": "Ampere",
|
| 52 |
+
"uuid": "GPU-6074818b-3292-5382-c329-f6ea4933c000"
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 56 |
+
"architecture": "Ampere",
|
| 57 |
+
"uuid": "GPU-9153a1c2-42e3-a4bf-5b99-c511c2dd1cd7"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 61 |
+
"architecture": "Ampere",
|
| 62 |
+
"uuid": "GPU-d3362bbd-73d7-9f93-9e24-20b1dde2c73d"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 66 |
+
"architecture": "Ampere",
|
| 67 |
+
"uuid": "GPU-0a7ad2e1-b610-b966-bb15-97dee8beda1d"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 71 |
+
"architecture": "Ampere",
|
| 72 |
+
"uuid": "GPU-71868de1-0c70-4e5f-ff0f-c6c77d6ff52a"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 76 |
+
"architecture": "Ampere",
|
| 77 |
+
"uuid": "GPU-91ff7b89-a1a9-22b7-e7a6-d280378f4c22"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 81 |
+
"architecture": "Ampere",
|
| 82 |
+
"uuid": "GPU-6da63e94-6ad8-9210-fcc7-9803f04b4499"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 86 |
+
"architecture": "Ampere",
|
| 87 |
+
"uuid": "GPU-17618b8c-49d9-1302-2d8b-8e2d3323f951"
|
| 88 |
+
}
|
| 89 |
+
],
|
| 90 |
+
"cudaVersion": "12.1"
|
| 91 |
+
}
|
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"val/reward_score":0.42200788855552673,"_runtime":2222.63040867,"val/accuracy_reward":0,"val/overall_reward":0.4220078922653015,"_timestamp":1.7499027736309972e+09,"val/repeat_reward":0.4220078922653015,"_step":0,"_wandb":{"runtime":2222},"val/format_reward":0,"val/generations":{"artifact_path":"wandb-client-artifact://ijyj2h6rsizx63o43gkuzc4ss8t0t749zor61i1pqle3t4c3ibf0xivv691aymsesb6ig7a9fuappbvthpwjudrcl39qby5t4etler4em69qzxjpizsh8ela68r5ocgs/val/generations.table.json","sha256":"2feffd203d182aadef48f52d1efd60785cdc0e27cfd8e96c5ed06b0f422b9a10","_latest_artifact_path":"wandb-client-artifact://l4v9x108g3wt2q2co8z0amg8u4xglpreu8janqpe0gu6vsl28ihxry8uc140267pdit8ni6m32ykh8gkouuhluxdqm3n1ipk3vzgp4x0kh75u6xtedujh1a0uh4vonfq:latest/val/generations.table.json","path":"media/table/val/generations_0_2feffd203d182aadef48.table.json","ncols":13,"log_mode":"IMMUTABLE","_type":"table-file","nrows":1,"size":37088}}
|
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-06-14T19:33:20.8065764+08:00","level":"INFO","msg":"stream: starting","core version":"0.20.1","symlink path":"/nas/shared/kilab/wangyujia/EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-core.log"}
|
| 2 |
+
{"time":"2025-06-14T19:33:22.123634875+08:00","level":"INFO","msg":"stream: created new stream","id":"9rxy2gyp"}
|
| 3 |
+
{"time":"2025-06-14T19:33:22.123667201+08:00","level":"INFO","msg":"stream: started","id":"9rxy2gyp"}
|
| 4 |
+
{"time":"2025-06-14T19:33:22.123732778+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9rxy2gyp"}
|
| 5 |
+
{"time":"2025-06-14T19:33:22.123739338+08:00","level":"INFO","msg":"sender: started","stream_id":"9rxy2gyp"}
|
| 6 |
+
{"time":"2025-06-14T19:33:22.123765948+08:00","level":"INFO","msg":"handler: started","stream_id":"9rxy2gyp"}
|
| 7 |
+
{"time":"2025-06-14T19:33:22.624115243+08:00","level":"INFO","msg":"Starting system monitor"}
|
| 8 |
+
{"time":"2025-06-14T20:10:23.427993609+08:00","level":"INFO","msg":"Stopping system monitor"}
|
| 9 |
+
{"time":"2025-06-14T20:10:23.429125637+08:00","level":"INFO","msg":"Stopped system monitor"}
|
| 10 |
+
{"time":"2025-06-14T20:10:24.39935884+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading wandb-summary.json","runtime_seconds":0.523990922},{"desc":"uploading output.log","runtime_seconds":0.523981291},{"desc":"uploading history steps 0-0, summary, console lines 137-204","runtime_seconds":0.048062105}],"total_operations":3}}
|
| 11 |
+
{"time":"2025-06-14T20:10:26.867530799+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 12 |
+
{"time":"2025-06-14T20:10:29.822474057+08:00","level":"INFO","msg":"stream: closing","id":"9rxy2gyp"}
|
| 13 |
+
{"time":"2025-06-14T20:10:29.822488409+08:00","level":"INFO","msg":"handler: closed","stream_id":"9rxy2gyp"}
|
| 14 |
+
{"time":"2025-06-14T20:10:29.822495399+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"9rxy2gyp"}
|
| 15 |
+
{"time":"2025-06-14T20:10:29.822500164+08:00","level":"INFO","msg":"sender: closed","stream_id":"9rxy2gyp"}
|
| 16 |
+
{"time":"2025-06-14T20:10:29.824705513+08:00","level":"INFO","msg":"stream: closed","id":"9rxy2gyp"}
|
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug.log
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_setup.py:_flush():81] Current SDK version is 0.20.1
|
| 2 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_setup.py:_flush():81] Configure stats pid to 2538
|
| 3 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_setup.py:_flush():81] Loading settings from /nas/shared/kilab/wangyujia/EasyR1/examples/wandb/settings
|
| 5 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 6 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /nas/shared/kilab/wangyujia/EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug.log
|
| 7 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /nas/shared/kilab/wangyujia/EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-internal.log
|
| 8 |
+
2025-06-14 19:33:20,580 INFO MainThread:2538 [wandb_init.py:init():831] calling init triggers
|
| 9 |
+
2025-06-14 19:33:20,581 INFO MainThread:2538 [wandb_init.py:init():836] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'data': {'train_files': '/nas/shared/kilab/wangyujia/rl_data@train', 'val_files': '/nas/shared/kilab/wangyujia/rl_data@validation', 'prompt_key': 'question', 'answer_key': 'answer', 'image_key': 'images', 'image_dir': None, 'max_prompt_length': 8192, 'max_response_length': 8192, 'rollout_batch_size': 256, 'val_batch_size': 1024, 'format_prompt': '/nas/shared/kilab/wangyujia/EasyR1/examples/format_prompt/bio_format.jinja', 'override_chat_template': None, 'shuffle': True, 'seed': 1, 'min_pixels': 262144, 'max_pixels': 4194304, 'filter_overlong_prompts': True}, 'worker': {'hybrid_engine': True, 'actor': {'strategy': 'fsdp', 'global_batch_size': 128, 'micro_batch_size_per_device_for_update': 2, 'micro_batch_size_per_device_for_experience': 64, 'max_grad_norm': 1.0, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.3, 'clip_ratio_dual': 3.0, 'ppo_epochs': 1, 'padding_free': True, 'ulysses_sequence_parallel_size': 1, 'use_torch_compile': True, 'model': {'model_path': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'tokenizer_path': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'override_config': {}, 'enable_gradient_checkpointing': True, 'trust_remote_code': False, 'freeze_vision_tower': False}, 'optim': {'lr': 1e-06, 'betas': [0.9, 0.999], 'weight_decay': 0.01, 'strategy': 'adamw', 'lr_warmup_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'training_steps': 1610}, 'fsdp': {'enable_full_shard': True, 'enable_cpu_offload': False, 'enable_rank0_init': True, 'use_orig_params': False, 'torch_dtype': None, 'fsdp_size': -1, 'mp_param_dtype': 'bf16', 'mp_reduce_dtype': 'fp32', 'mp_buffer_dtype': 'fp32'}, 'offload': {'offload_params': True, 'offload_optimizer': True}, 'global_batch_size_per_device': -1, 'disable_kl': False, 'use_kl_loss': True, 'kl_penalty': 'low_var_kl', 'kl_coef': 0.01}, 'critic': {'strategy': 'fsdp', 'global_batch_size': 256, 'micro_batch_size_per_device_for_update': 4, 'micro_batch_size_per_device_for_experience': 16, 'max_grad_norm': 1.0, 'cliprange_value': 0.5, 'ppo_epochs': 1, 'padding_free': False, 'ulysses_sequence_parallel_size': 1, 'model': {'model_path': None, 'tokenizer_path': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'trust_remote_code': True, 'freeze_vision_tower': False}, 'optim': {'lr': 1e-06, 'betas': [0.9, 0.999], 'weight_decay': 0.01, 'strategy': 'adamw', 'lr_warmup_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'training_steps': 1610}, 'fsdp': {'enable_full_shard': True, 'enable_cpu_offload': False, 'enable_rank0_init': True, 'use_orig_params': False, 'torch_dtype': None, 'fsdp_size': -1, 'mp_param_dtype': 'bf16', 'mp_reduce_dtype': 'fp32', 'mp_buffer_dtype': 'fp32'}, 'offload': {'offload_params': False, 'offload_optimizer': False}, 'global_batch_size_per_device': -1}, 'ref': {'strategy': 'fsdp', 'fsdp': {'enable_full_shard': True, 'enable_cpu_offload': True, 'enable_rank0_init': True, 'use_orig_params': False, 'torch_dtype': None, 'fsdp_size': -1, 'mp_param_dtype': 'bf16', 'mp_reduce_dtype': 'fp32', 'mp_buffer_dtype': 'fp32'}, 'offload': {'offload_params': False, 'offload_optimizer': False}, 'micro_batch_size_per_device_for_experience': 64, 'padding_free': True, 'ulysses_sequence_parallel_size': 1, 'use_torch_compile': True}, 'reward': {'reward_type': 'batch', 'reward_function': '/nas/shared/kilab/wangyujia/EasyR1/examples/reward_function/bio.py', 'reward_function_kwargs': {}, 'skip_special_tokens': True, 'num_cpus': 1, 'reward_function_name': 'compute_score'}, 'rollout': {'name': 'vllm', 'n': 8, 'temperature': 1.0, 'top_p': 0.99, 'top_k': -1, 'seed': 1, 'limit_images': 0, 'dtype': 'bf16', 'gpu_memory_utilization': 0.75, 'ignore_eos': False, 'enforce_eager': False, 'enable_chunked_prefill': False, 'tensor_parallel_size': 1, 'max_model_len': None, 'max_num_batched_tokens': 16384, 'disable_log_stats': True, 'val_override_config': {'temperature': 0.5, 'n': 1}, 'prompt_length': 8192, 'response_length': 8192, 'trust_remote_code': False}}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'disable_kl': False, 'use_kl_loss': True, 'kl_penalty': 'low_var_kl', 'kl_coef': 0.01, 'kl_type': 'fixed', 'kl_horizon': 0.0, 'kl_target': 0.0}, 'trainer': {'total_epochs': 5, 'max_steps': None, 'project_name': 'easy_r1', 'experiment_name': 'qwen2_5_bio_grpo', 'logger': ['console', 'wandb'], 'nnodes': 1, 'n_gpus_per_node': 8, 'critic_warmup': 0, 'val_freq': 5, 'val_before_train': True, 'val_only': False, 'val_generations_to_log': 3, 'save_freq': 5, 'save_limit': 3, 'save_checkpoint_path': '/oss/wangyujia/BIO/rl/qwen2.5_7b_bio', 'load_checkpoint_path': None}, '_wandb': {}}
|
| 11 |
+
2025-06-14 19:33:20,581 INFO MainThread:2538 [wandb_init.py:init():872] starting backend
|
| 12 |
+
2025-06-14 19:33:20,793 INFO MainThread:2538 [wandb_init.py:init():875] sending inform_init request
|
| 13 |
+
2025-06-14 19:33:20,796 INFO MainThread:2538 [wandb_init.py:init():883] backend started and connected
|
| 14 |
+
2025-06-14 19:33:20,799 INFO MainThread:2538 [wandb_init.py:init():956] updated telemetry
|
| 15 |
+
2025-06-14 19:33:20,799 INFO MainThread:2538 [wandb_init.py:init():980] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-06-14 19:33:22,613 INFO MainThread:2538 [wandb_init.py:init():1032] starting run threads in backend
|
| 17 |
+
2025-06-14 19:33:22,807 INFO MainThread:2538 [wandb_run.py:_console_start():2453] atexit reg
|
| 18 |
+
2025-06-14 19:33:22,807 INFO MainThread:2538 [wandb_run.py:_redirect():2301] redirect: wrap_raw
|
| 19 |
+
2025-06-14 19:33:22,807 INFO MainThread:2538 [wandb_run.py:_redirect():2370] Wrapping output streams.
|
| 20 |
+
2025-06-14 19:33:22,807 INFO MainThread:2538 [wandb_run.py:_redirect():2393] Redirects installed.
|
| 21 |
+
2025-06-14 19:33:22,810 INFO MainThread:2538 [wandb_init.py:init():1078] run started, returning control to user process
|
| 22 |
+
2025-06-14 20:10:23,389 INFO MainThread:2538 [wandb_run.py:_finish():2219] finishing run gia0603yucca/easy_r1/9rxy2gyp
|
| 23 |
+
2025-06-14 20:10:23,395 INFO MainThread:2538 [wandb_run.py:_atexit_cleanup():2418] got exitcode: 0
|
| 24 |
+
2025-06-14 20:10:23,397 INFO MainThread:2538 [wandb_run.py:_restore():2400] restore
|
| 25 |
+
2025-06-14 20:10:23,397 INFO MainThread:2538 [wandb_run.py:_restore():2406] restore done
|
| 26 |
+
2025-06-14 20:10:29,817 INFO MainThread:2538 [wandb_run.py:_footer_history_summary_info():4000] rendering history
|
| 27 |
+
2025-06-14 20:10:29,818 INFO MainThread:2538 [wandb_run.py:_footer_history_summary_info():4032] rendering summary
|
| 28 |
+
2025-06-14 20:10:29,818 INFO MainThread:2538 [wandb_run.py:_footer_sync_info():3961] logging synced files
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/config.yaml
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.20.1
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.10.0
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 11
|
| 10 |
+
- 30
|
| 11 |
+
- 41
|
| 12 |
+
- 49
|
| 13 |
+
- 51
|
| 14 |
+
- 71
|
| 15 |
+
- 95
|
| 16 |
+
- 98
|
| 17 |
+
- 105
|
| 18 |
+
"2":
|
| 19 |
+
- 1
|
| 20 |
+
- 11
|
| 21 |
+
- 30
|
| 22 |
+
- 41
|
| 23 |
+
- 49
|
| 24 |
+
- 51
|
| 25 |
+
- 71
|
| 26 |
+
- 95
|
| 27 |
+
- 98
|
| 28 |
+
- 105
|
| 29 |
+
"3":
|
| 30 |
+
- 2
|
| 31 |
+
- 13
|
| 32 |
+
- 16
|
| 33 |
+
- 55
|
| 34 |
+
- 61
|
| 35 |
+
"4": 3.10.0
|
| 36 |
+
"5": 0.20.1
|
| 37 |
+
"6": 4.52.4
|
| 38 |
+
"12": 0.20.1
|
| 39 |
+
"13": linux-x86_64
|
| 40 |
+
algorithm:
|
| 41 |
+
value:
|
| 42 |
+
adv_estimator: grpo
|
| 43 |
+
disable_kl: false
|
| 44 |
+
gamma: 1
|
| 45 |
+
kl_coef: 0.01
|
| 46 |
+
kl_horizon: 0
|
| 47 |
+
kl_penalty: low_var_kl
|
| 48 |
+
kl_target: 0
|
| 49 |
+
kl_type: fixed
|
| 50 |
+
lam: 1
|
| 51 |
+
use_kl_loss: true
|
| 52 |
+
data:
|
| 53 |
+
value:
|
| 54 |
+
answer_key: answer
|
| 55 |
+
filter_overlong_prompts: true
|
| 56 |
+
format_prompt: /nas/shared/kilab/wangyujia/EasyR1/examples/format_prompt/bio_format.jinja
|
| 57 |
+
image_dir: null
|
| 58 |
+
image_key: images
|
| 59 |
+
max_pixels: 4194304
|
| 60 |
+
max_prompt_length: 8192
|
| 61 |
+
max_response_length: 8192
|
| 62 |
+
min_pixels: 262144
|
| 63 |
+
override_chat_template: null
|
| 64 |
+
prompt_key: question
|
| 65 |
+
rollout_batch_size: 256
|
| 66 |
+
seed: 1
|
| 67 |
+
shuffle: true
|
| 68 |
+
train_files: /nas/shared/kilab/wangyujia/rl_data@train
|
| 69 |
+
val_batch_size: 1024
|
| 70 |
+
val_files: /nas/shared/kilab/wangyujia/rl_data@validation
|
| 71 |
+
trainer:
|
| 72 |
+
value:
|
| 73 |
+
critic_warmup: 0
|
| 74 |
+
experiment_name: qwen2_5_bio_grpo
|
| 75 |
+
load_checkpoint_path: null
|
| 76 |
+
logger:
|
| 77 |
+
- console
|
| 78 |
+
- wandb
|
| 79 |
+
max_steps: null
|
| 80 |
+
n_gpus_per_node: 8
|
| 81 |
+
nnodes: 1
|
| 82 |
+
project_name: easy_r1
|
| 83 |
+
save_checkpoint_path: /oss/wangyujia/BIO/rl/qwen2.5_7b_bio
|
| 84 |
+
save_freq: 5
|
| 85 |
+
save_limit: 3
|
| 86 |
+
total_epochs: 5
|
| 87 |
+
val_before_train: true
|
| 88 |
+
val_freq: 5
|
| 89 |
+
val_generations_to_log: 3
|
| 90 |
+
val_only: false
|
| 91 |
+
worker:
|
| 92 |
+
value:
|
| 93 |
+
actor:
|
| 94 |
+
clip_ratio_dual: 3
|
| 95 |
+
clip_ratio_high: 0.3
|
| 96 |
+
clip_ratio_low: 0.2
|
| 97 |
+
disable_kl: false
|
| 98 |
+
fsdp:
|
| 99 |
+
enable_cpu_offload: false
|
| 100 |
+
enable_full_shard: true
|
| 101 |
+
enable_rank0_init: true
|
| 102 |
+
fsdp_size: -1
|
| 103 |
+
mp_buffer_dtype: fp32
|
| 104 |
+
mp_param_dtype: bf16
|
| 105 |
+
mp_reduce_dtype: fp32
|
| 106 |
+
torch_dtype: null
|
| 107 |
+
use_orig_params: false
|
| 108 |
+
global_batch_size: 128
|
| 109 |
+
global_batch_size_per_device: -1
|
| 110 |
+
kl_coef: 0.01
|
| 111 |
+
kl_penalty: low_var_kl
|
| 112 |
+
max_grad_norm: 1
|
| 113 |
+
micro_batch_size_per_device_for_experience: 16
|
| 114 |
+
micro_batch_size_per_device_for_update: 2
|
| 115 |
+
model:
|
| 116 |
+
enable_gradient_checkpointing: true
|
| 117 |
+
freeze_vision_tower: false
|
| 118 |
+
model_path: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300
|
| 119 |
+
tokenizer_path: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300
|
| 120 |
+
trust_remote_code: false
|
| 121 |
+
offload:
|
| 122 |
+
offload_optimizer: true
|
| 123 |
+
offload_params: true
|
| 124 |
+
optim:
|
| 125 |
+
betas:
|
| 126 |
+
- 0.9
|
| 127 |
+
- 0.999
|
| 128 |
+
lr: 1e-06
|
| 129 |
+
lr_warmup_ratio: 0
|
| 130 |
+
min_lr_ratio: null
|
| 131 |
+
strategy: adamw
|
| 132 |
+
training_steps: 1610
|
| 133 |
+
warmup_style: constant
|
| 134 |
+
weight_decay: 0.01
|
| 135 |
+
padding_free: true
|
| 136 |
+
ppo_epochs: 1
|
| 137 |
+
strategy: fsdp
|
| 138 |
+
ulysses_sequence_parallel_size: 1
|
| 139 |
+
use_kl_loss: true
|
| 140 |
+
use_torch_compile: true
|
| 141 |
+
critic:
|
| 142 |
+
cliprange_value: 0.5
|
| 143 |
+
fsdp:
|
| 144 |
+
enable_cpu_offload: false
|
| 145 |
+
enable_full_shard: true
|
| 146 |
+
enable_rank0_init: true
|
| 147 |
+
fsdp_size: -1
|
| 148 |
+
mp_buffer_dtype: fp32
|
| 149 |
+
mp_param_dtype: bf16
|
| 150 |
+
mp_reduce_dtype: fp32
|
| 151 |
+
torch_dtype: null
|
| 152 |
+
use_orig_params: false
|
| 153 |
+
global_batch_size: 256
|
| 154 |
+
global_batch_size_per_device: -1
|
| 155 |
+
max_grad_norm: 1
|
| 156 |
+
micro_batch_size_per_device_for_experience: 16
|
| 157 |
+
micro_batch_size_per_device_for_update: 4
|
| 158 |
+
model:
|
| 159 |
+
enable_gradient_checkpointing: true
|
| 160 |
+
freeze_vision_tower: false
|
| 161 |
+
model_path: null
|
| 162 |
+
tokenizer_path: null
|
| 163 |
+
trust_remote_code: true
|
| 164 |
+
offload:
|
| 165 |
+
offload_optimizer: false
|
| 166 |
+
offload_params: false
|
| 167 |
+
optim:
|
| 168 |
+
betas:
|
| 169 |
+
- 0.9
|
| 170 |
+
- 0.999
|
| 171 |
+
lr: 1e-06
|
| 172 |
+
lr_warmup_ratio: 0
|
| 173 |
+
min_lr_ratio: null
|
| 174 |
+
strategy: adamw
|
| 175 |
+
training_steps: 1610
|
| 176 |
+
warmup_style: constant
|
| 177 |
+
weight_decay: 0.01
|
| 178 |
+
padding_free: false
|
| 179 |
+
ppo_epochs: 1
|
| 180 |
+
strategy: fsdp
|
| 181 |
+
ulysses_sequence_parallel_size: 1
|
| 182 |
+
hybrid_engine: true
|
| 183 |
+
ref:
|
| 184 |
+
fsdp:
|
| 185 |
+
enable_cpu_offload: true
|
| 186 |
+
enable_full_shard: true
|
| 187 |
+
enable_rank0_init: true
|
| 188 |
+
fsdp_size: -1
|
| 189 |
+
mp_buffer_dtype: fp32
|
| 190 |
+
mp_param_dtype: bf16
|
| 191 |
+
mp_reduce_dtype: fp32
|
| 192 |
+
torch_dtype: null
|
| 193 |
+
use_orig_params: false
|
| 194 |
+
micro_batch_size_per_device_for_experience: 16
|
| 195 |
+
offload:
|
| 196 |
+
offload_optimizer: false
|
| 197 |
+
offload_params: false
|
| 198 |
+
padding_free: true
|
| 199 |
+
strategy: fsdp
|
| 200 |
+
ulysses_sequence_parallel_size: 1
|
| 201 |
+
use_torch_compile: true
|
| 202 |
+
reward:
|
| 203 |
+
num_cpus: 1
|
| 204 |
+
reward_function: /nas/shared/kilab/wangyujia/EasyR1/examples/reward_function/bio.py
|
| 205 |
+
reward_function_name: compute_score
|
| 206 |
+
reward_type: batch
|
| 207 |
+
skip_special_tokens: true
|
| 208 |
+
rollout:
|
| 209 |
+
disable_log_stats: true
|
| 210 |
+
dtype: bf16
|
| 211 |
+
enable_chunked_prefill: false
|
| 212 |
+
enforce_eager: false
|
| 213 |
+
gpu_memory_utilization: 0.75
|
| 214 |
+
ignore_eos: false
|
| 215 |
+
limit_images: 0
|
| 216 |
+
max_model_len: null
|
| 217 |
+
max_num_batched_tokens: 16384
|
| 218 |
+
"n": 8
|
| 219 |
+
name: vllm
|
| 220 |
+
prompt_length: 8192
|
| 221 |
+
response_length: 8192
|
| 222 |
+
seed: 1
|
| 223 |
+
temperature: 1
|
| 224 |
+
tensor_parallel_size: 1
|
| 225 |
+
top_k: -1
|
| 226 |
+
top_p: 0.99
|
| 227 |
+
trust_remote_code: false
|
| 228 |
+
val_override_config:
|
| 229 |
+
"n": 1
|
| 230 |
+
temperature: 0.5
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_0_2feffd203d182aadef48.table.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"columns": ["step", "input_1", "output_1", "label_1", "score_1", "input_2", "output_2", "label_2", "score_2", "input_3", "output_3", "label_3", "score_3"], "data": [[0, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: PNAIVNRSIPEGVDVFEISGPLFFGAASKFKDAMHVVEKAPSIRILRMRKVMSIDATGLNMLKELFNDCRKSGTTLILSGVHTQPLFAMQQYGLADEIGEENIFGNIDDALDRARSLLGLPVQGRPAGF, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein solubility can be predicted by the grand average of hydropathicity (GRAVY) scale. The GRAVY scale is based on the assumption that a protein with a large number of hydrophilic residues will have a negative value and a protein with a large number of hydrophobic residues will have a positive value. The GRAVY scale is calculated by summing the hydropathy values of all amino acid residues in a protein sequence and dividing by the number of residues in the sequence. The hydropathy value of an amino acid residue is a measure of its hydrophobicity, which is based on its chemical properties. The hydropathy value of an amino acid residue can be calculated using a hydrophobicity scale, which is a table that assigns a numerical value to each amino acid residue based on its hydrophobicity. The amino acid sequence of the protein is: PNAIVNRSIPEGVDVFEISGPLFFGAASKFKDAMHVVEKAPSIRILRMRKVMSIDATGLNMLKELFNDCRKSGTTLILSGVHTQPLFAMQQYGLADEIGEENIFGNIDDALDRARSLLGLPVQGRPAGF. The hydropathy value of each amino acid residue in the sequence is: 0.057, 0.069, 0.08, 0.077, 0.038, 0.05, 0.065, 0.07, 0.057, 0.056, 0.08, 0.059, 0.06, 0.057, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, ", 0, 0.0, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: MTYVLAESPEFYQDNVTDYTGQISSSDITNIQAAIDDVKASEQKVIFVVFLSSFDGVDPETWTQQALQANGGGNVLIYALAPEERQYGIQGGTQWTDAELDAANNAAFQALSQEDWAGSALALAESVGSSSSSSLEHHHHHH, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n</think>\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydroph", 0, 0.9835715293884277, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: MGHHHHHHSHMDSKAIAERIKKLQAQQEHVRIGGKGTPRRKKKVIHKTAAADDKKLQSNLKKLSVTNIPGIEEVNMIKDDGTVIHFNNPKVQTSVPANTFSVTGSADNKQITEMLPGILNQLGPESLTHLKKLANNVTKLGPDGKGEDEDVPELVGDFDAASKNETKADEQ, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins tend to be more soluble than hydrophobic proteins. The presence of a large number of hydrophobic amino acids, such as phenylalanine, tryptophan, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 11 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine", 0, 0.0]]}
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_104_ccf2e4d7f5a8bd440fdc.table.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_109_dfde4027f4468ecb4bdb.table.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/requirements.txt
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
setproctitle==1.2.2
|
| 2 |
+
colorama==0.4.6
|
| 3 |
+
psutil==7.0.0
|
| 4 |
+
numpy==2.2.6
|
| 5 |
+
pylatexenc==2.10
|
| 6 |
+
pyzmq==26.4.0
|
| 7 |
+
email_validator==2.2.0
|
| 8 |
+
pyasn1==0.6.1
|
| 9 |
+
requests==2.32.4
|
| 10 |
+
omegaconf==2.3.0
|
| 11 |
+
tzdata==2025.2
|
| 12 |
+
yarl==1.20.1
|
| 13 |
+
watchfiles==1.0.5
|
| 14 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 15 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 16 |
+
proto-plus==1.26.1
|
| 17 |
+
wandb==0.20.1
|
| 18 |
+
scipy==1.15.3
|
| 19 |
+
googleapis-common-protos==1.70.0
|
| 20 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 21 |
+
opentelemetry-exporter-otlp-proto-http==1.26.0
|
| 22 |
+
vllm==0.8.5.post1
|
| 23 |
+
sniffio==1.3.1
|
| 24 |
+
ray==2.47.0
|
| 25 |
+
python-dateutil==2.9.0.post0
|
| 26 |
+
Deprecated==1.2.18
|
| 27 |
+
setuptools==78.1.1
|
| 28 |
+
aiosignal==1.3.2
|
| 29 |
+
platformdirs==4.3.8
|
| 30 |
+
regex==2024.11.6
|
| 31 |
+
aiohappyeyeballs==2.6.1
|
| 32 |
+
uvloop==0.21.0
|
| 33 |
+
opentelemetry-semantic-conventions-ai==0.4.9
|
| 34 |
+
virtualenv==20.31.2
|
| 35 |
+
rich==14.0.0
|
| 36 |
+
accelerate==1.7.0
|
| 37 |
+
datasets==3.6.0
|
| 38 |
+
python-dotenv==1.1.0
|
| 39 |
+
antlr4-python3-runtime==4.9.3
|
| 40 |
+
gitdb==4.0.12
|
| 41 |
+
six==1.17.0
|
| 42 |
+
smmap==5.0.2
|
| 43 |
+
opencensus==0.11.4
|
| 44 |
+
annotated-types==0.7.0
|
| 45 |
+
xxhash==3.5.0
|
| 46 |
+
frozenlist==1.7.0
|
| 47 |
+
Jinja2==3.1.6
|
| 48 |
+
interegular==0.3.3
|
| 49 |
+
opentelemetry-semantic-conventions==0.47b0
|
| 50 |
+
jiter==0.10.0
|
| 51 |
+
idna==3.10
|
| 52 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 53 |
+
propcache==0.3.2
|
| 54 |
+
nest-asyncio==1.6.0
|
| 55 |
+
sentencepiece==0.2.0
|
| 56 |
+
fastapi==0.115.12
|
| 57 |
+
verl==0.3.1.dev0
|
| 58 |
+
compressed-tensors==0.9.3
|
| 59 |
+
typing-inspection==0.4.1
|
| 60 |
+
gguf==0.17.0
|
| 61 |
+
dnspython==2.7.0
|
| 62 |
+
wheel==0.45.1
|
| 63 |
+
python-multipart==0.0.20
|
| 64 |
+
cupy-cuda12x==13.4.1
|
| 65 |
+
xgrammar==0.1.18
|
| 66 |
+
starlette==0.46.2
|
| 67 |
+
peft==0.15.2
|
| 68 |
+
blake3==1.0.5
|
| 69 |
+
torchdata==0.11.0
|
| 70 |
+
qwen-vl-utils==0.0.11
|
| 71 |
+
sentry-sdk==2.30.0
|
| 72 |
+
Pygments==2.19.1
|
| 73 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 74 |
+
aiohttp-cors==0.8.1
|
| 75 |
+
outlines_core==0.1.26
|
| 76 |
+
partial-json-parser==0.2.1.1.post5
|
| 77 |
+
filelock==3.18.0
|
| 78 |
+
tensordict==0.8.3
|
| 79 |
+
cloudpickle==3.1.1
|
| 80 |
+
torchaudio==2.6.0
|
| 81 |
+
pandas==2.3.0
|
| 82 |
+
tiktoken==0.9.0
|
| 83 |
+
av==14.4.0
|
| 84 |
+
flash-attn==2.7.1.post1
|
| 85 |
+
certifi==2025.4.26
|
| 86 |
+
nvidia-nvtx-cu12==12.4.127
|
| 87 |
+
numba==0.61.2
|
| 88 |
+
networkx==3.4.2
|
| 89 |
+
sympy==1.13.1
|
| 90 |
+
pyarrow==20.0.0
|
| 91 |
+
modelscope==1.27.0
|
| 92 |
+
mpmath==1.3.0
|
| 93 |
+
codetiming==1.4.0
|
| 94 |
+
triton==3.2.0
|
| 95 |
+
referencing==0.36.2
|
| 96 |
+
dill==0.3.8
|
| 97 |
+
opencv-python-headless==4.11.0.86
|
| 98 |
+
mathruler==0.1.0
|
| 99 |
+
jsonschema-specifications==2025.4.1
|
| 100 |
+
tokenizers==0.21.1
|
| 101 |
+
huggingface-hub==0.33.0
|
| 102 |
+
rich-toolkit==0.14.7
|
| 103 |
+
fastapi-cli==0.0.7
|
| 104 |
+
python-json-logger==3.3.0
|
| 105 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 106 |
+
httptools==0.6.4
|
| 107 |
+
mdurl==0.1.2
|
| 108 |
+
h11==0.16.0
|
| 109 |
+
distlib==0.3.9
|
| 110 |
+
airportsdata==20250523
|
| 111 |
+
transformers==4.52.4
|
| 112 |
+
opentelemetry-exporter-prometheus==0.55b1
|
| 113 |
+
fsspec==2025.3.0
|
| 114 |
+
diskcache==5.6.3
|
| 115 |
+
click==8.2.1
|
| 116 |
+
websockets==15.0.1
|
| 117 |
+
lark==1.2.2
|
| 118 |
+
uvicorn==0.34.3
|
| 119 |
+
grpcio==1.73.0
|
| 120 |
+
pillow==11.2.1
|
| 121 |
+
anyio==4.9.0
|
| 122 |
+
opentelemetry-exporter-otlp-proto-grpc==1.26.0
|
| 123 |
+
pydantic==2.11.6
|
| 124 |
+
wrapt==1.17.2
|
| 125 |
+
opentelemetry-api==1.26.0
|
| 126 |
+
nvidia-curand-cu12==10.3.5.147
|
| 127 |
+
rpds-py==0.25.1
|
| 128 |
+
exceptiongroup==1.3.0
|
| 129 |
+
msgpack==1.1.1
|
| 130 |
+
async-timeout==5.0.1
|
| 131 |
+
protobuf==4.25.8
|
| 132 |
+
httpx==0.28.1
|
| 133 |
+
opentelemetry-proto==1.26.0
|
| 134 |
+
nvidia-nccl-cu12==2.21.5
|
| 135 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 136 |
+
aiohttp==3.12.12
|
| 137 |
+
urllib3==2.4.0
|
| 138 |
+
smart-open==7.1.0
|
| 139 |
+
markdown-it-py==3.0.0
|
| 140 |
+
packaging==25.0
|
| 141 |
+
charset-normalizer==3.4.2
|
| 142 |
+
py-spy==0.4.0
|
| 143 |
+
setproctitle==1.3.6
|
| 144 |
+
safetensors==0.5.3
|
| 145 |
+
pyasn1_modules==0.4.2
|
| 146 |
+
jsonschema==4.24.0
|
| 147 |
+
astor==0.8.1
|
| 148 |
+
shellingham==1.5.4
|
| 149 |
+
pytz==2025.2
|
| 150 |
+
distro==1.9.0
|
| 151 |
+
google-api-core==2.25.1
|
| 152 |
+
rsa==4.9.1
|
| 153 |
+
multiprocess==0.70.16
|
| 154 |
+
colorful==0.5.6
|
| 155 |
+
einops==0.8.1
|
| 156 |
+
lm-format-enforcer==0.10.11
|
| 157 |
+
pydantic_core==2.33.2
|
| 158 |
+
mistral_common==1.6.2
|
| 159 |
+
opentelemetry-sdk==1.26.0
|
| 160 |
+
ninja==1.11.1.4
|
| 161 |
+
typing_extensions==4.14.0
|
| 162 |
+
depyf==0.18.0
|
| 163 |
+
attrs==25.3.0
|
| 164 |
+
tqdm==4.67.1
|
| 165 |
+
openai==1.86.0
|
| 166 |
+
xformers==0.0.29.post2
|
| 167 |
+
prometheus-fastapi-instrumentator==7.1.0
|
| 168 |
+
torch==2.6.0
|
| 169 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 170 |
+
opencensus-context==0.1.3
|
| 171 |
+
importlib_metadata==8.0.0
|
| 172 |
+
orjson==3.10.18
|
| 173 |
+
psutil==7.0.0
|
| 174 |
+
liger_kernel==0.5.10
|
| 175 |
+
pycountry==24.6.1
|
| 176 |
+
zipp==3.23.0
|
| 177 |
+
pip==25.1
|
| 178 |
+
MarkupSafe==3.0.2
|
| 179 |
+
opentelemetry-exporter-otlp-proto-common==1.26.0
|
| 180 |
+
llguidance==0.7.29
|
| 181 |
+
llvmlite==0.44.0
|
| 182 |
+
multidict==6.4.4
|
| 183 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 184 |
+
GitPython==3.1.44
|
| 185 |
+
fastrlock==0.8.3
|
| 186 |
+
hf-xet==1.1.3
|
| 187 |
+
PyYAML==6.0.2
|
| 188 |
+
opentelemetry-exporter-otlp==1.26.0
|
| 189 |
+
typer==0.16.0
|
| 190 |
+
msgspec==0.19.0
|
| 191 |
+
google-auth==2.40.3
|
| 192 |
+
cachetools==5.5.2
|
| 193 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 194 |
+
torchvision==0.21.0
|
| 195 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 196 |
+
outlines==0.1.11
|
| 197 |
+
prometheus_client==0.22.1
|
| 198 |
+
httpcore==1.0.9
|
| 199 |
+
py-cpuinfo==9.0.0
|
| 200 |
+
verl==0.3.1.dev0
|
| 201 |
+
jaraco.functools==4.0.1
|
| 202 |
+
inflect==7.3.1
|
| 203 |
+
jaraco.collections==5.1.0
|
| 204 |
+
packaging==24.2
|
| 205 |
+
wheel==0.45.1
|
| 206 |
+
tomli==2.0.1
|
| 207 |
+
platformdirs==4.2.2
|
| 208 |
+
typing_extensions==4.12.2
|
| 209 |
+
more-itertools==10.3.0
|
| 210 |
+
autocommand==2.2.2
|
| 211 |
+
jaraco.text==3.12.1
|
| 212 |
+
importlib_metadata==8.0.0
|
| 213 |
+
jaraco.context==5.3.0
|
| 214 |
+
zipp==3.19.2
|
| 215 |
+
backports.tarfile==1.2.0
|
| 216 |
+
typeguard==4.3.0
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.0",
|
| 4 |
+
"startedAt": "2025-06-14T11:47:44.287213Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--node-ip-address=10.1.5.10",
|
| 7 |
+
"--node-manager-port=45027",
|
| 8 |
+
"--object-store-name=/tmp/ray/session_2025-06-14_19-45-36_987346_61061/sockets/plasma_store",
|
| 9 |
+
"--raylet-name=/tmp/ray/session_2025-06-14_19-45-36_987346_61061/sockets/raylet",
|
| 10 |
+
"--redis-address=None",
|
| 11 |
+
"--metrics-agent-port=63241",
|
| 12 |
+
"--logging-rotate-bytes=536870912",
|
| 13 |
+
"--logging-rotate-backup-count=5",
|
| 14 |
+
"--runtime-env-agent-port=58728",
|
| 15 |
+
"--gcs-address=10.1.5.10:60836",
|
| 16 |
+
"--session-name=session_2025-06-14_19-45-36_987346_61061",
|
| 17 |
+
"--temp-dir=/tmp/ray",
|
| 18 |
+
"--webui=127.0.0.1:8265",
|
| 19 |
+
"--cluster-id=a6a80f53697854b083283d963e7ac5cff199d6477d3b4c5f6f0ddfec",
|
| 20 |
+
"--startup-token=64",
|
| 21 |
+
"--worker-launch-time-ms=1749901539376",
|
| 22 |
+
"--node-id=abb2963d276b09969a5a74d875c4ca780bcf729fc5d17badc158c62f",
|
| 23 |
+
"--runtime-env-hash=-115784934",
|
| 24 |
+
"--enable-resource-isolation=false"
|
| 25 |
+
],
|
| 26 |
+
"program": "/root/miniconda3/envs/easyr1/lib/python3.10/site-packages/ray/_private/workers/default_worker.py",
|
| 27 |
+
"email": "gia0603yucca@gmail.com",
|
| 28 |
+
"root": "/nas/shared/kilab/wangyujia/EasyR1/examples",
|
| 29 |
+
"host": "dsw-251511-7876c679d4-fpqq8",
|
| 30 |
+
"executable": "/root/miniconda3/envs/easyr1/bin/python3",
|
| 31 |
+
"cpu_count": 64,
|
| 32 |
+
"cpu_count_logical": 64,
|
| 33 |
+
"gpu": "NVIDIA A800-SXM4-80GB",
|
| 34 |
+
"gpu_count": 8,
|
| 35 |
+
"disk": {
|
| 36 |
+
"/": {
|
| 37 |
+
"total": "1623302262784",
|
| 38 |
+
"used": "799809536"
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"memory": {
|
| 42 |
+
"total": "549755813888"
|
| 43 |
+
},
|
| 44 |
+
"cpu": {
|
| 45 |
+
"count": 64,
|
| 46 |
+
"countLogical": 64
|
| 47 |
+
},
|
| 48 |
+
"gpu_nvidia": [
|
| 49 |
+
{
|
| 50 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 51 |
+
"architecture": "Ampere",
|
| 52 |
+
"uuid": "GPU-b6d61b63-2b46-d2f7-d450-38c1353782cb"
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 56 |
+
"architecture": "Ampere",
|
| 57 |
+
"uuid": "GPU-7a6b18cf-311a-f939-3dcc-25b0f1fc898c"
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 61 |
+
"architecture": "Ampere",
|
| 62 |
+
"uuid": "GPU-17cbe6f8-0339-60ce-23bb-dbd33795c1ad"
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 66 |
+
"architecture": "Ampere",
|
| 67 |
+
"uuid": "GPU-799578dd-bc6a-3b56-26b8-3941889669c9"
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 71 |
+
"architecture": "Ampere",
|
| 72 |
+
"uuid": "GPU-c783413d-e4e1-22c5-7c48-9296c28b08a0"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 76 |
+
"architecture": "Ampere",
|
| 77 |
+
"uuid": "GPU-0ad82850-a679-fa6b-9200-a26edb1bb8a4"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 81 |
+
"architecture": "Ampere",
|
| 82 |
+
"uuid": "GPU-e73b7d7b-4455-62ee-ec7e-a2eb1d845e07"
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"name": "NVIDIA A800-SXM4-80GB",
|
| 86 |
+
"architecture": "Ampere",
|
| 87 |
+
"uuid": "GPU-71ee45de-57b2-ac7c-13c1-08a1f197eb20"
|
| 88 |
+
}
|
| 89 |
+
],
|
| 90 |
+
"cudaVersion": "12.1"
|
| 91 |
+
}
|
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"critic/rewards/max":0.9997167587280273,"response_length/max":8192,"_runtime":145264.000918934,"timing_s/step":1183.4929371809994,"global_seqlen/max":1146817,"reward/format":0,"perf/cpu_memory_used_gb":407.5828285217285,"timing_s/old":102.70515622899984,"critic/returns/min":-2.4748668670654297,"actor/kl_loss":0.037933181738480926,"actor/pg_loss":-0.00841897885722176,"critic/advantages/min":-2.4748668670654297,"prompt_length/mean":314.70703125,"val/repeat_reward":0.0010451679389658478,"reward/overall":0.9458340392351747,"perf/max_memory_reserved_gb":75.796875,"prompt_length/clip_ratio":0,"perf/total_num_tokens":9002567,"prompt_length/max":1134,"perf/throughput":950.8471403981832,"critic/returns/max":2.3680503368377686,"actor/pg_clipfrac_higher":1.9632523688528636e-05,"val/generations":{"path":"media/table/val/generations_119_4e3f574f210c177c5437.table.json","nrows":25,"sha256":"4e3f574f210c177c54379d563cfe05fd219c32811853206cd37aa24896b5186c","ncols":13,"_type":"table-file","artifact_path":"wandb-client-artifact://76aqt3ola7mmf1zjvk79h2cs59rqhaslo8w0fqq45cu2m3evdeejs118qyegtnhx46a5806ogvbs8vt9zt1p7edrxhkmnyp31ey47kna9056mczftfqho4mygicga34i/val/generations.table.json","size":1223506,"log_mode":"IMMUTABLE","_latest_artifact_path":"wandb-client-artifact://h7flxr8fshgu5a89hy9khafst185p799hj8ttotsx8jkk8kyp037hrhnb8iekl0mne5kwynukkv20uhj63vd9zodj22j0nbm6o6w0bwwcqj2b8giffq60c52ebj5qneu:latest/val/generations.table.json"},"timing_s/gen":335.0029417749902,"critic/score/mean":0.9458340406417847,"_wandb":{"runtime":145264},"actor/pg_clipfrac_lower":0,"_step":124,"val/overall_reward":0.0010451679389658478,"critic/advantages/max":2.3680503368377686,"timing_per_token_ms/adv":0.02330009541756115,"timing_s/save_checkpoint":122.09420545600005,"reward/repeat":0.9458340392351747,"actor/ppo_kl":-3.353993864774196e-05,"critic/score/max":0.9997167587280273,"actor/lr":1e-06,"timing_per_token_ms/reward":0.00020019505992198455,"critic/rewards/mean":0.9458340406417847,"response_length/clip_ratio":0.01806640625,"critic/returns/mean":0.011055756360292435,"actor/kl_coef":0.01,"timing_per_token_ms/gen":0.04008148575558264,"perf/time_per_step":1183.4929371809994,"prompt_length/min":138,"global_seqlen/min":1109211,"global_seqlen/balanced_max":1125321,"_timestamp":1.7500469249759648e+09,"val/accuracy_reward":0,"critic/score/min":0,"timing_s/ref":98.26504057302373,"response_length/min":1542,"global_seqlen/balanced_min":1125320,"critic/rewards/min":0,"timing_s/update_actor":435.6776136950066,"perf/max_memory_allocated_gb":38.11449068784714,"perf/mfu_actor":0.42861793360235734,"response_length/mean":4081.07763671875,"reward/accuracy":0,"val/format_reward":0,"actor/entropy_loss":4.0293768817791715,"timing_s/reward":1.6732397199957632,"timing_s/adv":209.7606701029872,"timing_per_token_ms/update_actor":0.04839482046565236,"timing_per_token_ms/ref":0.01091522457683722,"timing_per_token_ms/old":0.011408430087662756,"timing_s/validation":2072.6338901569834,"val/reward_score":0.001045167911797762,"global_seqlen/minmax_diff":37606,"critic/advantages/mean":0.011055756360292435,"actor/grad_norm":0.12824900448322296,"global_seqlen/mean":1.125320875e+06}
|
EasyR1/verl.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: verl
|
| 3 |
+
Version: 0.3.1.dev0
|
| 4 |
+
Summary: An Efficient, Scalable, Multi-Modality RL Training Framework based on veRL
|
| 5 |
+
Home-page: https://github.com/volcengine/verl
|
| 6 |
+
Author: verl
|
| 7 |
+
Author-email: zhangchi.usc1992@bytedance.com, gmsheng@connect.hku.hk, hiyouga@buaa.edu.cn
|
| 8 |
+
License: Apache 2.0 License
|
| 9 |
+
Platform: UNKNOWN
|
| 10 |
+
Requires-Python: >=3.9.0
|
| 11 |
+
Description-Content-Type: text/markdown
|
| 12 |
+
License-File: LICENSE
|
| 13 |
+
Requires-Dist: accelerate
|
| 14 |
+
Requires-Dist: codetiming
|
| 15 |
+
Requires-Dist: datasets
|
| 16 |
+
Requires-Dist: liger-kernel
|
| 17 |
+
Requires-Dist: mathruler
|
| 18 |
+
Requires-Dist: numpy
|
| 19 |
+
Requires-Dist: omegaconf
|
| 20 |
+
Requires-Dist: pandas
|
| 21 |
+
Requires-Dist: peft
|
| 22 |
+
Requires-Dist: pillow
|
| 23 |
+
Requires-Dist: pyarrow>=15.0.0
|
| 24 |
+
Requires-Dist: pylatexenc
|
| 25 |
+
Requires-Dist: qwen-vl-utils
|
| 26 |
+
Requires-Dist: ray[default]
|
| 27 |
+
Requires-Dist: tensordict
|
| 28 |
+
Requires-Dist: torchdata
|
| 29 |
+
Requires-Dist: transformers>=4.51.0
|
| 30 |
+
Requires-Dist: vllm>=0.7.3
|
| 31 |
+
Requires-Dist: wandb
|
| 32 |
+
Provides-Extra: dev
|
| 33 |
+
Requires-Dist: pre-commit; extra == "dev"
|
| 34 |
+
Requires-Dist: ruff; extra == "dev"
|
| 35 |
+
Dynamic: author
|
| 36 |
+
Dynamic: author-email
|
| 37 |
+
Dynamic: description
|
| 38 |
+
Dynamic: description-content-type
|
| 39 |
+
Dynamic: home-page
|
| 40 |
+
Dynamic: license
|
| 41 |
+
Dynamic: license-file
|
| 42 |
+
Dynamic: provides-extra
|
| 43 |
+
Dynamic: requires-dist
|
| 44 |
+
Dynamic: requires-python
|
| 45 |
+
Dynamic: summary
|
| 46 |
+
|
| 47 |
+
# EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework
|
| 48 |
+
|
| 49 |
+
[](https://github.com/hiyouga/EasyR1/stargazers)
|
| 50 |
+
[](https://twitter.com/llamafactory_ai)
|
| 51 |
+
|
| 52 |
+
This project is a clean fork of the original [veRL](https://github.com/volcengine/verl) project to support vision language models, we thank all the authors for providing such a high-performance RL training framework.
|
| 53 |
+
|
| 54 |
+
EasyR1 is efficient and scalable due to the design of **[HybirdEngine](https://arxiv.org/abs/2409.19256)** and the latest release of **[vLLM](https://github.com/vllm-project/vllm)**'s SPMD mode.
|
| 55 |
+
|
| 56 |
+
## Features
|
| 57 |
+
|
| 58 |
+
- Supported models
|
| 59 |
+
- Llama3/Qwen2/Qwen2.5/Qwen3 language models
|
| 60 |
+
- Qwen2/Qwen2.5-VL vision language models
|
| 61 |
+
- DeepSeek-R1 distill models
|
| 62 |
+
|
| 63 |
+
- Supported algorithms
|
| 64 |
+
- GRPO
|
| 65 |
+
- Reinforce++
|
| 66 |
+
- ReMax
|
| 67 |
+
- RLOO
|
| 68 |
+
|
| 69 |
+
- Supported datasets
|
| 70 |
+
- Any text, vision-text dataset in a [specific format](#custom-dataset)
|
| 71 |
+
|
| 72 |
+
- Supported tricks
|
| 73 |
+
- Padding-free training
|
| 74 |
+
- Resuming from checkpoint
|
| 75 |
+
- Wandb & SwanLab & Mlflow & Tensorboard tracking
|
| 76 |
+
|
| 77 |
+
## Requirements
|
| 78 |
+
|
| 79 |
+
### Software Requirements
|
| 80 |
+
|
| 81 |
+
- Python 3.9+
|
| 82 |
+
- transformers>=4.51.0
|
| 83 |
+
- flash-attn>=2.4.3
|
| 84 |
+
- vllm>=0.8.3
|
| 85 |
+
|
| 86 |
+
We provide a [Dockerfile](./Dockerfile) to easily build environments.
|
| 87 |
+
|
| 88 |
+
We recommend using the [pre-built docker image](https://hub.docker.com/r/hiyouga/verl) in EasyR1.
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
### Hardware Requirements
|
| 95 |
+
|
| 96 |
+
\* *estimated*
|
| 97 |
+
|
| 98 |
+
| Method | Bits | 1.5B | 3B | 7B | 32B |
|
| 99 |
+
| ------------------------ | ---- | ------ | ------ | ------ | ------- |
|
| 100 |
+
| GRPO Full Fine-Tuning | AMP | 2*24GB | 4*40GB | 8*40GB | 16*80GB |
|
| 101 |
+
| GRPO Full Fine-Tuning | BF16 | 1*24GB | 1*40GB | 4*40GB | 8*80GB |
|
| 102 |
+
|
| 103 |
+
> [!NOTE]
|
| 104 |
+
> Use `worker.actor.fsdp.torch_dtype=bf16` and `worker.actor.optim.strategy=adamw_bf16` to enable bf16 training.
|
| 105 |
+
>
|
| 106 |
+
> We are working hard to reduce the VRAM in RL training, LoRA support will be integrated in next updates.
|
| 107 |
+
|
| 108 |
+
## Tutorial: Run Qwen2.5-VL GRPO on [Geometry3K](https://huggingface.co/datasets/hiyouga/geometry3k) Dataset in Just 3 Steps
|
| 109 |
+
|
| 110 |
+

|
| 111 |
+
|
| 112 |
+
### Installation
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
git clone https://github.com/hiyouga/EasyR1.git
|
| 116 |
+
cd EasyR1
|
| 117 |
+
pip install -e .
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
### GRPO Training
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
### Merge Checkpoint in Hugging Face Format
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
python3 scripts/model_merger.py --local_dir checkpoints/easy_r1/exp_name/global_step_1/actor
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
> [!TIP]
|
| 133 |
+
> If you encounter issues with connecting to Hugging Face, consider using `export HF_ENDPOINT=https://hf-mirror.com`.
|
| 134 |
+
>
|
| 135 |
+
> If you want to use SwanLab logger, consider using `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
|
| 136 |
+
|
| 137 |
+
## Custom Dataset
|
| 138 |
+
|
| 139 |
+
Please refer to the example datasets to prepare your own dataset.
|
| 140 |
+
|
| 141 |
+
- Text dataset: https://huggingface.co/datasets/hiyouga/math12k
|
| 142 |
+
- Image-text dataset: https://huggingface.co/datasets/hiyouga/geometry3k
|
| 143 |
+
- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
|
| 144 |
+
|
| 145 |
+
## How to Understand GRPO in EasyR1
|
| 146 |
+
|
| 147 |
+

|
| 148 |
+
|
| 149 |
+
- To learn about the GRPO algorithm, you can refer to [Hugging Face's blog](https://huggingface.co/docs/trl/v0.16.1/en/grpo_trainer).
|
| 150 |
+
|
| 151 |
+
## How to Run 70B+ Model in Multi-node Environment
|
| 152 |
+
|
| 153 |
+
1. Start the Ray head node.
|
| 154 |
+
|
| 155 |
+
```bash
|
| 156 |
+
ray start --head --port=6379 --dashboard-host=0.0.0.0
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
2. Start the Ray worker node and connect to the head node.
|
| 160 |
+
|
| 161 |
+
```bash
|
| 162 |
+
ray start --address=<head_node_ip>:6379
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
3. Check the Ray resource pool.
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
ray status
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
4. Run training script on the Ray head node only.
|
| 172 |
+
|
| 173 |
+
```bash
|
| 174 |
+
bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
See the **[veRL's official doc](https://verl.readthedocs.io/en/latest/start/multinode.html)** for more details about multi-node training and Ray debugger.
|
| 178 |
+
|
| 179 |
+
## Other Baselines
|
| 180 |
+
|
| 181 |
+
We also reproduced the following two baselines of the [R1-V](https://github.com/deep-agent/R1-V) project.
|
| 182 |
+
- [CLEVR-70k-Counting](examples/baselines/qwen2_5_vl_3b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
|
| 183 |
+
- [GeoQA-8k](examples/baselines/qwen2_5_vl_3b_geoqa8k.sh): Train the Qwen2.5-VL-3B-Instruct model on GeoQA problem.
|
| 184 |
+
|
| 185 |
+
## Performance Baselines
|
| 186 |
+
|
| 187 |
+
See [baselines.md](assets/baselines.md).
|
| 188 |
+
|
| 189 |
+
## Awesome Work using EasyR1
|
| 190 |
+
|
| 191 |
+
- **MMR1**: Advancing the Frontiers of Multimodal Reasoning. [![[code]](https://img.shields.io/github/stars/LengSicong/MMR1)](https://github.com/LengSicong/MMR1)
|
| 192 |
+
- **Vision-R1**: Incentivizing Reasoning Capability in Multimodal Large Language Models. [![[code]](https://img.shields.io/github/stars/Osilly/Vision-R1)](https://github.com/Osilly/Vision-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06749-blue)](https://arxiv.org/abs/2503.06749)
|
| 193 |
+
- **Seg-Zero**: Reasoning-Chain Guided Segmentation via Cognitive Reinforcement. [![[code]](https://img.shields.io/github/stars/dvlab-research/Seg-Zero)](https://github.com/dvlab-research/Seg-Zero) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06520-blue)](https://arxiv.org/abs/2503.06520)
|
| 194 |
+
- **MetaSpatial**: Reinforcing 3D Spatial Reasoning in VLMs for the Metaverse. [![[code]](https://img.shields.io/github/stars/PzySeere/MetaSpatial)](https://github.com/PzySeere/MetaSpatial) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.18470-blue)](https://arxiv.org/abs/2503.18470)
|
| 195 |
+
- **Temporal-R1**: Envolving Temporal Reasoning Capability into LMMs via Temporal Consistent Reward. [![[code]](https://img.shields.io/github/stars/appletea233/Temporal-R1)](https://github.com/appletea233/Temporal-R1)
|
| 196 |
+
- **NoisyRollout**: Reinforcing Visual Reasoning with Data Augmentation. [![[code]](https://img.shields.io/github/stars/John-AI-Lab/NoisyRollout)](https://github.com/John-AI-Lab/NoisyRollout) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.13055-blue)](https://arxiv.org/pdf/2504.13055)
|
| 197 |
+
- **GUI-R1**: A Generalist R1-Style Vision-Language Action Model For GUI Agents. [![[code]](https://img.shields.io/github/stars/ritzz-ai/GUI-R1)](https://github.com/ritzz-ai/GUI-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.10458-blue)](https://arxiv.org/abs/2504.10458)
|
| 198 |
+
- **R1-Track**: Direct Application of MLLMs to Visual Object Tracking via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/Wangbiao2/R1-Track)](https://github.com/Wangbiao2/R1-Track)
|
| 199 |
+
- **VisionReasoner**: Unified Visual Perception and Reasoning via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/dvlab-research/VisionReasoner)](https://github.com/dvlab-research/VisionReasoner) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.12081-blue)](https://arxiv.org/abs/2505.12081)
|
| 200 |
+
- **MM-UPT**: Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO. [![[code]](https://img.shields.io/github/stars/waltonfuture/MM-UPT)](https://github.com/waltonfuture/MM-UPT) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22453-blue)](https://arxiv.org/pdf/2505.22453)
|
| 201 |
+
- **RL-with-Cold-Start**: Advancing Multimodal Reasoning via Reinforcement Learning with Cold Start. [![[code]](https://img.shields.io/github/stars/waltonfuture/RL-with-Cold-Start)](https://github.com/waltonfuture/RL-with-Cold-Start) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/pdf/2505.22334)
|
| 202 |
+
- **ViGoRL**: Grounded Reinforcement Learning for Visual Reasoning. [![[code]](https://img.shields.io/github/stars/Gabesarch/grounded-rl)](https://github.com/Gabesarch/grounded-rl) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/abs/2505.23678)
|
| 203 |
+
- **Revisual-R1**: Advancing Multimodal Reasoning: From Optimized Cold Start to Staged Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CSfufu/Revisual-R1)](https://github.com/CSfufu/Revisual-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.04207-blue)](https://arxiv.org/abs/2506.04207)
|
| 204 |
+
- **SophiaVL-R1**: Reinforcing MLLMs Reasoning with Thinking Reward. [![[code]](https://img.shields.io/github/stars/kxfan2002/SophiaVL-R1)](https://github.com/kxfan2002/SophiaVL-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.17018-blue)](https://arxiv.org/abs/2505.17018)
|
| 205 |
+
- **Vision-Matters**: Simple Visual Perturbations Can Boost Multimodal Math Reasoning. [![[code]](https://img.shields.io/github/stars/YutingLi0606/Vision-Matters)](https://github.com/YutingLi0606/Vision-Matters) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.09736-blue)](https://arxiv.org/abs/2506.09736)
|
| 206 |
+
|
| 207 |
+
## TODO
|
| 208 |
+
|
| 209 |
+
- Support LoRA (high priority).
|
| 210 |
+
- Support ulysses parallelism for VLMs (middle priority).
|
| 211 |
+
- Support more VLM architectures.
|
| 212 |
+
|
| 213 |
+
> [!NOTE]
|
| 214 |
+
> We will not provide scripts for supervised fine-tuning and inference in this project. If you have such requirements, we recommend using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
|
| 215 |
+
|
| 216 |
+
### Known bugs
|
| 217 |
+
|
| 218 |
+
These features are temporarily disabled for now, we plan to fix them one-by-one in the future updates.
|
| 219 |
+
|
| 220 |
+
- Vision language models are not compatible with ulysses parallelism yet.
|
| 221 |
+
|
| 222 |
+
## Discussion Group
|
| 223 |
+
|
| 224 |
+
👋 Join our [WeChat group](assets/wechat.jpg).
|
| 225 |
+
|
| 226 |
+
## FAQs
|
| 227 |
+
|
| 228 |
+
> ValueError: Image features and image tokens do not match: tokens: 8192, features 9800
|
| 229 |
+
|
| 230 |
+
Increase the `data.max_prompt_length` or reduce the `data.max_pixels`.
|
| 231 |
+
|
| 232 |
+
> RuntimeError: CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62
|
| 233 |
+
|
| 234 |
+
Reduce the `worker.rollout.gpu_memory_utilization` and enable `worker.actor.offload.offload_params`.
|
| 235 |
+
|
| 236 |
+
> RuntimeError: 0 active drivers ([]). There should only be one.
|
| 237 |
+
|
| 238 |
+
Uninstall `deepspeed` from the current python environment.
|
| 239 |
+
|
| 240 |
+
## Citation
|
| 241 |
+
|
| 242 |
+
Core contributors: [Yaowei Zheng](https://github.com/hiyouga), [Junting Lu](https://github.com/AL-377), [Shenzhi Wang](https://github.com/Shenzhi-Wang), [Zhangchi Feng](https://github.com/BUAADreamer), [Dongdong Kuang](https://github.com/Kuangdd01) and Yuwen Xiong
|
| 243 |
+
|
| 244 |
+
We also thank Guangming Sheng and Chi Zhang for helpful discussions.
|
| 245 |
+
|
| 246 |
+
```bibtex
|
| 247 |
+
@misc{zheng2025easyr1,
|
| 248 |
+
title = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework},
|
| 249 |
+
author = {Yaowei Zheng, Junting Lu, Shenzhi Wang, Zhangchi Feng, Dongdong Kuang, Yuwen Xiong},
|
| 250 |
+
howpublished = {\url{https://github.com/hiyouga/EasyR1}},
|
| 251 |
+
year = {2025}
|
| 252 |
+
}
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
We recommend to also cite the original work.
|
| 256 |
+
|
| 257 |
+
```bibtex
|
| 258 |
+
@article{sheng2024hybridflow,
|
| 259 |
+
title = {HybridFlow: A Flexible and Efficient RLHF Framework},
|
| 260 |
+
author = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
|
| 261 |
+
year = {2024},
|
| 262 |
+
journal = {arXiv preprint arXiv: 2409.19256}
|
| 263 |
+
}
|
| 264 |
+
```
|
EasyR1/verl.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
README.md
|
| 3 |
+
pyproject.toml
|
| 4 |
+
setup.py
|
| 5 |
+
./verl/__init__.py
|
| 6 |
+
./verl/protocol.py
|
| 7 |
+
./verl/models/__init__.py
|
| 8 |
+
./verl/models/monkey_patch.py
|
| 9 |
+
./verl/models/transformers/__init__.py
|
| 10 |
+
./verl/models/transformers/flash_attention_utils.py
|
| 11 |
+
./verl/models/transformers/qwen2_vl.py
|
| 12 |
+
./verl/single_controller/__init__.py
|
| 13 |
+
./verl/single_controller/base/__init__.py
|
| 14 |
+
./verl/single_controller/base/decorator.py
|
| 15 |
+
./verl/single_controller/base/worker.py
|
| 16 |
+
./verl/single_controller/base/worker_group.py
|
| 17 |
+
./verl/single_controller/base/register_center/__init__.py
|
| 18 |
+
./verl/single_controller/base/register_center/ray.py
|
| 19 |
+
./verl/single_controller/ray/__init__.py
|
| 20 |
+
./verl/single_controller/ray/base.py
|
| 21 |
+
./verl/trainer/__init__.py
|
| 22 |
+
./verl/trainer/config.py
|
| 23 |
+
./verl/trainer/core_algos.py
|
| 24 |
+
./verl/trainer/data_loader.py
|
| 25 |
+
./verl/trainer/main.py
|
| 26 |
+
./verl/trainer/metrics.py
|
| 27 |
+
./verl/trainer/ray_trainer.py
|
| 28 |
+
./verl/utils/__init__.py
|
| 29 |
+
./verl/utils/dataset.py
|
| 30 |
+
./verl/utils/flops_counter.py
|
| 31 |
+
./verl/utils/fsdp_utils.py
|
| 32 |
+
./verl/utils/model_utils.py
|
| 33 |
+
./verl/utils/py_functional.py
|
| 34 |
+
./verl/utils/seqlen_balancing.py
|
| 35 |
+
./verl/utils/tokenizer.py
|
| 36 |
+
./verl/utils/torch_dtypes.py
|
| 37 |
+
./verl/utils/torch_functional.py
|
| 38 |
+
./verl/utils/ulysses.py
|
| 39 |
+
./verl/utils/checkpoint/__init__.py
|
| 40 |
+
./verl/utils/checkpoint/checkpoint_manager.py
|
| 41 |
+
./verl/utils/checkpoint/fsdp_checkpoint_manager.py
|
| 42 |
+
./verl/utils/logger/__init__.py
|
| 43 |
+
./verl/utils/logger/gen_logger.py
|
| 44 |
+
./verl/utils/logger/logger.py
|
| 45 |
+
./verl/workers/__init__.py
|
| 46 |
+
./verl/workers/config.py
|
| 47 |
+
./verl/workers/fsdp_workers.py
|
| 48 |
+
./verl/workers/actor/__init__.py
|
| 49 |
+
./verl/workers/actor/base.py
|
| 50 |
+
./verl/workers/actor/config.py
|
| 51 |
+
./verl/workers/actor/dp_actor.py
|
| 52 |
+
./verl/workers/critic/__init__.py
|
| 53 |
+
./verl/workers/critic/base.py
|
| 54 |
+
./verl/workers/critic/config.py
|
| 55 |
+
./verl/workers/critic/dp_critic.py
|
| 56 |
+
./verl/workers/reward/__init__.py
|
| 57 |
+
./verl/workers/reward/config.py
|
| 58 |
+
./verl/workers/reward/function.py
|
| 59 |
+
./verl/workers/rollout/__init__.py
|
| 60 |
+
./verl/workers/rollout/base.py
|
| 61 |
+
./verl/workers/rollout/config.py
|
| 62 |
+
./verl/workers/rollout/vllm_rollout_spmd.py
|
| 63 |
+
./verl/workers/sharding_manager/__init__.py
|
| 64 |
+
./verl/workers/sharding_manager/base.py
|
| 65 |
+
./verl/workers/sharding_manager/fsdp_ulysses.py
|
| 66 |
+
./verl/workers/sharding_manager/fsdp_vllm.py
|
| 67 |
+
verl.egg-info/PKG-INFO
|
| 68 |
+
verl.egg-info/SOURCES.txt
|
| 69 |
+
verl.egg-info/dependency_links.txt
|
| 70 |
+
verl.egg-info/requires.txt
|
| 71 |
+
verl.egg-info/top_level.txt
|
EasyR1/verl.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
EasyR1/verl.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate
|
| 2 |
+
codetiming
|
| 3 |
+
datasets
|
| 4 |
+
liger-kernel
|
| 5 |
+
mathruler
|
| 6 |
+
numpy
|
| 7 |
+
omegaconf
|
| 8 |
+
pandas
|
| 9 |
+
peft
|
| 10 |
+
pillow
|
| 11 |
+
pyarrow>=15.0.0
|
| 12 |
+
pylatexenc
|
| 13 |
+
qwen-vl-utils
|
| 14 |
+
ray[default]
|
| 15 |
+
tensordict
|
| 16 |
+
torchdata
|
| 17 |
+
transformers>=4.51.0
|
| 18 |
+
vllm>=0.7.3
|
| 19 |
+
wandb
|
| 20 |
+
|
| 21 |
+
[dev]
|
| 22 |
+
pre-commit
|
| 23 |
+
ruff
|
EasyR1/verl.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
verl
|
EasyR1/verl/workers/sharding_manager/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (373 Bytes). View file
|
|
|
EasyR1/verl/workers/sharding_manager/__pycache__/base.cpython-310.pyc
ADDED
|
Binary file (1.05 kB). View file
|
|
|
EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_ulysses.cpython-310.pyc
ADDED
|
Binary file (2.4 kB). View file
|
|
|
EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_vllm.cpython-310.pyc
ADDED
|
Binary file (5 kB). View file
|
|
|
EasyR1/verl/workers/sharding_manager/fsdp_ulysses.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
"""
|
| 15 |
+
Contains a resharding manager that binds weights from FSDP zero3 to XPerfGPT
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from torch.distributed.device_mesh import DeviceMesh
|
| 19 |
+
|
| 20 |
+
from ...protocol import DataProto, all_gather_data_proto
|
| 21 |
+
from ...utils.ulysses import get_ulysses_sequence_parallel_group, set_ulysses_sequence_parallel_group
|
| 22 |
+
from .base import BaseShardingManager
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class FSDPUlyssesShardingManager(BaseShardingManager):
|
| 26 |
+
"""
|
| 27 |
+
Sharding manager to support data resharding when using FSDP + Ulysses
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, device_mesh: DeviceMesh):
|
| 31 |
+
super().__init__()
|
| 32 |
+
self.device_mesh = device_mesh
|
| 33 |
+
|
| 34 |
+
def __enter__(self):
|
| 35 |
+
if self.device_mesh is not None:
|
| 36 |
+
self.prev_sp_group = get_ulysses_sequence_parallel_group()
|
| 37 |
+
set_ulysses_sequence_parallel_group(self.device_mesh["sp"].get_group())
|
| 38 |
+
|
| 39 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
| 40 |
+
if self.device_mesh is not None:
|
| 41 |
+
set_ulysses_sequence_parallel_group(self.prev_sp_group)
|
| 42 |
+
|
| 43 |
+
def preprocess_data(self, data: DataProto) -> DataProto:
|
| 44 |
+
"""
|
| 45 |
+
AllGather data from sp region
|
| 46 |
+
This is because the data is first sharded along the FSDP dimension as we utilize the DP_COMPUTE
|
| 47 |
+
In Ulysses, we need to make sure the same data is used across a SP group
|
| 48 |
+
"""
|
| 49 |
+
if self.device_mesh is not None:
|
| 50 |
+
sp_size = self.device_mesh["sp"].size()
|
| 51 |
+
sp_group = self.device_mesh["sp"].get_group()
|
| 52 |
+
all_gather_data_proto(data, size=sp_size, group=sp_group)
|
| 53 |
+
|
| 54 |
+
return data
|
| 55 |
+
|
| 56 |
+
def postprocess_data(self, data: DataProto) -> DataProto:
|
| 57 |
+
"""
|
| 58 |
+
Split the data to follow FSDP partition
|
| 59 |
+
"""
|
| 60 |
+
if self.device_mesh is not None:
|
| 61 |
+
sp_size = self.device_mesh["sp"].size()
|
| 62 |
+
sp_rank = self.device_mesh["sp"].get_local_rank()
|
| 63 |
+
data = data.chunk(chunks=sp_size)[sp_rank]
|
| 64 |
+
|
| 65 |
+
return data
|
EasyR1/verl/workers/sharding_manager/fsdp_vllm.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
import inspect
|
| 16 |
+
import re
|
| 17 |
+
from typing import Dict, Iterable, Tuple, Union
|
| 18 |
+
|
| 19 |
+
import torch
|
| 20 |
+
import torch.distributed as dist
|
| 21 |
+
from torch.distributed._tensor import DTensor
|
| 22 |
+
from torch.distributed.checkpoint.state_dict import get_model_state_dict
|
| 23 |
+
from torch.distributed.device_mesh import DeviceMesh
|
| 24 |
+
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
|
| 25 |
+
from transformers import PreTrainedModel
|
| 26 |
+
from vllm import LLM
|
| 27 |
+
from vllm.distributed import parallel_state as vllm_ps
|
| 28 |
+
|
| 29 |
+
from ...protocol import DataProto, all_gather_data_proto
|
| 30 |
+
from ...utils.model_utils import print_gpu_memory_usage
|
| 31 |
+
from .base import BaseShardingManager
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class FSDPVLLMShardingManager(BaseShardingManager):
|
| 35 |
+
def __init__(
|
| 36 |
+
self,
|
| 37 |
+
module: FSDP,
|
| 38 |
+
inference_engine: LLM,
|
| 39 |
+
device_mesh: DeviceMesh,
|
| 40 |
+
):
|
| 41 |
+
self.module = module
|
| 42 |
+
self.inference_engine = inference_engine
|
| 43 |
+
self.device_mesh = device_mesh
|
| 44 |
+
|
| 45 |
+
self.world_size = dist.get_world_size()
|
| 46 |
+
self.tp_size = vllm_ps.get_tensor_model_parallel_world_size()
|
| 47 |
+
self.tp_rank = vllm_ps.get_tensor_model_parallel_rank()
|
| 48 |
+
self.tp_group = vllm_ps.get_tensor_model_parallel_group().device_group
|
| 49 |
+
|
| 50 |
+
# Record freed bytes to estimate memory usage correctly
|
| 51 |
+
# https://github.com/vllm-project/vllm/pull/11743#issuecomment-2754338119
|
| 52 |
+
self.freed_bytes = 0
|
| 53 |
+
|
| 54 |
+
# Note that torch_random_states may be different on each dp rank
|
| 55 |
+
self.torch_random_states = torch.cuda.get_rng_state()
|
| 56 |
+
# get a random rng states
|
| 57 |
+
gen_dp_rank = self.device_mesh["dp"].get_local_rank()
|
| 58 |
+
torch.cuda.manual_seed(gen_dp_rank + 1000) # make sure all tp ranks have the same random states
|
| 59 |
+
self.gen_random_states = torch.cuda.get_rng_state()
|
| 60 |
+
torch.cuda.set_rng_state(self.torch_random_states)
|
| 61 |
+
|
| 62 |
+
def _rename_weight_keys(self, actor_weights: Dict[str, Union[torch.Tensor, DTensor]], model: PreTrainedModel):
|
| 63 |
+
# convert state dict keys: https://github.com/huggingface/transformers/pull/38385
|
| 64 |
+
if not hasattr(model, "_checkpoint_conversion_mapping"):
|
| 65 |
+
return actor_weights
|
| 66 |
+
|
| 67 |
+
reverse_key_mapping = {v: k for k, v in model._checkpoint_conversion_mapping.items()}
|
| 68 |
+
original_weights = {}
|
| 69 |
+
for key, value in actor_weights.items():
|
| 70 |
+
for pattern, replacement in reverse_key_mapping.items():
|
| 71 |
+
replacement = replacement.lstrip("^") # strip off un-needed chars and patterns
|
| 72 |
+
replacement = re.sub(r"\(.*\)", "", replacement)
|
| 73 |
+
key, n_replace = re.subn(pattern, replacement, key)
|
| 74 |
+
# Early exit of the loop
|
| 75 |
+
if n_replace > 0:
|
| 76 |
+
break
|
| 77 |
+
|
| 78 |
+
original_weights[key] = value
|
| 79 |
+
|
| 80 |
+
return original_weights
|
| 81 |
+
|
| 82 |
+
def _make_weight_iterator(
|
| 83 |
+
self, actor_weights: Dict[str, Union[torch.Tensor, DTensor]]
|
| 84 |
+
) -> Iterable[Tuple[str, torch.Tensor]]:
|
| 85 |
+
for name, tensor in actor_weights.items():
|
| 86 |
+
yield name, tensor.full_tensor() if self.world_size != 1 else tensor
|
| 87 |
+
|
| 88 |
+
def __enter__(self):
|
| 89 |
+
# NOTE: Basically, we only need `torch.cuda.empty_cache()` before vllm wake_up and
|
| 90 |
+
# after vllm sleep, since vllm has its own caching memory allocator CuMemAllocator.
|
| 91 |
+
# Out of vllm scope, we should avoid empty cache to let pytorch using caching memory
|
| 92 |
+
# to speed up memory allocations.
|
| 93 |
+
#
|
| 94 |
+
# pytorch: https://pytorch.org/docs/stable/notes/cuda.html#memory-management
|
| 95 |
+
# vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/device_allocator/cumem.py#L103
|
| 96 |
+
torch.cuda.empty_cache()
|
| 97 |
+
print_gpu_memory_usage("Before state_dict() in sharding manager")
|
| 98 |
+
actor_weights = get_model_state_dict(self.module)
|
| 99 |
+
actor_weights = self._rename_weight_keys(actor_weights, self.module._fsdp_wrapped_module)
|
| 100 |
+
print_gpu_memory_usage("After state_dict() in sharding manager")
|
| 101 |
+
|
| 102 |
+
if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
|
| 103 |
+
self.inference_engine.wake_up(tags=["weights"])
|
| 104 |
+
else:
|
| 105 |
+
self.inference_engine.wake_up()
|
| 106 |
+
|
| 107 |
+
model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
|
| 108 |
+
model.load_weights(self._make_weight_iterator(actor_weights))
|
| 109 |
+
print_gpu_memory_usage("After sync model weights in sharding manager")
|
| 110 |
+
|
| 111 |
+
del actor_weights
|
| 112 |
+
torch.cuda.empty_cache()
|
| 113 |
+
|
| 114 |
+
if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
|
| 115 |
+
self.inference_engine.wake_up(tags=["kv_cache"])
|
| 116 |
+
|
| 117 |
+
print_gpu_memory_usage("After del state_dict and empty_cache in sharding manager")
|
| 118 |
+
# important: need to manually set the random states of each tp to be identical.
|
| 119 |
+
if self.device_mesh is not None:
|
| 120 |
+
self.torch_random_states = torch.cuda.get_rng_state()
|
| 121 |
+
torch.cuda.set_rng_state(self.gen_random_states)
|
| 122 |
+
|
| 123 |
+
def __exit__(self, exc_type, exc_value, traceback):
|
| 124 |
+
print_gpu_memory_usage("Before vllm offload in sharding manager")
|
| 125 |
+
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
|
| 126 |
+
self.inference_engine.sleep(level=1)
|
| 127 |
+
free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
|
| 128 |
+
self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
|
| 129 |
+
print_gpu_memory_usage("After vllm offload in sharding manager")
|
| 130 |
+
|
| 131 |
+
self.module.train()
|
| 132 |
+
torch.cuda.empty_cache() # add empty cache after each compute
|
| 133 |
+
|
| 134 |
+
# restore random states
|
| 135 |
+
if self.device_mesh is not None:
|
| 136 |
+
self.gen_random_states = torch.cuda.get_rng_state()
|
| 137 |
+
torch.cuda.set_rng_state(self.torch_random_states)
|
| 138 |
+
|
| 139 |
+
def preprocess_data(self, data: DataProto) -> DataProto:
|
| 140 |
+
"""All gather across tp group to make each rank has identical input."""
|
| 141 |
+
all_gather_data_proto(data, size=self.tp_size, group=self.tp_group)
|
| 142 |
+
return data
|
| 143 |
+
|
| 144 |
+
def postprocess_data(self, data: DataProto) -> DataProto:
|
| 145 |
+
"""Get chunk data of this tp rank since we do all gather in preprocess."""
|
| 146 |
+
if self.tp_size > 1:
|
| 147 |
+
data = data.chunk(chunks=self.tp_size)[self.tp_rank]
|
| 148 |
+
|
| 149 |
+
return data
|
LAVIS-main/lavis/models/alpro_models/alpro_retrieval.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
All rights reserved.
|
| 4 |
+
SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import datetime
|
| 9 |
+
import logging
|
| 10 |
+
import time
|
| 11 |
+
|
| 12 |
+
import lavis.common.dist_utils as dist_utils
|
| 13 |
+
import numpy as np
|
| 14 |
+
import torch
|
| 15 |
+
import torch.distributed as dist
|
| 16 |
+
import torch.nn.functional as F
|
| 17 |
+
from lavis.common.config import node_to_dict
|
| 18 |
+
from lavis.common.dist_utils import get_rank
|
| 19 |
+
from lavis.common.logger import MetricLogger
|
| 20 |
+
from lavis.common.registry import registry
|
| 21 |
+
from lavis.models.alpro_models import AlproBase
|
| 22 |
+
from lavis.models.alpro_models.alpro_outputs import AlproIntermediateOutput, AlproOutput
|
| 23 |
+
from lavis.models.base_model import all_gather_with_grad
|
| 24 |
+
from lavis.models.med import XBertEncoder
|
| 25 |
+
from lavis.models.timesformer.vit import TimeSformer
|
| 26 |
+
from torch import nn
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@registry.register_model("alpro_retrieval")
|
| 30 |
+
class AlproRetrieval(AlproBase):
|
| 31 |
+
PRETRAINED_MODEL_CONFIG_DICT = {
|
| 32 |
+
"msrvtt": "configs/models/alpro_retrieval_msrvtt.yaml",
|
| 33 |
+
"didemo": "configs/models/alpro_retrieval_didemo.yaml",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
def __init__(
|
| 37 |
+
self,
|
| 38 |
+
visual_encoder,
|
| 39 |
+
text_encoder,
|
| 40 |
+
vision_width=768,
|
| 41 |
+
text_width=768,
|
| 42 |
+
embed_dim=256,
|
| 43 |
+
max_txt_len=35,
|
| 44 |
+
temp=0.07,
|
| 45 |
+
):
|
| 46 |
+
super().__init__()
|
| 47 |
+
|
| 48 |
+
self.temp = nn.Parameter(torch.ones([]) * temp)
|
| 49 |
+
|
| 50 |
+
self.tokenizer = self.init_tokenizer()
|
| 51 |
+
|
| 52 |
+
self.visual_encoder = visual_encoder
|
| 53 |
+
self.text_encoder = text_encoder
|
| 54 |
+
|
| 55 |
+
vision_width = vision_width
|
| 56 |
+
text_width = text_width
|
| 57 |
+
|
| 58 |
+
self.vision_proj = nn.Linear(vision_width, embed_dim)
|
| 59 |
+
self.text_proj = nn.Linear(text_width, embed_dim)
|
| 60 |
+
|
| 61 |
+
self.itm_head = nn.Linear(text_width, 2)
|
| 62 |
+
|
| 63 |
+
self.max_txt_len = max_txt_len
|
| 64 |
+
|
| 65 |
+
def forward(self, samples):
|
| 66 |
+
with torch.no_grad():
|
| 67 |
+
self.temp.clamp_(0.001, 0.5)
|
| 68 |
+
|
| 69 |
+
visual_inputs = samples["video"]
|
| 70 |
+
caption = samples["text_input"]
|
| 71 |
+
|
| 72 |
+
b, t, c, h, w = visual_inputs.shape
|
| 73 |
+
|
| 74 |
+
# forward text
|
| 75 |
+
text = self.tokenizer(
|
| 76 |
+
caption,
|
| 77 |
+
padding="max_length",
|
| 78 |
+
truncation=True,
|
| 79 |
+
max_length=self.max_txt_len,
|
| 80 |
+
return_tensors="pt",
|
| 81 |
+
).to(self.device)
|
| 82 |
+
|
| 83 |
+
text_output = self.text_encoder.forward_text(
|
| 84 |
+
text,
|
| 85 |
+
token_type_ids=torch.zeros(
|
| 86 |
+
text.input_ids.shape, dtype=torch.long, device=self.device
|
| 87 |
+
),
|
| 88 |
+
)
|
| 89 |
+
text_embeds = text_output.last_hidden_state
|
| 90 |
+
text_feat = F.normalize(self.text_proj(text_embeds[:, 0, :]), dim=-1)
|
| 91 |
+
|
| 92 |
+
# forward visual
|
| 93 |
+
# timeSformer asks for (b, c, t, h, w) as input.
|
| 94 |
+
video_embeds = self.visual_encoder.forward_features(visual_inputs)
|
| 95 |
+
video_feat = F.normalize(self.vision_proj(video_embeds[:, 0, :]), dim=-1)
|
| 96 |
+
video_atts = torch.ones(video_embeds.size()[:-1], dtype=torch.long).to(
|
| 97 |
+
self.device
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# ========== (in-batch) ITC loss ==========
|
| 101 |
+
gathered_video_feats = all_gather_with_grad(video_feat)
|
| 102 |
+
gathered_text_feats = all_gather_with_grad(text_feat)
|
| 103 |
+
|
| 104 |
+
sim_v2t = video_feat @ gathered_text_feats.t() / self.temp
|
| 105 |
+
sim_t2v = text_feat @ gathered_video_feats.t() / self.temp
|
| 106 |
+
|
| 107 |
+
sim_targets = torch.zeros_like(sim_v2t)
|
| 108 |
+
|
| 109 |
+
local_rank = get_rank()
|
| 110 |
+
b_start, b_end = b * local_rank, b * (local_rank + 1)
|
| 111 |
+
sim_targets[:, b_start:b_end] = torch.eye(b)
|
| 112 |
+
|
| 113 |
+
loss_v2t = -torch.sum(F.log_softmax(sim_v2t, dim=1) * sim_targets, dim=1).mean()
|
| 114 |
+
loss_t2v = -torch.sum(F.log_softmax(sim_t2v, dim=1) * sim_targets, dim=1).mean()
|
| 115 |
+
|
| 116 |
+
vtc_loss = (loss_v2t + loss_t2v) / 2
|
| 117 |
+
|
| 118 |
+
(
|
| 119 |
+
vtm_loss,
|
| 120 |
+
vtm_logits,
|
| 121 |
+
vtm_labels,
|
| 122 |
+
encoder_output,
|
| 123 |
+
encoder_output_neg,
|
| 124 |
+
) = self.compute_vtm(
|
| 125 |
+
text_embeds=text_embeds,
|
| 126 |
+
text_atts=text.attention_mask,
|
| 127 |
+
image_embeds=video_embeds,
|
| 128 |
+
image_atts=video_atts,
|
| 129 |
+
sim_i2t=sim_v2t.clone(), # for hard mining
|
| 130 |
+
sim_t2i=sim_t2v.clone(), # for hard mining
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
loss = vtc_loss + vtm_loss
|
| 134 |
+
|
| 135 |
+
# return {"loss": loss}
|
| 136 |
+
return AlproOutput(
|
| 137 |
+
loss=loss,
|
| 138 |
+
loss_vtc=vtc_loss,
|
| 139 |
+
loss_vtm=vtm_loss,
|
| 140 |
+
intermediate_output=AlproIntermediateOutput(
|
| 141 |
+
video_embeds=video_embeds,
|
| 142 |
+
text_embeds=text_embeds,
|
| 143 |
+
encoder_output=encoder_output,
|
| 144 |
+
encoder_output_neg=encoder_output_neg,
|
| 145 |
+
vtm_logits=vtm_logits,
|
| 146 |
+
vtm_labels=vtm_labels,
|
| 147 |
+
),
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
def compute_vtm(
|
| 151 |
+
self, text_embeds, text_atts, image_embeds, image_atts, sim_i2t, sim_t2i
|
| 152 |
+
):
|
| 153 |
+
device = self.device
|
| 154 |
+
|
| 155 |
+
# ====== positive pairs =======
|
| 156 |
+
attention_mask = torch.cat([text_atts, image_atts], dim=1)
|
| 157 |
+
embedding_output_pos = torch.cat([text_embeds, image_embeds], dim=1)
|
| 158 |
+
|
| 159 |
+
encoder_outputs_pos = self.text_encoder(
|
| 160 |
+
encoder_embeds=embedding_output_pos,
|
| 161 |
+
attention_mask=attention_mask,
|
| 162 |
+
return_dict=True,
|
| 163 |
+
mode="fusion",
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
# ====== negative pairs =======
|
| 167 |
+
bs = text_embeds.shape[0]
|
| 168 |
+
|
| 169 |
+
local_rank = get_rank()
|
| 170 |
+
b_start, b_end = bs * local_rank, bs * (local_rank + 1)
|
| 171 |
+
|
| 172 |
+
with torch.no_grad():
|
| 173 |
+
weights_v2t = sim_i2t[:, b_start:b_end]
|
| 174 |
+
weights_t2v = sim_t2i[:, b_start:b_end]
|
| 175 |
+
|
| 176 |
+
# never select self as negative
|
| 177 |
+
weights_v2t.fill_diagonal_(-np.Inf)
|
| 178 |
+
weights_t2v.fill_diagonal_(-np.Inf)
|
| 179 |
+
|
| 180 |
+
weights_v2t = F.softmax(weights_v2t, dim=1)
|
| 181 |
+
weights_t2v = F.softmax(weights_t2v, dim=1)
|
| 182 |
+
|
| 183 |
+
# select a negative image for each text
|
| 184 |
+
# FIXME to optimize using indexing operations
|
| 185 |
+
image_embeds_neg = []
|
| 186 |
+
for b in range(bs):
|
| 187 |
+
neg_idx = torch.multinomial(weights_t2v[b], 1).item()
|
| 188 |
+
image_embeds_neg.append(image_embeds[neg_idx])
|
| 189 |
+
image_embeds_neg = torch.stack(image_embeds_neg, dim=0)
|
| 190 |
+
|
| 191 |
+
# select a negative text for each image
|
| 192 |
+
text_embeds_neg = []
|
| 193 |
+
text_atts_neg = []
|
| 194 |
+
for b in range(bs):
|
| 195 |
+
neg_idx = torch.multinomial(weights_v2t[b], 1).item()
|
| 196 |
+
text_embeds_neg.append(text_embeds[neg_idx])
|
| 197 |
+
text_atts_neg.append(text_atts[neg_idx])
|
| 198 |
+
|
| 199 |
+
text_embeds_neg = torch.stack(text_embeds_neg, dim=0)
|
| 200 |
+
text_atts_neg = torch.stack(text_atts_neg, dim=0)
|
| 201 |
+
|
| 202 |
+
text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0)
|
| 203 |
+
text_atts_all = torch.cat([text_atts, text_atts_neg], dim=0)
|
| 204 |
+
|
| 205 |
+
video_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0)
|
| 206 |
+
video_atts_all = torch.cat([image_atts, image_atts], dim=0)
|
| 207 |
+
|
| 208 |
+
attention_mask_all = torch.cat([text_atts_all, video_atts_all], dim=1)
|
| 209 |
+
embedding_output_all = torch.cat([text_embeds_all, video_embeds_all], dim=1)
|
| 210 |
+
|
| 211 |
+
# forward negative pairs via cross encoder
|
| 212 |
+
encoder_outputs_neg = self.text_encoder(
|
| 213 |
+
encoder_embeds=embedding_output_all,
|
| 214 |
+
attention_mask=attention_mask_all,
|
| 215 |
+
return_dict=True,
|
| 216 |
+
mode="fusion",
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
vl_embeddings = torch.cat(
|
| 220 |
+
[
|
| 221 |
+
encoder_outputs_pos.last_hidden_state[:, 0, :],
|
| 222 |
+
encoder_outputs_neg.last_hidden_state[:, 0, :],
|
| 223 |
+
],
|
| 224 |
+
dim=0,
|
| 225 |
+
)
|
| 226 |
+
vtm_logits = self.itm_head(vl_embeddings)
|
| 227 |
+
|
| 228 |
+
vtm_labels = torch.cat(
|
| 229 |
+
[torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)],
|
| 230 |
+
dim=0,
|
| 231 |
+
).to(device)
|
| 232 |
+
vtm_loss = F.cross_entropy(vtm_logits, vtm_labels)
|
| 233 |
+
|
| 234 |
+
return (
|
| 235 |
+
vtm_loss,
|
| 236 |
+
vtm_logits,
|
| 237 |
+
vtm_labels,
|
| 238 |
+
encoder_outputs_pos,
|
| 239 |
+
encoder_outputs_neg,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def compute_sim_matrix(self, data_loader, task_cfg):
|
| 243 |
+
k_test = task_cfg.get("k_test")
|
| 244 |
+
|
| 245 |
+
metric_logger = MetricLogger(delimiter=" ")
|
| 246 |
+
header = "Evaluation:"
|
| 247 |
+
|
| 248 |
+
logging.info("Computing features for evaluation...")
|
| 249 |
+
start_time = time.time()
|
| 250 |
+
|
| 251 |
+
texts = data_loader.dataset.text
|
| 252 |
+
num_text = len(texts)
|
| 253 |
+
text_bs = 256
|
| 254 |
+
text_ids = []
|
| 255 |
+
text_embeds = []
|
| 256 |
+
text_feats = []
|
| 257 |
+
text_atts = []
|
| 258 |
+
for i in range(0, num_text, text_bs):
|
| 259 |
+
text = texts[i : min(num_text, i + text_bs)]
|
| 260 |
+
text_input = self.tokenizer(
|
| 261 |
+
text,
|
| 262 |
+
padding="max_length",
|
| 263 |
+
truncation=True,
|
| 264 |
+
max_length=self.max_txt_len,
|
| 265 |
+
return_tensors="pt",
|
| 266 |
+
).to(self.device)
|
| 267 |
+
text_output = self.text_encoder.forward_text(
|
| 268 |
+
text_input,
|
| 269 |
+
token_type_ids=torch.zeros(
|
| 270 |
+
text_input.input_ids.shape, dtype=torch.long, device=self.device
|
| 271 |
+
),
|
| 272 |
+
)
|
| 273 |
+
text_feats.append(text_output.last_hidden_state.cpu())
|
| 274 |
+
text_embed = F.normalize(
|
| 275 |
+
self.text_proj(text_output.last_hidden_state[:, 0, :])
|
| 276 |
+
)
|
| 277 |
+
text_embeds.append(text_embed)
|
| 278 |
+
text_ids.append(text_input.input_ids)
|
| 279 |
+
text_atts.append(text_input.attention_mask)
|
| 280 |
+
|
| 281 |
+
text_embeds = torch.cat(text_embeds, dim=0)
|
| 282 |
+
text_ids = torch.cat(text_ids, dim=0)
|
| 283 |
+
text_atts = torch.cat(text_atts, dim=0)
|
| 284 |
+
text_feats = torch.cat(text_feats, dim=0)
|
| 285 |
+
|
| 286 |
+
video_feats = []
|
| 287 |
+
video_embeds = []
|
| 288 |
+
for samples in data_loader:
|
| 289 |
+
video = samples["video"]
|
| 290 |
+
|
| 291 |
+
video = video.to(self.device)
|
| 292 |
+
video_feat = self.visual_encoder.forward_features(video)
|
| 293 |
+
video_embed = self.vision_proj(video_feat[:, 0, :])
|
| 294 |
+
video_embed = F.normalize(video_embed, dim=-1)
|
| 295 |
+
|
| 296 |
+
video_feats.append(video_feat.cpu())
|
| 297 |
+
video_embeds.append(video_embed)
|
| 298 |
+
|
| 299 |
+
video_feats = torch.cat(video_feats, dim=0)
|
| 300 |
+
video_embeds = torch.cat(video_embeds, dim=0)
|
| 301 |
+
|
| 302 |
+
sims_matrix = video_embeds @ text_embeds.t()
|
| 303 |
+
score_matrix_v2t = torch.full(
|
| 304 |
+
(len(data_loader.dataset.image), len(texts)), -100.0
|
| 305 |
+
).to(self.device)
|
| 306 |
+
|
| 307 |
+
num_tasks = dist_utils.get_world_size()
|
| 308 |
+
rank = dist_utils.get_rank()
|
| 309 |
+
step = sims_matrix.size(0) // num_tasks + 1
|
| 310 |
+
start = rank * step
|
| 311 |
+
end = min(sims_matrix.size(0), start + step)
|
| 312 |
+
|
| 313 |
+
# video-to-text
|
| 314 |
+
for i, sims in enumerate(
|
| 315 |
+
metric_logger.log_every(sims_matrix[start:end], 50, header)
|
| 316 |
+
):
|
| 317 |
+
topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
|
| 318 |
+
|
| 319 |
+
video_feats_repeat = (
|
| 320 |
+
video_feats[start + i].repeat(k_test, 1, 1).to(self.device)
|
| 321 |
+
)
|
| 322 |
+
video_atts_repeat = torch.ones(
|
| 323 |
+
video_feats_repeat.size()[:-1], dtype=torch.long
|
| 324 |
+
).to(self.device)
|
| 325 |
+
|
| 326 |
+
attention_mask = torch.cat([text_atts[topk_idx], video_atts_repeat], dim=1)
|
| 327 |
+
embedding_output = torch.cat(
|
| 328 |
+
[text_feats[topk_idx].to(self.device), video_feats_repeat], dim=1
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
output = self.text_encoder(
|
| 332 |
+
encoder_embeds=embedding_output,
|
| 333 |
+
attention_mask=attention_mask,
|
| 334 |
+
return_dict=True,
|
| 335 |
+
mode="fusion",
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
|
| 339 |
+
score_matrix_v2t[start + i, topk_idx] = score + topk_sim
|
| 340 |
+
|
| 341 |
+
# text-to-video
|
| 342 |
+
sims_matrix = sims_matrix.t()
|
| 343 |
+
score_matrix_t2v = torch.full(
|
| 344 |
+
(len(texts), len(data_loader.dataset.image)), -100.0
|
| 345 |
+
).to(self.device)
|
| 346 |
+
|
| 347 |
+
step = sims_matrix.size(0) // num_tasks + 1
|
| 348 |
+
start = rank * step
|
| 349 |
+
end = min(sims_matrix.size(0), start + step)
|
| 350 |
+
|
| 351 |
+
for i, sims in enumerate(
|
| 352 |
+
metric_logger.log_every(sims_matrix[start:end], 50, header)
|
| 353 |
+
):
|
| 354 |
+
|
| 355 |
+
topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
|
| 356 |
+
|
| 357 |
+
text_feats_repeat = (
|
| 358 |
+
text_feats[start + i].repeat(k_test, 1, 1).to(self.device)
|
| 359 |
+
)
|
| 360 |
+
text_atts_repeat = text_atts[start + i].repeat(k_test, 1).to(self.device)
|
| 361 |
+
|
| 362 |
+
video_atts = torch.ones(
|
| 363 |
+
video_feats[topk_idx].size()[:-1], dtype=torch.long
|
| 364 |
+
).to(self.device)
|
| 365 |
+
|
| 366 |
+
embedding_output = torch.cat(
|
| 367 |
+
[text_feats_repeat, video_feats[topk_idx].to(self.device)], dim=1
|
| 368 |
+
)
|
| 369 |
+
attention_mask = torch.cat([text_atts_repeat, video_atts], dim=1)
|
| 370 |
+
|
| 371 |
+
output = self.text_encoder(
|
| 372 |
+
encoder_embeds=embedding_output,
|
| 373 |
+
attention_mask=attention_mask,
|
| 374 |
+
return_dict=True,
|
| 375 |
+
mode="fusion",
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
|
| 379 |
+
score_matrix_t2v[start + i, topk_idx] = score + topk_sim
|
| 380 |
+
|
| 381 |
+
if dist_utils.is_dist_avail_and_initialized():
|
| 382 |
+
dist.barrier()
|
| 383 |
+
torch.distributed.all_reduce(
|
| 384 |
+
score_matrix_v2t, op=torch.distributed.ReduceOp.SUM
|
| 385 |
+
)
|
| 386 |
+
torch.distributed.all_reduce(
|
| 387 |
+
score_matrix_t2v, op=torch.distributed.ReduceOp.SUM
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
total_time = time.time() - start_time
|
| 391 |
+
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
|
| 392 |
+
logging.info("Evaluation time {}".format(total_time_str))
|
| 393 |
+
|
| 394 |
+
return score_matrix_v2t.cpu().numpy(), score_matrix_t2v.cpu().numpy()
|
| 395 |
+
|
| 396 |
+
@classmethod
|
| 397 |
+
def from_config(cls, cfg):
|
| 398 |
+
# vision encoder
|
| 399 |
+
visual_encoder_config = node_to_dict(cfg.timesformer)
|
| 400 |
+
visual_encoder = TimeSformer(**visual_encoder_config)
|
| 401 |
+
|
| 402 |
+
# text encoder
|
| 403 |
+
text_encoder = XBertEncoder.from_config(cfg)
|
| 404 |
+
|
| 405 |
+
max_txt_len = cfg.get("max_txt_len", 35)
|
| 406 |
+
|
| 407 |
+
model = cls(
|
| 408 |
+
visual_encoder=visual_encoder,
|
| 409 |
+
text_encoder=text_encoder,
|
| 410 |
+
max_txt_len=max_txt_len,
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
num_patches = (
|
| 414 |
+
visual_encoder_config["image_size"] // visual_encoder_config["patch_size"]
|
| 415 |
+
) ** 2
|
| 416 |
+
num_frames = visual_encoder_config["n_frms"]
|
| 417 |
+
|
| 418 |
+
model.load_checkpoint_from_config(
|
| 419 |
+
cfg, num_frames=num_frames, num_patches=num_patches
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
return model
|
LAVIS-main/lavis/models/beats/BEATs.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
|
| 3 |
+
# Github source: https://github.com/microsoft/unilm/tree/master/beats
|
| 4 |
+
# Copyright (c) 2022 Microsoft
|
| 5 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 6 |
+
# Based on fairseq code bases
|
| 7 |
+
# https://github.com/pytorch/fairseq
|
| 8 |
+
# --------------------------------------------------------
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
import torch.nn as nn
|
| 13 |
+
from torch.nn import LayerNorm
|
| 14 |
+
import torchaudio.compliance.kaldi as ta_kaldi
|
| 15 |
+
|
| 16 |
+
from lavis.models.beats.backbone import (
|
| 17 |
+
TransformerEncoder,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
import logging
|
| 21 |
+
from typing import Optional
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class BEATsConfig:
|
| 27 |
+
def __init__(self, cfg=None):
|
| 28 |
+
self.input_patch_size: int = -1 # path size of patch embedding
|
| 29 |
+
self.embed_dim: int = 512 # patch embedding dimension
|
| 30 |
+
self.conv_bias: bool = False # include bias in conv encoder
|
| 31 |
+
|
| 32 |
+
self.encoder_layers: int = 12 # num encoder layers in the transformer
|
| 33 |
+
self.encoder_embed_dim: int = 768 # encoder embedding dimension
|
| 34 |
+
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
|
| 35 |
+
self.encoder_attention_heads: int = 12 # num encoder attention heads
|
| 36 |
+
self.activation_fn: str = "gelu" # activation function to use
|
| 37 |
+
|
| 38 |
+
self.layer_wise_gradient_decay_ratio: float = 1.0 # ratio for layer-wise gradient decay
|
| 39 |
+
self.layer_norm_first: bool = False # apply layernorm first in the transformer
|
| 40 |
+
self.deep_norm: bool = False # apply deep_norm first in the transformer
|
| 41 |
+
|
| 42 |
+
# dropouts
|
| 43 |
+
self.dropout: float = 0.1 # dropout probability for the transformer
|
| 44 |
+
self.attention_dropout: float = 0.1 # dropout probability for attention weights
|
| 45 |
+
self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
|
| 46 |
+
self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer
|
| 47 |
+
self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
|
| 48 |
+
|
| 49 |
+
# positional embeddings
|
| 50 |
+
self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
|
| 51 |
+
self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
|
| 52 |
+
|
| 53 |
+
# relative position embedding
|
| 54 |
+
self.relative_position_embedding: bool = False # apply relative position embedding
|
| 55 |
+
self.num_buckets: int = 320 # number of buckets for relative position embedding
|
| 56 |
+
self.max_distance: int = 1280 # maximum distance for relative position embedding
|
| 57 |
+
self.gru_rel_pos: bool = False # apply gated relative position embedding
|
| 58 |
+
|
| 59 |
+
# label predictor
|
| 60 |
+
self.finetuned_model: bool = False # whether the model is a fine-tuned model.
|
| 61 |
+
self.predictor_dropout: float = 0.1 # dropout probability for the predictor
|
| 62 |
+
self.predictor_class: int = 527 # target class number for the predictor
|
| 63 |
+
|
| 64 |
+
if cfg is not None:
|
| 65 |
+
self.update(cfg)
|
| 66 |
+
|
| 67 |
+
def update(self, cfg: dict):
|
| 68 |
+
self.__dict__.update(cfg)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class BEATs(nn.Module):
|
| 72 |
+
def __init__(
|
| 73 |
+
self,
|
| 74 |
+
cfg: BEATsConfig,
|
| 75 |
+
) -> None:
|
| 76 |
+
super().__init__()
|
| 77 |
+
logger.info(f"BEATs Config: {cfg.__dict__}")
|
| 78 |
+
|
| 79 |
+
self.cfg = cfg
|
| 80 |
+
|
| 81 |
+
self.embed = cfg.embed_dim
|
| 82 |
+
self.post_extract_proj = (
|
| 83 |
+
nn.Linear(self.embed, cfg.encoder_embed_dim)
|
| 84 |
+
if self.embed != cfg.encoder_embed_dim
|
| 85 |
+
else None
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
self.input_patch_size = cfg.input_patch_size
|
| 89 |
+
self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
|
| 90 |
+
bias=cfg.conv_bias)
|
| 91 |
+
|
| 92 |
+
self.dropout_input = nn.Dropout(cfg.dropout_input)
|
| 93 |
+
|
| 94 |
+
assert not cfg.deep_norm or not cfg.layer_norm_first
|
| 95 |
+
self.encoder = TransformerEncoder(cfg)
|
| 96 |
+
self.layer_norm = LayerNorm(self.embed)
|
| 97 |
+
|
| 98 |
+
if cfg.finetuned_model:
|
| 99 |
+
self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
|
| 100 |
+
self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class)
|
| 101 |
+
else:
|
| 102 |
+
self.predictor = None
|
| 103 |
+
|
| 104 |
+
def forward_padding_mask(
|
| 105 |
+
self,
|
| 106 |
+
features: torch.Tensor,
|
| 107 |
+
padding_mask: torch.Tensor,
|
| 108 |
+
) -> torch.Tensor:
|
| 109 |
+
extra = padding_mask.size(1) % features.size(1)
|
| 110 |
+
if extra > 0:
|
| 111 |
+
padding_mask = padding_mask[:, :-extra]
|
| 112 |
+
padding_mask = padding_mask.view(
|
| 113 |
+
padding_mask.size(0), features.size(1), -1
|
| 114 |
+
)
|
| 115 |
+
padding_mask = padding_mask.all(-1)
|
| 116 |
+
return padding_mask
|
| 117 |
+
|
| 118 |
+
def preprocess(
|
| 119 |
+
self,
|
| 120 |
+
source: torch.Tensor,
|
| 121 |
+
fbank_mean: float = 15.41663,
|
| 122 |
+
fbank_std: float = 6.55582,
|
| 123 |
+
) -> torch.Tensor:
|
| 124 |
+
fbanks = []
|
| 125 |
+
for waveform in source:
|
| 126 |
+
waveform = waveform.unsqueeze(0) * 2 ** 15
|
| 127 |
+
fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
|
| 128 |
+
fbanks.append(fbank)
|
| 129 |
+
fbank = torch.stack(fbanks, dim=0)
|
| 130 |
+
fbank = (fbank - fbank_mean) / (2 * fbank_std)
|
| 131 |
+
return fbank
|
| 132 |
+
|
| 133 |
+
def extract_features(
|
| 134 |
+
self,
|
| 135 |
+
fbank: torch.Tensor,
|
| 136 |
+
padding_mask: Optional[torch.Tensor] = None,
|
| 137 |
+
fbank_mean: float = 15.41663,
|
| 138 |
+
fbank_std: float = 6.55582,
|
| 139 |
+
):
|
| 140 |
+
## NOTE: preprocessing is done separately in lavis.processsors.audio_processors.BeatsAudioProcessor
|
| 141 |
+
# fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
|
| 142 |
+
|
| 143 |
+
if padding_mask is not None:
|
| 144 |
+
padding_mask = self.forward_padding_mask(fbank, padding_mask)
|
| 145 |
+
|
| 146 |
+
fbank = fbank.unsqueeze(1)
|
| 147 |
+
features = self.patch_embedding(fbank)
|
| 148 |
+
features = features.reshape(features.shape[0], features.shape[1], -1)
|
| 149 |
+
features = features.transpose(1, 2)
|
| 150 |
+
features = self.layer_norm(features)
|
| 151 |
+
|
| 152 |
+
if padding_mask is not None:
|
| 153 |
+
padding_mask = self.forward_padding_mask(features, padding_mask)
|
| 154 |
+
|
| 155 |
+
if self.post_extract_proj is not None:
|
| 156 |
+
features = self.post_extract_proj(features)
|
| 157 |
+
|
| 158 |
+
x = self.dropout_input(features)
|
| 159 |
+
|
| 160 |
+
x, layer_results = self.encoder(
|
| 161 |
+
x,
|
| 162 |
+
padding_mask=padding_mask,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
if self.predictor is not None:
|
| 166 |
+
x = self.predictor_dropout(x)
|
| 167 |
+
logits = self.predictor(x)
|
| 168 |
+
|
| 169 |
+
if padding_mask is not None and padding_mask.any():
|
| 170 |
+
logits[padding_mask] = 0
|
| 171 |
+
logits = logits.sum(dim=1)
|
| 172 |
+
logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits)
|
| 173 |
+
else:
|
| 174 |
+
logits = logits.mean(dim=1)
|
| 175 |
+
|
| 176 |
+
lprobs = torch.sigmoid(logits)
|
| 177 |
+
|
| 178 |
+
return lprobs, padding_mask
|
| 179 |
+
else:
|
| 180 |
+
return x, padding_mask
|
LAVIS-main/lavis/models/beats/LICENSE_BEATs.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The MIT License (MIT)
|
| 2 |
+
|
| 3 |
+
Copyright (c) Microsoft Corporation
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
LAVIS-main/lavis/models/beats/README.md
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# BEATs
|
| 3 |
+
|
| 4 |
+
[**BEATs**](https://arxiv.org/abs/2212.09058): **Audio Pre-Training with Acoustic Tokenizers**
|
| 5 |
+
|
| 6 |
+
Official PyTorch implementation and pretrained models of BEATs
|
| 7 |
+
|
| 8 |
+
## Pre-Trained and Fine-Tuned Tokenizers and Models
|
| 9 |
+
Iterations | Tokenizer | Pre-Trained Model | AudioSet Fine-Tuned Model 1 | AudioSet Fine-Tuned Model 2
|
| 10 |
+
|---|---|---|---|---
|
| 11 |
+
Iter1 | Random Projection | [BEATs_iter1](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter1 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter1 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
|
| 12 |
+
Iter2 | [Tokenizer_iter2](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter2](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter2 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter2 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
|
| 13 |
+
Iter3 | [Tokenizer_iter3](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
|
| 14 |
+
Iter3+ | [Tokenizer_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS20K) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS20K) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
|
| 15 |
+
Iter3+ | [Tokenizer_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS2M) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS2M) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
### Load Tokenizers
|
| 19 |
+
|
| 20 |
+
```python
|
| 21 |
+
import torch
|
| 22 |
+
from Tokenizers import TokenizersConfig, Tokenizers
|
| 23 |
+
|
| 24 |
+
# load the pre-trained checkpoints
|
| 25 |
+
checkpoint = torch.load('/path/to/tokenizer.pt')
|
| 26 |
+
|
| 27 |
+
cfg = TokenizersConfig(checkpoint['cfg'])
|
| 28 |
+
BEATs_tokenizer = Tokenizers(cfg)
|
| 29 |
+
BEATs_tokenizer.load_state_dict(checkpoint['model'])
|
| 30 |
+
BEATs_tokenizer.eval()
|
| 31 |
+
|
| 32 |
+
# tokenize the audio and generate the labels
|
| 33 |
+
audio_input_16khz = torch.randn(1, 10000)
|
| 34 |
+
padding_mask = torch.zeros(1, 10000).bool()
|
| 35 |
+
|
| 36 |
+
labels = BEATs_tokenizer.extract_labels(audio_input_16khz, padding_mask=padding_mask)
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
### Load Pre-Trained Models
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
import torch
|
| 44 |
+
from BEATs import BEATs, BEATsConfig
|
| 45 |
+
|
| 46 |
+
# load the pre-trained checkpoints
|
| 47 |
+
checkpoint = torch.load('/path/to/model.pt')
|
| 48 |
+
|
| 49 |
+
cfg = BEATsConfig(checkpoint['cfg'])
|
| 50 |
+
BEATs_model = BEATs(cfg)
|
| 51 |
+
BEATs_model.load_state_dict(checkpoint['model'])
|
| 52 |
+
BEATs_model.eval()
|
| 53 |
+
|
| 54 |
+
# extract the the audio representation
|
| 55 |
+
audio_input_16khz = torch.randn(1, 10000)
|
| 56 |
+
padding_mask = torch.zeros(1, 10000).bool()
|
| 57 |
+
|
| 58 |
+
representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
### Load Fine-tuned Models
|
| 63 |
+
|
| 64 |
+
```python
|
| 65 |
+
import torch
|
| 66 |
+
from BEATs import BEATs, BEATsConfig
|
| 67 |
+
|
| 68 |
+
# load the fine-tuned checkpoints
|
| 69 |
+
checkpoint = torch.load('/path/to/model.pt')
|
| 70 |
+
|
| 71 |
+
cfg = BEATsConfig(checkpoint['cfg'])
|
| 72 |
+
BEATs_model = BEATs(cfg)
|
| 73 |
+
BEATs_model.load_state_dict(checkpoint['model'])
|
| 74 |
+
BEATs_model.eval()
|
| 75 |
+
|
| 76 |
+
# predict the classification probability of each class
|
| 77 |
+
audio_input_16khz = torch.randn(3, 10000)
|
| 78 |
+
padding_mask = torch.zeros(3, 10000).bool()
|
| 79 |
+
|
| 80 |
+
probs = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
|
| 81 |
+
|
| 82 |
+
for i, (top5_label_prob, top5_label_idx) in enumerate(zip(*probs.topk(k=5))):
|
| 83 |
+
top5_label = [checkpoint['label_dict'][label_idx.item()] for label_idx in top5_label_idx]
|
| 84 |
+
print(f'Top 5 predicted labels of the {i}th audio are {top5_label} with probability of {top5_label_prob}')
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Evaluation Results
|
| 88 |
+
|
| 89 |
+
### Comparing with the SOTA Single Models
|
| 90 |
+

|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
### Comparing with the SOTA Ensemble Models
|
| 94 |
+

|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
### Comparing Different BEATS Tokenizers
|
| 98 |
+

|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
### Comparing Different Pre-Training Targets
|
| 102 |
+

|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
## License
|
| 106 |
+
This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
|
| 107 |
+
Portions of the source code are based on the [FAIRSEQ](https://github.com/pytorch/fairseq) and [VQGAN](https://github.com/CompVis/taming-transformers) project.
|
| 108 |
+
|
| 109 |
+
[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
### Reference
|
| 113 |
+
If you find our work is useful in your research, please cite the following paper:
|
| 114 |
+
``` latex
|
| 115 |
+
@article{Chen2022beats,
|
| 116 |
+
title = {BEATs: Audio Pre-Training with Acoustic Tokenizers},
|
| 117 |
+
author = {Sanyuan Chen and Yu Wu and Chengyi Wang and Shujie Liu and Daniel Tompkins and Zhuo Chen and Furu Wei},
|
| 118 |
+
eprint={2212.09058},
|
| 119 |
+
archivePrefix={arXiv},
|
| 120 |
+
year={2022}
|
| 121 |
+
}
|
| 122 |
+
```
|
| 123 |
+
### Contact Information
|
| 124 |
+
|
| 125 |
+
For help or issues using BEATs models, please submit a GitHub issue.
|
| 126 |
+
|
| 127 |
+
For other communications related to BEATs, please contact Yu Wu (`yuwu1@microsoft.com`).
|
LAVIS-main/lavis/models/beats/Tokenizers.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
|
| 3 |
+
# Github source: https://github.com/microsoft/unilm/tree/master/beats
|
| 4 |
+
# Copyright (c) 2022 Microsoft
|
| 5 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 6 |
+
# Based on fairseq code bases
|
| 7 |
+
# https://github.com/pytorch/fairseq
|
| 8 |
+
# --------------------------------------------------------
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
import torch.nn as nn
|
| 13 |
+
from torch.nn import LayerNorm
|
| 14 |
+
import torchaudio.compliance.kaldi as ta_kaldi
|
| 15 |
+
|
| 16 |
+
from lavis.models.beats.backbone import (
|
| 17 |
+
TransformerEncoder,
|
| 18 |
+
)
|
| 19 |
+
from lavis.models.beats.quantizer import (
|
| 20 |
+
NormEMAVectorQuantizer,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
import logging
|
| 24 |
+
from typing import Optional
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class TokenizersConfig:
|
| 30 |
+
def __init__(self, cfg=None):
|
| 31 |
+
self.input_patch_size: int = -1 # path size of patch embedding
|
| 32 |
+
self.embed_dim: int = 512 # patch embedding dimension
|
| 33 |
+
self.conv_bias: bool = False # include bias in conv encoder
|
| 34 |
+
|
| 35 |
+
self.encoder_layers: int = 12 # num encoder layers in the transformer
|
| 36 |
+
self.encoder_embed_dim: int = 768 # encoder embedding dimension
|
| 37 |
+
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
|
| 38 |
+
self.encoder_attention_heads: int = 12 # num encoder attention heads
|
| 39 |
+
self.activation_fn: str = "gelu" # activation function to use
|
| 40 |
+
|
| 41 |
+
self.layer_norm_first: bool = False # apply layernorm first in the transformer
|
| 42 |
+
self.deep_norm: bool = False # apply deep_norm first in the transformer
|
| 43 |
+
|
| 44 |
+
# dropouts
|
| 45 |
+
self.dropout: float = 0.1 # dropout probability for the transformer
|
| 46 |
+
self.attention_dropout: float = 0.1 # dropout probability for attention weights
|
| 47 |
+
self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
|
| 48 |
+
self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer
|
| 49 |
+
self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
|
| 50 |
+
|
| 51 |
+
# positional embeddings
|
| 52 |
+
self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
|
| 53 |
+
self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
|
| 54 |
+
|
| 55 |
+
# relative position embedding
|
| 56 |
+
self.relative_position_embedding: bool = False # apply relative position embedding
|
| 57 |
+
self.num_buckets: int = 320 # number of buckets for relative position embedding
|
| 58 |
+
self.max_distance: int = 1280 # maximum distance for relative position embedding
|
| 59 |
+
self.gru_rel_pos: bool = False # apply gated relative position embedding
|
| 60 |
+
|
| 61 |
+
# quantizer
|
| 62 |
+
self.quant_n: int = 1024 # codebook number in quantizer
|
| 63 |
+
self.quant_dim: int = 256 # codebook dimension in quantizer
|
| 64 |
+
|
| 65 |
+
if cfg is not None:
|
| 66 |
+
self.update(cfg)
|
| 67 |
+
|
| 68 |
+
def update(self, cfg: dict):
|
| 69 |
+
self.__dict__.update(cfg)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class Tokenizers(nn.Module):
|
| 73 |
+
def __init__(
|
| 74 |
+
self,
|
| 75 |
+
cfg: TokenizersConfig,
|
| 76 |
+
) -> None:
|
| 77 |
+
super().__init__()
|
| 78 |
+
logger.info(f"Tokenizers Config: {cfg.__dict__}")
|
| 79 |
+
|
| 80 |
+
self.cfg = cfg
|
| 81 |
+
|
| 82 |
+
self.embed = cfg.embed_dim
|
| 83 |
+
self.post_extract_proj = (
|
| 84 |
+
nn.Linear(self.embed, cfg.encoder_embed_dim)
|
| 85 |
+
if self.embed != cfg.encoder_embed_dim
|
| 86 |
+
else None
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
self.input_patch_size = cfg.input_patch_size
|
| 90 |
+
self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
|
| 91 |
+
bias=cfg.conv_bias)
|
| 92 |
+
|
| 93 |
+
self.dropout_input = nn.Dropout(cfg.dropout_input)
|
| 94 |
+
|
| 95 |
+
assert not cfg.deep_norm or not cfg.layer_norm_first
|
| 96 |
+
self.encoder = TransformerEncoder(cfg)
|
| 97 |
+
self.layer_norm = LayerNorm(self.embed)
|
| 98 |
+
|
| 99 |
+
self.quantize = NormEMAVectorQuantizer(
|
| 100 |
+
n_embed=cfg.quant_n, embedding_dim=cfg.quant_dim, beta=1.0, kmeans_init=True, decay=0.99,
|
| 101 |
+
)
|
| 102 |
+
self.quant_n = cfg.quant_n
|
| 103 |
+
self.quantize_layer = nn.Sequential(
|
| 104 |
+
nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim),
|
| 105 |
+
nn.Tanh(),
|
| 106 |
+
nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim) # for quantize
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def forward_padding_mask(
|
| 110 |
+
self,
|
| 111 |
+
features: torch.Tensor,
|
| 112 |
+
padding_mask: torch.Tensor,
|
| 113 |
+
) -> torch.Tensor:
|
| 114 |
+
extra = padding_mask.size(1) % features.size(1)
|
| 115 |
+
if extra > 0:
|
| 116 |
+
padding_mask = padding_mask[:, :-extra]
|
| 117 |
+
padding_mask = padding_mask.view(
|
| 118 |
+
padding_mask.size(0), features.size(1), -1
|
| 119 |
+
)
|
| 120 |
+
padding_mask = padding_mask.all(-1)
|
| 121 |
+
return padding_mask
|
| 122 |
+
|
| 123 |
+
def preprocess(
|
| 124 |
+
self,
|
| 125 |
+
source: torch.Tensor,
|
| 126 |
+
fbank_mean: float = 15.41663,
|
| 127 |
+
fbank_std: float = 6.55582,
|
| 128 |
+
) -> torch.Tensor:
|
| 129 |
+
fbanks = []
|
| 130 |
+
for waveform in source:
|
| 131 |
+
waveform = waveform.unsqueeze(0) * 2 ** 15
|
| 132 |
+
fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
|
| 133 |
+
fbanks.append(fbank)
|
| 134 |
+
fbank = torch.stack(fbanks, dim=0)
|
| 135 |
+
fbank = (fbank - fbank_mean) / (2 * fbank_std)
|
| 136 |
+
return fbank
|
| 137 |
+
|
| 138 |
+
def extract_labels(
|
| 139 |
+
self,
|
| 140 |
+
source: torch.Tensor,
|
| 141 |
+
padding_mask: Optional[torch.Tensor] = None,
|
| 142 |
+
fbank_mean: float = 15.41663,
|
| 143 |
+
fbank_std: float = 6.55582,
|
| 144 |
+
):
|
| 145 |
+
fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
|
| 146 |
+
|
| 147 |
+
if padding_mask is not None:
|
| 148 |
+
padding_mask = self.forward_padding_mask(fbank, padding_mask)
|
| 149 |
+
|
| 150 |
+
fbank = fbank.unsqueeze(1)
|
| 151 |
+
features = self.patch_embedding(fbank)
|
| 152 |
+
features = features.reshape(features.shape[0], features.shape[1], -1)
|
| 153 |
+
features = features.transpose(1, 2)
|
| 154 |
+
features = self.layer_norm(features)
|
| 155 |
+
|
| 156 |
+
if padding_mask is not None:
|
| 157 |
+
padding_mask = self.forward_padding_mask(features, padding_mask)
|
| 158 |
+
|
| 159 |
+
if self.post_extract_proj is not None:
|
| 160 |
+
features = self.post_extract_proj(features)
|
| 161 |
+
|
| 162 |
+
x = self.dropout_input(features)
|
| 163 |
+
|
| 164 |
+
x, layer_results = self.encoder(
|
| 165 |
+
x,
|
| 166 |
+
padding_mask=padding_mask,
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
quantize_input = self.quantize_layer(x)
|
| 170 |
+
quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input)
|
| 171 |
+
|
| 172 |
+
return embed_ind
|
| 173 |
+
|
LAVIS-main/lavis/models/beats/backbone.py
ADDED
|
@@ -0,0 +1,783 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
|
| 3 |
+
# Github source: https://github.com/microsoft/unilm/tree/master/beats
|
| 4 |
+
# Copyright (c) 2022 Microsoft
|
| 5 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 6 |
+
# Based on fairseq code bases
|
| 7 |
+
# https://github.com/pytorch/fairseq
|
| 8 |
+
# --------------------------------------------------------
|
| 9 |
+
|
| 10 |
+
import math
|
| 11 |
+
import numpy as np
|
| 12 |
+
from typing import Dict, Optional, Tuple
|
| 13 |
+
import torch
|
| 14 |
+
from torch import Tensor, nn
|
| 15 |
+
import torch.nn.functional as F
|
| 16 |
+
from torch.nn import LayerNorm, Parameter
|
| 17 |
+
from lavis.models.beats.modules import (
|
| 18 |
+
GradMultiply,
|
| 19 |
+
SamePad,
|
| 20 |
+
get_activation_fn,
|
| 21 |
+
GLU_Linear,
|
| 22 |
+
quant_noise,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TransformerEncoder(nn.Module):
|
| 27 |
+
def __init__(self, args):
|
| 28 |
+
super().__init__()
|
| 29 |
+
|
| 30 |
+
self.dropout = args.dropout
|
| 31 |
+
self.embedding_dim = args.encoder_embed_dim
|
| 32 |
+
|
| 33 |
+
self.pos_conv = nn.Conv1d(
|
| 34 |
+
self.embedding_dim,
|
| 35 |
+
self.embedding_dim,
|
| 36 |
+
kernel_size=args.conv_pos,
|
| 37 |
+
padding=args.conv_pos // 2,
|
| 38 |
+
groups=args.conv_pos_groups,
|
| 39 |
+
)
|
| 40 |
+
dropout = 0
|
| 41 |
+
std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
|
| 42 |
+
nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
|
| 43 |
+
nn.init.constant_(self.pos_conv.bias, 0)
|
| 44 |
+
|
| 45 |
+
self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
|
| 46 |
+
self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
|
| 47 |
+
|
| 48 |
+
if hasattr(args, "relative_position_embedding"):
|
| 49 |
+
self.relative_position_embedding = args.relative_position_embedding
|
| 50 |
+
self.num_buckets = args.num_buckets
|
| 51 |
+
self.max_distance = args.max_distance
|
| 52 |
+
else:
|
| 53 |
+
self.relative_position_embedding = False
|
| 54 |
+
self.num_buckets = 0
|
| 55 |
+
self.max_distance = 0
|
| 56 |
+
|
| 57 |
+
self.layers = nn.ModuleList(
|
| 58 |
+
[
|
| 59 |
+
TransformerSentenceEncoderLayer(
|
| 60 |
+
embedding_dim=self.embedding_dim,
|
| 61 |
+
ffn_embedding_dim=args.encoder_ffn_embed_dim,
|
| 62 |
+
num_attention_heads=args.encoder_attention_heads,
|
| 63 |
+
dropout=self.dropout,
|
| 64 |
+
attention_dropout=args.attention_dropout,
|
| 65 |
+
activation_dropout=args.activation_dropout,
|
| 66 |
+
activation_fn=args.activation_fn,
|
| 67 |
+
layer_norm_first=args.layer_norm_first,
|
| 68 |
+
deep_norm=args.deep_norm,
|
| 69 |
+
has_relative_attention_bias=self.relative_position_embedding,
|
| 70 |
+
num_buckets=self.num_buckets,
|
| 71 |
+
max_distance=self.max_distance,
|
| 72 |
+
gru_rel_pos=args.gru_rel_pos,
|
| 73 |
+
encoder_layers=args.encoder_layers,
|
| 74 |
+
)
|
| 75 |
+
for i in range(args.encoder_layers)
|
| 76 |
+
]
|
| 77 |
+
)
|
| 78 |
+
if self.relative_position_embedding:
|
| 79 |
+
for i in range(1, args.encoder_layers):
|
| 80 |
+
del self.layers[i].self_attn.relative_attention_bias
|
| 81 |
+
self.layers[i].self_attn.relative_attention_bias = self.layers[0].self_attn.relative_attention_bias
|
| 82 |
+
|
| 83 |
+
self.layer_norm_first = args.layer_norm_first
|
| 84 |
+
self.layer_norm = LayerNorm(self.embedding_dim)
|
| 85 |
+
self.layerdrop = args.encoder_layerdrop
|
| 86 |
+
|
| 87 |
+
self.apply(init_bert_params)
|
| 88 |
+
|
| 89 |
+
if args.deep_norm:
|
| 90 |
+
deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4)
|
| 91 |
+
for i in range(args.encoder_layers):
|
| 92 |
+
nn.init.xavier_normal_(self.layers[i].self_attn.k_proj.weight, gain=1)
|
| 93 |
+
nn.init.xavier_normal_(self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta)
|
| 94 |
+
nn.init.xavier_normal_(self.layers[i].self_attn.q_proj.weight, gain=1)
|
| 95 |
+
nn.init.xavier_normal_(self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta)
|
| 96 |
+
nn.init.xavier_normal_(self.layers[i].fc1.weight, gain=deep_norm_beta)
|
| 97 |
+
nn.init.xavier_normal_(self.layers[i].fc2.weight, gain=deep_norm_beta)
|
| 98 |
+
|
| 99 |
+
self.layer_wise_gradient_decay_ratio = getattr(args, "layer_wise_gradient_decay_ratio", 1)
|
| 100 |
+
|
| 101 |
+
def forward(self, x, padding_mask=None, layer=None):
|
| 102 |
+
x, layer_results = self.extract_features(x, padding_mask, layer)
|
| 103 |
+
|
| 104 |
+
if self.layer_norm_first and layer is None:
|
| 105 |
+
x = self.layer_norm(x)
|
| 106 |
+
|
| 107 |
+
return x, layer_results
|
| 108 |
+
|
| 109 |
+
def extract_features(self, x, padding_mask=None, tgt_layer=None):
|
| 110 |
+
|
| 111 |
+
if padding_mask is not None:
|
| 112 |
+
x[padding_mask] = 0
|
| 113 |
+
|
| 114 |
+
x_conv = self.pos_conv(x.transpose(1, 2))
|
| 115 |
+
x_conv = x_conv.transpose(1, 2)
|
| 116 |
+
x = x + x_conv
|
| 117 |
+
|
| 118 |
+
if not self.layer_norm_first:
|
| 119 |
+
x = self.layer_norm(x)
|
| 120 |
+
|
| 121 |
+
x = F.dropout(x, p=self.dropout, training=self.training)
|
| 122 |
+
|
| 123 |
+
# B x T x C -> T x B x C
|
| 124 |
+
x = x.transpose(0, 1)
|
| 125 |
+
|
| 126 |
+
layer_results = []
|
| 127 |
+
z = None
|
| 128 |
+
if tgt_layer is not None:
|
| 129 |
+
layer_results.append((x, z))
|
| 130 |
+
r = None
|
| 131 |
+
pos_bias = None
|
| 132 |
+
for i, layer in enumerate(self.layers):
|
| 133 |
+
if self.layer_wise_gradient_decay_ratio != 1.0:
|
| 134 |
+
x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio)
|
| 135 |
+
dropout_probability = np.random.random()
|
| 136 |
+
if not self.training or (dropout_probability > self.layerdrop):
|
| 137 |
+
x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_bias)
|
| 138 |
+
if tgt_layer is not None:
|
| 139 |
+
layer_results.append((x, z))
|
| 140 |
+
if i == tgt_layer:
|
| 141 |
+
r = x
|
| 142 |
+
break
|
| 143 |
+
|
| 144 |
+
if r is not None:
|
| 145 |
+
x = r
|
| 146 |
+
|
| 147 |
+
# T x B x C -> B x T x C
|
| 148 |
+
x = x.transpose(0, 1)
|
| 149 |
+
|
| 150 |
+
return x, layer_results
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class TransformerSentenceEncoderLayer(nn.Module):
|
| 154 |
+
def __init__(
|
| 155 |
+
self,
|
| 156 |
+
embedding_dim: float = 768,
|
| 157 |
+
ffn_embedding_dim: float = 3072,
|
| 158 |
+
num_attention_heads: float = 8,
|
| 159 |
+
dropout: float = 0.1,
|
| 160 |
+
attention_dropout: float = 0.1,
|
| 161 |
+
activation_dropout: float = 0.1,
|
| 162 |
+
activation_fn: str = "relu",
|
| 163 |
+
layer_norm_first: bool = False,
|
| 164 |
+
deep_norm: bool = False,
|
| 165 |
+
has_relative_attention_bias: bool = False,
|
| 166 |
+
num_buckets: int = 0,
|
| 167 |
+
max_distance: int = 0,
|
| 168 |
+
rescale_init: bool = False,
|
| 169 |
+
gru_rel_pos: bool = False,
|
| 170 |
+
encoder_layers: int = 0,
|
| 171 |
+
) -> None:
|
| 172 |
+
|
| 173 |
+
super().__init__()
|
| 174 |
+
self.embedding_dim = embedding_dim
|
| 175 |
+
self.dropout = dropout
|
| 176 |
+
self.activation_dropout = activation_dropout
|
| 177 |
+
|
| 178 |
+
self.activation_name = activation_fn
|
| 179 |
+
self.activation_fn = get_activation_fn(activation_fn)
|
| 180 |
+
self.self_attn = MultiheadAttention(
|
| 181 |
+
self.embedding_dim,
|
| 182 |
+
num_attention_heads,
|
| 183 |
+
dropout=attention_dropout,
|
| 184 |
+
self_attention=True,
|
| 185 |
+
has_relative_attention_bias=has_relative_attention_bias,
|
| 186 |
+
num_buckets=num_buckets,
|
| 187 |
+
max_distance=max_distance,
|
| 188 |
+
rescale_init=rescale_init,
|
| 189 |
+
gru_rel_pos=gru_rel_pos,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
self.dropout1 = nn.Dropout(dropout)
|
| 193 |
+
self.dropout2 = nn.Dropout(self.activation_dropout)
|
| 194 |
+
self.dropout3 = nn.Dropout(dropout)
|
| 195 |
+
|
| 196 |
+
self.layer_norm_first = layer_norm_first
|
| 197 |
+
|
| 198 |
+
self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
|
| 199 |
+
|
| 200 |
+
if self.activation_name == "glu":
|
| 201 |
+
self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
|
| 202 |
+
else:
|
| 203 |
+
self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
|
| 204 |
+
self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
|
| 205 |
+
|
| 206 |
+
self.final_layer_norm = LayerNorm(self.embedding_dim)
|
| 207 |
+
|
| 208 |
+
self.deep_norm = deep_norm
|
| 209 |
+
if self.deep_norm:
|
| 210 |
+
self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4)
|
| 211 |
+
else:
|
| 212 |
+
self.deep_norm_alpha = 1
|
| 213 |
+
|
| 214 |
+
def forward(
|
| 215 |
+
self,
|
| 216 |
+
x: torch.Tensor,
|
| 217 |
+
self_attn_mask: torch.Tensor = None,
|
| 218 |
+
self_attn_padding_mask: torch.Tensor = None,
|
| 219 |
+
need_weights: bool = False,
|
| 220 |
+
pos_bias=None
|
| 221 |
+
):
|
| 222 |
+
residual = x
|
| 223 |
+
|
| 224 |
+
if self.layer_norm_first:
|
| 225 |
+
x = self.self_attn_layer_norm(x)
|
| 226 |
+
x, attn, pos_bias = self.self_attn(
|
| 227 |
+
query=x,
|
| 228 |
+
key=x,
|
| 229 |
+
value=x,
|
| 230 |
+
key_padding_mask=self_attn_padding_mask,
|
| 231 |
+
need_weights=False,
|
| 232 |
+
attn_mask=self_attn_mask,
|
| 233 |
+
position_bias=pos_bias
|
| 234 |
+
)
|
| 235 |
+
x = self.dropout1(x)
|
| 236 |
+
x = residual + x
|
| 237 |
+
|
| 238 |
+
residual = x
|
| 239 |
+
x = self.final_layer_norm(x)
|
| 240 |
+
if self.activation_name == "glu":
|
| 241 |
+
x = self.fc1(x)
|
| 242 |
+
else:
|
| 243 |
+
x = self.activation_fn(self.fc1(x))
|
| 244 |
+
x = self.dropout2(x)
|
| 245 |
+
x = self.fc2(x)
|
| 246 |
+
x = self.dropout3(x)
|
| 247 |
+
x = residual + x
|
| 248 |
+
else:
|
| 249 |
+
x, attn, pos_bias = self.self_attn(
|
| 250 |
+
query=x,
|
| 251 |
+
key=x,
|
| 252 |
+
value=x,
|
| 253 |
+
key_padding_mask=self_attn_padding_mask,
|
| 254 |
+
need_weights=need_weights,
|
| 255 |
+
attn_mask=self_attn_mask,
|
| 256 |
+
position_bias=pos_bias
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
x = self.dropout1(x)
|
| 260 |
+
x = residual * self.deep_norm_alpha + x
|
| 261 |
+
|
| 262 |
+
x = self.self_attn_layer_norm(x)
|
| 263 |
+
|
| 264 |
+
residual = x
|
| 265 |
+
if self.activation_name == "glu":
|
| 266 |
+
x = self.fc1(x)
|
| 267 |
+
else:
|
| 268 |
+
x = self.activation_fn(self.fc1(x))
|
| 269 |
+
x = self.dropout2(x)
|
| 270 |
+
x = self.fc2(x)
|
| 271 |
+
x = self.dropout3(x)
|
| 272 |
+
x = residual * self.deep_norm_alpha + x
|
| 273 |
+
x = self.final_layer_norm(x)
|
| 274 |
+
|
| 275 |
+
return x, attn, pos_bias
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
class MultiheadAttention(nn.Module):
|
| 279 |
+
"""Multi-headed attention.
|
| 280 |
+
|
| 281 |
+
See "Attention Is All You Need" for more details.
|
| 282 |
+
"""
|
| 283 |
+
|
| 284 |
+
def __init__(
|
| 285 |
+
self,
|
| 286 |
+
embed_dim,
|
| 287 |
+
num_heads,
|
| 288 |
+
kdim=None,
|
| 289 |
+
vdim=None,
|
| 290 |
+
dropout=0.0,
|
| 291 |
+
bias=True,
|
| 292 |
+
add_bias_kv=False,
|
| 293 |
+
add_zero_attn=False,
|
| 294 |
+
self_attention=False,
|
| 295 |
+
encoder_decoder_attention=False,
|
| 296 |
+
q_noise=0.0,
|
| 297 |
+
qn_block_size=8,
|
| 298 |
+
has_relative_attention_bias=False,
|
| 299 |
+
num_buckets=32,
|
| 300 |
+
max_distance=128,
|
| 301 |
+
gru_rel_pos=False,
|
| 302 |
+
rescale_init=False,
|
| 303 |
+
):
|
| 304 |
+
super().__init__()
|
| 305 |
+
self.embed_dim = embed_dim
|
| 306 |
+
self.kdim = kdim if kdim is not None else embed_dim
|
| 307 |
+
self.vdim = vdim if vdim is not None else embed_dim
|
| 308 |
+
self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
|
| 309 |
+
|
| 310 |
+
self.num_heads = num_heads
|
| 311 |
+
self.dropout_module = nn.Dropout(dropout)
|
| 312 |
+
|
| 313 |
+
self.has_relative_attention_bias = has_relative_attention_bias
|
| 314 |
+
self.num_buckets = num_buckets
|
| 315 |
+
self.max_distance = max_distance
|
| 316 |
+
if self.has_relative_attention_bias:
|
| 317 |
+
self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
|
| 318 |
+
|
| 319 |
+
self.head_dim = embed_dim // num_heads
|
| 320 |
+
self.q_head_dim = self.head_dim
|
| 321 |
+
self.k_head_dim = self.head_dim
|
| 322 |
+
assert (
|
| 323 |
+
self.head_dim * num_heads == self.embed_dim
|
| 324 |
+
), "embed_dim must be divisible by num_heads"
|
| 325 |
+
self.scaling = self.head_dim ** -0.5
|
| 326 |
+
|
| 327 |
+
self.self_attention = self_attention
|
| 328 |
+
self.encoder_decoder_attention = encoder_decoder_attention
|
| 329 |
+
|
| 330 |
+
assert not self.self_attention or self.qkv_same_dim, (
|
| 331 |
+
"Self-attention requires query, key and " "value to be of the same size"
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
k_bias = True
|
| 335 |
+
if rescale_init:
|
| 336 |
+
k_bias = False
|
| 337 |
+
|
| 338 |
+
k_embed_dim = embed_dim
|
| 339 |
+
q_embed_dim = embed_dim
|
| 340 |
+
|
| 341 |
+
self.k_proj = quant_noise(
|
| 342 |
+
nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size
|
| 343 |
+
)
|
| 344 |
+
self.v_proj = quant_noise(
|
| 345 |
+
nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
|
| 346 |
+
)
|
| 347 |
+
self.q_proj = quant_noise(
|
| 348 |
+
nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
self.out_proj = quant_noise(
|
| 352 |
+
nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
if add_bias_kv:
|
| 356 |
+
self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
|
| 357 |
+
self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
|
| 358 |
+
else:
|
| 359 |
+
self.bias_k = self.bias_v = None
|
| 360 |
+
|
| 361 |
+
self.add_zero_attn = add_zero_attn
|
| 362 |
+
|
| 363 |
+
self.gru_rel_pos = gru_rel_pos
|
| 364 |
+
if self.gru_rel_pos:
|
| 365 |
+
self.grep_linear = nn.Linear(self.q_head_dim, 8)
|
| 366 |
+
self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
|
| 367 |
+
|
| 368 |
+
self.reset_parameters()
|
| 369 |
+
|
| 370 |
+
def reset_parameters(self):
|
| 371 |
+
if self.qkv_same_dim:
|
| 372 |
+
# Empirically observed the convergence to be much better with
|
| 373 |
+
# the scaled initialization
|
| 374 |
+
nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
|
| 375 |
+
nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
|
| 376 |
+
nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
|
| 377 |
+
else:
|
| 378 |
+
nn.init.xavier_uniform_(self.k_proj.weight)
|
| 379 |
+
nn.init.xavier_uniform_(self.v_proj.weight)
|
| 380 |
+
nn.init.xavier_uniform_(self.q_proj.weight)
|
| 381 |
+
|
| 382 |
+
nn.init.xavier_uniform_(self.out_proj.weight)
|
| 383 |
+
if self.out_proj.bias is not None:
|
| 384 |
+
nn.init.constant_(self.out_proj.bias, 0.0)
|
| 385 |
+
if self.bias_k is not None:
|
| 386 |
+
nn.init.xavier_normal_(self.bias_k)
|
| 387 |
+
if self.bias_v is not None:
|
| 388 |
+
nn.init.xavier_normal_(self.bias_v)
|
| 389 |
+
if self.has_relative_attention_bias:
|
| 390 |
+
nn.init.xavier_normal_(self.relative_attention_bias.weight)
|
| 391 |
+
|
| 392 |
+
def _relative_positions_bucket(self, relative_positions, bidirectional=True):
|
| 393 |
+
num_buckets = self.num_buckets
|
| 394 |
+
max_distance = self.max_distance
|
| 395 |
+
relative_buckets = 0
|
| 396 |
+
|
| 397 |
+
if bidirectional:
|
| 398 |
+
num_buckets = num_buckets // 2
|
| 399 |
+
relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
|
| 400 |
+
relative_positions = torch.abs(relative_positions)
|
| 401 |
+
else:
|
| 402 |
+
relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions))
|
| 403 |
+
|
| 404 |
+
max_exact = num_buckets // 2
|
| 405 |
+
is_small = relative_positions < max_exact
|
| 406 |
+
|
| 407 |
+
relative_postion_if_large = max_exact + (
|
| 408 |
+
torch.log(relative_positions.float() / max_exact)
|
| 409 |
+
/ math.log(max_distance / max_exact)
|
| 410 |
+
* (num_buckets - max_exact)
|
| 411 |
+
).to(torch.long)
|
| 412 |
+
relative_postion_if_large = torch.min(
|
| 413 |
+
relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
|
| 414 |
+
)
|
| 415 |
+
|
| 416 |
+
relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large)
|
| 417 |
+
return relative_buckets
|
| 418 |
+
|
| 419 |
+
def compute_bias(self, query_length, key_length):
|
| 420 |
+
context_position = torch.arange(query_length, dtype=torch.long)[:, None]
|
| 421 |
+
memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
|
| 422 |
+
relative_position = memory_position - context_position
|
| 423 |
+
relative_position_bucket = self._relative_positions_bucket(
|
| 424 |
+
relative_position,
|
| 425 |
+
bidirectional=True
|
| 426 |
+
)
|
| 427 |
+
relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
|
| 428 |
+
values = self.relative_attention_bias(relative_position_bucket)
|
| 429 |
+
values = values.permute([2, 0, 1])
|
| 430 |
+
return values
|
| 431 |
+
|
| 432 |
+
def forward(
|
| 433 |
+
self,
|
| 434 |
+
query,
|
| 435 |
+
key: Optional[Tensor],
|
| 436 |
+
value: Optional[Tensor],
|
| 437 |
+
key_padding_mask: Optional[Tensor] = None,
|
| 438 |
+
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
|
| 439 |
+
need_weights: bool = True,
|
| 440 |
+
static_kv: bool = False,
|
| 441 |
+
attn_mask: Optional[Tensor] = None,
|
| 442 |
+
before_softmax: bool = False,
|
| 443 |
+
need_head_weights: bool = False,
|
| 444 |
+
position_bias: Optional[Tensor] = None
|
| 445 |
+
) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
|
| 446 |
+
"""Input shape: Time x Batch x Channel
|
| 447 |
+
|
| 448 |
+
Args:
|
| 449 |
+
key_padding_mask (ByteTensor, optional): mask to exclude
|
| 450 |
+
keys that are pads, of shape `(batch, src_len)`, where
|
| 451 |
+
padding elements are indicated by 1s.
|
| 452 |
+
need_weights (bool, optional): return the attention weights,
|
| 453 |
+
averaged over heads (default: False).
|
| 454 |
+
attn_mask (ByteTensor, optional): typically used to
|
| 455 |
+
implement causal attention, where the mask prevents the
|
| 456 |
+
attention from looking forward in time (default: None).
|
| 457 |
+
before_softmax (bool, optional): return the raw attention
|
| 458 |
+
weights and values before the attention softmax.
|
| 459 |
+
need_head_weights (bool, optional): return the attention
|
| 460 |
+
weights for each head. Implies *need_weights*. Default:
|
| 461 |
+
return the average attention weights over all heads.
|
| 462 |
+
"""
|
| 463 |
+
if need_head_weights:
|
| 464 |
+
need_weights = True
|
| 465 |
+
|
| 466 |
+
is_tpu = query.device.type == "xla"
|
| 467 |
+
|
| 468 |
+
tgt_len, bsz, embed_dim = query.size()
|
| 469 |
+
src_len = tgt_len
|
| 470 |
+
assert embed_dim == self.embed_dim
|
| 471 |
+
assert list(query.size()) == [tgt_len, bsz, embed_dim]
|
| 472 |
+
if key is not None:
|
| 473 |
+
src_len, key_bsz, _ = key.size()
|
| 474 |
+
if not torch.jit.is_scripting():
|
| 475 |
+
assert key_bsz == bsz
|
| 476 |
+
assert value is not None
|
| 477 |
+
assert src_len, bsz == value.shape[:2]
|
| 478 |
+
|
| 479 |
+
if self.has_relative_attention_bias and position_bias is None:
|
| 480 |
+
position_bias = self.compute_bias(tgt_len, src_len)
|
| 481 |
+
position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len)
|
| 482 |
+
|
| 483 |
+
if incremental_state is not None:
|
| 484 |
+
saved_state = self._get_input_buffer(incremental_state)
|
| 485 |
+
if saved_state is not None and "prev_key" in saved_state:
|
| 486 |
+
# previous time steps are cached - no need to recompute
|
| 487 |
+
# key and value if they are static
|
| 488 |
+
if static_kv:
|
| 489 |
+
assert self.encoder_decoder_attention and not self.self_attention
|
| 490 |
+
key = value = None
|
| 491 |
+
else:
|
| 492 |
+
saved_state = None
|
| 493 |
+
|
| 494 |
+
if self.self_attention:
|
| 495 |
+
q = self.q_proj(query)
|
| 496 |
+
k = self.k_proj(query)
|
| 497 |
+
v = self.v_proj(query)
|
| 498 |
+
elif self.encoder_decoder_attention:
|
| 499 |
+
# encoder-decoder attention
|
| 500 |
+
q = self.q_proj(query)
|
| 501 |
+
if key is None:
|
| 502 |
+
assert value is None
|
| 503 |
+
k = v = None
|
| 504 |
+
else:
|
| 505 |
+
k = self.k_proj(key)
|
| 506 |
+
v = self.v_proj(key)
|
| 507 |
+
|
| 508 |
+
else:
|
| 509 |
+
assert key is not None and value is not None
|
| 510 |
+
q = self.q_proj(query)
|
| 511 |
+
k = self.k_proj(key)
|
| 512 |
+
v = self.v_proj(value)
|
| 513 |
+
q *= self.scaling
|
| 514 |
+
alpha = 32
|
| 515 |
+
q *= 1 / alpha
|
| 516 |
+
|
| 517 |
+
if self.bias_k is not None:
|
| 518 |
+
assert self.bias_v is not None
|
| 519 |
+
k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
|
| 520 |
+
v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
|
| 521 |
+
if attn_mask is not None:
|
| 522 |
+
attn_mask = torch.cat(
|
| 523 |
+
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
|
| 524 |
+
)
|
| 525 |
+
if key_padding_mask is not None:
|
| 526 |
+
key_padding_mask = torch.cat(
|
| 527 |
+
[
|
| 528 |
+
key_padding_mask,
|
| 529 |
+
key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
|
| 530 |
+
],
|
| 531 |
+
dim=1,
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
q = (
|
| 535 |
+
q.contiguous()
|
| 536 |
+
.view(tgt_len, bsz * self.num_heads, self.q_head_dim)
|
| 537 |
+
.transpose(0, 1)
|
| 538 |
+
)
|
| 539 |
+
if k is not None:
|
| 540 |
+
k = (
|
| 541 |
+
k.contiguous()
|
| 542 |
+
.view(-1, bsz * self.num_heads, self.k_head_dim)
|
| 543 |
+
.transpose(0, 1)
|
| 544 |
+
)
|
| 545 |
+
if v is not None:
|
| 546 |
+
v = (
|
| 547 |
+
v.contiguous()
|
| 548 |
+
.view(-1, bsz * self.num_heads, self.head_dim)
|
| 549 |
+
.transpose(0, 1)
|
| 550 |
+
)
|
| 551 |
+
|
| 552 |
+
if saved_state is not None:
|
| 553 |
+
# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
|
| 554 |
+
if "prev_key" in saved_state:
|
| 555 |
+
_prev_key = saved_state["prev_key"]
|
| 556 |
+
assert _prev_key is not None
|
| 557 |
+
prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
|
| 558 |
+
if static_kv:
|
| 559 |
+
k = prev_key
|
| 560 |
+
else:
|
| 561 |
+
assert k is not None
|
| 562 |
+
k = torch.cat([prev_key, k], dim=1)
|
| 563 |
+
src_len = k.size(1)
|
| 564 |
+
if "prev_value" in saved_state:
|
| 565 |
+
_prev_value = saved_state["prev_value"]
|
| 566 |
+
assert _prev_value is not None
|
| 567 |
+
prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
|
| 568 |
+
if static_kv:
|
| 569 |
+
v = prev_value
|
| 570 |
+
else:
|
| 571 |
+
assert v is not None
|
| 572 |
+
v = torch.cat([prev_value, v], dim=1)
|
| 573 |
+
prev_key_padding_mask: Optional[Tensor] = None
|
| 574 |
+
if "prev_key_padding_mask" in saved_state:
|
| 575 |
+
prev_key_padding_mask = saved_state["prev_key_padding_mask"]
|
| 576 |
+
assert k is not None and v is not None
|
| 577 |
+
key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
|
| 578 |
+
key_padding_mask=key_padding_mask,
|
| 579 |
+
prev_key_padding_mask=prev_key_padding_mask,
|
| 580 |
+
batch_size=bsz,
|
| 581 |
+
src_len=k.size(1),
|
| 582 |
+
static_kv=static_kv,
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
|
| 586 |
+
saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
|
| 587 |
+
saved_state["prev_key_padding_mask"] = key_padding_mask
|
| 588 |
+
# In this branch incremental_state is never None
|
| 589 |
+
assert incremental_state is not None
|
| 590 |
+
incremental_state = self._set_input_buffer(incremental_state, saved_state)
|
| 591 |
+
assert k is not None
|
| 592 |
+
assert k.size(1) == src_len
|
| 593 |
+
|
| 594 |
+
# This is part of a workaround to get around fork/join parallelism
|
| 595 |
+
# not supporting Optional types.
|
| 596 |
+
if key_padding_mask is not None and key_padding_mask.dim() == 0:
|
| 597 |
+
key_padding_mask = None
|
| 598 |
+
|
| 599 |
+
if key_padding_mask is not None:
|
| 600 |
+
assert key_padding_mask.size(0) == bsz
|
| 601 |
+
assert key_padding_mask.size(1) == src_len
|
| 602 |
+
|
| 603 |
+
if self.add_zero_attn:
|
| 604 |
+
assert v is not None
|
| 605 |
+
src_len += 1
|
| 606 |
+
k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
|
| 607 |
+
v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
|
| 608 |
+
if attn_mask is not None:
|
| 609 |
+
attn_mask = torch.cat(
|
| 610 |
+
[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
|
| 611 |
+
)
|
| 612 |
+
if key_padding_mask is not None:
|
| 613 |
+
key_padding_mask = torch.cat(
|
| 614 |
+
[
|
| 615 |
+
key_padding_mask,
|
| 616 |
+
torch.zeros(key_padding_mask.size(0), 1).type_as(
|
| 617 |
+
key_padding_mask
|
| 618 |
+
),
|
| 619 |
+
],
|
| 620 |
+
dim=1,
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
attn_weights = torch.bmm(q, k.transpose(1, 2))
|
| 624 |
+
attn_weights = (attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha
|
| 625 |
+
attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
|
| 626 |
+
|
| 627 |
+
assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
|
| 628 |
+
|
| 629 |
+
if attn_mask is not None:
|
| 630 |
+
attn_mask = attn_mask.unsqueeze(0)
|
| 631 |
+
attn_weights += attn_mask
|
| 632 |
+
|
| 633 |
+
if key_padding_mask is not None:
|
| 634 |
+
# don't attend to padding symbols
|
| 635 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
| 636 |
+
if not is_tpu:
|
| 637 |
+
attn_weights = attn_weights.masked_fill(
|
| 638 |
+
key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
|
| 639 |
+
float("-inf"),
|
| 640 |
+
)
|
| 641 |
+
else:
|
| 642 |
+
attn_weights = attn_weights.transpose(0, 2)
|
| 643 |
+
attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
|
| 644 |
+
attn_weights = attn_weights.transpose(0, 2)
|
| 645 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
| 646 |
+
|
| 647 |
+
if before_softmax:
|
| 648 |
+
return attn_weights, v, position_bias
|
| 649 |
+
|
| 650 |
+
if position_bias is not None:
|
| 651 |
+
attn_mask_rel_pos = position_bias
|
| 652 |
+
if self.gru_rel_pos == 1:
|
| 653 |
+
query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) * alpha / self.scaling
|
| 654 |
+
_B, _H, _L, __ = query_layer.size()
|
| 655 |
+
gate_a, gate_b = torch.sigmoid(self.grep_linear(query_layer).view(
|
| 656 |
+
_B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, dim=-1)
|
| 657 |
+
gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
|
| 658 |
+
attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias
|
| 659 |
+
|
| 660 |
+
attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size())
|
| 661 |
+
|
| 662 |
+
attn_weights = attn_weights + attn_mask_rel_pos
|
| 663 |
+
|
| 664 |
+
attn_weights_float = F.softmax(
|
| 665 |
+
attn_weights, dim=-1
|
| 666 |
+
)
|
| 667 |
+
attn_weights = attn_weights_float.type_as(attn_weights)
|
| 668 |
+
attn_probs = self.dropout_module(attn_weights)
|
| 669 |
+
|
| 670 |
+
assert v is not None
|
| 671 |
+
attn = torch.bmm(attn_probs, v)
|
| 672 |
+
assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
|
| 673 |
+
attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
|
| 674 |
+
attn = self.out_proj(attn)
|
| 675 |
+
attn_weights: Optional[Tensor] = None
|
| 676 |
+
if need_weights:
|
| 677 |
+
attn_weights = attn_weights_float.view(
|
| 678 |
+
bsz, self.num_heads, tgt_len, src_len
|
| 679 |
+
).transpose(1, 0)
|
| 680 |
+
if not need_head_weights:
|
| 681 |
+
# average attention weights over heads
|
| 682 |
+
attn_weights = attn_weights.mean(dim=0)
|
| 683 |
+
|
| 684 |
+
return attn, attn_weights, position_bias
|
| 685 |
+
|
| 686 |
+
@staticmethod
|
| 687 |
+
def _append_prev_key_padding_mask(
|
| 688 |
+
key_padding_mask: Optional[Tensor],
|
| 689 |
+
prev_key_padding_mask: Optional[Tensor],
|
| 690 |
+
batch_size: int,
|
| 691 |
+
src_len: int,
|
| 692 |
+
static_kv: bool,
|
| 693 |
+
) -> Optional[Tensor]:
|
| 694 |
+
# saved key padding masks have shape (bsz, seq_len)
|
| 695 |
+
if prev_key_padding_mask is not None and static_kv:
|
| 696 |
+
new_key_padding_mask = prev_key_padding_mask
|
| 697 |
+
elif prev_key_padding_mask is not None and key_padding_mask is not None:
|
| 698 |
+
new_key_padding_mask = torch.cat(
|
| 699 |
+
[prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
|
| 700 |
+
)
|
| 701 |
+
# During incremental decoding, as the padding token enters and
|
| 702 |
+
# leaves the frame, there will be a time when prev or current
|
| 703 |
+
# is None
|
| 704 |
+
elif prev_key_padding_mask is not None:
|
| 705 |
+
if src_len > prev_key_padding_mask.size(1):
|
| 706 |
+
filler = torch.zeros(
|
| 707 |
+
(batch_size, src_len - prev_key_padding_mask.size(1)),
|
| 708 |
+
device=prev_key_padding_mask.device,
|
| 709 |
+
)
|
| 710 |
+
new_key_padding_mask = torch.cat(
|
| 711 |
+
[prev_key_padding_mask.float(), filler.float()], dim=1
|
| 712 |
+
)
|
| 713 |
+
else:
|
| 714 |
+
new_key_padding_mask = prev_key_padding_mask.float()
|
| 715 |
+
elif key_padding_mask is not None:
|
| 716 |
+
if src_len > key_padding_mask.size(1):
|
| 717 |
+
filler = torch.zeros(
|
| 718 |
+
(batch_size, src_len - key_padding_mask.size(1)),
|
| 719 |
+
device=key_padding_mask.device,
|
| 720 |
+
)
|
| 721 |
+
new_key_padding_mask = torch.cat(
|
| 722 |
+
[filler.float(), key_padding_mask.float()], dim=1
|
| 723 |
+
)
|
| 724 |
+
else:
|
| 725 |
+
new_key_padding_mask = key_padding_mask.float()
|
| 726 |
+
else:
|
| 727 |
+
new_key_padding_mask = prev_key_padding_mask
|
| 728 |
+
return new_key_padding_mask
|
| 729 |
+
|
| 730 |
+
def _get_input_buffer(
|
| 731 |
+
self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
|
| 732 |
+
) -> Dict[str, Optional[Tensor]]:
|
| 733 |
+
result = self.get_incremental_state(incremental_state, "attn_state")
|
| 734 |
+
if result is not None:
|
| 735 |
+
return result
|
| 736 |
+
else:
|
| 737 |
+
empty_result: Dict[str, Optional[Tensor]] = {}
|
| 738 |
+
return empty_result
|
| 739 |
+
|
| 740 |
+
def _set_input_buffer(
|
| 741 |
+
self,
|
| 742 |
+
incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
|
| 743 |
+
buffer: Dict[str, Optional[Tensor]],
|
| 744 |
+
):
|
| 745 |
+
return self.set_incremental_state(incremental_state, "attn_state", buffer)
|
| 746 |
+
|
| 747 |
+
def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
|
| 748 |
+
return attn_weights
|
| 749 |
+
|
| 750 |
+
|
| 751 |
+
def init_bert_params(module):
|
| 752 |
+
"""
|
| 753 |
+
Initialize the weights specific to the BERT Model.
|
| 754 |
+
This overrides the default initializations depending on the specified arguments.
|
| 755 |
+
1. If normal_init_linear_weights is set then weights of linear
|
| 756 |
+
layer will be initialized using the normal distribution and
|
| 757 |
+
bais will be set to the specified value.
|
| 758 |
+
2. If normal_init_embed_weights is set then weights of embedding
|
| 759 |
+
layer will be initialized using the normal distribution.
|
| 760 |
+
3. If normal_init_proj_weights is set then weights of
|
| 761 |
+
in_project_weight for MultiHeadAttention initialized using
|
| 762 |
+
the normal distribution (to be validated).
|
| 763 |
+
"""
|
| 764 |
+
|
| 765 |
+
def normal_(data):
|
| 766 |
+
# with FSDP, module params will be on CUDA, so we cast them back to CPU
|
| 767 |
+
# so that the RNG is consistent with and without FSDP
|
| 768 |
+
data.copy_(
|
| 769 |
+
data.cpu().normal_(mean=0.0, std=0.02).to(data.device)
|
| 770 |
+
)
|
| 771 |
+
|
| 772 |
+
if isinstance(module, nn.Linear):
|
| 773 |
+
normal_(module.weight.data)
|
| 774 |
+
if module.bias is not None:
|
| 775 |
+
module.bias.data.zero_()
|
| 776 |
+
if isinstance(module, nn.Embedding):
|
| 777 |
+
normal_(module.weight.data)
|
| 778 |
+
if module.padding_idx is not None:
|
| 779 |
+
module.weight.data[module.padding_idx].zero_()
|
| 780 |
+
if isinstance(module, MultiheadAttention):
|
| 781 |
+
normal_(module.q_proj.weight.data)
|
| 782 |
+
normal_(module.k_proj.weight.data)
|
| 783 |
+
normal_(module.v_proj.weight.data)
|