yuccaaa commited on Sep 3, 2025

Commit

9627ce0

verified ·

1 Parent(s): 31ec239

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/args.json +364 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/config.json +29 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/generation_config.json +14 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/merges.txt +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/model.safetensors.index.json +346 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/special_tokens_map.json +31 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/tokenizer_config.json +208 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/vocab.json +0 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/README.md +202 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/adapter_config.json +39 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/additional_config.json +1 -0
BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/args.json +364 -0
EasyR1-new/.gitignore +181 -0
EasyR1-new/.pre-commit-config.yaml +22 -0
EasyR1-new/Dockerfile +65 -0
EasyR1-new/Dockerfile.legacy +72 -0
EasyR1-new/LICENSE +201 -0
EasyR1-new/Makefile +24 -0
EasyR1-new/README.md +223 -0
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/media/table/val/generations_0_2feffd203d182aadef48.table.json +1 -0
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/requirements.txt +216 -0
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-metadata.json +91 -0
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-summary.json +1 -0
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-internal.log +16 -0
EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug.log +28 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/config.yaml +230 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_0_2feffd203d182aadef48.table.json +1 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_104_ccf2e4d7f5a8bd440fdc.table.json +0 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_109_dfde4027f4468ecb4bdb.table.json +0 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/output.log +0 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/requirements.txt +216 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-metadata.json +91 -0
EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-summary.json +1 -0
EasyR1/verl.egg-info/PKG-INFO +264 -0
EasyR1/verl.egg-info/SOURCES.txt +71 -0
EasyR1/verl.egg-info/dependency_links.txt +1 -0
EasyR1/verl.egg-info/requires.txt +23 -0
EasyR1/verl.egg-info/top_level.txt +1 -0
EasyR1/verl/workers/sharding_manager/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1/verl/workers/sharding_manager/__pycache__/base.cpython-310.pyc +0 -0
EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_ulysses.cpython-310.pyc +0 -0
EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_vllm.cpython-310.pyc +0 -0
EasyR1/verl/workers/sharding_manager/fsdp_ulysses.py +65 -0
EasyR1/verl/workers/sharding_manager/fsdp_vllm.py +149 -0
LAVIS-main/lavis/models/alpro_models/alpro_retrieval.py +422 -0
LAVIS-main/lavis/models/beats/BEATs.py +180 -0
LAVIS-main/lavis/models/beats/LICENSE_BEATs.txt +21 -0
LAVIS-main/lavis/models/beats/README.md +127 -0
LAVIS-main/lavis/models/beats/Tokenizers.py +173 -0
LAVIS-main/lavis/models/beats/backbone.py +783 -0

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/args.json ADDED Viewed

	@@ -0,0 +1,364 @@

+{
+  "model": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
+  "model_type": "qwen2_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "local_repo_path": null,
+  "template": "qwen2_5",
+  "system": null,
+  "max_length": 8192,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "norm_bbox": null,
+  "response_prefix": null,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/nas/shared/kilab/wangyujia/material_production_train.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 128,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "remove_unused_columns": true,
+  "model_name": [
+    "qwen_bio_sft_deeplocbinary-08022035"
+  ],
+  "model_author": [
+    "swift"
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "external_plugins": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 2,
+  "per_device_eval_batch_size": 2,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 1e-05,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 1,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 5.0,
+  "save_total_limit": 5,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": true,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 5.0,
+  "dataloader_num_workers": 1,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "construct",
+  "disable_tqdm": null,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "tp_size": 0,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "eval_use_evalscope": false,
+  "eval_datasets": [],
+  "eval_limit": null,
+  "eval_datasets_args": null,
+  "eval_generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "resume_only_model": false,
+  "create_checkpoint_symlink": false,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": null,
+  "optimizer": null,
+  "metric": null,
+  "zero_hpz_partition_size": null,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "checkpoint-50-merged",
+  "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding'])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f22ea5f5e10>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "steps",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=5, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=5, dataloader_num_workers=1, dataloader_prefetch_factor=10, past_index=-1, run_name='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None)"
+}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "model_type": "qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.51.3"
+}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,346 @@

+{
+  "metadata": {
+    "total_size": 15231233024
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors"
+  }
+}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-1029-merged/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: /oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [],
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "gate_proj",
+    "q_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/additional_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lora_dtype": null, "lorap_lr_ratio": null, "lorap_emb_lr": 1e-06}

BIO/sft/qwen-production-08022302/v0-20250802-230250/checkpoint-835/args.json ADDED Viewed

	@@ -0,0 +1,364 @@

+{
+  "model": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
+  "model_type": "qwen2_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "local_repo_path": null,
+  "template": "qwen2_5",
+  "system": null,
+  "max_length": 8192,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "tools_prompt": "react_en",
+  "norm_bbox": null,
+  "response_prefix": null,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "use_chat_template": true,
+  "template_backend": "swift",
+  "dataset": [
+    "/nas/shared/kilab/wangyujia/material_production_train.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 128,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "enable_cache": false,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "remove_unused_columns": true,
+  "model_name": [
+    "qwen_bio_sft_deeplocbinary-08022035"
+  ],
+  "model_author": [
+    "swift"
+  ],
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": 64,
+  "temperature": 0.0,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
+  "load_dataset_config": null,
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "external_plugins": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "use_hf": false,
+  "hub_token": null,
+  "custom_register_path": [],
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "output_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250",
+  "overwrite_output_dir": false,
+  "do_train": false,
+  "do_eval": false,
+  "do_predict": false,
+  "eval_strategy": "steps",
+  "prediction_loss_only": false,
+  "per_device_train_batch_size": 2,
+  "per_device_eval_batch_size": 2,
+  "per_gpu_train_batch_size": null,
+  "per_gpu_eval_batch_size": null,
+  "gradient_accumulation_steps": 4,
+  "eval_accumulation_steps": null,
+  "eval_delay": 0,
+  "torch_empty_cache_steps": null,
+  "learning_rate": 1e-05,
+  "weight_decay": 0.1,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "max_grad_norm": 1.0,
+  "num_train_epochs": 3.0,
+  "max_steps": -1,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": null,
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "log_level": "passive",
+  "log_level_replica": "warning",
+  "log_on_each_node": true,
+  "logging_dir": "/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs",
+  "logging_strategy": "steps",
+  "logging_first_step": true,
+  "logging_steps": 1,
+  "logging_nan_inf_filter": true,
+  "save_strategy": "steps",
+  "save_steps": 5.0,
+  "save_total_limit": 5,
+  "save_safetensors": true,
+  "save_on_each_node": false,
+  "save_only_model": true,
+  "restore_callback_states_from_checkpoint": false,
+  "no_cuda": false,
+  "use_cpu": false,
+  "use_mps_device": false,
+  "jit_mode_eval": false,
+  "use_ipex": false,
+  "bf16": true,
+  "fp16": false,
+  "fp16_opt_level": "O1",
+  "half_precision_backend": "auto",
+  "bf16_full_eval": false,
+  "fp16_full_eval": false,
+  "tf32": null,
+  "local_rank": 0,
+  "ddp_backend": null,
+  "tpu_num_cores": null,
+  "tpu_metrics_debug": false,
+  "debug": null,
+  "dataloader_drop_last": false,
+  "eval_steps": 5.0,
+  "dataloader_num_workers": 1,
+  "dataloader_prefetch_factor": null,
+  "past_index": -1,
+  "run_name": "construct",
+  "disable_tqdm": null,
+  "label_names": null,
+  "load_best_model_at_end": false,
+  "metric_for_best_model": "loss",
+  "greater_is_better": false,
+  "ignore_data_skip": false,
+  "fsdp": "",
+  "fsdp_min_num_params": 0,
+  "fsdp_config": null,
+  "tp_size": 0,
+  "fsdp_transformer_layer_cls_to_wrap": null,
+  "accelerator_config": {
+    "dispatch_batches": false
+  },
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": false,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "zero_quantized_weights": false,
+      "zero_quantized_gradients": false,
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "label_smoothing_factor": 0.0,
+  "optim": "adamw_torch",
+  "optim_args": null,
+  "adafactor": false,
+  "group_by_length": false,
+  "length_column_name": "length",
+  "report_to": [
+    "tensorboard"
+  ],
+  "ddp_find_unused_parameters": null,
+  "ddp_bucket_cap_mb": null,
+  "ddp_broadcast_buffers": null,
+  "dataloader_pin_memory": true,
+  "dataloader_persistent_workers": false,
+  "skip_memory_metrics": true,
+  "use_legacy_prediction_loop": false,
+  "push_to_hub": false,
+  "resume_from_checkpoint": null,
+  "hub_model_id": null,
+  "hub_strategy": "every_save",
+  "hub_private_repo": null,
+  "hub_always_push": false,
+  "gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "include_inputs_for_metrics": false,
+  "include_for_metrics": [],
+  "eval_do_concat_batches": true,
+  "fp16_backend": "auto",
+  "push_to_hub_model_id": null,
+  "push_to_hub_organization": null,
+  "push_to_hub_token": null,
+  "mp_parameters": "",
+  "auto_find_batch_size": false,
+  "full_determinism": false,
+  "torchdynamo": null,
+  "ray_scope": "last",
+  "ddp_timeout": 1800,
+  "torch_compile": false,
+  "torch_compile_backend": null,
+  "torch_compile_mode": null,
+  "include_tokens_per_second": false,
+  "include_num_input_tokens_seen": false,
+  "neftune_noise_alpha": null,
+  "optim_target_modules": null,
+  "batch_eval_metrics": false,
+  "eval_on_start": false,
+  "use_liger_kernel": false,
+  "eval_use_gather_object": false,
+  "average_tokens_across_devices": false,
+  "sortish_sampler": false,
+  "predict_with_generate": false,
+  "generation_max_length": null,
+  "generation_num_beams": null,
+  "generation_config": null,
+  "check_model": true,
+  "acc_strategy": "token",
+  "train_dataloader_shuffle": true,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "acc_steps": 1,
+  "eval_use_evalscope": false,
+  "eval_datasets": [],
+  "eval_limit": null,
+  "eval_datasets_args": null,
+  "eval_generation_config": null,
+  "freeze_parameters": [],
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "lorap_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "lora_ga_batch_size": 2,
+  "lora_ga_iters": 2,
+  "lora_ga_max_length": 1024,
+  "lora_ga_direction": "ArB2r",
+  "lora_ga_scale": "stable",
+  "lora_ga_stable_gamma": 16,
+  "init_weights": true,
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "swanlab_token": null,
+  "swanlab_project": null,
+  "swanlab_workspace": null,
+  "swanlab_exp_name": null,
+  "swanlab_mode": "cloud",
+  "add_version": true,
+  "resume_only_model": false,
+  "create_checkpoint_symlink": false,
+  "packing": false,
+  "lazy_tokenize": false,
+  "loss_type": null,
+  "optimizer": null,
+  "metric": null,
+  "zero_hpz_partition_size": null,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "checkpoint-50-merged",
+  "model_info": "ModelInfo(model_type='qwen2_5', model_dir='/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='qwen2_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct', hf_model_id='Qwen/Qwen2.5-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct', hf_model_id='Qwen/Qwen2.5-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct', hf_model_id='Qwen/Qwen2.5-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct', hf_model_id='Qwen/Qwen2.5-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct', hf_model_id='Qwen/Qwen2.5-72B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B', hf_model_id='Qwen/Qwen2.5-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B', hf_model_id='Qwen/Qwen2.5-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B', hf_model_id='Qwen/Qwen2.5-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B', hf_model_id='Qwen/Qwen2.5-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B', hf_model_id='Qwen/Qwen2.5-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B', hf_model_id='Qwen/Qwen2.5-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B', hf_model_id='Qwen/Qwen2.5-72B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-72B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B', hf_model_id='Qwen/Qwen2.5-Coder-0.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B', hf_model_id='Qwen/Qwen2.5-Coder-1.5B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B', hf_model_id='Qwen/Qwen2.5-Coder-3B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B', hf_model_id='Qwen/Qwen2.5-Coder-7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B', hf_model_id='Qwen/Qwen2.5-Coder-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B', hf_model_id='Qwen/Qwen2.5-Coder-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', hf_model_id='Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=['coding'])], template='qwen2_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x7f22ea5f5e10>, model_arch='llama', architectures=['Qwen2ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, task_type=None, ignore_patterns=[], requires=['transformers>=4.37'], tags=[])",
+  "model_dir": "/oss/wangyujia/BIO/construction_finetuning/alpaca/v1-20250609-141541/checkpoint-50-merged",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "evaluation_strategy": "steps",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=2, per_device_eval_batch_size=2, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=5, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=True, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=5, dataloader_num_workers=1, dataloader_prefetch_factor=10, past_index=-1, run_name='/nas/shared/kilab/wangyujia/BIO/sft/qwen-production-08022302/v0-20250802-230250', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, tp_size=0, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'zero_quantized_weights': False, 'zero_quantized_gradients': False, 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['tensorboard'], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, include_tokens_per_second=None, include_num_input_tokens_seen=None, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=None, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, check_model=True, acc_strategy='token', train_dataloader_shuffle=True, metric_warmup_step=0, fsdp_num=1, acc_steps=1, eval_use_evalscope=False, eval_datasets=[], eval_limit=None, eval_datasets_args=None, eval_generation_config=None, train_type='lora', optimizer=None, local_repo_path=None, galore_config=None)"
+}

EasyR1-new/.gitignore ADDED Viewed

	@@ -0,0 +1,181 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+# outputs
+outputs/
+checkpoints/
+wandb/
+tensorboard_log/
+# data
+images/
+images*

EasyR1-new/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+    -   id: check-ast
+    -   id: check-added-large-files
+        args: ['--maxkb=25000']
+    -   id: check-merge-conflict
+    -   id: check-yaml
+    -   id: debug-statements
+    -   id: end-of-file-fixer
+    -   id: requirements-txt-fixer
+    -   id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
+    -   id: no-commit-to-branch
+        args: ['--branch', 'main']
+-   repo: https://github.com/asottile/pyupgrade
+    rev: v3.17.0
+    hooks:
+    -   id: pyupgrade
+        args: [--py38-plus]

EasyR1-new/Dockerfile ADDED Viewed

	@@ -0,0 +1,65 @@

+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    transformer-engine flash-attn apex megatron-core \
+    xgboost opencv grpcio
+# Fix cv2
+RUN rm -rf /usr/local/lib/python3.10/dist-packages/cv2
+# Install torch-2.7.0+cu126 + vllm-0.9.1
+RUN pip install --no-cache-dir "vllm==0.9.1" "torch==2.7.0" "torchvision==0.22.0" "torchaudio==2.7.0" tensordict torchdata \
+    "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=15.0.0" "grpcio>=1.62.1" "optree>=0.13.0" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb liger-kernel mathruler \
+    pytest yapf py-spy pyext pre-commit ruff
+# Install flash-attn-2.8.0.post2
+RUN ABI_FLAG=$(python -c "import torch; print('TRUE' if torch._C._GLIBCXX_USE_CXX11_ABI else 'FALSE')") && \
+    URL="https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.0.post2/flash_attn-2.8.0.post2+cu12torch2.7cxx11abi${ABI_FLAG}-cp310-cp310-linux_x86_64.whl" && \
+    wget -nv -P /opt/tiger "${URL}" && \
+    pip install --no-cache-dir "/opt/tiger/$(basename ${URL})"
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url

EasyR1-new/Dockerfile.legacy ADDED Viewed

	@@ -0,0 +1,72 @@

+# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+# Define environments
+ENV MAX_JOBS=32
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
+ENV DEBIAN_FRONTEND=noninteractive
+ENV NODE_OPTIONS=""
+ENV PIP_ROOT_USER_ACTION=ignore
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+# Define installation arguments
+ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
+ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+# Set apt source
+RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
+    { \
+    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
+    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
+    } > /etc/apt/sources.list
+# Install systemctl
+RUN apt-get update && \
+    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
+    apt-get clean
+# Install tini
+RUN apt-get update && \
+    apt-get install -y tini && \
+    apt-get clean
+# Change pip source
+RUN pip config set global.index-url "${PIP_INDEX}" && \
+    pip config set global.extra-index-url "${PIP_INDEX}" && \
+    python -m pip install --upgrade pip
+# Uninstall nv-pytorch fork
+RUN pip uninstall -y torch torchvision torchaudio \
+    pytorch-quantization pytorch-triton torch-tensorrt \
+    transformer_engine flash_attn apex megatron-core \
+    xgboost opencv grpcio
+# Fix cv2
+RUN rm -rf /usr/local/lib/python3.10/dist-packages/cv2
+# Install torch-2.6.0+cu124 + vllm-0.8.4
+# torch-2.6.0+cu124: cxx11abi=False
+# torch-2.6.0+cu126: cxx11abi=True
+# see https://github.com/flashinfer-ai/flashinfer/issues/911
+RUN pip install --no-cache-dir "vllm==0.8.4" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" tensordict torchdata \
+    "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
+    "numpy<2.0.0" "pyarrow>=15.0.0" "grpcio>=1.62.1" "optree>=0.13.0" pandas \
+    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb liger-kernel mathruler \
+    pytest yapf py-spy pyext pre-commit ruff
+# Install flash-attn-2.7.4.post1 (cxx11abi=False)
+RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
+    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
+# vllm-0.8.3 does not support flashinfer>=0.2.3
+# see https://github.com/vllm-project/vllm/pull/15777
+RUN wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
+    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
+# Reset pip config
+RUN pip config unset global.index-url && \
+    pip config unset global.extra-index-url

EasyR1-new/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

EasyR1-new/Makefile ADDED Viewed

	@@ -0,0 +1,24 @@

+.PHONY: build commit license quality style test
+check_dirs := examples scripts tests verl setup.py
+build:
+	python3 setup.py sdist bdist_wheel
+commit:
+	pre-commit install
+	pre-commit run --all-files
+license:
+	python3 tests/check_license.py $(check_dirs)
+quality:
+	ruff check $(check_dirs)
+	ruff format --check $(check_dirs)
+style:
+	ruff check $(check_dirs) --fix
+	ruff format $(check_dirs)
+test:
+	pytest -vv tests/

EasyR1-new/README.md ADDED Viewed

	@@ -0,0 +1,223 @@

+# EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework
+[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/EasyR1)](https://github.com/hiyouga/EasyR1/stargazers)
+[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
+### Used by [Amazon Web Services](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/)
+This project is a clean fork of the original [veRL](https://github.com/volcengine/verl) project to support vision language models, we thank all the authors for providing such a high-performance RL training framework.
+EasyR1 is efficient and scalable due to the design of **[HybirdEngine](https://arxiv.org/abs/2409.19256)** and the latest release of **[vLLM](https://github.com/vllm-project/vllm)**'s SPMD mode.
+## Features
+- Supported models
+  - Llama3/Qwen2/Qwen2.5/Qwen3 language models
+  - Qwen2/Qwen2.5-VL vision language models
+  - DeepSeek-R1 distill models
+- Supported algorithms
+  - GRPO
+  - DAPO
+  - Reinforce++
+  - ReMax
+  - RLOO
+- Supported datasets
+  - Any text, vision-text dataset in a [specific format](#custom-dataset)
+- Supported tricks
+  - Padding-free training
+  - Resuming from checkpoint
+  - Wandb & SwanLab & Mlflow & Tensorboard tracking
+## Requirements
+### Software Requirements
+- Python 3.9+
+- transformers>=4.51.0
+- flash-attn>=2.4.3
+- vllm>=0.8.3
+We provide a [Dockerfile](./Dockerfile) to easily build environments.
+We recommend using the [pre-built docker image](https://hub.docker.com/r/hiyouga/verl) in EasyR1.
+```bash
+docker pull hiyouga/verl:ngc-th2.7.0-cu12.6-vllm0.9.1
+```
+### Hardware Requirements
+\* *estimated*
+| Method                   | Bits |  1.5B  |   3B   |   7B   |   32B   |   72B   |
+| ------------------------ | ---- | ------ | ------ | ------ | ------- | ------- |
+| GRPO Full Fine-Tuning    |  AMP | 2*24GB | 4*40GB | 8*40GB | 16*80GB | 32*80GB |
+| GRPO Full Fine-Tuning    | BF16 | 1*24GB | 1*40GB | 4*40GB |  8*80GB | 16*80GB |
+> [!NOTE]
+> Use `worker.actor.fsdp.torch_dtype=bf16` and `worker.actor.optim.strategy=adamw_bf16` to enable bf16 training.
+>
+> We are working hard to reduce the VRAM in RL training, LoRA support will be integrated in next updates.
+## Tutorial: Run Qwen2.5-VL GRPO on [Geometry3K](https://huggingface.co/datasets/hiyouga/geometry3k) Dataset in Just 3 Steps
+![image](assets/qwen2_5_vl_7b_geo.png)
+### Installation
+```bash
+git clone https://github.com/hiyouga/EasyR1.git
+cd EasyR1
+pip install -e .
+```
+### GRPO Training
+```bash
+bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
+```
+### Merge Checkpoint in Hugging Face Format
+```bash
+python3 scripts/model_merger.py --local_dir checkpoints/easy_r1/exp_name/global_step_1/actor
+```
+> [!TIP]
+> If you encounter issues with connecting to Hugging Face, consider using `export HF_ENDPOINT=https://hf-mirror.com`.
+>
+> If you want to use SwanLab logger, consider using `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
+## Custom Dataset
+Please refer to the example datasets to prepare your own dataset.
+- Text dataset: https://huggingface.co/datasets/hiyouga/math12k
+- Image-text dataset: https://huggingface.co/datasets/hiyouga/geometry3k
+- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
+- Text-image mixed dataset: https://huggingface.co/datasets/hiyouga/rl-mixed-dataset
+## How to Understand GRPO in EasyR1
+![image](assets/easyr1_grpo.png)
+- To learn about the GRPO algorithm, you can refer to [Hugging Face's blog](https://huggingface.co/docs/trl/v0.16.1/en/grpo_trainer).
+## How to Run 70B+ Model in Multi-node Environment
+1. Start the Ray head node.
+```bash
+ray start --head --port=6379 --dashboard-host=0.0.0.0
+```
+2. Start the Ray worker node and connect to the head node.
+```bash
+ray start --address=<head_node_ip>:6379
+```
+3. Check the Ray resource pool.
+```bash
+ray status
+```
+4. Run training script on the Ray head node only.
+```bash
+bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
+```
+See the **[veRL's official doc](https://verl.readthedocs.io/en/latest/start/multinode.html)** for more details about multi-node training and Ray debugger.
+## Other Baselines
+We also reproduced the following two baselines of the [R1-V](https://github.com/deep-agent/R1-V) project.
+- [CLEVR-70k-Counting](examples/baselines/qwen2_5_vl_3b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
+- [GeoQA-8k](examples/baselines/qwen2_5_vl_3b_geoqa8k.sh): Train the Qwen2.5-VL-3B-Instruct model on GeoQA problem.
+## Performance Baselines
+See [baselines.md](assets/baselines.md).
+## Awesome Work using EasyR1
+- **MMR1**: Advancing the Frontiers of Multimodal Reasoning. [![[code]](https://img.shields.io/github/stars/LengSicong/MMR1)](https://github.com/LengSicong/MMR1)
+- **Vision-R1**: Incentivizing Reasoning Capability in Multimodal Large Language Models. [![[code]](https://img.shields.io/github/stars/Osilly/Vision-R1)](https://github.com/Osilly/Vision-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06749-blue)](https://arxiv.org/abs/2503.06749)
+- **Seg-Zero**: Reasoning-Chain Guided Segmentation via Cognitive Reinforcement. [![[code]](https://img.shields.io/github/stars/dvlab-research/Seg-Zero)](https://github.com/dvlab-research/Seg-Zero) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06520-blue)](https://arxiv.org/abs/2503.06520)
+- **MetaSpatial**: Reinforcing 3D Spatial Reasoning in VLMs for the Metaverse. [![[code]](https://img.shields.io/github/stars/PzySeere/MetaSpatial)](https://github.com/PzySeere/MetaSpatial) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.18470-blue)](https://arxiv.org/abs/2503.18470)
+- **Temporal-R1**: Envolving Temporal Reasoning Capability into LMMs via Temporal Consistent Reward. [![[code]](https://img.shields.io/github/stars/appletea233/Temporal-R1)](https://github.com/appletea233/Temporal-R1)
+- **NoisyRollout**: Reinforcing Visual Reasoning with Data Augmentation. [![[code]](https://img.shields.io/github/stars/John-AI-Lab/NoisyRollout)](https://github.com/John-AI-Lab/NoisyRollout) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.13055-blue)](https://arxiv.org/pdf/2504.13055)
+- **GUI-R1**: A Generalist R1-Style Vision-Language Action Model For GUI Agents. [![[code]](https://img.shields.io/github/stars/ritzz-ai/GUI-R1)](https://github.com/ritzz-ai/GUI-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.10458-blue)](https://arxiv.org/abs/2504.10458)
+- **R1-Track**: Direct Application of MLLMs to Visual Object Tracking via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/Wangbiao2/R1-Track)](https://github.com/Wangbiao2/R1-Track)
+- **VisionReasoner**: Unified Visual Perception and Reasoning via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/dvlab-research/VisionReasoner)](https://github.com/dvlab-research/VisionReasoner) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.12081-blue)](https://arxiv.org/abs/2505.12081)
+- **MM-UPT**: Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO. [![[code]](https://img.shields.io/github/stars/waltonfuture/MM-UPT)](https://github.com/waltonfuture/MM-UPT) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22453-blue)](https://arxiv.org/pdf/2505.22453)
+- **RL-with-Cold-Start**: Advancing Multimodal Reasoning via Reinforcement Learning with Cold Start. [![[code]](https://img.shields.io/github/stars/waltonfuture/RL-with-Cold-Start)](https://github.com/waltonfuture/RL-with-Cold-Start) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/pdf/2505.22334)
+- **ViGoRL**: Grounded Reinforcement Learning for Visual Reasoning. [![[code]](https://img.shields.io/github/stars/Gabesarch/grounded-rl)](https://github.com/Gabesarch/grounded-rl) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/abs/2505.23678)
+- **Revisual-R1**: Advancing Multimodal Reasoning: From Optimized Cold Start to Staged Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CSfufu/Revisual-R1)](https://github.com/CSfufu/Revisual-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.04207-blue)](https://arxiv.org/abs/2506.04207)
+- **SophiaVL-R1**: Reinforcing MLLMs Reasoning with Thinking Reward. [![[code]](https://img.shields.io/github/stars/kxfan2002/SophiaVL-R1)](https://github.com/kxfan2002/SophiaVL-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.17018-blue)](https://arxiv.org/abs/2505.17018)
+- **Vision-Matters**: Simple Visual Perturbations Can Boost Multimodal Math Reasoning. [![[code]](https://img.shields.io/github/stars/YutingLi0606/Vision-Matters)](https://github.com/YutingLi0606/Vision-Matters) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.09736-blue)](https://arxiv.org/abs/2506.09736)
+- **VTool-R1**: VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. [![[code]](https://img.shields.io/github/stars/VTOOL-R1/vtool-r1)](https://github.com/VTOOL-R1/vtool-r1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.19255-blue)](https://arxiv.org/abs/2505.19255)
+## TODO
+- Support LoRA (high priority).
+- Support ulysses parallelism for VLMs (middle priority).
+- Support more VLM architectures.
+> [!NOTE]
+> We will not provide scripts for supervised fine-tuning and inference in this project. If you have such requirements, we recommend using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
+### Known bugs
+These features are temporarily disabled for now, we plan to fix them one-by-one in the future updates.
+- Vision language models are not compatible with ulysses parallelism yet.
+## Discussion Group
+👋 Join our [WeChat group](assets/wechat.jpg).
+## FAQs
+> ValueError: Image features and image tokens do not match: tokens: 8192, features 9800
+Increase the `data.max_prompt_length` or reduce the `data.max_pixels`.
+> RuntimeError: CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62
+Reduce the `worker.rollout.gpu_memory_utilization` and enable `worker.actor.offload.offload_params`.
+> RuntimeError: 0 active drivers ([]). There should only be one.
+Uninstall `deepspeed` from the current python environment.
+## Citation
+Core contributors: [Yaowei Zheng](https://github.com/hiyouga), [Junting Lu](https://github.com/AL-377), [Shenzhi Wang](https://github.com/Shenzhi-Wang), [Zhangchi Feng](https://github.com/BUAADreamer), [Dongdong Kuang](https://github.com/Kuangdd01) and Yuwen Xiong
+We also thank Guangming Sheng and Chi Zhang for helpful discussions.
+```bibtex
+@misc{zheng2025easyr1,
+  title        = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework},
+  author       = {Yaowei Zheng, Junting Lu, Shenzhi Wang, Zhangchi Feng, Dongdong Kuang, Yuwen Xiong},
+  howpublished = {\url{https://github.com/hiyouga/EasyR1}},
+  year         = {2025}
+}
+```
+We recommend to also cite the original work.
+```bibtex
+@article{sheng2024hybridflow,
+  title   = {HybridFlow: A Flexible and Efficient RLHF Framework},
+  author  = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
+  year    = {2024},
+  journal = {arXiv preprint arXiv: 2409.19256}
+}
+```

EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/media/table/val/generations_0_2feffd203d182aadef48.table.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"columns": ["step", "input_1", "output_1", "label_1", "score_1", "input_2", "output_2", "label_2", "score_2", "input_3", "output_3", "label_3", "score_3"], "data": [[0, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: PNAIVNRSIPEGVDVFEISGPLFFGAASKFKDAMHVVEKAPSIRILRMRKVMSIDATGLNMLKELFNDCRKSGTTLILSGVHTQPLFAMQQYGLADEIGEENIFGNIDDALDRARSLLGLPVQGRPAGF, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein solubility can be predicted by the grand average of hydropathicity (GRAVY) scale. The GRAVY scale is based on the assumption that a protein with a large number of hydrophilic residues will have a negative value and a protein with a large number of hydrophobic residues will have a positive value. The GRAVY scale is calculated by summing the hydropathy values of all amino acid residues in a protein sequence and dividing by the number of residues in the sequence. The hydropathy value of an amino acid residue is a measure of its hydrophobicity, which is based on its chemical properties. The hydropathy value of an amino acid residue can be calculated using a hydrophobicity scale, which is a table that assigns a numerical value to each amino acid residue based on its hydrophobicity. The amino acid sequence of the protein is: PNAIVNRSIPEGVDVFEISGPLFFGAASKFKDAMHVVEKAPSIRILRMRKVMSIDATGLNMLKELFNDCRKSGTTLILSGVHTQPLFAMQQYGLADEIGEENIFGNIDDALDRARSLLGLPVQGRPAGF. The hydropathy value of each amino acid residue in the sequence is: 0.057, 0.069, 0.08, 0.077, 0.038, 0.05, 0.065, 0.07, 0.057, 0.056, 0.08, 0.059, 0.06, 0.057, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, 0.065, 0.07, 0.05, ", 0, 0.0, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: MTYVLAESPEFYQDNVTDYTGQISSSDITNIQAAIDDVKASEQKVIFVVFLSSFDGVDPETWTQQALQANGGGNVLIYALAPEERQYGIQGGTQWTDAELDAANNAAFQALSQEDWAGSALALAESVGSSSSSSLEHHHHHH, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n</think>\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydrophobic proteins. Therefore, the protein is more likely to be soluble.\n<answer>\n1. Soluble.\n</answer>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins are more likely to be soluble than hydroph", 0, 0.9835715293884277, "system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.\nuser\nGiven the protein amino acid sequence: MGHHHHHHSHMDSKAIAERIKKLQAQQEHVRIGGKGTPRRKKKVIHKTAAADDKKLQSNLKKLSVTNIPGIEEVNMIKDDGTVIHFNNPKVQTSVPANTFSVTGSADNKQITEMLPGILNQLGPESLTHLKKLANNVTKLGPDGKGEDEDVPELVGDFDAASKNETKADEQ, does the protein have the property of solubility?\nOptions:\n0. Insoluble.\n1. Soluble. You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE output within <answer> </answer> tag.\nassistant\n", "<think>\nThe protein has the property of solubility. The solubility of a protein is determined by its amino acid composition and sequence. The solubility of a protein can be affected by various factors, such as pH, temperature, ionic strength, and the presence of other proteins or ligands. The solubility of a protein can be estimated by its net charge and hydrophobicity. A protein with a high net charge or a high hydrophobicity is more likely to be soluble. In this case, the protein has a net charge of 0 and a hydrophobicity of 0.494, which indicates that it is a hydrophilic protein. Hydrophilic proteins tend to be more soluble than hydrophobic proteins. The presence of a large number of hydrophobic amino acids, such as phenylalanine, tryptophan, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 11 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine, can also increase the solubility of a protein. In this case, the protein contains 14 hydrophilic amino acids, which is a moderate number. The presence of a large number of small amino acids, such as alanine, glycine, and proline, can also increase the solubility of a protein. In this case, the protein contains 14 small amino acids, which is a moderate number. The presence of a large number of aromatic amino acids, such as phenylalanine, tyrosine, and tryptophan, can also increase the solubility of a protein. In this case, the protein contains 10 aromatic amino acids, which is a moderate number. The presence of a large number of hydrophobic amino acids, such as leucine, isoleucine, and valine, can also increase the solubility of a protein. In this case, the protein contains 10 hydrophobic amino acids, which is a moderate number. The presence of a large number of charged amino acids, such as arginine, lysine, and glutamic acid, can also increase the solubility of a protein. In this case, the protein contains 10 charged amino acids, which is a moderate number. The presence of a large number of proline and glycine residues can also increase the solubility of a protein. In this case, the protein contains 10 proline residues and 1 glycine residue, which is a moderate number. The presence of a large number of hydrophilic amino acids, such as serine, threonine, and tyrosine", 0, 0.0]]}

EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,216 @@

+setproctitle==1.2.2
+psutil==7.0.0
+colorama==0.4.6
+scipy==1.15.3
+protobuf==4.25.8
+xxhash==3.5.0
+tiktoken==0.9.0
+setproctitle==1.3.6
+opentelemetry-exporter-otlp-proto-grpc==1.26.0
+pyasn1==0.6.1
+typing_extensions==4.14.0
+xgrammar==0.1.18
+frozenlist==1.7.0
+gguf==0.17.0
+tzdata==2025.2
+interegular==0.3.3
+nvidia-cusparselt-cu12==0.6.2
+nvidia-cusparse-cu12==12.3.1.170
+filelock==3.18.0
+wrapt==1.17.2
+multiprocess==0.70.16
+certifi==2025.4.26
+email_validator==2.2.0
+ninja==1.11.1.4
+cupy-cuda12x==13.4.1
+fsspec==2025.3.0
+grpcio==1.73.0
+referencing==0.36.2
+exceptiongroup==1.3.0
+httpcore==1.0.9
+tqdm==4.67.1
+torchvision==0.21.0
+omegaconf==2.3.0
+tensordict==0.8.3
+nvidia-cusolver-cu12==11.6.1.9
+pycountry==24.6.1
+aiohappyeyeballs==2.6.1
+aiohttp-cors==0.8.1
+pydantic==2.11.6
+python-dotenv==1.1.0
+propcache==0.3.2
+platformdirs==4.3.8
+googleapis-common-protos==1.70.0
+vllm==0.8.5.post1
+modelscope==1.27.0
+GitPython==3.1.44
+packaging==25.0
+fastapi==0.115.12
+smmap==5.0.2
+fastapi-cli==0.0.7
+huggingface-hub==0.33.0
+peft==0.15.2
+wandb==0.20.1
+cachetools==5.5.2
+aiosignal==1.3.2
+astor==0.8.1
+msgpack==1.1.1
+cloudpickle==3.1.1
+attrs==25.3.0
+google-auth==2.40.3
+ray==2.47.0
+opentelemetry-exporter-otlp-proto-common==1.26.0
+flash-attn==2.7.1.post1
+codetiming==1.4.0
+typing-inspection==0.4.1
+rich-toolkit==0.14.7
+regex==2024.11.6
+einops==0.8.1
+nvidia-nvtx-cu12==12.4.127
+torchdata==0.11.0
+orjson==3.10.18
+opentelemetry-exporter-otlp==1.26.0
+requests==2.32.4
+yarl==1.20.1
+six==1.17.0
+compressed-tensors==0.9.3
+nvidia-cudnn-cu12==9.1.0.70
+pydantic_core==2.33.2
+rsa==4.9.1
+fastrlock==0.8.3
+prometheus-fastapi-instrumentator==7.1.0
+psutil==7.0.0
+async-timeout==5.0.1
+msgspec==0.19.0
+nvidia-cufile-cu12==1.11.1.6
+sentencepiece==0.2.0
+rpds-py==0.25.1
+mathruler==0.1.0
+opentelemetry-proto==1.26.0
+pyarrow==20.0.0
+mdurl==0.1.2
+python-json-logger==3.3.0
+python-dateutil==2.9.0.post0
+numpy==2.2.6
+markdown-it-py==3.0.0
+tokenizers==0.21.1
+wheel==0.45.1
+gitdb==4.0.12
+nvidia-nvjitlink-cu12==12.4.127
+urllib3==2.4.0
+airportsdata==20250523
+prometheus_client==0.22.1
+safetensors==0.5.3
+setuptools==78.1.1
+opentelemetry-api==1.26.0
+liger_kernel==0.5.10
+click==8.2.1
+mpmath==1.3.0
+dill==0.3.8
+PyYAML==6.0.2
+outlines_core==0.1.26
+jsonschema-specifications==2025.4.1
+nvidia-curand-cu12==10.3.5.147
+zipp==3.23.0
+triton==3.2.0
+python-multipart==0.0.20
+distlib==0.3.9
+transformers==4.52.4
+pandas==2.3.0
+uvloop==0.21.0
+opencensus==0.11.4
+opencensus-context==0.1.3
+numba==0.61.2
+dnspython==2.7.0
+sympy==1.13.1
+pillow==11.2.1
+idna==3.10
+nvidia-cuda-runtime-cu12==12.4.127
+sniffio==1.3.1
+antlr4-python3-runtime==4.9.3
+annotated-types==0.7.0
+uvicorn==0.34.3
+Pygments==2.19.1
+nvidia-cuda-cupti-cu12==12.4.127
+opentelemetry-exporter-prometheus==0.55b1
+h11==0.16.0
+depyf==0.18.0
+colorful==0.5.6
+Jinja2==3.1.6
+nvidia-cuda-nvrtc-cu12==12.4.127
+shellingham==1.5.4
+mistral_common==1.6.2
+partial-json-parser==0.2.1.1.post5
+lm-format-enforcer==0.10.11
+pyzmq==26.4.0
+datasets==3.6.0
+httpx==0.28.1
+charset-normalizer==3.4.2
+hf-xet==1.1.3
+av==14.4.0
+qwen-vl-utils==0.0.11
+opencv-python-headless==4.11.0.86
+nvidia-nccl-cu12==2.21.5
+opentelemetry-semantic-conventions-ai==0.4.9
+py-cpuinfo==9.0.0
+nest-asyncio==1.6.0
+blake3==1.0.5
+pytz==2025.2
+proto-plus==1.26.1
+pylatexenc==2.10
+websockets==15.0.1
+starlette==0.46.2
+nvidia-cublas-cu12==12.4.5.8
+importlib_metadata==8.0.0
+aiohttp==3.12.12
+anyio==4.9.0
+llvmlite==0.44.0
+sentry-sdk==2.30.0
+multidict==6.4.4
+verl==0.3.1.dev0
+distro==1.9.0
+torch==2.6.0
+Deprecated==1.2.18
+MarkupSafe==3.0.2
+torchaudio==2.6.0
+nvidia-cufft-cu12==11.2.1.3
+networkx==3.4.2
+httptools==0.6.4
+opentelemetry-sdk==1.26.0
+pyasn1_modules==0.4.2
+opentelemetry-exporter-otlp-proto-http==1.26.0
+google-api-core==2.25.1
+diskcache==5.6.3
+rich==14.0.0
+virtualenv==20.31.2
+outlines==0.1.11
+llguidance==0.7.29
+py-spy==0.4.0
+lark==1.2.2
+typer==0.16.0
+jiter==0.10.0
+opentelemetry-semantic-conventions==0.47b0
+jsonschema==4.24.0
+watchfiles==1.0.5
+pip==25.1
+smart-open==7.1.0
+openai==1.86.0
+accelerate==1.7.0
+xformers==0.0.29.post2
+verl==0.3.1.dev0
+autocommand==2.2.2
+jaraco.functools==4.0.1
+typeguard==4.3.0
+platformdirs==4.2.2
+jaraco.text==3.12.1
+wheel==0.45.1
+inflect==7.3.1
+jaraco.context==5.3.0
+jaraco.collections==5.1.0
+packaging==24.2
+more-itertools==10.3.0
+typing_extensions==4.12.2
+importlib_metadata==8.0.0
+backports.tarfile==1.2.0
+zipp==3.19.2
+tomli==2.0.1

EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-14T11:33:20.797563Z",
+  "args": [
+    "--node-ip-address=10.1.0.34",
+    "--node-manager-port=44395",
+    "--object-store-name=/tmp/ray/session_2025-06-14_19-26-10_939959_377/sockets/plasma_store",
+    "--raylet-name=/tmp/ray/session_2025-06-14_19-26-10_939959_377/sockets/raylet",
+    "--redis-address=None",
+    "--metrics-agent-port=53248",
+    "--logging-rotate-bytes=536870912",
+    "--logging-rotate-backup-count=5",
+    "--runtime-env-agent-port=62639",
+    "--gcs-address=10.1.0.34:54500",
+    "--session-name=session_2025-06-14_19-26-10_939959_377",
+    "--temp-dir=/tmp/ray",
+    "--webui=127.0.0.1:8265",
+    "--cluster-id=87252e5ee41fad91c76da7e57bdcecbe022b796f41d09458b8a10f7c",
+    "--startup-token=24",
+    "--worker-launch-time-ms=1749900374194",
+    "--node-id=55695a579dfa13036c97ee61aacacb1a3bcb84cbe0636297d5a4b61b",
+    "--runtime-env-hash=-115784934",
+    "--enable-resource-isolation=false"
+  ],
+  "program": "/root/miniconda3/envs/easyr1/lib/python3.10/site-packages/ray/_private/workers/default_worker.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "/nas/shared/kilab/wangyujia/EasyR1/examples",
+  "host": "dlcjqxpfs58ebbif-master-0",
+  "executable": "/root/miniconda3/envs/easyr1/bin/python3",
+  "cpu_count": 24,
+  "cpu_count_logical": 24,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "93401088"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 24,
+    "countLogical": 24
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-6074818b-3292-5382-c329-f6ea4933c000"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-9153a1c2-42e3-a4bf-5b99-c511c2dd1cd7"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-d3362bbd-73d7-9f93-9e24-20b1dde2c73d"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-0a7ad2e1-b610-b966-bb15-97dee8beda1d"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-71868de1-0c70-4e5f-ff0f-c6c77d6ff52a"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-91ff7b89-a1a9-22b7-e7a6-d280378f4c22"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-6da63e94-6ad8-9210-fcc7-9803f04b4499"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-17618b8c-49d9-1302-2d8b-8e2d3323f951"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"val/reward_score":0.42200788855552673,"_runtime":2222.63040867,"val/accuracy_reward":0,"val/overall_reward":0.4220078922653015,"_timestamp":1.7499027736309972e+09,"val/repeat_reward":0.4220078922653015,"_step":0,"_wandb":{"runtime":2222},"val/format_reward":0,"val/generations":{"artifact_path":"wandb-client-artifact://ijyj2h6rsizx63o43gkuzc4ss8t0t749zor61i1pqle3t4c3ibf0xivv691aymsesb6ig7a9fuappbvthpwjudrcl39qby5t4etler4em69qzxjpizsh8ela68r5ocgs/val/generations.table.json","sha256":"2feffd203d182aadef48f52d1efd60785cdc0e27cfd8e96c5ed06b0f422b9a10","_latest_artifact_path":"wandb-client-artifact://l4v9x108g3wt2q2co8z0amg8u4xglpreu8janqpe0gu6vsl28ihxry8uc140267pdit8ni6m32ykh8gkouuhluxdqm3n1ipk3vzgp4x0kh75u6xtedujh1a0uh4vonfq:latest/val/generations.table.json","path":"media/table/val/generations_0_2feffd203d182aadef48.table.json","ncols":13,"log_mode":"IMMUTABLE","_type":"table-file","nrows":1,"size":37088}}

EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,16 @@

+{"time":"2025-06-14T19:33:20.8065764+08:00","level":"INFO","msg":"stream: starting","core version":"0.20.1","symlink path":"/nas/shared/kilab/wangyujia/EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-core.log"}
+{"time":"2025-06-14T19:33:22.123634875+08:00","level":"INFO","msg":"stream: created new stream","id":"9rxy2gyp"}
+{"time":"2025-06-14T19:33:22.123667201+08:00","level":"INFO","msg":"stream: started","id":"9rxy2gyp"}
+{"time":"2025-06-14T19:33:22.123732778+08:00","level":"INFO","msg":"writer: Do: started","stream_id":"9rxy2gyp"}
+{"time":"2025-06-14T19:33:22.123739338+08:00","level":"INFO","msg":"sender: started","stream_id":"9rxy2gyp"}
+{"time":"2025-06-14T19:33:22.123765948+08:00","level":"INFO","msg":"handler: started","stream_id":"9rxy2gyp"}
+{"time":"2025-06-14T19:33:22.624115243+08:00","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-06-14T20:10:23.427993609+08:00","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-06-14T20:10:23.429125637+08:00","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-06-14T20:10:24.39935884+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"uploading wandb-summary.json","runtime_seconds":0.523990922},{"desc":"uploading output.log","runtime_seconds":0.523981291},{"desc":"uploading history steps 0-0, summary, console lines 137-204","runtime_seconds":0.048062105}],"total_operations":3}}
+{"time":"2025-06-14T20:10:26.867530799+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-06-14T20:10:29.822474057+08:00","level":"INFO","msg":"stream: closing","id":"9rxy2gyp"}
+{"time":"2025-06-14T20:10:29.822488409+08:00","level":"INFO","msg":"handler: closed","stream_id":"9rxy2gyp"}
+{"time":"2025-06-14T20:10:29.822495399+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":"9rxy2gyp"}
+{"time":"2025-06-14T20:10:29.822500164+08:00","level":"INFO","msg":"sender: closed","stream_id":"9rxy2gyp"}
+{"time":"2025-06-14T20:10:29.824705513+08:00","level":"INFO","msg":"stream: closed","id":"9rxy2gyp"}

EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug.log ADDED Viewed

	@@ -0,0 +1,28 @@

+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_setup.py:_flush():81] Current SDK version is 0.20.1
+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_setup.py:_flush():81] Configure stats pid to 2538
+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_setup.py:_flush():81] Loading settings from /nas/shared/kilab/wangyujia/EasyR1/examples/wandb/settings
+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /nas/shared/kilab/wangyujia/EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug.log
+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /nas/shared/kilab/wangyujia/EasyR1/examples/wandb/run-20250614_193320-9rxy2gyp/logs/debug-internal.log
+2025-06-14 19:33:20,580 INFO    MainThread:2538 [wandb_init.py:init():831] calling init triggers
+2025-06-14 19:33:20,581 INFO    MainThread:2538 [wandb_init.py:init():836] wandb.init called with sweep_config: {}
+config: {'data': {'train_files': '/nas/shared/kilab/wangyujia/rl_data@train', 'val_files': '/nas/shared/kilab/wangyujia/rl_data@validation', 'prompt_key': 'question', 'answer_key': 'answer', 'image_key': 'images', 'image_dir': None, 'max_prompt_length': 8192, 'max_response_length': 8192, 'rollout_batch_size': 256, 'val_batch_size': 1024, 'format_prompt': '/nas/shared/kilab/wangyujia/EasyR1/examples/format_prompt/bio_format.jinja', 'override_chat_template': None, 'shuffle': True, 'seed': 1, 'min_pixels': 262144, 'max_pixels': 4194304, 'filter_overlong_prompts': True}, 'worker': {'hybrid_engine': True, 'actor': {'strategy': 'fsdp', 'global_batch_size': 128, 'micro_batch_size_per_device_for_update': 2, 'micro_batch_size_per_device_for_experience': 64, 'max_grad_norm': 1.0, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.3, 'clip_ratio_dual': 3.0, 'ppo_epochs': 1, 'padding_free': True, 'ulysses_sequence_parallel_size': 1, 'use_torch_compile': True, 'model': {'model_path': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'tokenizer_path': '/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300', 'override_config': {}, 'enable_gradient_checkpointing': True, 'trust_remote_code': False, 'freeze_vision_tower': False}, 'optim': {'lr': 1e-06, 'betas': [0.9, 0.999], 'weight_decay': 0.01, 'strategy': 'adamw', 'lr_warmup_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'training_steps': 1610}, 'fsdp': {'enable_full_shard': True, 'enable_cpu_offload': False, 'enable_rank0_init': True, 'use_orig_params': False, 'torch_dtype': None, 'fsdp_size': -1, 'mp_param_dtype': 'bf16', 'mp_reduce_dtype': 'fp32', 'mp_buffer_dtype': 'fp32'}, 'offload': {'offload_params': True, 'offload_optimizer': True}, 'global_batch_size_per_device': -1, 'disable_kl': False, 'use_kl_loss': True, 'kl_penalty': 'low_var_kl', 'kl_coef': 0.01}, 'critic': {'strategy': 'fsdp', 'global_batch_size': 256, 'micro_batch_size_per_device_for_update': 4, 'micro_batch_size_per_device_for_experience': 16, 'max_grad_norm': 1.0, 'cliprange_value': 0.5, 'ppo_epochs': 1, 'padding_free': False, 'ulysses_sequence_parallel_size': 1, 'model': {'model_path': None, 'tokenizer_path': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'trust_remote_code': True, 'freeze_vision_tower': False}, 'optim': {'lr': 1e-06, 'betas': [0.9, 0.999], 'weight_decay': 0.01, 'strategy': 'adamw', 'lr_warmup_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'training_steps': 1610}, 'fsdp': {'enable_full_shard': True, 'enable_cpu_offload': False, 'enable_rank0_init': True, 'use_orig_params': False, 'torch_dtype': None, 'fsdp_size': -1, 'mp_param_dtype': 'bf16', 'mp_reduce_dtype': 'fp32', 'mp_buffer_dtype': 'fp32'}, 'offload': {'offload_params': False, 'offload_optimizer': False}, 'global_batch_size_per_device': -1}, 'ref': {'strategy': 'fsdp', 'fsdp': {'enable_full_shard': True, 'enable_cpu_offload': True, 'enable_rank0_init': True, 'use_orig_params': False, 'torch_dtype': None, 'fsdp_size': -1, 'mp_param_dtype': 'bf16', 'mp_reduce_dtype': 'fp32', 'mp_buffer_dtype': 'fp32'}, 'offload': {'offload_params': False, 'offload_optimizer': False}, 'micro_batch_size_per_device_for_experience': 64, 'padding_free': True, 'ulysses_sequence_parallel_size': 1, 'use_torch_compile': True}, 'reward': {'reward_type': 'batch', 'reward_function': '/nas/shared/kilab/wangyujia/EasyR1/examples/reward_function/bio.py', 'reward_function_kwargs': {}, 'skip_special_tokens': True, 'num_cpus': 1, 'reward_function_name': 'compute_score'}, 'rollout': {'name': 'vllm', 'n': 8, 'temperature': 1.0, 'top_p': 0.99, 'top_k': -1, 'seed': 1, 'limit_images': 0, 'dtype': 'bf16', 'gpu_memory_utilization': 0.75, 'ignore_eos': False, 'enforce_eager': False, 'enable_chunked_prefill': False, 'tensor_parallel_size': 1, 'max_model_len': None, 'max_num_batched_tokens': 16384, 'disable_log_stats': True, 'val_override_config': {'temperature': 0.5, 'n': 1}, 'prompt_length': 8192, 'response_length': 8192, 'trust_remote_code': False}}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'disable_kl': False, 'use_kl_loss': True, 'kl_penalty': 'low_var_kl', 'kl_coef': 0.01, 'kl_type': 'fixed', 'kl_horizon': 0.0, 'kl_target': 0.0}, 'trainer': {'total_epochs': 5, 'max_steps': None, 'project_name': 'easy_r1', 'experiment_name': 'qwen2_5_bio_grpo', 'logger': ['console', 'wandb'], 'nnodes': 1, 'n_gpus_per_node': 8, 'critic_warmup': 0, 'val_freq': 5, 'val_before_train': True, 'val_only': False, 'val_generations_to_log': 3, 'save_freq': 5, 'save_limit': 3, 'save_checkpoint_path': '/oss/wangyujia/BIO/rl/qwen2.5_7b_bio', 'load_checkpoint_path': None}, '_wandb': {}}
+2025-06-14 19:33:20,581 INFO    MainThread:2538 [wandb_init.py:init():872] starting backend
+2025-06-14 19:33:20,793 INFO    MainThread:2538 [wandb_init.py:init():875] sending inform_init request
+2025-06-14 19:33:20,796 INFO    MainThread:2538 [wandb_init.py:init():883] backend started and connected
+2025-06-14 19:33:20,799 INFO    MainThread:2538 [wandb_init.py:init():956] updated telemetry
+2025-06-14 19:33:20,799 INFO    MainThread:2538 [wandb_init.py:init():980] communicating run to backend with 90.0 second timeout
+2025-06-14 19:33:22,613 INFO    MainThread:2538 [wandb_init.py:init():1032] starting run threads in backend
+2025-06-14 19:33:22,807 INFO    MainThread:2538 [wandb_run.py:_console_start():2453] atexit reg
+2025-06-14 19:33:22,807 INFO    MainThread:2538 [wandb_run.py:_redirect():2301] redirect: wrap_raw
+2025-06-14 19:33:22,807 INFO    MainThread:2538 [wandb_run.py:_redirect():2370] Wrapping output streams.
+2025-06-14 19:33:22,807 INFO    MainThread:2538 [wandb_run.py:_redirect():2393] Redirects installed.
+2025-06-14 19:33:22,810 INFO    MainThread:2538 [wandb_init.py:init():1078] run started, returning control to user process
+2025-06-14 20:10:23,389 INFO    MainThread:2538 [wandb_run.py:_finish():2219] finishing run gia0603yucca/easy_r1/9rxy2gyp
+2025-06-14 20:10:23,395 INFO    MainThread:2538 [wandb_run.py:_atexit_cleanup():2418] got exitcode: 0
+2025-06-14 20:10:23,397 INFO    MainThread:2538 [wandb_run.py:_restore():2400] restore
+2025-06-14 20:10:23,397 INFO    MainThread:2538 [wandb_run.py:_restore():2406] restore done
+2025-06-14 20:10:29,817 INFO    MainThread:2538 [wandb_run.py:_footer_history_summary_info():4000] rendering history
+2025-06-14 20:10:29,818 INFO    MainThread:2538 [wandb_run.py:_footer_history_summary_info():4032] rendering summary
+2025-06-14 20:10:29,818 INFO    MainThread:2538 [wandb_run.py:_footer_sync_info():3961] logging synced files

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/config.yaml ADDED Viewed

	@@ -0,0 +1,230 @@

+_wandb:
+    value:
+        cli_version: 0.20.1
+        m: []
+        python_version: 3.10.0
+        t:
+            "1":
+                - 1
+                - 11
+                - 30
+                - 41
+                - 49
+                - 51
+                - 71
+                - 95
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 30
+                - 41
+                - 49
+                - 51
+                - 71
+                - 95
+                - 98
+                - 105
+            "3":
+                - 2
+                - 13
+                - 16
+                - 55
+                - 61
+            "4": 3.10.0
+            "5": 0.20.1
+            "6": 4.52.4
+            "12": 0.20.1
+            "13": linux-x86_64
+algorithm:
+    value:
+        adv_estimator: grpo
+        disable_kl: false
+        gamma: 1
+        kl_coef: 0.01
+        kl_horizon: 0
+        kl_penalty: low_var_kl
+        kl_target: 0
+        kl_type: fixed
+        lam: 1
+        use_kl_loss: true
+data:
+    value:
+        answer_key: answer
+        filter_overlong_prompts: true
+        format_prompt: /nas/shared/kilab/wangyujia/EasyR1/examples/format_prompt/bio_format.jinja
+        image_dir: null
+        image_key: images
+        max_pixels: 4194304
+        max_prompt_length: 8192
+        max_response_length: 8192
+        min_pixels: 262144
+        override_chat_template: null
+        prompt_key: question
+        rollout_batch_size: 256
+        seed: 1
+        shuffle: true
+        train_files: /nas/shared/kilab/wangyujia/rl_data@train
+        val_batch_size: 1024
+        val_files: /nas/shared/kilab/wangyujia/rl_data@validation
+trainer:
+    value:
+        critic_warmup: 0
+        experiment_name: qwen2_5_bio_grpo
+        load_checkpoint_path: null
+        logger:
+            - console
+            - wandb
+        max_steps: null
+        n_gpus_per_node: 8
+        nnodes: 1
+        project_name: easy_r1
+        save_checkpoint_path: /oss/wangyujia/BIO/rl/qwen2.5_7b_bio
+        save_freq: 5
+        save_limit: 3
+        total_epochs: 5
+        val_before_train: true
+        val_freq: 5
+        val_generations_to_log: 3
+        val_only: false
+worker:
+    value:
+        actor:
+            clip_ratio_dual: 3
+            clip_ratio_high: 0.3
+            clip_ratio_low: 0.2
+            disable_kl: false
+            fsdp:
+                enable_cpu_offload: false
+                enable_full_shard: true
+                enable_rank0_init: true
+                fsdp_size: -1
+                mp_buffer_dtype: fp32
+                mp_param_dtype: bf16
+                mp_reduce_dtype: fp32
+                torch_dtype: null
+                use_orig_params: false
+            global_batch_size: 128
+            global_batch_size_per_device: -1
+            kl_coef: 0.01
+            kl_penalty: low_var_kl
+            max_grad_norm: 1
+            micro_batch_size_per_device_for_experience: 16
+            micro_batch_size_per_device_for_update: 2
+            model:
+                enable_gradient_checkpointing: true
+                freeze_vision_tower: false
+                model_path: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300
+                tokenizer_path: /oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300
+                trust_remote_code: false
+            offload:
+                offload_optimizer: true
+                offload_params: true
+            optim:
+                betas:
+                    - 0.9
+                    - 0.999
+                lr: 1e-06
+                lr_warmup_ratio: 0
+                min_lr_ratio: null
+                strategy: adamw
+                training_steps: 1610
+                warmup_style: constant
+                weight_decay: 0.01
+            padding_free: true
+            ppo_epochs: 1
+            strategy: fsdp
+            ulysses_sequence_parallel_size: 1
+            use_kl_loss: true
+            use_torch_compile: true
+        critic:
+            cliprange_value: 0.5
+            fsdp:
+                enable_cpu_offload: false
+                enable_full_shard: true
+                enable_rank0_init: true
+                fsdp_size: -1
+                mp_buffer_dtype: fp32
+                mp_param_dtype: bf16
+                mp_reduce_dtype: fp32
+                torch_dtype: null
+                use_orig_params: false
+            global_batch_size: 256
+            global_batch_size_per_device: -1
+            max_grad_norm: 1
+            micro_batch_size_per_device_for_experience: 16
+            micro_batch_size_per_device_for_update: 4
+            model:
+                enable_gradient_checkpointing: true
+                freeze_vision_tower: false
+                model_path: null
+                tokenizer_path: null
+                trust_remote_code: true
+            offload:
+                offload_optimizer: false
+                offload_params: false
+            optim:
+                betas:
+                    - 0.9
+                    - 0.999
+                lr: 1e-06
+                lr_warmup_ratio: 0
+                min_lr_ratio: null
+                strategy: adamw
+                training_steps: 1610
+                warmup_style: constant
+                weight_decay: 0.01
+            padding_free: false
+            ppo_epochs: 1
+            strategy: fsdp
+            ulysses_sequence_parallel_size: 1
+        hybrid_engine: true
+        ref:
+            fsdp:
+                enable_cpu_offload: true
+                enable_full_shard: true
+                enable_rank0_init: true
+                fsdp_size: -1
+                mp_buffer_dtype: fp32
+                mp_param_dtype: bf16
+                mp_reduce_dtype: fp32
+                torch_dtype: null
+                use_orig_params: false
+            micro_batch_size_per_device_for_experience: 16
+            offload:
+                offload_optimizer: false
+                offload_params: false
+            padding_free: true
+            strategy: fsdp
+            ulysses_sequence_parallel_size: 1
+            use_torch_compile: true
+        reward:
+            num_cpus: 1
+            reward_function: /nas/shared/kilab/wangyujia/EasyR1/examples/reward_function/bio.py
+            reward_function_name: compute_score
+            reward_type: batch
+            skip_special_tokens: true
+        rollout:
+            disable_log_stats: true
+            dtype: bf16
+            enable_chunked_prefill: false
+            enforce_eager: false
+            gpu_memory_utilization: 0.75
+            ignore_eos: false
+            limit_images: 0
+            max_model_len: null
+            max_num_batched_tokens: 16384
+            "n": 8
+            name: vllm
+            prompt_length: 8192
+            response_length: 8192
+            seed: 1
+            temperature: 1
+            tensor_parallel_size: 1
+            top_k: -1
+            top_p: 0.99
+            trust_remote_code: false
+            val_override_config:
+                "n": 1
+                temperature: 0.5

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_0_2feffd203d182aadef48.table.json ADDED Viewed

	@@ -0,0 +1 @@

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_104_ccf2e4d7f5a8bd440fdc.table.json ADDED Viewed

The diff for this file is too large to render. See raw diff

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/media/table/val/generations_109_dfde4027f4468ecb4bdb.table.json ADDED Viewed

The diff for this file is too large to render. See raw diff

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,216 @@

+setproctitle==1.2.2
+colorama==0.4.6
+psutil==7.0.0
+numpy==2.2.6
+pylatexenc==2.10
+pyzmq==26.4.0
+email_validator==2.2.0
+pyasn1==0.6.1
+requests==2.32.4
+omegaconf==2.3.0
+tzdata==2025.2
+yarl==1.20.1
+watchfiles==1.0.5
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cublas-cu12==12.4.5.8
+proto-plus==1.26.1
+wandb==0.20.1
+scipy==1.15.3
+googleapis-common-protos==1.70.0
+nvidia-cufile-cu12==1.11.1.6
+opentelemetry-exporter-otlp-proto-http==1.26.0
+vllm==0.8.5.post1
+sniffio==1.3.1
+ray==2.47.0
+python-dateutil==2.9.0.post0
+Deprecated==1.2.18
+setuptools==78.1.1
+aiosignal==1.3.2
+platformdirs==4.3.8
+regex==2024.11.6
+aiohappyeyeballs==2.6.1
+uvloop==0.21.0
+opentelemetry-semantic-conventions-ai==0.4.9
+virtualenv==20.31.2
+rich==14.0.0
+accelerate==1.7.0
+datasets==3.6.0
+python-dotenv==1.1.0
+antlr4-python3-runtime==4.9.3
+gitdb==4.0.12
+six==1.17.0
+smmap==5.0.2
+opencensus==0.11.4
+annotated-types==0.7.0
+xxhash==3.5.0
+frozenlist==1.7.0
+Jinja2==3.1.6
+interegular==0.3.3
+opentelemetry-semantic-conventions==0.47b0
+jiter==0.10.0
+idna==3.10
+nvidia-cusolver-cu12==11.6.1.9
+propcache==0.3.2
+nest-asyncio==1.6.0
+sentencepiece==0.2.0
+fastapi==0.115.12
+verl==0.3.1.dev0
+compressed-tensors==0.9.3
+typing-inspection==0.4.1
+gguf==0.17.0
+dnspython==2.7.0
+wheel==0.45.1
+python-multipart==0.0.20
+cupy-cuda12x==13.4.1
+xgrammar==0.1.18
+starlette==0.46.2
+peft==0.15.2
+blake3==1.0.5
+torchdata==0.11.0
+qwen-vl-utils==0.0.11
+sentry-sdk==2.30.0
+Pygments==2.19.1
+nvidia-nvjitlink-cu12==12.4.127
+aiohttp-cors==0.8.1
+outlines_core==0.1.26
+partial-json-parser==0.2.1.1.post5
+filelock==3.18.0
+tensordict==0.8.3
+cloudpickle==3.1.1
+torchaudio==2.6.0
+pandas==2.3.0
+tiktoken==0.9.0
+av==14.4.0
+flash-attn==2.7.1.post1
+certifi==2025.4.26
+nvidia-nvtx-cu12==12.4.127
+numba==0.61.2
+networkx==3.4.2
+sympy==1.13.1
+pyarrow==20.0.0
+modelscope==1.27.0
+mpmath==1.3.0
+codetiming==1.4.0
+triton==3.2.0
+referencing==0.36.2
+dill==0.3.8
+opencv-python-headless==4.11.0.86
+mathruler==0.1.0
+jsonschema-specifications==2025.4.1
+tokenizers==0.21.1
+huggingface-hub==0.33.0
+rich-toolkit==0.14.7
+fastapi-cli==0.0.7
+python-json-logger==3.3.0
+nvidia-cuda-cupti-cu12==12.4.127
+httptools==0.6.4
+mdurl==0.1.2
+h11==0.16.0
+distlib==0.3.9
+airportsdata==20250523
+transformers==4.52.4
+opentelemetry-exporter-prometheus==0.55b1
+fsspec==2025.3.0
+diskcache==5.6.3
+click==8.2.1
+websockets==15.0.1
+lark==1.2.2
+uvicorn==0.34.3
+grpcio==1.73.0
+pillow==11.2.1
+anyio==4.9.0
+opentelemetry-exporter-otlp-proto-grpc==1.26.0
+pydantic==2.11.6
+wrapt==1.17.2
+opentelemetry-api==1.26.0
+nvidia-curand-cu12==10.3.5.147
+rpds-py==0.25.1
+exceptiongroup==1.3.0
+msgpack==1.1.1
+async-timeout==5.0.1
+protobuf==4.25.8
+httpx==0.28.1
+opentelemetry-proto==1.26.0
+nvidia-nccl-cu12==2.21.5
+nvidia-cusparselt-cu12==0.6.2
+aiohttp==3.12.12
+urllib3==2.4.0
+smart-open==7.1.0
+markdown-it-py==3.0.0
+packaging==25.0
+charset-normalizer==3.4.2
+py-spy==0.4.0
+setproctitle==1.3.6
+safetensors==0.5.3
+pyasn1_modules==0.4.2
+jsonschema==4.24.0
+astor==0.8.1
+shellingham==1.5.4
+pytz==2025.2
+distro==1.9.0
+google-api-core==2.25.1
+rsa==4.9.1
+multiprocess==0.70.16
+colorful==0.5.6
+einops==0.8.1
+lm-format-enforcer==0.10.11
+pydantic_core==2.33.2
+mistral_common==1.6.2
+opentelemetry-sdk==1.26.0
+ninja==1.11.1.4
+typing_extensions==4.14.0
+depyf==0.18.0
+attrs==25.3.0
+tqdm==4.67.1
+openai==1.86.0
+xformers==0.0.29.post2
+prometheus-fastapi-instrumentator==7.1.0
+torch==2.6.0
+nvidia-cudnn-cu12==9.1.0.70
+opencensus-context==0.1.3
+importlib_metadata==8.0.0
+orjson==3.10.18
+psutil==7.0.0
+liger_kernel==0.5.10
+pycountry==24.6.1
+zipp==3.23.0
+pip==25.1
+MarkupSafe==3.0.2
+opentelemetry-exporter-otlp-proto-common==1.26.0
+llguidance==0.7.29
+llvmlite==0.44.0
+multidict==6.4.4
+nvidia-cufft-cu12==11.2.1.3
+GitPython==3.1.44
+fastrlock==0.8.3
+hf-xet==1.1.3
+PyYAML==6.0.2
+opentelemetry-exporter-otlp==1.26.0
+typer==0.16.0
+msgspec==0.19.0
+google-auth==2.40.3
+cachetools==5.5.2
+nvidia-cuda-runtime-cu12==12.4.127
+torchvision==0.21.0
+nvidia-cusparse-cu12==12.3.1.170
+outlines==0.1.11
+prometheus_client==0.22.1
+httpcore==1.0.9
+py-cpuinfo==9.0.0
+verl==0.3.1.dev0
+jaraco.functools==4.0.1
+inflect==7.3.1
+jaraco.collections==5.1.0
+packaging==24.2
+wheel==0.45.1
+tomli==2.0.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+more-itertools==10.3.0
+autocommand==2.2.2
+jaraco.text==3.12.1
+importlib_metadata==8.0.0
+jaraco.context==5.3.0
+zipp==3.19.2
+backports.tarfile==1.2.0
+typeguard==4.3.0

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+  "os": "Linux-5.10.134-008.16.kangaroo.al8.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.0",
+  "startedAt": "2025-06-14T11:47:44.287213Z",
+  "args": [
+    "--node-ip-address=10.1.5.10",
+    "--node-manager-port=45027",
+    "--object-store-name=/tmp/ray/session_2025-06-14_19-45-36_987346_61061/sockets/plasma_store",
+    "--raylet-name=/tmp/ray/session_2025-06-14_19-45-36_987346_61061/sockets/raylet",
+    "--redis-address=None",
+    "--metrics-agent-port=63241",
+    "--logging-rotate-bytes=536870912",
+    "--logging-rotate-backup-count=5",
+    "--runtime-env-agent-port=58728",
+    "--gcs-address=10.1.5.10:60836",
+    "--session-name=session_2025-06-14_19-45-36_987346_61061",
+    "--temp-dir=/tmp/ray",
+    "--webui=127.0.0.1:8265",
+    "--cluster-id=a6a80f53697854b083283d963e7ac5cff199d6477d3b4c5f6f0ddfec",
+    "--startup-token=64",
+    "--worker-launch-time-ms=1749901539376",
+    "--node-id=abb2963d276b09969a5a74d875c4ca780bcf729fc5d17badc158c62f",
+    "--runtime-env-hash=-115784934",
+    "--enable-resource-isolation=false"
+  ],
+  "program": "/root/miniconda3/envs/easyr1/lib/python3.10/site-packages/ray/_private/workers/default_worker.py",
+  "email": "gia0603yucca@gmail.com",
+  "root": "/nas/shared/kilab/wangyujia/EasyR1/examples",
+  "host": "dsw-251511-7876c679d4-fpqq8",
+  "executable": "/root/miniconda3/envs/easyr1/bin/python3",
+  "cpu_count": 64,
+  "cpu_count_logical": 64,
+  "gpu": "NVIDIA A800-SXM4-80GB",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "1623302262784",
+      "used": "799809536"
+    }
+  },
+  "memory": {
+    "total": "549755813888"
+  },
+  "cpu": {
+    "count": 64,
+    "countLogical": 64
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-b6d61b63-2b46-d2f7-d450-38c1353782cb"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-7a6b18cf-311a-f939-3dcc-25b0f1fc898c"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-17cbe6f8-0339-60ce-23bb-dbd33795c1ad"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-799578dd-bc6a-3b56-26b8-3941889669c9"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-c783413d-e4e1-22c5-7c48-9296c28b08a0"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-0ad82850-a679-fa6b-9200-a26edb1bb8a4"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-e73b7d7b-4455-62ee-ec7e-a2eb1d845e07"
+    },
+    {
+      "name": "NVIDIA A800-SXM4-80GB",
+      "architecture": "Ampere",
+      "uuid": "GPU-71ee45de-57b2-ac7c-13c1-08a1f197eb20"
+    }
+  ],
+  "cudaVersion": "12.1"
+}

EasyR1/examples/wandb/run-20250614_194744-cdibom67/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"critic/rewards/max":0.9997167587280273,"response_length/max":8192,"_runtime":145264.000918934,"timing_s/step":1183.4929371809994,"global_seqlen/max":1146817,"reward/format":0,"perf/cpu_memory_used_gb":407.5828285217285,"timing_s/old":102.70515622899984,"critic/returns/min":-2.4748668670654297,"actor/kl_loss":0.037933181738480926,"actor/pg_loss":-0.00841897885722176,"critic/advantages/min":-2.4748668670654297,"prompt_length/mean":314.70703125,"val/repeat_reward":0.0010451679389658478,"reward/overall":0.9458340392351747,"perf/max_memory_reserved_gb":75.796875,"prompt_length/clip_ratio":0,"perf/total_num_tokens":9002567,"prompt_length/max":1134,"perf/throughput":950.8471403981832,"critic/returns/max":2.3680503368377686,"actor/pg_clipfrac_higher":1.9632523688528636e-05,"val/generations":{"path":"media/table/val/generations_119_4e3f574f210c177c5437.table.json","nrows":25,"sha256":"4e3f574f210c177c54379d563cfe05fd219c32811853206cd37aa24896b5186c","ncols":13,"_type":"table-file","artifact_path":"wandb-client-artifact://76aqt3ola7mmf1zjvk79h2cs59rqhaslo8w0fqq45cu2m3evdeejs118qyegtnhx46a5806ogvbs8vt9zt1p7edrxhkmnyp31ey47kna9056mczftfqho4mygicga34i/val/generations.table.json","size":1223506,"log_mode":"IMMUTABLE","_latest_artifact_path":"wandb-client-artifact://h7flxr8fshgu5a89hy9khafst185p799hj8ttotsx8jkk8kyp037hrhnb8iekl0mne5kwynukkv20uhj63vd9zodj22j0nbm6o6w0bwwcqj2b8giffq60c52ebj5qneu:latest/val/generations.table.json"},"timing_s/gen":335.0029417749902,"critic/score/mean":0.9458340406417847,"_wandb":{"runtime":145264},"actor/pg_clipfrac_lower":0,"_step":124,"val/overall_reward":0.0010451679389658478,"critic/advantages/max":2.3680503368377686,"timing_per_token_ms/adv":0.02330009541756115,"timing_s/save_checkpoint":122.09420545600005,"reward/repeat":0.9458340392351747,"actor/ppo_kl":-3.353993864774196e-05,"critic/score/max":0.9997167587280273,"actor/lr":1e-06,"timing_per_token_ms/reward":0.00020019505992198455,"critic/rewards/mean":0.9458340406417847,"response_length/clip_ratio":0.01806640625,"critic/returns/mean":0.011055756360292435,"actor/kl_coef":0.01,"timing_per_token_ms/gen":0.04008148575558264,"perf/time_per_step":1183.4929371809994,"prompt_length/min":138,"global_seqlen/min":1109211,"global_seqlen/balanced_max":1125321,"_timestamp":1.7500469249759648e+09,"val/accuracy_reward":0,"critic/score/min":0,"timing_s/ref":98.26504057302373,"response_length/min":1542,"global_seqlen/balanced_min":1125320,"critic/rewards/min":0,"timing_s/update_actor":435.6776136950066,"perf/max_memory_allocated_gb":38.11449068784714,"perf/mfu_actor":0.42861793360235734,"response_length/mean":4081.07763671875,"reward/accuracy":0,"val/format_reward":0,"actor/entropy_loss":4.0293768817791715,"timing_s/reward":1.6732397199957632,"timing_s/adv":209.7606701029872,"timing_per_token_ms/update_actor":0.04839482046565236,"timing_per_token_ms/ref":0.01091522457683722,"timing_per_token_ms/old":0.011408430087662756,"timing_s/validation":2072.6338901569834,"val/reward_score":0.001045167911797762,"global_seqlen/minmax_diff":37606,"critic/advantages/mean":0.011055756360292435,"actor/grad_norm":0.12824900448322296,"global_seqlen/mean":1.125320875e+06}

EasyR1/verl.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,264 @@

+Metadata-Version: 2.4
+Name: verl
+Version: 0.3.1.dev0
+Summary: An Efficient, Scalable, Multi-Modality RL Training Framework based on veRL
+Home-page: https://github.com/volcengine/verl
+Author: verl
+Author-email: zhangchi.usc1992@bytedance.com, gmsheng@connect.hku.hk, hiyouga@buaa.edu.cn
+License: Apache 2.0 License
+Platform: UNKNOWN
+Requires-Python: >=3.9.0
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: accelerate
+Requires-Dist: codetiming
+Requires-Dist: datasets
+Requires-Dist: liger-kernel
+Requires-Dist: mathruler
+Requires-Dist: numpy
+Requires-Dist: omegaconf
+Requires-Dist: pandas
+Requires-Dist: peft
+Requires-Dist: pillow
+Requires-Dist: pyarrow>=15.0.0
+Requires-Dist: pylatexenc
+Requires-Dist: qwen-vl-utils
+Requires-Dist: ray[default]
+Requires-Dist: tensordict
+Requires-Dist: torchdata
+Requires-Dist: transformers>=4.51.0
+Requires-Dist: vllm>=0.7.3
+Requires-Dist: wandb
+Provides-Extra: dev
+Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Dynamic: author
+Dynamic: author-email
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework
+[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/EasyR1)](https://github.com/hiyouga/EasyR1/stargazers)
+[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
+This project is a clean fork of the original [veRL](https://github.com/volcengine/verl) project to support vision language models, we thank all the authors for providing such a high-performance RL training framework.
+EasyR1 is efficient and scalable due to the design of **[HybirdEngine](https://arxiv.org/abs/2409.19256)** and the latest release of **[vLLM](https://github.com/vllm-project/vllm)**'s SPMD mode.
+## Features
+- Supported models
+  - Llama3/Qwen2/Qwen2.5/Qwen3 language models
+  - Qwen2/Qwen2.5-VL vision language models
+  - DeepSeek-R1 distill models
+- Supported algorithms
+  - GRPO
+  - Reinforce++
+  - ReMax
+  - RLOO
+- Supported datasets
+  - Any text, vision-text dataset in a [specific format](#custom-dataset)
+- Supported tricks
+  - Padding-free training
+  - Resuming from checkpoint
+  - Wandb & SwanLab & Mlflow & Tensorboard tracking
+## Requirements
+### Software Requirements
+- Python 3.9+
+- transformers>=4.51.0
+- flash-attn>=2.4.3
+- vllm>=0.8.3
+We provide a [Dockerfile](./Dockerfile) to easily build environments.
+We recommend using the [pre-built docker image](https://hub.docker.com/r/hiyouga/verl) in EasyR1.
+```bash
+docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0
+```
+### Hardware Requirements
+\* *estimated*
+| Method                   | Bits |  1.5B  |   3B   |   7B   |   32B   |
+| ------------------------ | ---- | ------ | ------ | ------ | ------- |
+| GRPO Full Fine-Tuning    |  AMP | 2*24GB | 4*40GB | 8*40GB | 16*80GB |
+| GRPO Full Fine-Tuning    | BF16 | 1*24GB | 1*40GB | 4*40GB |  8*80GB |
+> [!NOTE]
+> Use `worker.actor.fsdp.torch_dtype=bf16` and `worker.actor.optim.strategy=adamw_bf16` to enable bf16 training.
+>
+> We are working hard to reduce the VRAM in RL training, LoRA support will be integrated in next updates.
+## Tutorial: Run Qwen2.5-VL GRPO on [Geometry3K](https://huggingface.co/datasets/hiyouga/geometry3k) Dataset in Just 3 Steps
+![image](assets/qwen2_5_vl_7b_geo.png)
+### Installation
+```bash
+git clone https://github.com/hiyouga/EasyR1.git
+cd EasyR1
+pip install -e .
+```
+### GRPO Training
+```bash
+bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
+```
+### Merge Checkpoint in Hugging Face Format
+```bash
+python3 scripts/model_merger.py --local_dir checkpoints/easy_r1/exp_name/global_step_1/actor
+```
+> [!TIP]
+> If you encounter issues with connecting to Hugging Face, consider using `export HF_ENDPOINT=https://hf-mirror.com`.
+>
+> If you want to use SwanLab logger, consider using `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
+## Custom Dataset
+Please refer to the example datasets to prepare your own dataset.
+- Text dataset: https://huggingface.co/datasets/hiyouga/math12k
+- Image-text dataset: https://huggingface.co/datasets/hiyouga/geometry3k
+- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
+## How to Understand GRPO in EasyR1
+![image](assets/easyr1_grpo.png)
+- To learn about the GRPO algorithm, you can refer to [Hugging Face's blog](https://huggingface.co/docs/trl/v0.16.1/en/grpo_trainer).
+## How to Run 70B+ Model in Multi-node Environment
+1. Start the Ray head node.
+```bash
+ray start --head --port=6379 --dashboard-host=0.0.0.0
+```
+2. Start the Ray worker node and connect to the head node.
+```bash
+ray start --address=<head_node_ip>:6379
+```
+3. Check the Ray resource pool.
+```bash
+ray status
+```
+4. Run training script on the Ray head node only.
+```bash
+bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
+```
+See the **[veRL's official doc](https://verl.readthedocs.io/en/latest/start/multinode.html)** for more details about multi-node training and Ray debugger.
+## Other Baselines
+We also reproduced the following two baselines of the [R1-V](https://github.com/deep-agent/R1-V) project.
+- [CLEVR-70k-Counting](examples/baselines/qwen2_5_vl_3b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
+- [GeoQA-8k](examples/baselines/qwen2_5_vl_3b_geoqa8k.sh): Train the Qwen2.5-VL-3B-Instruct model on GeoQA problem.
+## Performance Baselines
+See [baselines.md](assets/baselines.md).
+## Awesome Work using EasyR1
+- **MMR1**: Advancing the Frontiers of Multimodal Reasoning. [![[code]](https://img.shields.io/github/stars/LengSicong/MMR1)](https://github.com/LengSicong/MMR1)
+- **Vision-R1**: Incentivizing Reasoning Capability in Multimodal Large Language Models. [![[code]](https://img.shields.io/github/stars/Osilly/Vision-R1)](https://github.com/Osilly/Vision-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06749-blue)](https://arxiv.org/abs/2503.06749)
+- **Seg-Zero**: Reasoning-Chain Guided Segmentation via Cognitive Reinforcement. [![[code]](https://img.shields.io/github/stars/dvlab-research/Seg-Zero)](https://github.com/dvlab-research/Seg-Zero) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06520-blue)](https://arxiv.org/abs/2503.06520)
+- **MetaSpatial**: Reinforcing 3D Spatial Reasoning in VLMs for the Metaverse. [![[code]](https://img.shields.io/github/stars/PzySeere/MetaSpatial)](https://github.com/PzySeere/MetaSpatial) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.18470-blue)](https://arxiv.org/abs/2503.18470)
+- **Temporal-R1**: Envolving Temporal Reasoning Capability into LMMs via Temporal Consistent Reward. [![[code]](https://img.shields.io/github/stars/appletea233/Temporal-R1)](https://github.com/appletea233/Temporal-R1)
+- **NoisyRollout**: Reinforcing Visual Reasoning with Data Augmentation. [![[code]](https://img.shields.io/github/stars/John-AI-Lab/NoisyRollout)](https://github.com/John-AI-Lab/NoisyRollout) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.13055-blue)](https://arxiv.org/pdf/2504.13055)
+- **GUI-R1**: A Generalist R1-Style Vision-Language Action Model For GUI Agents. [![[code]](https://img.shields.io/github/stars/ritzz-ai/GUI-R1)](https://github.com/ritzz-ai/GUI-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.10458-blue)](https://arxiv.org/abs/2504.10458)
+- **R1-Track**: Direct Application of MLLMs to Visual Object Tracking via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/Wangbiao2/R1-Track)](https://github.com/Wangbiao2/R1-Track)
+- **VisionReasoner**: Unified Visual Perception and Reasoning via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/dvlab-research/VisionReasoner)](https://github.com/dvlab-research/VisionReasoner) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.12081-blue)](https://arxiv.org/abs/2505.12081)
+- **MM-UPT**: Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO. [![[code]](https://img.shields.io/github/stars/waltonfuture/MM-UPT)](https://github.com/waltonfuture/MM-UPT) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22453-blue)](https://arxiv.org/pdf/2505.22453)
+- **RL-with-Cold-Start**: Advancing Multimodal Reasoning via Reinforcement Learning with Cold Start. [![[code]](https://img.shields.io/github/stars/waltonfuture/RL-with-Cold-Start)](https://github.com/waltonfuture/RL-with-Cold-Start) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/pdf/2505.22334)
+- **ViGoRL**: Grounded Reinforcement Learning for Visual Reasoning. [![[code]](https://img.shields.io/github/stars/Gabesarch/grounded-rl)](https://github.com/Gabesarch/grounded-rl) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/abs/2505.23678)
+- **Revisual-R1**: Advancing Multimodal Reasoning: From Optimized Cold Start to Staged Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CSfufu/Revisual-R1)](https://github.com/CSfufu/Revisual-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.04207-blue)](https://arxiv.org/abs/2506.04207)
+- **SophiaVL-R1**: Reinforcing MLLMs Reasoning with Thinking Reward. [![[code]](https://img.shields.io/github/stars/kxfan2002/SophiaVL-R1)](https://github.com/kxfan2002/SophiaVL-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.17018-blue)](https://arxiv.org/abs/2505.17018)
+- **Vision-Matters**: Simple Visual Perturbations Can Boost Multimodal Math Reasoning. [![[code]](https://img.shields.io/github/stars/YutingLi0606/Vision-Matters)](https://github.com/YutingLi0606/Vision-Matters) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.09736-blue)](https://arxiv.org/abs/2506.09736)
+## TODO
+- Support LoRA (high priority).
+- Support ulysses parallelism for VLMs (middle priority).
+- Support more VLM architectures.
+> [!NOTE]
+> We will not provide scripts for supervised fine-tuning and inference in this project. If you have such requirements, we recommend using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
+### Known bugs
+These features are temporarily disabled for now, we plan to fix them one-by-one in the future updates.
+- Vision language models are not compatible with ulysses parallelism yet.
+## Discussion Group
+👋 Join our [WeChat group](assets/wechat.jpg).
+## FAQs
+> ValueError: Image features and image tokens do not match: tokens: 8192, features 9800
+Increase the `data.max_prompt_length` or reduce the `data.max_pixels`.
+> RuntimeError: CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62
+Reduce the `worker.rollout.gpu_memory_utilization` and enable `worker.actor.offload.offload_params`.
+> RuntimeError: 0 active drivers ([]). There should only be one.
+Uninstall `deepspeed` from the current python environment.
+## Citation
+Core contributors: [Yaowei Zheng](https://github.com/hiyouga), [Junting Lu](https://github.com/AL-377), [Shenzhi Wang](https://github.com/Shenzhi-Wang), [Zhangchi Feng](https://github.com/BUAADreamer), [Dongdong Kuang](https://github.com/Kuangdd01) and Yuwen Xiong
+We also thank Guangming Sheng and Chi Zhang for helpful discussions.
+```bibtex
+@misc{zheng2025easyr1,
+  title        = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework},
+  author       = {Yaowei Zheng, Junting Lu, Shenzhi Wang, Zhangchi Feng, Dongdong Kuang, Yuwen Xiong},
+  howpublished = {\url{https://github.com/hiyouga/EasyR1}},
+  year         = {2025}
+}
+```
+We recommend to also cite the original work.
+```bibtex
+@article{sheng2024hybridflow,
+  title   = {HybridFlow: A Flexible and Efficient RLHF Framework},
+  author  = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
+  year    = {2024},
+  journal = {arXiv preprint arXiv: 2409.19256}
+}
+```

EasyR1/verl.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,71 @@

+LICENSE
+README.md
+pyproject.toml
+setup.py
+./verl/__init__.py
+./verl/protocol.py
+./verl/models/__init__.py
+./verl/models/monkey_patch.py
+./verl/models/transformers/__init__.py
+./verl/models/transformers/flash_attention_utils.py
+./verl/models/transformers/qwen2_vl.py
+./verl/single_controller/__init__.py
+./verl/single_controller/base/__init__.py
+./verl/single_controller/base/decorator.py
+./verl/single_controller/base/worker.py
+./verl/single_controller/base/worker_group.py
+./verl/single_controller/base/register_center/__init__.py
+./verl/single_controller/base/register_center/ray.py
+./verl/single_controller/ray/__init__.py
+./verl/single_controller/ray/base.py
+./verl/trainer/__init__.py
+./verl/trainer/config.py
+./verl/trainer/core_algos.py
+./verl/trainer/data_loader.py
+./verl/trainer/main.py
+./verl/trainer/metrics.py
+./verl/trainer/ray_trainer.py
+./verl/utils/__init__.py
+./verl/utils/dataset.py
+./verl/utils/flops_counter.py
+./verl/utils/fsdp_utils.py
+./verl/utils/model_utils.py
+./verl/utils/py_functional.py
+./verl/utils/seqlen_balancing.py
+./verl/utils/tokenizer.py
+./verl/utils/torch_dtypes.py
+./verl/utils/torch_functional.py
+./verl/utils/ulysses.py
+./verl/utils/checkpoint/__init__.py
+./verl/utils/checkpoint/checkpoint_manager.py
+./verl/utils/checkpoint/fsdp_checkpoint_manager.py
+./verl/utils/logger/__init__.py
+./verl/utils/logger/gen_logger.py
+./verl/utils/logger/logger.py
+./verl/workers/__init__.py
+./verl/workers/config.py
+./verl/workers/fsdp_workers.py
+./verl/workers/actor/__init__.py
+./verl/workers/actor/base.py
+./verl/workers/actor/config.py
+./verl/workers/actor/dp_actor.py
+./verl/workers/critic/__init__.py
+./verl/workers/critic/base.py
+./verl/workers/critic/config.py
+./verl/workers/critic/dp_critic.py
+./verl/workers/reward/__init__.py
+./verl/workers/reward/config.py
+./verl/workers/reward/function.py
+./verl/workers/rollout/__init__.py
+./verl/workers/rollout/base.py
+./verl/workers/rollout/config.py
+./verl/workers/rollout/vllm_rollout_spmd.py
+./verl/workers/sharding_manager/__init__.py
+./verl/workers/sharding_manager/base.py
+./verl/workers/sharding_manager/fsdp_ulysses.py
+./verl/workers/sharding_manager/fsdp_vllm.py
+verl.egg-info/PKG-INFO
+verl.egg-info/SOURCES.txt
+verl.egg-info/dependency_links.txt
+verl.egg-info/requires.txt
+verl.egg-info/top_level.txt

EasyR1/verl.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

EasyR1/verl.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+accelerate
+codetiming
+datasets
+liger-kernel
+mathruler
+numpy
+omegaconf
+pandas
+peft
+pillow
+pyarrow>=15.0.0
+pylatexenc
+qwen-vl-utils
+ray[default]
+tensordict
+torchdata
+transformers>=4.51.0
+vllm>=0.7.3
+wandb
+[dev]
+pre-commit
+ruff

EasyR1/verl.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ verl

EasyR1/verl/workers/sharding_manager/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (373 Bytes). View file

EasyR1/verl/workers/sharding_manager/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (1.05 kB). View file

EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_ulysses.cpython-310.pyc ADDED Viewed

Binary file (2.4 kB). View file

EasyR1/verl/workers/sharding_manager/__pycache__/fsdp_vllm.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

EasyR1/verl/workers/sharding_manager/fsdp_ulysses.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contains a resharding manager that binds weights from FSDP zero3 to XPerfGPT
+"""
+from torch.distributed.device_mesh import DeviceMesh
+from ...protocol import DataProto, all_gather_data_proto
+from ...utils.ulysses import get_ulysses_sequence_parallel_group, set_ulysses_sequence_parallel_group
+from .base import BaseShardingManager
+class FSDPUlyssesShardingManager(BaseShardingManager):
+    """
+    Sharding manager to support data resharding when using FSDP + Ulysses
+    """
+    def __init__(self, device_mesh: DeviceMesh):
+        super().__init__()
+        self.device_mesh = device_mesh
+    def __enter__(self):
+        if self.device_mesh is not None:
+            self.prev_sp_group = get_ulysses_sequence_parallel_group()
+            set_ulysses_sequence_parallel_group(self.device_mesh["sp"].get_group())
+    def __exit__(self, exc_type, exc_value, traceback):
+        if self.device_mesh is not None:
+            set_ulysses_sequence_parallel_group(self.prev_sp_group)
+    def preprocess_data(self, data: DataProto) -> DataProto:
+        """
+        AllGather data from sp region
+        This is because the data is first sharded along the FSDP dimension as we utilize the DP_COMPUTE
+        In Ulysses, we need to make sure the same data is used across a SP group
+        """
+        if self.device_mesh is not None:
+            sp_size = self.device_mesh["sp"].size()
+            sp_group = self.device_mesh["sp"].get_group()
+            all_gather_data_proto(data, size=sp_size, group=sp_group)
+        return data
+    def postprocess_data(self, data: DataProto) -> DataProto:
+        """
+        Split the data to follow FSDP partition
+        """
+        if self.device_mesh is not None:
+            sp_size = self.device_mesh["sp"].size()
+            sp_rank = self.device_mesh["sp"].get_local_rank()
+            data = data.chunk(chunks=sp_size)[sp_rank]
+        return data

EasyR1/verl/workers/sharding_manager/fsdp_vllm.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import re
+from typing import Dict, Iterable, Tuple, Union
+import torch
+import torch.distributed as dist
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint.state_dict import get_model_state_dict
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+from transformers import PreTrainedModel
+from vllm import LLM
+from vllm.distributed import parallel_state as vllm_ps
+from ...protocol import DataProto, all_gather_data_proto
+from ...utils.model_utils import print_gpu_memory_usage
+from .base import BaseShardingManager
+class FSDPVLLMShardingManager(BaseShardingManager):
+    def __init__(
+        self,
+        module: FSDP,
+        inference_engine: LLM,
+        device_mesh: DeviceMesh,
+    ):
+        self.module = module
+        self.inference_engine = inference_engine
+        self.device_mesh = device_mesh
+        self.world_size = dist.get_world_size()
+        self.tp_size = vllm_ps.get_tensor_model_parallel_world_size()
+        self.tp_rank = vllm_ps.get_tensor_model_parallel_rank()
+        self.tp_group = vllm_ps.get_tensor_model_parallel_group().device_group
+        # Record freed bytes to estimate memory usage correctly
+        # https://github.com/vllm-project/vllm/pull/11743#issuecomment-2754338119
+        self.freed_bytes = 0
+        # Note that torch_random_states may be different on each dp rank
+        self.torch_random_states = torch.cuda.get_rng_state()
+        # get a random rng states
+        gen_dp_rank = self.device_mesh["dp"].get_local_rank()
+        torch.cuda.manual_seed(gen_dp_rank + 1000)  # make sure all tp ranks have the same random states
+        self.gen_random_states = torch.cuda.get_rng_state()
+        torch.cuda.set_rng_state(self.torch_random_states)
+    def _rename_weight_keys(self, actor_weights: Dict[str, Union[torch.Tensor, DTensor]], model: PreTrainedModel):
+        # convert state dict keys: https://github.com/huggingface/transformers/pull/38385
+        if not hasattr(model, "_checkpoint_conversion_mapping"):
+            return actor_weights
+        reverse_key_mapping = {v: k for k, v in model._checkpoint_conversion_mapping.items()}
+        original_weights = {}
+        for key, value in actor_weights.items():
+            for pattern, replacement in reverse_key_mapping.items():
+                replacement = replacement.lstrip("^")  # strip off un-needed chars and patterns
+                replacement = re.sub(r"\(.*\)", "", replacement)
+                key, n_replace = re.subn(pattern, replacement, key)
+                # Early exit of the loop
+                if n_replace > 0:
+                    break
+            original_weights[key] = value
+        return original_weights
+    def _make_weight_iterator(
+        self, actor_weights: Dict[str, Union[torch.Tensor, DTensor]]
+    ) -> Iterable[Tuple[str, torch.Tensor]]:
+        for name, tensor in actor_weights.items():
+            yield name, tensor.full_tensor() if self.world_size != 1 else tensor
+    def __enter__(self):
+        # NOTE: Basically, we only need `torch.cuda.empty_cache()` before vllm wake_up and
+        # after vllm sleep, since vllm has its own caching memory allocator CuMemAllocator.
+        # Out of vllm scope, we should avoid empty cache to let pytorch using caching memory
+        # to speed up memory allocations.
+        #
+        # pytorch: https://pytorch.org/docs/stable/notes/cuda.html#memory-management
+        # vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/device_allocator/cumem.py#L103
+        torch.cuda.empty_cache()
+        print_gpu_memory_usage("Before state_dict() in sharding manager")
+        actor_weights = get_model_state_dict(self.module)
+        actor_weights = self._rename_weight_keys(actor_weights, self.module._fsdp_wrapped_module)
+        print_gpu_memory_usage("After state_dict() in sharding manager")
+        if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
+            self.inference_engine.wake_up(tags=["weights"])
+        else:
+            self.inference_engine.wake_up()
+        model = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+        model.load_weights(self._make_weight_iterator(actor_weights))
+        print_gpu_memory_usage("After sync model weights in sharding manager")
+        del actor_weights
+        torch.cuda.empty_cache()
+        if "tags" in inspect.signature(self.inference_engine.wake_up).parameters:
+            self.inference_engine.wake_up(tags=["kv_cache"])
+        print_gpu_memory_usage("After del state_dict and empty_cache in sharding manager")
+        # important: need to manually set the random states of each tp to be identical.
+        if self.device_mesh is not None:
+            self.torch_random_states = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(self.gen_random_states)
+    def __exit__(self, exc_type, exc_value, traceback):
+        print_gpu_memory_usage("Before vllm offload in sharding manager")
+        free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
+        self.inference_engine.sleep(level=1)
+        free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
+        self.freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
+        print_gpu_memory_usage("After vllm offload in sharding manager")
+        self.module.train()
+        torch.cuda.empty_cache()  # add empty cache after each compute
+        # restore random states
+        if self.device_mesh is not None:
+            self.gen_random_states = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(self.torch_random_states)
+    def preprocess_data(self, data: DataProto) -> DataProto:
+        """All gather across tp group to make each rank has identical input."""
+        all_gather_data_proto(data, size=self.tp_size, group=self.tp_group)
+        return data
+    def postprocess_data(self, data: DataProto) -> DataProto:
+        """Get chunk data of this tp rank since we do all gather in preprocess."""
+        if self.tp_size > 1:
+            data = data.chunk(chunks=self.tp_size)[self.tp_rank]
+        return data

LAVIS-main/lavis/models/alpro_models/alpro_retrieval.py ADDED Viewed

	@@ -0,0 +1,422 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import datetime
+import logging
+import time
+import lavis.common.dist_utils as dist_utils
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from lavis.common.config import node_to_dict
+from lavis.common.dist_utils import get_rank
+from lavis.common.logger import MetricLogger
+from lavis.common.registry import registry
+from lavis.models.alpro_models import AlproBase
+from lavis.models.alpro_models.alpro_outputs import AlproIntermediateOutput, AlproOutput
+from lavis.models.base_model import all_gather_with_grad
+from lavis.models.med import XBertEncoder
+from lavis.models.timesformer.vit import TimeSformer
+from torch import nn
+@registry.register_model("alpro_retrieval")
+class AlproRetrieval(AlproBase):
+    PRETRAINED_MODEL_CONFIG_DICT = {
+        "msrvtt": "configs/models/alpro_retrieval_msrvtt.yaml",
+        "didemo": "configs/models/alpro_retrieval_didemo.yaml",
+    }
+    def __init__(
+        self,
+        visual_encoder,
+        text_encoder,
+        vision_width=768,
+        text_width=768,
+        embed_dim=256,
+        max_txt_len=35,
+        temp=0.07,
+    ):
+        super().__init__()
+        self.temp = nn.Parameter(torch.ones([]) * temp)
+        self.tokenizer = self.init_tokenizer()
+        self.visual_encoder = visual_encoder
+        self.text_encoder = text_encoder
+        vision_width = vision_width
+        text_width = text_width
+        self.vision_proj = nn.Linear(vision_width, embed_dim)
+        self.text_proj = nn.Linear(text_width, embed_dim)
+        self.itm_head = nn.Linear(text_width, 2)
+        self.max_txt_len = max_txt_len
+    def forward(self, samples):
+        with torch.no_grad():
+            self.temp.clamp_(0.001, 0.5)
+        visual_inputs = samples["video"]
+        caption = samples["text_input"]
+        b, t, c, h, w = visual_inputs.shape
+        # forward text
+        text = self.tokenizer(
+            caption,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_txt_len,
+            return_tensors="pt",
+        ).to(self.device)
+        text_output = self.text_encoder.forward_text(
+            text,
+            token_type_ids=torch.zeros(
+                text.input_ids.shape, dtype=torch.long, device=self.device
+            ),
+        )
+        text_embeds = text_output.last_hidden_state
+        text_feat = F.normalize(self.text_proj(text_embeds[:, 0, :]), dim=-1)
+        # forward visual
+        # timeSformer asks for (b, c, t, h, w) as input.
+        video_embeds = self.visual_encoder.forward_features(visual_inputs)
+        video_feat = F.normalize(self.vision_proj(video_embeds[:, 0, :]), dim=-1)
+        video_atts = torch.ones(video_embeds.size()[:-1], dtype=torch.long).to(
+            self.device
+        )
+        # ========== (in-batch) ITC loss ==========
+        gathered_video_feats = all_gather_with_grad(video_feat)
+        gathered_text_feats = all_gather_with_grad(text_feat)
+        sim_v2t = video_feat @ gathered_text_feats.t() / self.temp
+        sim_t2v = text_feat @ gathered_video_feats.t() / self.temp
+        sim_targets = torch.zeros_like(sim_v2t)
+        local_rank = get_rank()
+        b_start, b_end = b * local_rank, b * (local_rank + 1)
+        sim_targets[:, b_start:b_end] = torch.eye(b)
+        loss_v2t = -torch.sum(F.log_softmax(sim_v2t, dim=1) * sim_targets, dim=1).mean()
+        loss_t2v = -torch.sum(F.log_softmax(sim_t2v, dim=1) * sim_targets, dim=1).mean()
+        vtc_loss = (loss_v2t + loss_t2v) / 2
+        (
+            vtm_loss,
+            vtm_logits,
+            vtm_labels,
+            encoder_output,
+            encoder_output_neg,
+        ) = self.compute_vtm(
+            text_embeds=text_embeds,
+            text_atts=text.attention_mask,
+            image_embeds=video_embeds,
+            image_atts=video_atts,
+            sim_i2t=sim_v2t.clone(),  # for hard mining
+            sim_t2i=sim_t2v.clone(),  # for hard mining
+        )
+        loss = vtc_loss + vtm_loss
+        # return {"loss": loss}
+        return AlproOutput(
+            loss=loss,
+            loss_vtc=vtc_loss,
+            loss_vtm=vtm_loss,
+            intermediate_output=AlproIntermediateOutput(
+                video_embeds=video_embeds,
+                text_embeds=text_embeds,
+                encoder_output=encoder_output,
+                encoder_output_neg=encoder_output_neg,
+                vtm_logits=vtm_logits,
+                vtm_labels=vtm_labels,
+            ),
+        )
+    def compute_vtm(
+        self, text_embeds, text_atts, image_embeds, image_atts, sim_i2t, sim_t2i
+    ):
+        device = self.device
+        # ====== positive pairs =======
+        attention_mask = torch.cat([text_atts, image_atts], dim=1)
+        embedding_output_pos = torch.cat([text_embeds, image_embeds], dim=1)
+        encoder_outputs_pos = self.text_encoder(
+            encoder_embeds=embedding_output_pos,
+            attention_mask=attention_mask,
+            return_dict=True,
+            mode="fusion",
+        )
+        # ====== negative pairs =======
+        bs = text_embeds.shape[0]
+        local_rank = get_rank()
+        b_start, b_end = bs * local_rank, bs * (local_rank + 1)
+        with torch.no_grad():
+            weights_v2t = sim_i2t[:, b_start:b_end]
+            weights_t2v = sim_t2i[:, b_start:b_end]
+            # never select self as negative
+            weights_v2t.fill_diagonal_(-np.Inf)
+            weights_t2v.fill_diagonal_(-np.Inf)
+            weights_v2t = F.softmax(weights_v2t, dim=1)
+            weights_t2v = F.softmax(weights_t2v, dim=1)
+        # select a negative image for each text
+        # FIXME to optimize using indexing operations
+        image_embeds_neg = []
+        for b in range(bs):
+            neg_idx = torch.multinomial(weights_t2v[b], 1).item()
+            image_embeds_neg.append(image_embeds[neg_idx])
+        image_embeds_neg = torch.stack(image_embeds_neg, dim=0)
+        # select a negative text for each image
+        text_embeds_neg = []
+        text_atts_neg = []
+        for b in range(bs):
+            neg_idx = torch.multinomial(weights_v2t[b], 1).item()
+            text_embeds_neg.append(text_embeds[neg_idx])
+            text_atts_neg.append(text_atts[neg_idx])
+        text_embeds_neg = torch.stack(text_embeds_neg, dim=0)
+        text_atts_neg = torch.stack(text_atts_neg, dim=0)
+        text_embeds_all = torch.cat([text_embeds, text_embeds_neg], dim=0)
+        text_atts_all = torch.cat([text_atts, text_atts_neg], dim=0)
+        video_embeds_all = torch.cat([image_embeds_neg, image_embeds], dim=0)
+        video_atts_all = torch.cat([image_atts, image_atts], dim=0)
+        attention_mask_all = torch.cat([text_atts_all, video_atts_all], dim=1)
+        embedding_output_all = torch.cat([text_embeds_all, video_embeds_all], dim=1)
+        # forward negative pairs via cross encoder
+        encoder_outputs_neg = self.text_encoder(
+            encoder_embeds=embedding_output_all,
+            attention_mask=attention_mask_all,
+            return_dict=True,
+            mode="fusion",
+        )
+        vl_embeddings = torch.cat(
+            [
+                encoder_outputs_pos.last_hidden_state[:, 0, :],
+                encoder_outputs_neg.last_hidden_state[:, 0, :],
+            ],
+            dim=0,
+        )
+        vtm_logits = self.itm_head(vl_embeddings)
+        vtm_labels = torch.cat(
+            [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)],
+            dim=0,
+        ).to(device)
+        vtm_loss = F.cross_entropy(vtm_logits, vtm_labels)
+        return (
+            vtm_loss,
+            vtm_logits,
+            vtm_labels,
+            encoder_outputs_pos,
+            encoder_outputs_neg,
+        )
+    def compute_sim_matrix(self, data_loader, task_cfg):
+        k_test = task_cfg.get("k_test")
+        metric_logger = MetricLogger(delimiter="  ")
+        header = "Evaluation:"
+        logging.info("Computing features for evaluation...")
+        start_time = time.time()
+        texts = data_loader.dataset.text
+        num_text = len(texts)
+        text_bs = 256
+        text_ids = []
+        text_embeds = []
+        text_feats = []
+        text_atts = []
+        for i in range(0, num_text, text_bs):
+            text = texts[i : min(num_text, i + text_bs)]
+            text_input = self.tokenizer(
+                text,
+                padding="max_length",
+                truncation=True,
+                max_length=self.max_txt_len,
+                return_tensors="pt",
+            ).to(self.device)
+            text_output = self.text_encoder.forward_text(
+                text_input,
+                token_type_ids=torch.zeros(
+                    text_input.input_ids.shape, dtype=torch.long, device=self.device
+                ),
+            )
+            text_feats.append(text_output.last_hidden_state.cpu())
+            text_embed = F.normalize(
+                self.text_proj(text_output.last_hidden_state[:, 0, :])
+            )
+            text_embeds.append(text_embed)
+            text_ids.append(text_input.input_ids)
+            text_atts.append(text_input.attention_mask)
+        text_embeds = torch.cat(text_embeds, dim=0)
+        text_ids = torch.cat(text_ids, dim=0)
+        text_atts = torch.cat(text_atts, dim=0)
+        text_feats = torch.cat(text_feats, dim=0)
+        video_feats = []
+        video_embeds = []
+        for samples in data_loader:
+            video = samples["video"]
+            video = video.to(self.device)
+            video_feat = self.visual_encoder.forward_features(video)
+            video_embed = self.vision_proj(video_feat[:, 0, :])
+            video_embed = F.normalize(video_embed, dim=-1)
+            video_feats.append(video_feat.cpu())
+            video_embeds.append(video_embed)
+        video_feats = torch.cat(video_feats, dim=0)
+        video_embeds = torch.cat(video_embeds, dim=0)
+        sims_matrix = video_embeds @ text_embeds.t()
+        score_matrix_v2t = torch.full(
+            (len(data_loader.dataset.image), len(texts)), -100.0
+        ).to(self.device)
+        num_tasks = dist_utils.get_world_size()
+        rank = dist_utils.get_rank()
+        step = sims_matrix.size(0) // num_tasks + 1
+        start = rank * step
+        end = min(sims_matrix.size(0), start + step)
+        # video-to-text
+        for i, sims in enumerate(
+            metric_logger.log_every(sims_matrix[start:end], 50, header)
+        ):
+            topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+            video_feats_repeat = (
+                video_feats[start + i].repeat(k_test, 1, 1).to(self.device)
+            )
+            video_atts_repeat = torch.ones(
+                video_feats_repeat.size()[:-1], dtype=torch.long
+            ).to(self.device)
+            attention_mask = torch.cat([text_atts[topk_idx], video_atts_repeat], dim=1)
+            embedding_output = torch.cat(
+                [text_feats[topk_idx].to(self.device), video_feats_repeat], dim=1
+            )
+            output = self.text_encoder(
+                encoder_embeds=embedding_output,
+                attention_mask=attention_mask,
+                return_dict=True,
+                mode="fusion",
+            )
+            score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
+            score_matrix_v2t[start + i, topk_idx] = score + topk_sim
+        # text-to-video
+        sims_matrix = sims_matrix.t()
+        score_matrix_t2v = torch.full(
+            (len(texts), len(data_loader.dataset.image)), -100.0
+        ).to(self.device)
+        step = sims_matrix.size(0) // num_tasks + 1
+        start = rank * step
+        end = min(sims_matrix.size(0), start + step)
+        for i, sims in enumerate(
+            metric_logger.log_every(sims_matrix[start:end], 50, header)
+        ):
+            topk_sim, topk_idx = sims.topk(k=k_test, dim=0)
+            text_feats_repeat = (
+                text_feats[start + i].repeat(k_test, 1, 1).to(self.device)
+            )
+            text_atts_repeat = text_atts[start + i].repeat(k_test, 1).to(self.device)
+            video_atts = torch.ones(
+                video_feats[topk_idx].size()[:-1], dtype=torch.long
+            ).to(self.device)
+            embedding_output = torch.cat(
+                [text_feats_repeat, video_feats[topk_idx].to(self.device)], dim=1
+            )
+            attention_mask = torch.cat([text_atts_repeat, video_atts], dim=1)
+            output = self.text_encoder(
+                encoder_embeds=embedding_output,
+                attention_mask=attention_mask,
+                return_dict=True,
+                mode="fusion",
+            )
+            score = self.itm_head(output.last_hidden_state[:, 0, :])[:, 1]
+            score_matrix_t2v[start + i, topk_idx] = score + topk_sim
+        if dist_utils.is_dist_avail_and_initialized():
+            dist.barrier()
+            torch.distributed.all_reduce(
+                score_matrix_v2t, op=torch.distributed.ReduceOp.SUM
+            )
+            torch.distributed.all_reduce(
+                score_matrix_t2v, op=torch.distributed.ReduceOp.SUM
+            )
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        logging.info("Evaluation time {}".format(total_time_str))
+        return score_matrix_v2t.cpu().numpy(), score_matrix_t2v.cpu().numpy()
+    @classmethod
+    def from_config(cls, cfg):
+        # vision encoder
+        visual_encoder_config = node_to_dict(cfg.timesformer)
+        visual_encoder = TimeSformer(**visual_encoder_config)
+        # text encoder
+        text_encoder = XBertEncoder.from_config(cfg)
+        max_txt_len = cfg.get("max_txt_len", 35)
+        model = cls(
+            visual_encoder=visual_encoder,
+            text_encoder=text_encoder,
+            max_txt_len=max_txt_len,
+        )
+        num_patches = (
+            visual_encoder_config["image_size"] // visual_encoder_config["patch_size"]
+        ) ** 2
+        num_frames = visual_encoder_config["n_frms"]
+        model.load_checkpoint_from_config(
+            cfg, num_frames=num_frames, num_patches=num_patches
+        )
+        return model

LAVIS-main/lavis/models/beats/BEATs.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+import torchaudio.compliance.kaldi as ta_kaldi
+from lavis.models.beats.backbone import (
+    TransformerEncoder,
+)
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+class BEATsConfig:
+    def __init__(self, cfg=None):
+        self.input_patch_size: int = -1  # path size of patch embedding
+        self.embed_dim: int = 512  # patch embedding dimension
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use
+        self.layer_wise_gradient_decay_ratio: float = 1.0  # ratio for layer-wise gradient decay
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.deep_norm: bool = False  # apply deep_norm first in the transformer
+        # dropouts
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        # positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
+        # relative position embedding
+        self.relative_position_embedding: bool = False  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = False  # apply gated relative position embedding
+        # label predictor
+        self.finetuned_model: bool = False  # whether the model is a fine-tuned model.
+        self.predictor_dropout: float = 0.1  # dropout probability for the predictor
+        self.predictor_class: int = 527  # target class number for the predictor
+        if cfg is not None:
+            self.update(cfg)
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+class BEATs(nn.Module):
+    def __init__(
+            self,
+            cfg: BEATsConfig,
+    ) -> None:
+        super().__init__()
+        logger.info(f"BEATs Config: {cfg.__dict__}")
+        self.cfg = cfg
+        self.embed = cfg.embed_dim
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim)
+            if self.embed != cfg.encoder_embed_dim
+            else None
+        )
+        self.input_patch_size = cfg.input_patch_size
+        self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
+                                         bias=cfg.conv_bias)
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        assert not cfg.deep_norm or not cfg.layer_norm_first
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        if cfg.finetuned_model:
+            self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
+            self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class)
+        else:
+            self.predictor = None
+    def forward_padding_mask(
+            self,
+            features: torch.Tensor,
+            padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(
+            padding_mask.size(0), features.size(1), -1
+        )
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def preprocess(
+            self,
+            source: torch.Tensor,
+            fbank_mean: float = 15.41663,
+            fbank_std: float = 6.55582,
+    ) -> torch.Tensor:
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2 ** 15
+            fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+    def extract_features(
+            self,
+            fbank: torch.Tensor,
+            padding_mask: Optional[torch.Tensor] = None,
+            fbank_mean: float = 15.41663,
+            fbank_std: float = 6.55582,
+    ):
+        ## NOTE: preprocessing is done separately in lavis.processsors.audio_processors.BeatsAudioProcessor
+        # fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(fbank, padding_mask)
+        fbank = fbank.unsqueeze(1)
+        features = self.patch_embedding(fbank)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        x = self.dropout_input(features)
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+        )
+        if self.predictor is not None:
+            x = self.predictor_dropout(x)
+            logits = self.predictor(x)
+            if padding_mask is not None and padding_mask.any():
+                logits[padding_mask] = 0
+                logits = logits.sum(dim=1)
+                logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits)
+            else:
+                logits = logits.mean(dim=1)
+            lprobs = torch.sigmoid(logits)
+            return lprobs, padding_mask
+        else:
+            return x, padding_mask

LAVIS-main/lavis/models/beats/LICENSE_BEATs.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+The MIT License (MIT)
+Copyright (c) Microsoft Corporation
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LAVIS-main/lavis/models/beats/README.md ADDED Viewed

	@@ -0,0 +1,127 @@

+# BEATs
+[**BEATs**](https://arxiv.org/abs/2212.09058): **Audio Pre-Training with Acoustic Tokenizers**
+Official PyTorch implementation and pretrained models of BEATs
+## Pre-Trained and Fine-Tuned Tokenizers and Models
+Iterations  | Tokenizer  | Pre-Trained Model | AudioSet Fine-Tuned Model 1 | AudioSet Fine-Tuned Model 2
+|---|---|---|---|---
+Iter1  |  Random Projection | [BEATs_iter1](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter1 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter1 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter1_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter2  |  [Tokenizer_iter2](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter2](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter2 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter2 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter2_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter3  |  [Tokenizer_iter3](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3 (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3 (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter3+  |  [Tokenizer_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS20K)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3+ (AS20K) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS20K) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS20K_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+Iter3+  |  [Tokenizer_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/Tokenizer_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)| [BEATs_iter3+ (AS2M)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D)  | [Fine-tuned BEATs_iter3+ (AS2M) (cpt1)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt1.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) | [Fine-tuned BEATs_iter3+ (AS2M) (cpt2)](https://valle.blob.core.windows.net/share/BEATs/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D) |
+### Load Tokenizers
+```python
+import torch
+from Tokenizers import TokenizersConfig, Tokenizers
+# load the pre-trained checkpoints
+checkpoint = torch.load('/path/to/tokenizer.pt')
+cfg = TokenizersConfig(checkpoint['cfg'])
+BEATs_tokenizer = Tokenizers(cfg)
+BEATs_tokenizer.load_state_dict(checkpoint['model'])
+BEATs_tokenizer.eval()
+# tokenize the audio and generate the labels
+audio_input_16khz = torch.randn(1, 10000)
+padding_mask = torch.zeros(1, 10000).bool()
+labels = BEATs_tokenizer.extract_labels(audio_input_16khz, padding_mask=padding_mask)
+```
+### Load Pre-Trained Models
+```python
+import torch
+from BEATs import BEATs, BEATsConfig
+# load the pre-trained checkpoints
+checkpoint = torch.load('/path/to/model.pt')
+cfg = BEATsConfig(checkpoint['cfg'])
+BEATs_model = BEATs(cfg)
+BEATs_model.load_state_dict(checkpoint['model'])
+BEATs_model.eval()
+# extract the the audio representation
+audio_input_16khz = torch.randn(1, 10000)
+padding_mask = torch.zeros(1, 10000).bool()
+representation = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
+```
+### Load Fine-tuned Models
+```python
+import torch
+from BEATs import BEATs, BEATsConfig
+# load the fine-tuned checkpoints
+checkpoint = torch.load('/path/to/model.pt')
+cfg = BEATsConfig(checkpoint['cfg'])
+BEATs_model = BEATs(cfg)
+BEATs_model.load_state_dict(checkpoint['model'])
+BEATs_model.eval()
+# predict the classification probability of each class
+audio_input_16khz = torch.randn(3, 10000)
+padding_mask = torch.zeros(3, 10000).bool()
+probs = BEATs_model.extract_features(audio_input_16khz, padding_mask=padding_mask)[0]
+for i, (top5_label_prob, top5_label_idx) in enumerate(zip(*probs.topk(k=5))):
+    top5_label = [checkpoint['label_dict'][label_idx.item()] for label_idx in top5_label_idx]
+    print(f'Top 5 predicted labels of the {i}th audio are {top5_label} with probability of {top5_label_prob}')
+```
+## Evaluation Results
+### Comparing with the SOTA Single Models
+![alt text](Evaluation_Results/Comparing_with_the_SOTA_Single_Models.png)
+### Comparing with the SOTA Ensemble Models
+![alt text](Evaluation_Results/Comparing_with_the_SOTA_Ensemble_Models.png)
+### Comparing Different BEATS Tokenizers
+![alt text](Evaluation_Results/Comparing_Different_BEATS_Tokenizers.png)
+### Comparing Different Pre-Training Targets
+![alt text](Evaluation_Results/Comparing_Different_Pre-Training_Targets.png)
+## License
+This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
+Portions of the source code are based on the [FAIRSEQ](https://github.com/pytorch/fairseq) and [VQGAN](https://github.com/CompVis/taming-transformers) project.
+[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
+### Reference
+If you find our work is useful in your research, please cite the following paper:
+``` latex
+@article{Chen2022beats,
+  title = {BEATs: Audio Pre-Training with Acoustic Tokenizers},
+  author  = {Sanyuan Chen and Yu Wu and Chengyi Wang and Shujie Liu and Daniel Tompkins and Zhuo Chen and Furu Wei},
+  eprint={2212.09058},
+  archivePrefix={arXiv},
+  year={2022}
+}
+```
+### Contact Information
+For help or issues using BEATs models, please submit a GitHub issue.
+For other communications related to  BEATs, please contact Yu Wu (`yuwu1@microsoft.com`).

LAVIS-main/lavis/models/beats/Tokenizers.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+from torch.nn import LayerNorm
+import torchaudio.compliance.kaldi as ta_kaldi
+from lavis.models.beats.backbone import (
+    TransformerEncoder,
+)
+from lavis.models.beats.quantizer import (
+    NormEMAVectorQuantizer,
+)
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+class TokenizersConfig:
+    def __init__(self, cfg=None):
+        self.input_patch_size: int = -1  # path size of patch embedding
+        self.embed_dim: int = 512  # patch embedding dimension
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.deep_norm: bool = False  # apply deep_norm first in the transformer
+        # dropouts
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        # positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
+        # relative position embedding
+        self.relative_position_embedding: bool = False  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = False  # apply gated relative position embedding
+        # quantizer
+        self.quant_n: int = 1024 # codebook number in quantizer
+        self.quant_dim: int = 256    # codebook dimension in quantizer
+        if cfg is not None:
+            self.update(cfg)
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+class Tokenizers(nn.Module):
+    def __init__(
+            self,
+            cfg: TokenizersConfig,
+    ) -> None:
+        super().__init__()
+        logger.info(f"Tokenizers Config: {cfg.__dict__}")
+        self.cfg = cfg
+        self.embed = cfg.embed_dim
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim)
+            if self.embed != cfg.encoder_embed_dim
+            else None
+        )
+        self.input_patch_size = cfg.input_patch_size
+        self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size,
+                                         bias=cfg.conv_bias)
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        assert not cfg.deep_norm or not cfg.layer_norm_first
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        self.quantize = NormEMAVectorQuantizer(
+            n_embed=cfg.quant_n, embedding_dim=cfg.quant_dim, beta=1.0, kmeans_init=True, decay=0.99,
+        )
+        self.quant_n = cfg.quant_n
+        self.quantize_layer = nn.Sequential(
+            nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim),
+            nn.Tanh(),
+            nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim)  # for quantize
+        )
+    def forward_padding_mask(
+            self,
+            features: torch.Tensor,
+            padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(
+            padding_mask.size(0), features.size(1), -1
+        )
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def preprocess(
+            self,
+            source: torch.Tensor,
+            fbank_mean: float = 15.41663,
+            fbank_std: float = 6.55582,
+    ) -> torch.Tensor:
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2 ** 15
+            fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+    def extract_labels(
+            self,
+            source: torch.Tensor,
+            padding_mask: Optional[torch.Tensor] = None,
+            fbank_mean: float = 15.41663,
+            fbank_std: float = 6.55582,
+    ):
+        fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(fbank, padding_mask)
+        fbank = fbank.unsqueeze(1)
+        features = self.patch_embedding(fbank)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        x = self.dropout_input(features)
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+        )
+        quantize_input = self.quantize_layer(x)
+        quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input)
+        return embed_ind

LAVIS-main/lavis/models/beats/backbone.py ADDED Viewed

	@@ -0,0 +1,783 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import math
+import numpy as np
+from typing import Dict, Optional, Tuple
+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+from torch.nn import LayerNorm, Parameter
+from lavis.models.beats.modules import (
+    GradMultiply,
+    SamePad,
+    get_activation_fn,
+    GLU_Linear,
+    quant_noise,
+)
+class TransformerEncoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.pos_conv = nn.Conv1d(
+            self.embedding_dim,
+            self.embedding_dim,
+            kernel_size=args.conv_pos,
+            padding=args.conv_pos // 2,
+            groups=args.conv_pos_groups,
+        )
+        dropout = 0
+        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
+        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
+        nn.init.constant_(self.pos_conv.bias, 0)
+        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
+        if hasattr(args, "relative_position_embedding"):
+            self.relative_position_embedding = args.relative_position_embedding
+            self.num_buckets = args.num_buckets
+            self.max_distance = args.max_distance
+        else:
+            self.relative_position_embedding = False
+            self.num_buckets = 0
+            self.max_distance = 0
+        self.layers = nn.ModuleList(
+            [
+                TransformerSentenceEncoderLayer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                    num_attention_heads=args.encoder_attention_heads,
+                    dropout=self.dropout,
+                    attention_dropout=args.attention_dropout,
+                    activation_dropout=args.activation_dropout,
+                    activation_fn=args.activation_fn,
+                    layer_norm_first=args.layer_norm_first,
+                    deep_norm=args.deep_norm,
+                    has_relative_attention_bias=self.relative_position_embedding,
+                    num_buckets=self.num_buckets,
+                    max_distance=self.max_distance,
+                    gru_rel_pos=args.gru_rel_pos,
+                    encoder_layers=args.encoder_layers,
+                )
+                for i in range(args.encoder_layers)
+            ]
+        )
+        if self.relative_position_embedding:
+            for i in range(1, args.encoder_layers):
+                del self.layers[i].self_attn.relative_attention_bias
+                self.layers[i].self_attn.relative_attention_bias = self.layers[0].self_attn.relative_attention_bias
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+        self.apply(init_bert_params)
+        if args.deep_norm:
+            deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4)
+            for i in range(args.encoder_layers):
+                nn.init.xavier_normal_(self.layers[i].self_attn.k_proj.weight, gain=1)
+                nn.init.xavier_normal_(self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(self.layers[i].self_attn.q_proj.weight, gain=1)
+                nn.init.xavier_normal_(self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(self.layers[i].fc1.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(self.layers[i].fc2.weight, gain=deep_norm_beta)
+        self.layer_wise_gradient_decay_ratio = getattr(args, "layer_wise_gradient_decay_ratio", 1)
+    def forward(self, x, padding_mask=None, layer=None):
+        x, layer_results = self.extract_features(x, padding_mask, layer)
+        if self.layer_norm_first and layer is None:
+            x = self.layer_norm(x)
+        return x, layer_results
+    def extract_features(self, x, padding_mask=None, tgt_layer=None):
+        if padding_mask is not None:
+            x[padding_mask] = 0
+        x_conv = self.pos_conv(x.transpose(1, 2))
+        x_conv = x_conv.transpose(1, 2)
+        x = x + x_conv
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        layer_results = []
+        z = None
+        if tgt_layer is not None:
+            layer_results.append((x, z))
+        r = None
+        pos_bias = None
+        for i, layer in enumerate(self.layers):
+            if self.layer_wise_gradient_decay_ratio != 1.0:
+                x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio)
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_bias)
+            if tgt_layer is not None:
+                layer_results.append((x, z))
+            if i == tgt_layer:
+                r = x
+                break
+        if r is not None:
+            x = r
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        return x, layer_results
+class TransformerSentenceEncoderLayer(nn.Module):
+    def __init__(
+            self,
+            embedding_dim: float = 768,
+            ffn_embedding_dim: float = 3072,
+            num_attention_heads: float = 8,
+            dropout: float = 0.1,
+            attention_dropout: float = 0.1,
+            activation_dropout: float = 0.1,
+            activation_fn: str = "relu",
+            layer_norm_first: bool = False,
+            deep_norm: bool = False,
+            has_relative_attention_bias: bool = False,
+            num_buckets: int = 0,
+            max_distance: int = 0,
+            rescale_init: bool = False,
+            gru_rel_pos: bool = False,
+            encoder_layers: int = 0,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.activation_name = activation_fn
+        self.activation_fn = get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+            has_relative_attention_bias=has_relative_attention_bias,
+            num_buckets=num_buckets,
+            max_distance=max_distance,
+            rescale_init=rescale_init,
+            gru_rel_pos=gru_rel_pos,
+        )
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.layer_norm_first = layer_norm_first
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+        if self.activation_name == "glu":
+            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
+        else:
+            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+        self.deep_norm = deep_norm
+        if self.deep_norm:
+            self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4)
+        else:
+            self.deep_norm_alpha = 1
+    def forward(
+            self,
+            x: torch.Tensor,
+            self_attn_mask: torch.Tensor = None,
+            self_attn_padding_mask: torch.Tensor = None,
+            need_weights: bool = False,
+            pos_bias=None
+    ):
+        residual = x
+        if self.layer_norm_first:
+            x = self.self_attn_layer_norm(x)
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=False,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias
+            )
+            x = self.dropout1(x)
+            x = residual + x
+            residual = x
+            x = self.final_layer_norm(x)
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual + x
+        else:
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=need_weights,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias
+            )
+            x = self.dropout1(x)
+            x = residual * self.deep_norm_alpha + x
+            x = self.self_attn_layer_norm(x)
+            residual = x
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual * self.deep_norm_alpha + x
+            x = self.final_layer_norm(x)
+        return x, attn, pos_bias
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention.
+    See "Attention Is All You Need" for more details.
+    """
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            kdim=None,
+            vdim=None,
+            dropout=0.0,
+            bias=True,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            self_attention=False,
+            encoder_decoder_attention=False,
+            q_noise=0.0,
+            qn_block_size=8,
+            has_relative_attention_bias=False,
+            num_buckets=32,
+            max_distance=128,
+            gru_rel_pos=False,
+            rescale_init=False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout_module = nn.Dropout(dropout)
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
+        self.head_dim = embed_dim // num_heads
+        self.q_head_dim = self.head_dim
+        self.k_head_dim = self.head_dim
+        assert (
+                self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+        k_bias = True
+        if rescale_init:
+            k_bias = False
+        k_embed_dim = embed_dim
+        q_embed_dim = embed_dim
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.gru_rel_pos = gru_rel_pos
+        if self.gru_rel_pos:
+            self.grep_linear = nn.Linear(self.q_head_dim, 8)
+            self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
+        self.reset_parameters()
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+        if self.has_relative_attention_bias:
+            nn.init.xavier_normal_(self.relative_attention_bias.weight)
+    def _relative_positions_bucket(self, relative_positions, bidirectional=True):
+        num_buckets = self.num_buckets
+        max_distance = self.max_distance
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets = num_buckets // 2
+            relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
+            relative_positions = torch.abs(relative_positions)
+        else:
+            relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions))
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+        relative_postion_if_large = max_exact + (
+                torch.log(relative_positions.float() / max_exact)
+                / math.log(max_distance / max_exact)
+                * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
+        )
+        relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large)
+        return relative_buckets
+    def compute_bias(self, query_length, key_length):
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(
+            relative_position,
+            bidirectional=True
+        )
+        relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
+        values = self.relative_attention_bias(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+    def forward(
+            self,
+            query,
+            key: Optional[Tensor],
+            value: Optional[Tensor],
+            key_padding_mask: Optional[Tensor] = None,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+            need_weights: bool = True,
+            static_kv: bool = False,
+            attn_mask: Optional[Tensor] = None,
+            before_softmax: bool = False,
+            need_head_weights: bool = False,
+            position_bias: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        is_tpu = query.device.type == "xla"
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert key_bsz == bsz
+                assert value is not None
+                assert src_len, bsz == value.shape[:2]
+        if self.has_relative_attention_bias and position_bias is None:
+            position_bias = self.compute_bias(tgt_len, src_len)
+            position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len)
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        alpha = 32
+        q *= 1 / alpha
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+        q = (
+            q.contiguous()
+                .view(tgt_len, bsz * self.num_heads, self.q_head_dim)
+                .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                    .view(-1, bsz * self.num_heads, self.k_head_dim)
+                    .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                    .view(-1, bsz * self.num_heads, self.head_dim)
+                    .transpose(0, 1)
+            )
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(
+                            key_padding_mask
+                        ),
+                    ],
+                    dim=1,
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = (attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            if not is_tpu:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf"),
+                )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v, position_bias
+        if position_bias is not None:
+            attn_mask_rel_pos = position_bias
+            if self.gru_rel_pos == 1:
+                query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) * alpha / self.scaling
+                _B, _H, _L, __ = query_layer.size()
+                gate_a, gate_b = torch.sigmoid(self.grep_linear(query_layer).view(
+                    _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, dim=-1)
+                gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
+                attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias
+            attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size())
+            attn_weights = attn_weights + attn_mask_rel_pos
+        attn_weights_float = F.softmax(
+            attn_weights, dim=-1
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights, position_bias
+    @staticmethod
+    def _append_prev_key_padding_mask(
+            key_padding_mask: Optional[Tensor],
+            prev_key_padding_mask: Optional[Tensor],
+            batch_size: int,
+            src_len: int,
+            static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - prev_key_padding_mask.size(1)),
+                    device=prev_key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat(
+                    [prev_key_padding_mask.float(), filler.float()], dim=1
+                )
+            else:
+                new_key_padding_mask = prev_key_padding_mask.float()
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - key_padding_mask.size(1)),
+                    device=key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat(
+                    [filler.float(), key_padding_mask.float()], dim=1
+                )
+            else:
+                new_key_padding_mask = key_padding_mask.float()
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    def _get_input_buffer(
+            self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+    def _set_input_buffer(
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+def init_bert_params(module):
+    """
+    Initialize the weights specific to the BERT Model.
+    This overrides the default initializations depending on the specified arguments.
+        1. If normal_init_linear_weights is set then weights of linear
+           layer will be initialized using the normal distribution and
+           bais will be set to the specified value.
+        2. If normal_init_embed_weights is set then weights of embedding
+           layer will be initialized using the normal distribution.
+        3. If normal_init_proj_weights is set then weights of
+           in_project_weight for MultiHeadAttention initialized using
+           the normal distribution (to be validated).
+    """
+    def normal_(data):
+        # with FSDP, module params will be on CUDA, so we cast them back to CPU
+        # so that the RNG is consistent with and without FSDP
+        data.copy_(
+            data.cpu().normal_(mean=0.0, std=0.02).to(data.device)
+        )
+    if isinstance(module, nn.Linear):
+        normal_(module.weight.data)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight.data)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    if isinstance(module, MultiheadAttention):
+        normal_(module.q_proj.weight.data)
+        normal_(module.k_proj.weight.data)
+        normal_(module.v_proj.weight.data)