diff --git "a/logs/20250527_235509/train.log" "b/logs/20250527_235509/train.log" new file mode 100644--- /dev/null +++ "b/logs/20250527_235509/train.log" @@ -0,0 +1,4035 @@ +2025-05-27 23:55:29,745 INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_e25cfeb4a729bbcc.zip. +2025-05-27 23:55:29,746 INFO packaging.py:575 -- Creating a file package for local module '/mnt/petrelfs/luyiting/MultiAgentEval/lmm-r1'. +2025-05-27 23:55:28,765 INFO cli.py:39 -- Job submission server address: http://127.0.0.1:2989 +2025-05-27 23:55:34,243 SUCC cli.py:63 -- ------------------------------------------------------- +2025-05-27 23:55:34,243 SUCC cli.py:64 -- Job 'raysubmit_tdQtQ5pUh6YdmvqQ' submitted successfully +2025-05-27 23:55:34,243 SUCC cli.py:65 -- ------------------------------------------------------- +2025-05-27 23:55:34,243 INFO cli.py:289 -- Next steps +2025-05-27 23:55:34,243 INFO cli.py:290 -- Query the logs of the job: +2025-05-27 23:55:34,243 INFO cli.py:292 -- ray job logs raysubmit_tdQtQ5pUh6YdmvqQ +2025-05-27 23:55:34,243 INFO cli.py:294 -- Query the status of the job: +2025-05-27 23:55:34,243 INFO cli.py:296 -- ray job status raysubmit_tdQtQ5pUh6YdmvqQ +2025-05-27 23:55:34,244 INFO cli.py:298 -- Request the job to be stopped: +2025-05-27 23:55:34,244 INFO cli.py:300 -- ray job stop raysubmit_tdQtQ5pUh6YdmvqQ +2025-05-27 23:55:34,246 INFO cli.py:307 -- Tailing logs until the job exits (disable with --no-wait): +2025-05-27 23:55:33,741 INFO job_manager.py:531 -- Runtime env is setting up. +[2025-05-27 23:55:52,909] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +INFO 05-27 23:55:57 [__init__.py:239] Automatically detected platform cuda. +2025-05-27 23:55:58,401 INFO worker.py:1520 -- Using address 10.140.0.151:6239 set in the environment variable RAY_ADDRESS +2025-05-27 23:55:58,402 INFO worker.py:1660 -- Connecting to existing Ray cluster at address: 10.140.0.151:6239... +2025-05-27 23:55:58,424 INFO worker.py:1843 -- Connected to Ray cluster. View the dashboard at 10.140.0.151:2989  +(pid=425166) INFO 05-27 23:56:17 [__init__.py:239] Automatically detected platform cuda. +(LLMRayActor pid=425166) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'classify', 'reward', 'embed', 'generate', 'score'}. Defaulting to 'generate'. +(LLMRayActor pid=425166) WARNING 05-27 23:56:43 [arg_utils.py:1846] VLLM_ATTENTION_BACKEND=triton is not supported by the V1 Engine. Falling back to V0. We recommend to remove VLLM_ATTENTION_BACKEND=triton from your config in favor of the V1 Engine. +(LLMRayActor pid=425166) WARNING 05-27 23:56:43 [arg_utils.py:1745] --enable-prefix-caching is not supported for multimodal models in V0 and has been disabled. +(LLMRayActor pid=425166) INFO 05-27 23:56:43 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2.dev76+gf68cce8) with config: model='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', speculative_config=None, tokenizer='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=42, served_model_name=/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, +(pid=425165) INFO 05-27 23:56:17 [__init__.py:239] Automatically detected platform cuda. [repeated 7x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.) +(LLMRayActor pid=425160) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'classify', 'embed', 'reward', 'score', 'generate'}. Defaulting to 'generate'. +(LLMRayActor pid=425162) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'generate', 'classify', 'score', 'reward', 'embed'}. Defaulting to 'generate'. +(LLMRayActor pid=425161) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'reward', 'generate', 'embed', 'classify', 'score'}. Defaulting to 'generate'. +(LLMRayActor pid=425159) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'generate', 'score', 'classify', 'embed', 'reward'}. Defaulting to 'generate'. +(LLMRayActor pid=425163) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'embed', 'score', 'reward', 'generate', 'classify'}. Defaulting to 'generate'. +(LLMRayActor pid=425164) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'embed', 'classify', 'score', 'reward', 'generate'}. Defaulting to 'generate'. +(LLMRayActor pid=425165) INFO 05-27 23:56:43 [config.py:585] This model supports multiple tasks: {'generate', 'reward', 'embed', 'classify', 'score'}. Defaulting to 'generate'. +(LLMRayActor pid=425166) [2025-05-27 23:56:46,863] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +(LLMRayActor pid=425166) INFO 05-27 23:56:51 [cuda.py:293] Using Flash Attention backend. +(LLMRayActor pid=425165) WARNING 05-27 23:56:43 [arg_utils.py:1846] VLLM_ATTENTION_BACKEND=triton is not supported by the V1 Engine. Falling back to V0. We recommend to remove VLLM_ATTENTION_BACKEND=triton from your config in favor of the V1 Engine. [repeated 7x across cluster] +(LLMRayActor pid=425165) WARNING 05-27 23:56:43 [arg_utils.py:1745] --enable-prefix-caching is not supported for multimodal models in V0 and has been disabled. [repeated 7x across cluster] +(LLMRayActor pid=425165) INFO 05-27 23:56:43 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2.dev76+gf68cce8) with config: model='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', speculative_config=None, tokenizer='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=49, served_model_name=/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,  [repeated 7x across cluster] +(LLMRayActor pid=425166) INFO 05-27 23:56:54 [parallel_state.py:967] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0 +(LLMRayActor pid=425166) INFO 05-27 23:56:54 [model_runner.py:1110] Starting to load model /mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/... +(LLMRayActor pid=425165) [2025-05-27 23:56:46,860] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [repeated 7x across cluster] +(LLMRayActor pid=425166) INFO 05-27 23:56:55 [config.py:3229] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256] is overridden by config [256, 128, 2, 1, 4, 136, 8, 144, 16, 152, 24, 160, 32, 168, 40, 176, 48, 184, 56, 192, 64, 200, 72, 208, 80, 216, 88, 120, 224, 96, 232, 104, 240, 112, 248] +(LLMRayActor pid=425166) +Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] communication_data_type ...... None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] curriculum_enabled_legacy .... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] curriculum_params_legacy ..... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] data_efficiency_enabled ...... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] dataloader_drop_last ......... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] disable_allgather ............ False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] dump_state ................... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] dynamic_loss_scale_args ...... None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,433] [INFO] [config.py:1005:print] eigenvalue_enabled ........... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] eigenvalue_gas_boundary_resolution 1 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] eigenvalue_layer_name ........ bert.encoder.layer +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] eigenvalue_layer_num ......... 0 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] eigenvalue_max_iter .......... 100 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] eigenvalue_stability ......... 1e-06 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] eigenvalue_tol ............... 0.01 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] eigenvalue_verbose ........... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] elasticity_enabled ........... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] flops_profiler_config ........ { +(ReferenceModelRayActor pid=431335) "enabled": false, +(ReferenceModelRayActor pid=431335) "recompute_fwd_factor": 0.0, +(ReferenceModelRayActor pid=431335) "profile_step": 1, +(ReferenceModelRayActor pid=431335) "module_depth": -1, +(ReferenceModelRayActor pid=431335) "top_modules": 1, +(ReferenceModelRayActor pid=431335) "detailed": true, +(ReferenceModelRayActor pid=431335) "output_file": null +(ReferenceModelRayActor pid=431335) } +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] fp16_auto_cast ............... None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] fp16_enabled ................. False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] fp16_master_weights_and_gradients False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] global_rank .................. 0 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] grad_accum_dtype ............. None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] gradient_accumulation_steps .. 8 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] gradient_clipping ............ 1.0 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] gradient_predivide_factor .... 1.0 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] graph_harvesting ............. False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] initial_dynamic_scale ........ 1 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,434] [INFO] [config.py:1005:print] load_universal_checkpoint .... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] loss_scale ................... 1.0 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] memory_breakdown ............. False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] mics_hierarchial_params_gather False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] mics_shard_size .............. -1 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] nebula_config ................ { +(ReferenceModelRayActor pid=431335) "enabled": false, +(ReferenceModelRayActor pid=431335) "persistent_storage_path": null, +(ReferenceModelRayActor pid=431335) "persistent_time_interval": 100, +(ReferenceModelRayActor pid=431335) "num_of_version_in_retention": 2, +(ReferenceModelRayActor pid=431335) "enable_nebula_load": true, +(ReferenceModelRayActor pid=431335) "load_path": null +(ReferenceModelRayActor pid=431335) } +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] optimizer_legacy_fusion ...... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] optimizer_name ............... None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] optimizer_params ............. None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] pld_enabled .................. False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] pld_params ................... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] prescale_gradients ........... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] scheduler_name ............... None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] scheduler_params ............. None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] seq_parallel_communication_data_type torch.float32 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] sparse_attention ............. None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] sparse_gradients_enabled ..... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] steps_per_print .............. 100 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] timers_config ................ enabled=True synchronized=True +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,435] [INFO] [config.py:1005:print] train_batch_size ............. 128 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] train_micro_batch_size_per_gpu 2 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] use_data_before_expert_parallel_ False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] use_node_local_storage ....... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] wall_clock_breakdown ......... False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] weight_quantization_config ... None +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] world_size ................... 8 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] zero_allow_untested_optimizer False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=True) offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] zero_enabled ................. True +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] zero_force_ds_cpu_optimizer .. True +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:1005:print] zero_optimization_stage ...... 3 +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,436] [INFO] [config.py:991:print_user_config] json = { +(ReferenceModelRayActor pid=431335) "steps_per_print": 100, +(ReferenceModelRayActor pid=431335) "zero_optimization": { +(ReferenceModelRayActor pid=431335) "stage": 3, +(ReferenceModelRayActor pid=431335) "stage3_max_live_parameters": "auto", +(ReferenceModelRayActor pid=431335) "stage3_max_reuse_distance": "auto", +(ReferenceModelRayActor pid=431335) "stage3_param_persistence_threshold": "auto", +(ReferenceModelRayActor pid=431335) "stage3_prefetch_bucket_size": "auto", +(ReferenceModelRayActor pid=431335) "offload_param": { +(ReferenceModelRayActor pid=431335) "device": "none", +(ReferenceModelRayActor pid=431335) "pin_memory": true +(ReferenceModelRayActor pid=431335) } +(ReferenceModelRayActor pid=431335) }, +(ReferenceModelRayActor pid=431335) "bf16": { +(ReferenceModelRayActor pid=431335) "enabled": true +(ReferenceModelRayActor pid=431335) }, +(ReferenceModelRayActor pid=431335) "gradient_clipping": 1.0, +(ReferenceModelRayActor pid=431335) "prescale_gradients": false, +(ReferenceModelRayActor pid=431335) "wall_clock_breakdown": false, +(ReferenceModelRayActor pid=431335) "train_micro_batch_size_per_gpu": 2, +(ReferenceModelRayActor pid=431335) "train_batch_size": 128 +(ReferenceModelRayActor pid=431335) } +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ActorModelRayActor pid=431329) in preprocess_data None False +(ReferenceModelRayActor pid=431335) [2025-05-27 23:59:44,009] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8 [repeated 6x across cluster] +(ActorModelRayActor pid=430510) loaded /mnt/petrelfs/luyiting/MultiAgentEval/data_process_v1/train_ava_mini_evalmuse_koniq_llavastyle_openrlhf_merged.jsonl with data_files=/mnt/petrelfs/luyiting/MultiAgentEval/data_process_v1/train_ava_mini_evalmuse_koniq_llavastyle_openrlhf_merged.jsonl +(ActorModelRayActor pid=430510) [Dataset({ +(ActorModelRayActor pid=430510) features: ['message', 'answer'], +(ActorModelRayActor pid=430510) num_rows: 24000 +(ActorModelRayActor pid=430510) })] +(ActorModelRayActor pid=430510) +Preprocessing data: 0%| | 0/24000 [00:00 +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,501] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,501] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,722] [INFO] [utils.py:781:see_memory_usage] Stage 3 initialize beginning +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,723] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 3.98 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,723] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.88 GB, percent = 44.3% +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,726] [INFO] [stage3.py:170:__init__] Reduce bucket size 500000000 +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,726] [INFO] [stage3.py:171:__init__] Prefetch bucket size 50000000 +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,906] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,907] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 1.94 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,907] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.88 GB, percent = 44.3% +(ActorModelRayActor pid=430510) Parameter Offload: Total persistent parameters: 848896 in 368 params +(ActorModelRayActor pid=430510) [2025-05-27 23:59:49,112] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +(ActorModelRayActor pid=430510) [2025-05-27 23:59:49,113] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 1.94 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:49,113] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.88 GB, percent = 44.3% +(ActorModelRayActor pid=430510) [2025-05-27 23:59:49,289] [INFO] [utils.py:781:see_memory_usage] Before creating fp16 partitions +(ActorModelRayActor pid=430510) [2025-05-27 23:59:49,290] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 1.94 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:49,291] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.88 GB, percent = 44.3% +(ActorModelRayActor pid=430510) [2025-05-27 23:59:51,659] [INFO] [utils.py:781:see_memory_usage] After creating fp16 partitions: 2 +(ActorModelRayActor pid=430510) [2025-05-27 23:59:51,660] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.94 GB CA 1.94 GB Max_CA 4 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:51,661] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 449.43 GB, percent = 44.6% +(ActorModelRayActor pid=430510) [2025-05-27 23:59:51,872] [INFO] [utils.py:781:see_memory_usage] Before creating fp32 partitions +(ActorModelRayActor pid=430510) [2025-05-27 23:59:51,873] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:51,873] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 451.83 GB, percent = 44.9% +(ActorModelRayActor pid=430510) [2025-05-27 23:59:55,698] [INFO] [utils.py:781:see_memory_usage] After creating fp32 partitions +(ActorModelRayActor pid=430510) [2025-05-27 23:59:55,698] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:55,699] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 478.68 GB, percent = 47.5% +(ActorModelRayActor pid=430510) [2025-05-27 23:59:48,443] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8 [repeated 8x across cluster] +(ActorModelRayActor pid=430510) [2025-05-27 23:59:56,077] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +(ActorModelRayActor pid=430510) [2025-05-27 23:59:56,078] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=430510) [2025-05-27 23:59:56,078] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 481.26 GB, percent = 47.8% +(ActorModelRayActor pid=430510) [2025-05-28 00:00:04,652] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +(ActorModelRayActor pid=430510) [2025-05-28 00:00:04,652] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=430510) [2025-05-28 00:00:04,654] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 509.36 GB, percent = 50.6% +(ActorModelRayActor pid=430510) [2025-05-28 00:00:04,655] [INFO] [stage3.py:534:_setup_for_real_optimizer] optimizer state initialized +(ActorModelRayActor pid=430510) in preprocess_data None False [repeated 3000x across cluster] +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,264] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,265] [INFO] [utils.py:782:see_memory_usage] MA 2.86 GB Max_MA 4.89 GB CA 5.02 GB Max_CA 5 GB +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,266] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 525.83 GB, percent = 52.2% +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,266] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer_Stage3 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,266] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,266] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,266] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,268] [INFO] [config.py:1001:print] DeepSpeedEngine configuration: +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] activation_checkpointing_config { +(ActorModelRayActor pid=430510) "partition_activations": false, +(ActorModelRayActor pid=430510) "contiguous_memory_optimization": false, +(ActorModelRayActor pid=430510) "cpu_checkpointing": false, +(ActorModelRayActor pid=430510) "number_checkpoints": null, +(ActorModelRayActor pid=430510) "synchronize_checkpoint_boundary": false, +(ActorModelRayActor pid=430510) "profile": false +(ActorModelRayActor pid=430510) } +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'intra_op_parallelism': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] amp_enabled .................. False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] amp_params ................... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] autotuning_config ............ { +(ActorModelRayActor pid=430510) "enabled": false, +(ActorModelRayActor pid=430510) "start_step": null, +(ActorModelRayActor pid=430510) "end_step": null, +(ActorModelRayActor pid=430510) "metric_path": null, +(ActorModelRayActor pid=430510) "arg_mappings": null, +(ActorModelRayActor pid=430510) "metric": "throughput", +(ActorModelRayActor pid=430510) "model_info": null, +(ActorModelRayActor pid=430510) "results_dir": "autotuning_results", +(ActorModelRayActor pid=430510) "exps_dir": "autotuning_exps", +(ActorModelRayActor pid=430510) "overwrite": true, +(ActorModelRayActor pid=430510) "fast": true, +(ActorModelRayActor pid=430510) "start_profile_step": 3, +(ActorModelRayActor pid=430510) "end_profile_step": 5, +(ActorModelRayActor pid=430510) "tuner_type": "gridsearch", +(ActorModelRayActor pid=430510) "tuner_early_stopping": 5, +(ActorModelRayActor pid=430510) "tuner_num_trials": 50, +(ActorModelRayActor pid=430510) "model_info_path": null, +(ActorModelRayActor pid=430510) "mp_size": 1, +(ActorModelRayActor pid=430510) "max_train_batch_size": null, +(ActorModelRayActor pid=430510) "min_train_batch_size": 1, +(ActorModelRayActor pid=430510) "max_train_micro_batch_size_per_gpu": 1.024000e+03, +(ActorModelRayActor pid=430510) "min_train_micro_batch_size_per_gpu": 1, +(ActorModelRayActor pid=430510) "num_tuning_micro_batch_sizes": 3 +(ActorModelRayActor pid=430510) } +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] bfloat16_enabled ............. True +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] bfloat16_immediate_grad_update False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] checkpoint_parallel_write_pipeline False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] checkpoint_tag_validation_enabled True +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] checkpoint_tag_validation_fail False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] comms_config ................. +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,269] [INFO] [config.py:1005:print] communication_data_type ...... None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] curriculum_enabled_legacy .... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] curriculum_params_legacy ..... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] data_efficiency_enabled ...... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] dataloader_drop_last ......... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] disable_allgather ............ False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] dump_state ................... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] dynamic_loss_scale_args ...... None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_enabled ........... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_gas_boundary_resolution 1 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_layer_name ........ bert.encoder.layer +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_layer_num ......... 0 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_max_iter .......... 100 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_stability ......... 1e-06 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_tol ............... 0.01 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] eigenvalue_verbose ........... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] elasticity_enabled ........... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,270] [INFO] [config.py:1005:print] flops_profiler_config ........ { +(ActorModelRayActor pid=430510) "enabled": false, +(ActorModelRayActor pid=430510) "recompute_fwd_factor": 0.0, +(ActorModelRayActor pid=430510) "profile_step": 1, +(ActorModelRayActor pid=430510) "module_depth": -1, +(ActorModelRayActor pid=430510) "top_modules": 1, +(ActorModelRayActor pid=430510) "detailed": true, +(ActorModelRayActor pid=430510) "output_file": null +(ActorModelRayActor pid=430510) } +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] fp16_auto_cast ............... None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] fp16_enabled ................. False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] fp16_master_weights_and_gradients False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] global_rank .................. 0 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] grad_accum_dtype ............. None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] gradient_accumulation_steps .. 8 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] gradient_clipping ............ 1.0 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] gradient_predivide_factor .... 1.0 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] graph_harvesting ............. False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] initial_dynamic_scale ........ 1 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] load_universal_checkpoint .... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] loss_scale ................... 1.0 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] memory_breakdown ............. False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] mics_hierarchial_params_gather False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] mics_shard_size .............. -1 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] nebula_config ................ { +(ActorModelRayActor pid=430510) "enabled": false, +(ActorModelRayActor pid=430510) "persistent_storage_path": null, +(ActorModelRayActor pid=430510) "persistent_time_interval": 100, +(ActorModelRayActor pid=430510) "num_of_version_in_retention": 2, +(ActorModelRayActor pid=430510) "enable_nebula_load": true, +(ActorModelRayActor pid=430510) "load_path": null +(ActorModelRayActor pid=430510) } +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] optimizer_legacy_fusion ...... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,271] [INFO] [config.py:1005:print] optimizer_name ............... None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] optimizer_params ............. None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] pld_enabled .................. False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] pld_params ................... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] prescale_gradients ........... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] scheduler_name ............... None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] scheduler_params ............. None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] seq_parallel_communication_data_type torch.float32 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] sparse_attention ............. None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] sparse_gradients_enabled ..... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] steps_per_print .............. 100 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] timers_config ................ enabled=True synchronized=True +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] train_batch_size ............. 128 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] train_micro_batch_size_per_gpu 2 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] use_data_before_expert_parallel_ False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] use_node_local_storage ....... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] wall_clock_breakdown ......... False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] weight_quantization_config ... None +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] world_size ................... 8 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,272] [INFO] [config.py:1005:print] zero_allow_untested_optimizer False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,273] [INFO] [config.py:1005:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,273] [INFO] [config.py:1005:print] zero_enabled ................. True +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,273] [INFO] [config.py:1005:print] zero_force_ds_cpu_optimizer .. True +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,273] [INFO] [config.py:1005:print] zero_optimization_stage ...... 3 +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,273] [INFO] [config.py:991:print_user_config] json = { +(ActorModelRayActor pid=430510) "steps_per_print": 100, +(ActorModelRayActor pid=430510) "zero_optimization": { +(ActorModelRayActor pid=430510) "stage": 3, +(ActorModelRayActor pid=430510) "offload_param": { +(ActorModelRayActor pid=430510) "device": "none" +(ActorModelRayActor pid=430510) }, +(ActorModelRayActor pid=430510) "offload_optimizer": { +(ActorModelRayActor pid=430510) "device": "cpu", +(ActorModelRayActor pid=430510) "pin_memory": true +(ActorModelRayActor pid=430510) }, +(ActorModelRayActor pid=430510) "sub_group_size": "auto", +(ActorModelRayActor pid=430510) "stage3_max_live_parameters": "auto", +(ActorModelRayActor pid=430510) "stage3_max_reuse_distance": "auto", +(ActorModelRayActor pid=430510) "stage3_param_persistence_threshold": "auto", +(ActorModelRayActor pid=430510) "stage3_prefetch_bucket_size": "auto", +(ActorModelRayActor pid=430510) "reduce_bucket_size": "auto", +(ActorModelRayActor pid=430510) "zero_hpz_partition_size": 1, +(ActorModelRayActor pid=430510) "zero_quantized_weights": false, +(ActorModelRayActor pid=430510) "zero_quantized_gradients": false, +(ActorModelRayActor pid=430510) "reduce_scatter": true +(ActorModelRayActor pid=430510) }, +(ActorModelRayActor pid=430510) "bf16": { +(ActorModelRayActor pid=430510) "enabled": true +(ActorModelRayActor pid=430510) }, +(ActorModelRayActor pid=430510) "gradient_clipping": 1.0, +(ActorModelRayActor pid=430510) "prescale_gradients": false, +(ActorModelRayActor pid=430510) "wall_clock_breakdown": false, +(ActorModelRayActor pid=430510) "data_types": { +(ActorModelRayActor pid=430510) "grad_accum_dtype": null +(ActorModelRayActor pid=430510) }, +(ActorModelRayActor pid=430510) "checkpoint": { +(ActorModelRayActor pid=430510) "load_universal": false +(ActorModelRayActor pid=430510) }, +(ActorModelRayActor pid=430510) "train_micro_batch_size_per_gpu": 2, +(ActorModelRayActor pid=430510) "train_batch_size": 128 +(ActorModelRayActor pid=430510) } +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,284] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt... +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,313] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt. +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,314] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt... +(ActorModelRayActor pid=430510) [2025-05-28 00:00:07,338] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt. +(ActorModelRayActor pid=431333) [2025-05-28 00:00:33,844] [INFO] [engine.py:3185:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 1 +(ActorModelRayActor pid=431330) [2025-05-28 00:00:11,542] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... [repeated 22x across cluster] +(ActorModelRayActor pid=431333) [2025-05-28 00:00:33,844] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. [repeated 15x across cluster] +(ActorModelRayActor pid=431334) [2025-05-28 00:00:36,557] [INFO] [engine.py:3135:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 4 +(ActorModelRayActor pid=431328) [2025-05-28 00:00:40,539] [INFO] [engine.py:3185:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 3 [repeated 2x across cluster] +(ActorModelRayActor pid=431328) [2025-05-28 00:00:40,538] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. [repeated 2x across cluster] +(ActorModelRayActor pid=431333) [2025-05-28 00:00:36,757] [INFO] [engine.py:3135:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 1 +(ActorModelRayActor pid=431328) [2025-05-28 00:00:43,509] [INFO] [engine.py:3135:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 3 +(ActorModelRayActor pid=431332) [2025-05-28 00:00:44,685] [INFO] [engine.py:3185:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 5 [repeated 5x across cluster] +(ActorModelRayActor pid=431332) [2025-05-28 00:00:44,685] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. [repeated 5x across cluster] +(ActorModelRayActor pid=430510) Loaded the checkpoint: /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor, consumed_samples: 7680 +(ActorModelRayActor pid=430510) wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +(ActorModelRayActor pid=430510) wandb: Tracking run with wandb version 0.19.8 +(ActorModelRayActor pid=430510) wandb: W&B syncing is set to `offline` in this directory. +(ActorModelRayActor pid=430510) wandb: Run `wandb online` or set WANDB_MODE=online to enable cloud syncing. +(LLMRayActor pid=425166) init_process_group: master_address=10.140.0.151, master_port=6188, rank=1, world_size=9, group_name=openrlhf +(ActorModelRayActor pid=431332) [2025-05-28 00:00:47,458] [INFO] [engine.py:3135:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 5 [repeated 5x across cluster] +(LLMRayActor pid=425159) INFO 05-28 00:00:52 [executor_base.py:219] It took 1.357901 seconds to wake up. +(LLMRayActor pid=425166) update weight: visual.patch_embed.proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 3, 2, 14, 14]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.norm1.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.norm2.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.attn.qkv.weight, dtype: torch.bfloat16, shape: torch.Size([3840, 1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.attn.qkv.bias, dtype: torch.bfloat16, shape: torch.Size([3840]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.attn.proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.attn.proj.bias, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3420, 1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.mlp.gate_proj.bias, dtype: torch.bfloat16, shape: torch.Size([3420]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.mlp.up_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3420, 1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.mlp.up_proj.bias, dtype: torch.bfloat16, shape: torch.Size([3420]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.mlp.down_proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 3420]) +(LLMRayActor pid=425166) update weight: visual.blocks.0.mlp.down_proj.bias, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.norm1.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.norm2.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.attn.qkv.weight, dtype: torch.bfloat16, shape: torch.Size([3840, 1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.attn.qkv.bias, dtype: torch.bfloat16, shape: torch.Size([3840]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.attn.proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.attn.proj.bias, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3420, 1280]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.mlp.gate_proj.bias, dtype: torch.bfloat16, shape: torch.Size([3420]) +(LLMRayActor pid=425166) update weight: visual.blocks.1.mlp.up_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3420, 1280]) +(LLMRayActor pid=425165) init_process_group: master_address=10.140.0.151, master_port=6188, rank=8, world_size=9, group_name=openrlhf [repeated 7x across cluster] +(LLMRayActor pid=425166) update weight: visual.merger.ln_q.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=425166) update weight: model.embed_tokens.weight, dtype: torch.bfloat16, shape: torch.Size([152064, 3584]) +(LLMRayActor pid=425165) INFO 05-28 00:00:54 [executor_base.py:219] It took 3.127167 seconds to wake up. [repeated 7x across cluster] +(LLMRayActor pid=425165) update weight: visual.patch_embed.proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 3, 2, 14, 14]) [repeated 7x across cluster] +(LLMRayActor pid=425166) update weight: model.layers.0.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([18944, 3584]) [repeated 3140x across cluster] +(LLMRayActor pid=425165) update weight: visual.merger.ln_q.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) [repeated 7x across cluster] +(LLMRayActor pid=425165) update weight: model.embed_tokens.weight, dtype: torch.bfloat16, shape: torch.Size([152064, 3584]) [repeated 7x across cluster] +(LLMRayActor pid=425166) update weight: model.layers.10.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([18944, 3584]) [repeated 897x across cluster] +(LLMRayActor pid=425166) update weight: model.layers.21.mlp.down_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3584, 18944]) [repeated 1135x across cluster] +(LLMRayActor pid=425166) update weight: model.norm.weight, dtype: torch.bfloat16, shape: torch.Size([3584]) +(LLMRayActor pid=425166) update weight: lm_head.weight, dtype: torch.bfloat16, shape: torch.Size([152064, 3584]) +(LLMRayActor pid=425166) INFO 05-28 00:01:14 [worker.py:133] Sleep mode freed 38.53 GiB memory, 16.25 GiB memory is still in use. +(LLMRayActor pid=425166) INFO 05-28 00:01:14 [executor_base.py:208] It took 1.393784 seconds to fall asleep. +(LLMRayActor pid=425164) update weight: model.layers.27.post_attention_layernorm.weight, dtype: torch.bfloat16, shape: torch.Size([3584]) [repeated 599x across cluster] +(ActorModelRayActor pid=430510) +Episode [1/2]: 0%| | 0/127 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nWhat is your overall rating of this image’s techniqual quality? Provide a float between 0 and 5, rounded to two decimal places. A rating of 0 represents very poor quality, while 5 represents excellent quality.\nassistant\nThe image showcases a dynamic and creative light painting technique, where the light sources are deliberately moved to capture the shapes and patterns in the photograph. The exposure time is long enough to capture the light trails of the light sources, which create a visually appealing and artistic effect. The image appears slightly grainy, which might be due to the long exposure time, and the sharpness is not high, as the image is intentionally designed to give a sense of motion and light. The composition is balanced, with the light trails forming a coherent visual narrative. However, the image lacks detail in the darkness, which might be due to the focus and resolution limitations of the photography process used here. The colors are vibrant, particularly the warm tones of the light, which add to the visual appeal but reduce the image to a primarily aesthetic portrayal rather than a detailed technical one.\n3.75'] +(ActorModelRayActor pid=431336) ele.get("min_pixels" 3136 [repeated 47x across cluster] +(ActorModelRayActor pid=431336) ele.get("max_pixels" 1254400 [repeated 47x across cluster] +(LLMRayActor pid=425165) INFO 05-28 00:03:11 [worker.py:133] Sleep mode freed 38.49 GiB memory, 21.46 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=425165) INFO 05-28 00:03:11 [executor_base.py:208] It took 1.673811 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=430510) +(ActorModelRayActor pid=430510) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nHow visually aligned is this with the prompt: "half black character, illustration, portrait"?\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a character with dark skin, which aligns with the element of "half black character." The style is an illustration, and the composition is a portrait. The elements of the prompt are well-represented, but the term "half black character" might be slightly misleading as the image does not provide any indication of the character\'s ethnicity beyond the skin tone. The focus is on the character\'s face and upper body, making it a portrait, but there is no indication of the character being "half" anything. The image is visually engaging and styled appropriately for an illustration.\n3.50'] +(ActorModelRayActor pid=431330) ele.get("min_pixels" 3136 [repeated 347x across cluster] +(ActorModelRayActor pid=431330) ele.get("max_pixels" 1254400 [repeated 347x across cluster] +(LLMRayActor pid=425165) INFO 05-28 00:16:38 [worker.py:133] Sleep mode freed 38.22 GiB memory, 22.00 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=425165) INFO 05-28 00:16:38 [executor_base.py:208] It took 1.715809 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=430510) +(ActorModelRayActor pid=430510) +Train epoch [1/1]: 0%| | 0/128 [00:00: Failed to establish a new connection: [Errno 111] Connection refused + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/requests/adapters.py", line 667, in send + resp = conn.urlopen( + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen + retries = retries.increment( + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment + raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type] +urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=2989): Max retries exceeded with url: /api/jobs/raysubmit_tdQtQ5pUh6YdmvqQ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused')) + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/bin/ray", line 8, in + sys.exit(main()) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/scripts/scripts.py", line 2690, in main + return cli() + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/click/core.py", line 1161, in __call__