diff --git "a/logs/20250528_110535/train.log" "b/logs/20250528_110535/train.log" new file mode 100644--- /dev/null +++ "b/logs/20250528_110535/train.log" @@ -0,0 +1,27362 @@ +2025-05-28 11:05:55,258 INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_8afcbc808386ae61.zip. +2025-05-28 11:05:55,259 INFO packaging.py:575 -- Creating a file package for local module '/mnt/petrelfs/luyiting/MultiAgentEval/lmm-r1'. +2025-05-28 11:05:54,313 INFO cli.py:39 -- Job submission server address: http://127.0.0.1:2989 +2025-05-28 11:05:59,914 SUCC cli.py:63 -- ------------------------------------------------------- +2025-05-28 11:05:59,914 SUCC cli.py:64 -- Job 'raysubmit_YfLa8tqbvrqPbydr' submitted successfully +2025-05-28 11:05:59,914 SUCC cli.py:65 -- ------------------------------------------------------- +2025-05-28 11:05:59,914 INFO cli.py:289 -- Next steps +2025-05-28 11:05:59,914 INFO cli.py:290 -- Query the logs of the job: +2025-05-28 11:05:59,914 INFO cli.py:292 -- ray job logs raysubmit_YfLa8tqbvrqPbydr +2025-05-28 11:05:59,914 INFO cli.py:294 -- Query the status of the job: +2025-05-28 11:05:59,914 INFO cli.py:296 -- ray job status raysubmit_YfLa8tqbvrqPbydr +2025-05-28 11:05:59,914 INFO cli.py:298 -- Request the job to be stopped: +2025-05-28 11:05:59,914 INFO cli.py:300 -- ray job stop raysubmit_YfLa8tqbvrqPbydr +2025-05-28 11:05:59,917 INFO cli.py:307 -- Tailing logs until the job exits (disable with --no-wait): +2025-05-28 11:05:59,429 INFO job_manager.py:531 -- Runtime env is setting up. +[2025-05-28 11:06:18,303] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +INFO 05-28 11:06:22 [__init__.py:239] Automatically detected platform cuda. +2025-05-28 11:06:23,452 INFO worker.py:1520 -- Using address 10.140.0.151:6239 set in the environment variable RAY_ADDRESS +2025-05-28 11:06:23,453 INFO worker.py:1660 -- Connecting to existing Ray cluster at address: 10.140.0.151:6239... +2025-05-28 11:06:23,474 INFO worker.py:1843 -- Connected to Ray cluster. View the dashboard at 10.140.0.151:2989  +(pid=279375) INFO 05-28 11:06:42 [__init__.py:239] Automatically detected platform cuda. +(LLMRayActor pid=279375) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'reward', 'generate', 'score', 'embed', 'classify'}. Defaulting to 'generate'. +(LLMRayActor pid=279375) WARNING 05-28 11:07:38 [arg_utils.py:1846] VLLM_ATTENTION_BACKEND=triton is not supported by the V1 Engine. Falling back to V0. We recommend to remove VLLM_ATTENTION_BACKEND=triton from your config in favor of the V1 Engine. +(LLMRayActor pid=279375) WARNING 05-28 11:07:38 [arg_utils.py:1745] --enable-prefix-caching is not supported for multimodal models in V0 and has been disabled. +(LLMRayActor pid=279375) INFO 05-28 11:07:38 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2.dev76+gf68cce8) with config: model='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', speculative_config=None, tokenizer='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=44, served_model_name=/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, +(pid=279380) INFO 05-28 11:06:42 [__init__.py:239] Automatically detected platform cuda. [repeated 7x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.) +(LLMRayActor pid=279379) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'score', 'reward', 'classify', 'generate', 'embed'}. Defaulting to 'generate'. +(LLMRayActor pid=279382) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'reward', 'generate', 'embed', 'score', 'classify'}. Defaulting to 'generate'. +(LLMRayActor pid=279381) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'generate', 'embed', 'reward', 'classify', 'score'}. Defaulting to 'generate'. +(LLMRayActor pid=279378) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'generate', 'reward', 'embed', 'classify', 'score'}. Defaulting to 'generate'. +(LLMRayActor pid=279377) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'. +(LLMRayActor pid=279374) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'classify', 'embed', 'score', 'reward', 'generate'}. Defaulting to 'generate'. +(LLMRayActor pid=279380) INFO 05-28 11:07:38 [config.py:585] This model supports multiple tasks: {'classify', 'score', 'generate', 'embed', 'reward'}. Defaulting to 'generate'. +(LLMRayActor pid=279375) [2025-05-28 11:07:43,846] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +(LLMRayActor pid=279380) WARNING 05-28 11:07:38 [arg_utils.py:1846] VLLM_ATTENTION_BACKEND=triton is not supported by the V1 Engine. Falling back to V0. We recommend to remove VLLM_ATTENTION_BACKEND=triton from your config in favor of the V1 Engine. [repeated 7x across cluster] +(LLMRayActor pid=279380) WARNING 05-28 11:07:38 [arg_utils.py:1745] --enable-prefix-caching is not supported for multimodal models in V0 and has been disabled. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 11:07:38 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2.dev76+gf68cce8) with config: model='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', speculative_config=None, tokenizer='/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=47, served_model_name=/mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,  [repeated 7x across cluster] +(LLMRayActor pid=279375) INFO 05-28 11:07:54 [cuda.py:293] Using Flash Attention backend. +(LLMRayActor pid=279380) [2025-05-28 11:07:43,846] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) [repeated 7x across cluster] +(LLMRayActor pid=279375) INFO 05-28 11:07:58 [parallel_state.py:967] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0 +(LLMRayActor pid=279375) INFO 05-28 11:07:58 [model_runner.py:1110] Starting to load model /mnt/petrelfs/luyiting/ckt/Qwen2.5-VL-7B-Instruct/Qwen2.5-VL-7B-Instruct/... +(LLMRayActor pid=279380) INFO 05-28 11:07:54 [cuda.py:293] Using Flash Attention backend. [repeated 7x across cluster] +(LLMRayActor pid=279375) INFO 05-28 11:07:59 [config.py:3229] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256] is overridden by config [256, 128, 2, 1, 4, 136, 8, 144, 16, 152, 24, 160, 32, 168, 40, 176, 48, 184, 56, 192, 64, 200, 72, 208, 80, 216, 88, 120, 224, 96, 232, 104, 240, 112, 248] +(LLMRayActor pid=279382) +Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,048] [INFO] [config.py:1005:print] communication_data_type ...... None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,048] [INFO] [config.py:1005:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] curriculum_enabled_legacy .... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] curriculum_params_legacy ..... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] data_efficiency_enabled ...... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] dataloader_drop_last ......... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] disable_allgather ............ False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] dump_state ................... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] dynamic_loss_scale_args ...... None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_enabled ........... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_gas_boundary_resolution 1 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_layer_name ........ bert.encoder.layer +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_layer_num ......... 0 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_max_iter .......... 100 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_stability ......... 1e-06 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_tol ............... 0.01 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] eigenvalue_verbose ........... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] elasticity_enabled ........... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] flops_profiler_config ........ { +(ReferenceModelRayActor pid=287373) "enabled": false, +(ReferenceModelRayActor pid=287373) "recompute_fwd_factor": 0.0, +(ReferenceModelRayActor pid=287373) "profile_step": 1, +(ReferenceModelRayActor pid=287373) "module_depth": -1, +(ReferenceModelRayActor pid=287373) "top_modules": 1, +(ReferenceModelRayActor pid=287373) "detailed": true, +(ReferenceModelRayActor pid=287373) "output_file": null +(ReferenceModelRayActor pid=287373) } +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] fp16_auto_cast ............... None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] fp16_enabled ................. False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] fp16_master_weights_and_gradients False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,049] [INFO] [config.py:1005:print] global_rank .................. 0 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] grad_accum_dtype ............. None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] gradient_accumulation_steps .. 8 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] gradient_clipping ............ 1.0 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] gradient_predivide_factor .... 1.0 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] graph_harvesting ............. False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] initial_dynamic_scale ........ 1 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] load_universal_checkpoint .... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] loss_scale ................... 1.0 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] memory_breakdown ............. False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] mics_hierarchial_params_gather False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] mics_shard_size .............. -1 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] nebula_config ................ { +(ReferenceModelRayActor pid=287373) "enabled": false, +(ReferenceModelRayActor pid=287373) "persistent_storage_path": null, +(ReferenceModelRayActor pid=287373) "persistent_time_interval": 100, +(ReferenceModelRayActor pid=287373) "num_of_version_in_retention": 2, +(ReferenceModelRayActor pid=287373) "enable_nebula_load": true, +(ReferenceModelRayActor pid=287373) "load_path": null +(ReferenceModelRayActor pid=287373) } +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] optimizer_legacy_fusion ...... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] optimizer_name ............... None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] optimizer_params ............. None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] pld_enabled .................. False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] pld_params ................... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] prescale_gradients ........... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,050] [INFO] [config.py:1005:print] scheduler_name ............... None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] scheduler_params ............. None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] seq_parallel_communication_data_type torch.float32 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] sparse_attention ............. None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] sparse_gradients_enabled ..... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] steps_per_print .............. 100 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] timers_config ................ enabled=True synchronized=True +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] train_batch_size ............. 128 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] train_micro_batch_size_per_gpu 2 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] use_data_before_expert_parallel_ False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] use_node_local_storage ....... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] wall_clock_breakdown ......... False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] weight_quantization_config ... None +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] world_size ................... 8 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] zero_allow_untested_optimizer False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=True) offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] zero_enabled ................. True +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] zero_force_ds_cpu_optimizer .. True +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:1005:print] zero_optimization_stage ...... 3 +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:17,051] [INFO] [config.py:991:print_user_config] json = { +(ReferenceModelRayActor pid=287373) "steps_per_print": 100, +(ReferenceModelRayActor pid=287373) "zero_optimization": { +(ReferenceModelRayActor pid=287373) "stage": 3, +(ReferenceModelRayActor pid=287373) "stage3_max_live_parameters": "auto", +(ReferenceModelRayActor pid=287373) "stage3_max_reuse_distance": "auto", +(ReferenceModelRayActor pid=287373) "stage3_param_persistence_threshold": "auto", +(ReferenceModelRayActor pid=287373) "stage3_prefetch_bucket_size": "auto", +(ReferenceModelRayActor pid=287373) "offload_param": { +(ReferenceModelRayActor pid=287373) "device": "none", +(ReferenceModelRayActor pid=287373) "pin_memory": true +(ReferenceModelRayActor pid=287373) } +(ReferenceModelRayActor pid=287373) }, +(ReferenceModelRayActor pid=287373) "bf16": { +(ReferenceModelRayActor pid=287373) "enabled": true +(ReferenceModelRayActor pid=287373) }, +(ReferenceModelRayActor pid=287373) "gradient_clipping": 1.0, +(ReferenceModelRayActor pid=287373) "prescale_gradients": false, +(ReferenceModelRayActor pid=287373) "wall_clock_breakdown": false, +(ReferenceModelRayActor pid=287373) "train_micro_batch_size_per_gpu": 2, +(ReferenceModelRayActor pid=287373) "train_batch_size": 128 +(ReferenceModelRayActor pid=287373) } +(ActorModelRayActor pid=286523) loaded /mnt/petrelfs/luyiting/MultiAgentEval/data_process_v1/train_ava_mini_evalmuse_koniq_llavastyle_openrlhf_merged.jsonl with data_files=/mnt/petrelfs/luyiting/MultiAgentEval/data_process_v1/train_ava_mini_evalmuse_koniq_llavastyle_openrlhf_merged.jsonl +(ActorModelRayActor pid=286523) [Dataset({ +(ActorModelRayActor pid=286523) features: ['message', 'answer'], +(ActorModelRayActor pid=286523) num_rows: 24000 +(ActorModelRayActor pid=286523) })] +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) +(ActorModelRayActor pid=287378) in preprocess_data None False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) None False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) +(ActorModelRayActor pid=287378) None False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) in preprocess_data +(ActorModelRayActor pid=287378) +(ActorModelRayActor pid=287378) None +(ActorModelRayActor pid=287378) False +(ActorModelRayActor pid=287378) +(ActorModelRayActor pid=287378) in preprocess_data +(ReferenceModelRayActor pid=287373) [2025-05-28 11:11:16,616] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8 [repeated 6x across cluster] +(ActorModelRayActor pid=287378) None False +(ActorModelRayActor pid=286523) +Preprocessing data: 0%| | 0/24000 [00:00 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,521] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,521] [INFO] [logging.py:128:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,736] [INFO] [utils.py:781:see_memory_usage] Stage 3 initialize beginning +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,737] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 3.98 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,738] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.79 GB, percent = 44.3% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,740] [INFO] [stage3.py:170:__init__] Reduce bucket size 500000000 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,740] [INFO] [stage3.py:171:__init__] Prefetch bucket size 50000000 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,917] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,917] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 1.94 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,918] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.79 GB, percent = 44.3% +(ActorModelRayActor pid=286523) Parameter Offload: Total persistent parameters: 848896 in 368 params +(ActorModelRayActor pid=286523) [2025-05-28 11:11:22,137] [INFO] [utils.py:781:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +(ActorModelRayActor pid=286523) [2025-05-28 11:11:22,138] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 1.94 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:22,138] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.79 GB, percent = 44.3% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:22,320] [INFO] [utils.py:781:see_memory_usage] Before creating fp16 partitions +(ActorModelRayActor pid=286523) [2025-05-28 11:11:22,321] [INFO] [utils.py:782:see_memory_usage] MA 1.94 GB Max_MA 1.94 GB CA 4.04 GB Max_CA 4 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:22,321] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 445.8 GB, percent = 44.3% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:24,642] [INFO] [utils.py:781:see_memory_usage] After creating fp16 partitions: 2 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:24,643] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.94 GB CA 1.94 GB Max_CA 4 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:24,644] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 448.65 GB, percent = 44.5% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:24,846] [INFO] [utils.py:781:see_memory_usage] Before creating fp32 partitions +(ActorModelRayActor pid=286523) [2025-05-28 11:11:24,847] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:24,848] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 451.4 GB, percent = 44.8% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:28,581] [INFO] [utils.py:781:see_memory_usage] After creating fp32 partitions +(ActorModelRayActor pid=286523) [2025-05-28 11:11:28,582] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:28,583] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 476.75 GB, percent = 47.3% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:28,807] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states +(ActorModelRayActor pid=286523) [2025-05-28 11:11:28,807] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:28,808] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 481.13 GB, percent = 47.8% +(ActorModelRayActor pid=286523) in preprocess_data None False [repeated 191874x across cluster] +(ActorModelRayActor pid=286523) [2025-05-28 11:11:21,465] [INFO] [config.py:734:__init__] Config mesh_device None world_size = 8 [repeated 8x across cluster] +(ActorModelRayActor pid=286523) [2025-05-28 11:11:36,537] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states +(ActorModelRayActor pid=286523) [2025-05-28 11:11:36,537] [INFO] [utils.py:782:see_memory_usage] MA 1.93 GB Max_MA 1.93 GB CA 1.94 GB Max_CA 2 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:36,538] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 511.42 GB, percent = 50.8% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:36,579] [INFO] [stage3.py:534:_setup_for_real_optimizer] optimizer state initialized +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,325] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,326] [INFO] [utils.py:782:see_memory_usage] MA 2.86 GB Max_MA 4.89 GB CA 5.02 GB Max_CA 5 GB +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,327] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 525.92 GB, percent = 52.2% +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,327] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer_Stage3 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,327] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,327] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,327] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,329] [INFO] [config.py:1001:print] DeepSpeedEngine configuration: +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,329] [INFO] [config.py:1005:print] activation_checkpointing_config { +(ActorModelRayActor pid=286523) "partition_activations": false, +(ActorModelRayActor pid=286523) "contiguous_memory_optimization": false, +(ActorModelRayActor pid=286523) "cpu_checkpointing": false, +(ActorModelRayActor pid=286523) "number_checkpoints": null, +(ActorModelRayActor pid=286523) "synchronize_checkpoint_boundary": false, +(ActorModelRayActor pid=286523) "profile": false +(ActorModelRayActor pid=286523) } +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,329] [INFO] [config.py:1005:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'intra_op_parallelism': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] amp_enabled .................. False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] amp_params ................... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] autotuning_config ............ { +(ActorModelRayActor pid=286523) "enabled": false, +(ActorModelRayActor pid=286523) "start_step": null, +(ActorModelRayActor pid=286523) "end_step": null, +(ActorModelRayActor pid=286523) "metric_path": null, +(ActorModelRayActor pid=286523) "arg_mappings": null, +(ActorModelRayActor pid=286523) "metric": "throughput", +(ActorModelRayActor pid=286523) "model_info": null, +(ActorModelRayActor pid=286523) "results_dir": "autotuning_results", +(ActorModelRayActor pid=286523) "exps_dir": "autotuning_exps", +(ActorModelRayActor pid=286523) "overwrite": true, +(ActorModelRayActor pid=286523) "fast": true, +(ActorModelRayActor pid=286523) "start_profile_step": 3, +(ActorModelRayActor pid=286523) "end_profile_step": 5, +(ActorModelRayActor pid=286523) "tuner_type": "gridsearch", +(ActorModelRayActor pid=286523) "tuner_early_stopping": 5, +(ActorModelRayActor pid=286523) "tuner_num_trials": 50, +(ActorModelRayActor pid=286523) "model_info_path": null, +(ActorModelRayActor pid=286523) "mp_size": 1, +(ActorModelRayActor pid=286523) "max_train_batch_size": null, +(ActorModelRayActor pid=286523) "min_train_batch_size": 1, +(ActorModelRayActor pid=286523) "max_train_micro_batch_size_per_gpu": 1.024000e+03, +(ActorModelRayActor pid=286523) "min_train_micro_batch_size_per_gpu": 1, +(ActorModelRayActor pid=286523) "num_tuning_micro_batch_sizes": 3 +(ActorModelRayActor pid=286523) } +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] bfloat16_enabled ............. True +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] bfloat16_immediate_grad_update False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] checkpoint_parallel_write_pipeline False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] checkpoint_tag_validation_enabled True +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] checkpoint_tag_validation_fail False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] comms_config ................. +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] communication_data_type ...... None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] curriculum_enabled_legacy .... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] curriculum_params_legacy ..... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] data_efficiency_enabled ...... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] dataloader_drop_last ......... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] disable_allgather ............ False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] dump_state ................... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] dynamic_loss_scale_args ...... None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,330] [INFO] [config.py:1005:print] eigenvalue_enabled ........... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] eigenvalue_gas_boundary_resolution 1 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] eigenvalue_layer_name ........ bert.encoder.layer +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] eigenvalue_layer_num ......... 0 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] eigenvalue_max_iter .......... 100 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] eigenvalue_stability ......... 1e-06 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] eigenvalue_tol ............... 0.01 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] eigenvalue_verbose ........... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] elasticity_enabled ........... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] flops_profiler_config ........ { +(ActorModelRayActor pid=286523) "enabled": false, +(ActorModelRayActor pid=286523) "recompute_fwd_factor": 0.0, +(ActorModelRayActor pid=286523) "profile_step": 1, +(ActorModelRayActor pid=286523) "module_depth": -1, +(ActorModelRayActor pid=286523) "top_modules": 1, +(ActorModelRayActor pid=286523) "detailed": true, +(ActorModelRayActor pid=286523) "output_file": null +(ActorModelRayActor pid=286523) } +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] fp16_auto_cast ............... None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] fp16_enabled ................. False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] fp16_master_weights_and_gradients False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] global_rank .................. 0 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] grad_accum_dtype ............. None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] gradient_accumulation_steps .. 8 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] gradient_clipping ............ 1.0 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] gradient_predivide_factor .... 1.0 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] graph_harvesting ............. False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] initial_dynamic_scale ........ 1 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] load_universal_checkpoint .... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,331] [INFO] [config.py:1005:print] loss_scale ................... 1.0 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] memory_breakdown ............. False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] mics_hierarchial_params_gather False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] mics_shard_size .............. -1 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] nebula_config ................ { +(ActorModelRayActor pid=286523) "enabled": false, +(ActorModelRayActor pid=286523) "persistent_storage_path": null, +(ActorModelRayActor pid=286523) "persistent_time_interval": 100, +(ActorModelRayActor pid=286523) "num_of_version_in_retention": 2, +(ActorModelRayActor pid=286523) "enable_nebula_load": true, +(ActorModelRayActor pid=286523) "load_path": null +(ActorModelRayActor pid=286523) } +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] optimizer_legacy_fusion ...... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] optimizer_name ............... None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] optimizer_params ............. None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] pld_enabled .................. False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] pld_params ................... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] prescale_gradients ........... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] scheduler_name ............... None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] scheduler_params ............. None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] seq_parallel_communication_data_type torch.float32 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] sparse_attention ............. None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] sparse_gradients_enabled ..... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] steps_per_print .............. 100 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] tensor_parallel_config ....... dtype=torch.float16 autotp_size=0 tensor_parallel=TPConfig(tp_size=1, tp_grain_size=1, mpu=None, tp_group=None) injection_policy_tuple=None keep_module_on_host=False replace_with_kernel_inject=False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] timers_config ................ enabled=True synchronized=True +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,332] [INFO] [config.py:1005:print] train_batch_size ............. 128 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] train_micro_batch_size_per_gpu 2 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] use_data_before_expert_parallel_ False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] use_node_local_storage ....... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] wall_clock_breakdown ......... False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] weight_quantization_config ... None +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] world_size ................... 8 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] zero_allow_untested_optimizer False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True log_trace_cache_warnings=False +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] zero_enabled ................. True +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] zero_force_ds_cpu_optimizer .. True +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:1005:print] zero_optimization_stage ...... 3 +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,333] [INFO] [config.py:991:print_user_config] json = { +(ActorModelRayActor pid=286523) "steps_per_print": 100, +(ActorModelRayActor pid=286523) "zero_optimization": { +(ActorModelRayActor pid=286523) "stage": 3, +(ActorModelRayActor pid=286523) "offload_param": { +(ActorModelRayActor pid=286523) "device": "none" +(ActorModelRayActor pid=286523) }, +(ActorModelRayActor pid=286523) "offload_optimizer": { +(ActorModelRayActor pid=286523) "device": "cpu", +(ActorModelRayActor pid=286523) "pin_memory": true +(ActorModelRayActor pid=286523) }, +(ActorModelRayActor pid=286523) "sub_group_size": "auto", +(ActorModelRayActor pid=286523) "stage3_max_live_parameters": "auto", +(ActorModelRayActor pid=286523) "stage3_max_reuse_distance": "auto", +(ActorModelRayActor pid=286523) "stage3_param_persistence_threshold": "auto", +(ActorModelRayActor pid=286523) "stage3_prefetch_bucket_size": "auto", +(ActorModelRayActor pid=286523) "reduce_bucket_size": "auto", +(ActorModelRayActor pid=286523) "zero_hpz_partition_size": 1, +(ActorModelRayActor pid=286523) "zero_quantized_weights": false, +(ActorModelRayActor pid=286523) "zero_quantized_gradients": false, +(ActorModelRayActor pid=286523) "reduce_scatter": true +(ActorModelRayActor pid=286523) }, +(ActorModelRayActor pid=286523) "bf16": { +(ActorModelRayActor pid=286523) "enabled": true +(ActorModelRayActor pid=286523) }, +(ActorModelRayActor pid=286523) "gradient_clipping": 1.0, +(ActorModelRayActor pid=286523) "prescale_gradients": false, +(ActorModelRayActor pid=286523) "wall_clock_breakdown": false, +(ActorModelRayActor pid=286523) "data_types": { +(ActorModelRayActor pid=286523) "grad_accum_dtype": null +(ActorModelRayActor pid=286523) }, +(ActorModelRayActor pid=286523) "checkpoint": { +(ActorModelRayActor pid=286523) "load_universal": false +(ActorModelRayActor pid=286523) }, +(ActorModelRayActor pid=286523) "train_micro_batch_size_per_gpu": 2, +(ActorModelRayActor pid=286523) "train_batch_size": 128 +(ActorModelRayActor pid=286523) } +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,346] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt... +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,373] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt. +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,374] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt... +(ActorModelRayActor pid=286523) [2025-05-28 11:11:39,398] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_0_mp_rank_00_model_states.pt. +(ActorModelRayActor pid=286523) [2025-05-28 11:12:08,056] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... [repeated 21x across cluster] +(ActorModelRayActor pid=287374) [2025-05-28 11:11:39,141] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/zero_pp_rank_6_mp_rank_00_model_states.pt. [repeated 14x across cluster] +(ActorModelRayActor pid=287372) [2025-05-28 11:12:22,493] [INFO] [engine.py:3185:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 3 +(ActorModelRayActor pid=287376) [2025-05-28 11:12:08,056] [INFO] [torch_checkpoint_engine.py:27:load] [Torch] Loading checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... +(ActorModelRayActor pid=287372) [2025-05-28 11:12:22,493] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. +(ActorModelRayActor pid=287374) [2025-05-28 11:12:22,450] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. +(ActorModelRayActor pid=287372) [2025-05-28 11:12:25,593] [INFO] [engine.py:3135:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 3 +(ActorModelRayActor pid=287377) [2025-05-28 11:12:25,372] [INFO] [engine.py:3185:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 7 [repeated 5x across cluster] +(ActorModelRayActor pid=287377) [2025-05-28 11:12:25,372] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. [repeated 4x across cluster] +(ActorModelRayActor pid=287377) [2025-05-28 11:12:28,489] [INFO] [engine.py:3135:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 7 [repeated 5x across cluster] +(ActorModelRayActor pid=287376) [2025-05-28 11:12:41,250] [INFO] [engine.py:3185:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 4 +(ActorModelRayActor pid=287376) [2025-05-28 11:12:41,250] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. +(ActorModelRayActor pid=286523) [2025-05-28 11:12:41,929] [INFO] [torch_checkpoint_engine.py:29:load] [Torch] Loaded checkpoint from /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor/global_step60/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +(ActorModelRayActor pid=286523) [2025-05-28 11:12:41,930] [INFO] [engine.py:3185:_get_all_zero_checkpoint_state_dicts] successfully read 8 ZeRO state_dicts for rank 0 +(ActorModelRayActor pid=286523) Loaded the checkpoint: /mnt/petrelfs/luyiting/MultiAgentEval/checkpoints_lmmr1//lmm-r1-ava-evalmuse-koniq-grpo-score-7B/ckpt/_actor, consumed_samples: 7680 +(ActorModelRayActor pid=286523) wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. +(ActorModelRayActor pid=286523) wandb: Tracking run with wandb version 0.19.8 +(ActorModelRayActor pid=286523) wandb: W&B syncing is set to `offline` in this directory. +(ActorModelRayActor pid=286523) wandb: Run `wandb online` or set WANDB_MODE=online to enable cloud syncing. +(LLMRayActor pid=279375) init_process_group: master_address=10.140.0.151, master_port=28092, rank=3, world_size=9, group_name=openrlhf +(ActorModelRayActor pid=286523) [2025-05-28 11:12:44,920] [INFO] [engine.py:3135:_load_zero_checkpoint] loading 8 zero partition checkpoints for rank 0 [repeated 2x across cluster] +(LLMRayActor pid=279375) INFO 05-28 11:12:50 [executor_base.py:219] It took 1.475051 seconds to wake up. +(LLMRayActor pid=279375) update weight: visual.patch_embed.proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 3, 2, 14, 14]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.norm1.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.norm2.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.attn.qkv.weight, dtype: torch.bfloat16, shape: torch.Size([3840, 1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.attn.qkv.bias, dtype: torch.bfloat16, shape: torch.Size([3840]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.attn.proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.attn.proj.bias, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3420, 1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.mlp.gate_proj.bias, dtype: torch.bfloat16, shape: torch.Size([3420]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.mlp.up_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3420, 1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.mlp.up_proj.bias, dtype: torch.bfloat16, shape: torch.Size([3420]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.mlp.down_proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 3420]) +(LLMRayActor pid=279375) update weight: visual.blocks.0.mlp.down_proj.bias, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.1.norm1.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=279375) update weight: visual.blocks.1.norm2.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=279380) init_process_group: master_address=10.140.0.151, master_port=28092, rank=6, world_size=9, group_name=openrlhf [repeated 7x across cluster] +(LLMRayActor pid=279375) update weight: visual.merger.ln_q.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) +(LLMRayActor pid=279375) update weight: model.embed_tokens.weight, dtype: torch.bfloat16, shape: torch.Size([152064, 3584]) +(LLMRayActor pid=279380) INFO 05-28 11:12:52 [executor_base.py:219] It took 3.291262 seconds to wake up. [repeated 7x across cluster] +(LLMRayActor pid=279380) update weight: visual.patch_embed.proj.weight, dtype: torch.bfloat16, shape: torch.Size([1280, 3, 2, 14, 14]) [repeated 7x across cluster] +(LLMRayActor pid=279375) update weight: model.layers.0.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([18944, 3584]) [repeated 3098x across cluster] +(LLMRayActor pid=279380) update weight: visual.merger.ln_q.weight, dtype: torch.bfloat16, shape: torch.Size([1280]) [repeated 7x across cluster] +(LLMRayActor pid=279380) update weight: model.embed_tokens.weight, dtype: torch.bfloat16, shape: torch.Size([152064, 3584]) [repeated 7x across cluster] +(LLMRayActor pid=279375) update weight: model.layers.8.mlp.down_proj.weight, dtype: torch.bfloat16, shape: torch.Size([3584, 18944]) [repeated 826x across cluster] +(LLMRayActor pid=279375) update weight: model.layers.17.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([18944, 3584]) [repeated 792x across cluster] +(LLMRayActor pid=279375) update weight: model.layers.26.mlp.gate_proj.weight, dtype: torch.bfloat16, shape: torch.Size([18944, 3584]) [repeated 859x across cluster] +(LLMRayActor pid=279375) update weight: model.norm.weight, dtype: torch.bfloat16, shape: torch.Size([3584]) +(LLMRayActor pid=279375) update weight: lm_head.weight, dtype: torch.bfloat16, shape: torch.Size([152064, 3584]) +(LLMRayActor pid=279375) INFO 05-28 11:13:17 [worker.py:133] Sleep mode freed 38.53 GiB memory, 17.54 GiB memory is still in use. +(LLMRayActor pid=279375) INFO 05-28 11:13:17 [executor_base.py:208] It took 1.477016 seconds to fall asleep. +(ActorModelRayActor pid=286523) +Episode [1/2]: 0%| | 0/127 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nWhat is your overall rating of this image’s techniqual quality? Provide a float between 0 and 5, rounded to two decimal places. A rating of 0 represents very poor quality, while 5 represents excellent quality.\nassistant\nThe image showcases a dynamic and creative light painting technique, where the light sources are deliberately moved to capture the shapes and patterns in the photograph. The exposure time is long enough to capture the light trails of the light sources, which create a visually appealing and artistic effect. The image appears slightly grainy, which might be due to the long exposure time, and the sharpness is not high, as the image is intentionally designed to give a sense of motion and light. The composition is balanced, with the light trails forming a coherent visual narrative. However, the image lacks detail in the darkness, which might be due to the focus and resolution limitations of the photography process used here. The colors are vibrant, particularly the warm tones of the light, which add to the visual appeal but reduce the image to a primarily aesthetic portrayal rather than a detailed technical one.\n3.75'] +(ActorModelRayActor pid=287374) ele.get("min_pixels" 3136 [repeated 48x across cluster] +(ActorModelRayActor pid=287374) ele.get("max_pixels" 1254400 [repeated 48x across cluster] +(LLMRayActor pid=279380) INFO 05-28 11:15:21 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.62 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 11:15:21 [executor_base.py:208] It took 1.653984 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nHow visually aligned is this with the prompt: "half black character, illustration, portrait"?\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a character with dark skin, which aligns with the element of "half black character." The style is an illustration, and the composition is a portrait. The elements of the prompt are well-represented, but the term "half black character" might be slightly misleading if the image implies a full representation of a black individual. However, the overall alignment with the provided prompt is quite close.\n4.00'] +(LLMRayActor pid=279380) INFO 05-28 11:28:02 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.27 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 11:28:02 [executor_base.py:208] It took 1.666023 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287377) ele.get("min_pixels" 3136 [repeated 4x across cluster] +(ActorModelRayActor pid=287377) ele.get("max_pixels" 1254400 [repeated 4x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nPlease rate the aesthetics of this picture on a scale from 0 to 5, as a float rounded to two decimal places. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image showcases a structure with classical architectural elements, including columns and arched windows, which suggest it may be a significant or governmental building. The dome, particularly the golden underside, adds a sense of grandeur and historical significance to the structure. The lighting in the image is somewhat artificial, possibly due to the time of day, as it does not appear to be direct sunlight, and this might give the image a slightly artificial look. The composition is vertical, focusing on the height of the structure, but the image is not particularly dynamic due to the lack of movement or action. The tree on the right adds a natural element to the image, but it is somewhat overwhelming and directs the viewer's focus away from the building. The image does not seem to have a strong or clear message or narrative, which can make it feel somewhat flat or static in terms of storytelling or visual communication.\n3.00"] +(ActorModelRayActor pid=287376) ele.get("min_pixels" 3136 [repeated 267x across cluster] +(ActorModelRayActor pid=287376) ele.get("max_pixels" 1254400 [repeated 267x across cluster] +(LLMRayActor pid=279380) INFO 05-28 11:39:58 [worker.py:133] Sleep mode freed 38.19 GiB memory, 22.30 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 11:39:58 [executor_base.py:208] It took 1.701456 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nGive an overall aesthetic rating for this image, between 0 and 5, rounded to two decimal points. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image showcases a cruise ship in the background, with a foreground featuring a bollard and a chain, suggesting a dock or pier setting. The composition is balanced, with the ship centered in the image, providing a clear subject. The lighting appears natural, indicating the photo was taken during the day, possibly with some level of overcast sky given the soft shadows. The colors are vivid, with the deep blue of the water and the contrasting tones of the ship against the soft orange platform. However, the image seems a bit flat and lacks the dynamic energy that might come with more action or a sense of movement. The composition is straightforward, not particularly evocative or artistic, which may contribute to a slightly flat aesthetic. The image might be improved with a stronger composition or a focus on a more dynamic element.\n3.00'] +(LLMRayActor pid=279380) INFO 05-28 11:51:53 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.69 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 11:51:53 [executor_base.py:208] It took 1.656377 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287377) ele.get("min_pixels" 3136 [repeated 20x across cluster] +(ActorModelRayActor pid=287377) ele.get("max_pixels" 1254400 [repeated 20x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nHow faithful is the image to this prompt: "A large quantity Segway and 3 Lipstick"?\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image shows a total of six scooter-like vehicles, which do not appear to be Segways but could be interpreted as electric scooters. However, the prompt only asks for a "large quantity" of Segway, which is not clear from the image as it does not define Segway specifically. Regarding the lipsticks, there are indeed three, which matches the prompt. However, the number of lipsticks is not in a "large quantity" as defined in the prompt.\n3.00'] +(LLMRayActor pid=279380) INFO 05-28 12:03:41 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.39 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 12:03:41 [executor_base.py:208] It took 1.692146 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287372) ele.get("min_pixels" 3136 [repeated 32x across cluster] +(ActorModelRayActor pid=287372) ele.get("max_pixels" 1254400 [repeated 32x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nGive an overall aesthetic rating for this image, between 0 and 5, rounded to two decimal points. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image presents a serene and natural scene, likely of a coastal area with shallow water. The color palette is dominated by blues and browns, which are harmonious and give the image a calm, peaceful feeling. The composition is straightforward, with no apparent focal point or strong visual hierarchy. The image lacks any human presence or activity, which might give it a feeling of tranquility and solitude. However, the scene could be considered somewhat monotone and lacks the dynamic nature that might make the image more engaging or interesting. The lighting is even, without any harsh shadows or highlights, which keeps the image soft and mild. Overall, the image has a pleasing aesthetic, but it might not be particularly striking or memorable due to its simplicity and lack of contrast or detail.\n2.50'] +(ActorModelRayActor pid=287375) ele.get("min_pixels" 3136 [repeated 48x across cluster] +(ActorModelRayActor pid=287375) ele.get("max_pixels" 1254400 [repeated 48x across cluster] +(LLMRayActor pid=279380) INFO 05-28 12:15:49 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.22 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 12:15:49 [executor_base.py:208] It took 1.653215 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nEvaluate the visual aesthetics of this photo using a float score from 0.00 to 5.00. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image showcases a male mallard duck in flight, with its wings spread and a vibrant mix of colors, particularly the presence of its metallic green head and chest. The lighting highlights the duck's features against a soft-focused background of autumn foliage, creating a depth of field that draws attention to the subject. The composition is well-balanced, with the duck positioned centrally and the natural backdrop contributing to a serene and naturalistic atmosphere. The photo captures a moment of the duck's life, which adds a dynamic and lively element to the image. The color contrast between the duck and the greenery is pleasing to the eye, enhancing the visual appeal of the photograph.\n3.25"] +(ActorModelRayActor pid=286523) ele.get("min_pixels" 3136 [repeated 8x across cluster] +(ActorModelRayActor pid=286523) ele.get("max_pixels" 1254400 [repeated 8x across cluster] +(LLMRayActor pid=279380) INFO 05-28 12:27:49 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.63 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 12:27:49 [executor_base.py:208] It took 1.618947 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nGive an overall aesthetic rating for this image, between 0 and 5, rounded to two decimal points. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image depicts what appears to be a close-up of a handwashing sink, with visible water droplets and reflections on the wet surface. The composition is somewhat abstract due to the distorted and angular perspective, which might be intentional for artistic effect. The use of light and shadow adds depth to the image, creating a sense of texture and movement. However, the image lacks a clear subject or narrative, and the abstract nature may make it difficult for the viewer to immediately understand the intended message. The color palette is limited, with cool tones being dominant, which gives the image a somewhat sterile or clinical feel. While the image has an intriguing and somewhat artistic quality, it might not be appealing to everyone due to its abstract and somewhat cold aesthetic.\n2.25'] +(LLMRayActor pid=279380) INFO 05-28 12:39:41 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.81 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 12:39:41 [executor_base.py:208] It took 1.622974 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nOn a scale of 0 to 5, how would you rate the overall techniqual quality of this image? Use a float with two decimal places. A rating of 0 represents very poor quality, while 5 represents excellent quality.\nassistant\nThe image shows a close-up of two crocheted or woven items with a focus on texture and color. The lighting is even, highlighting the details of the materials used. The image is clear, with no apparent blurring or noise, suggesting the image was taken with a camera capable of producing high-resolution images. The composition is simple, with the objects placed on a wooden surface that adds a natural texture to the image. The colors are vibrant, which, combined with the clarity, gives the image a pleasant and inviting appearance. However, the image does not seem to be overly aesthetically focused, and it does not contain any dramatic lighting or strong compositional elements that might elevate it to a particularly artistic image. It is a clear and detailed image, but it does not necessarily convey a high level of technical skill or artistic vision above the level of practical image-taking.\n4.20'] +(LLMRayActor pid=279374) INFO 05-28 12:51:26 [worker.py:133] Sleep mode freed 38.53 GiB memory, 22.22 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279374) INFO 05-28 12:51:26 [executor_base.py:208] It took 1.693672 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287376) ele.get("min_pixels" 3136 [repeated 6x across cluster] +(ActorModelRayActor pid=287376) ele.get("max_pixels" 1254400 [repeated 6x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nWhat is your overall rating of this image’s techniqual quality? Provide a float between 0 and 5, rounded to two decimal places. A rating of 0 represents very poor quality, while 5 represents excellent quality.\nassistant\nThe image appears to be clear and well-lit, with no apparent noise or blurring. The lighting is consistent, and the details of the objects in the room are visible. The colors are natural and true to life, suggesting that the image was taken with a camera capable of capturing good quality photographs. The composition is also well-balanced, with the main subject (the room) being the focus of the image. There are no signs of image manipulation or poor exposure.\n4.38'] +(LLMRayActor pid=279380) INFO 05-28 13:03:09 [worker.py:133] Sleep mode freed 38.18 GiB memory, 21.58 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 13:03:09 [executor_base.py:208] It took 1.802933 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287377) ele.get("min_pixels" 3136 [repeated 118x across cluster] +(ActorModelRayActor pid=287377) ele.get("max_pixels" 1254400 [repeated 118x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nAssess the image-prompt alignment: "compressed flowers, morphed earth, ridiculed bandages, changed eye color, subliminal time travel, impossible food, concept love, valued at 2 million yen, museum - quality oil painting, traditional art"\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a vibrant and colorful floral arrangement with a whimsical, dreamlike quality. It does not resemble "compressed flowers" or "morphed earth." The "ridiculed bandages" and "changed eye color" are unrelated to the content of the image. The concept of "subliminal time travel" is not represented. The image does not feature any "impossible food." There is a "concept love" element if one interprets the floral arrangement as symbolizing love, but it is not explicitly stated. The valuation of "2 million yen" is also not indicated in the image. The image appears to be a traditional art piece, but it does not align with the other elements of the prompt.\n1.00'] +(LLMRayActor pid=279380) INFO 05-28 13:16:41 [worker.py:133] Sleep mode freed 38.28 GiB memory, 22.32 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 13:16:41 [executor_base.py:208] It took 1.724628 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287376) ele.get("min_pixels" 3136 [repeated 6x across cluster] +(ActorModelRayActor pid=287376) ele.get("max_pixels" 1254400 [repeated 6x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nDoes this image correspond accurately to the prompt: "somebody once told me the world was gonna roll me, i ain\'t the sharpest tool in the shed"?\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a person seated at a desk with books and other cluttered objects surrounding them, which could symbolize a world of knowledge and complexity. However, the quote "somebody once told me the world was gonna roll me, i ain\'t the sharpest tool in the shed" implies a sense of being overwhelmed or-led by circumstances, possibly feeling lost or not in control. The image does not visually represent the feeling of being overwhelmed or out of control, nor does it suggest that the person is not capable or knowledgeable. The scene is more about exploration and learning, rather than a sense of being led or overwhelmed by the world.\n1.00'] +(LLMRayActor pid=279380) INFO 05-28 13:28:48 [worker.py:133] Sleep mode freed 38.21 GiB memory, 22.67 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 13:28:48 [executor_base.py:208] It took 1.651580 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287371) ele.get("min_pixels" 3136 [repeated 18x across cluster] +(ActorModelRayActor pid=287371) ele.get("max_pixels" 1254400 [repeated 18x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nEvaluate the visual aesthetics of this photo using a float score from 0.00 to 5.00. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image showcases a black dog lying on a grassy field with a tree in the background. The photo has a shallow depth of field, focusing on the dog while the surrounding elements are slightly out of focus. This gives the image a sense of depth but may also suggest a loss of detail in the background. The lighting appears natural, possibly taken during the day, but the overcast sky gives the scene a soft, diffused look without strong shadows. The composition is straightforward, with the dog centered in the frame, which might make the image feel a bit static. While the photo captures a moment of calm, it may lack the vibrancy or emotion that could enhance its appeal. The colors are muted, with the green of the grass and the black of the dog providing a simple palette without much visual excitement.\n2.75'] +(LLMRayActor pid=279380) INFO 05-28 13:41:05 [worker.py:133] Sleep mode freed 38.44 GiB memory, 21.45 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 13:41:05 [executor_base.py:208] It took 1.803200 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287377) ele.get("min_pixels" 3136 [repeated 179x across cluster] +(ActorModelRayActor pid=287377) ele.get("max_pixels" 1254400 [repeated 179x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nHow faithful is the image to this prompt: "A clerk at a farm, 3D CGI, during a sunny spring morning"?\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a 3D CGI character dressed as a farmer, not a clerk. The setting is indeed a farm with a barn and cows, which aligns with the prompt of a farm environment. However, the time of day is not specified in the prompt, and there is no indication that it is a "sunny spring morning." The image does not provide enough context to determine the season, and the lighting suggests it could be any time of day. The character\'s role as a clerk is not reflected in the image, as the character is dressed more like a farmer. The overall scene, although not faithful to the specific details of the provided prompt, does represent a farm setting in a 3D CGI style.\n1.50'] +(LLMRayActor pid=279380) INFO 05-28 13:52:36 [worker.py:133] Sleep mode freed 38.50 GiB memory, 21.41 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 13:52:36 [executor_base.py:208] It took 1.778450 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287372) ele.get("min_pixels" 3136 [repeated 180x across cluster] +(ActorModelRayActor pid=287372) ele.get("max_pixels" 1254400 [repeated 180x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nGive an overall aesthetic rating for this image, between 0 and 5, rounded to two decimal points. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image depicts a still life arrangement with pears and apples on a checkered surface accompanied by a pipe. The lighting is focused, creating a dramatic effect by emphasizing the textures and contours of the fruits. The composition is balanced, with the subjects in the center of the frame. However, the image's aesthetic could be enhanced by a stronger, more dynamic color contrast or by adjusting the mood through the selection of lighting. The atmosphere appears somewhat somber due to the subdued lighting, which might not be a deliberate choice for the mood the artist intended to convey. The image lacks the playful or whimsical quality that could elevate it to a more engaging or emotionally appealing composition.\n2.92"] +(ActorModelRayActor pid=287378) ele.get("min_pixels" 3136 [repeated 259x across cluster] +(ActorModelRayActor pid=287378) ele.get("max_pixels" 1254400 [repeated 259x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:04:46 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.52 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:04:46 [executor_base.py:208] It took 1.647691 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nGive a techniqual quality score for this picture between 0 and 5 (float, two decimal places). A rating of 0 represents very poor quality, while 5 represents excellent quality.\nassistant\nThe image shows a clear and well-lit scene with a glass globe placed on a rocky surface, reflecting the sky and part of the landscape. The colors are vibrant, with the blue of the sky and water contrasting nicely with the natural tones of the rocks. The lighting appears to be natural and bright, suggesting the photo was taken during the day with good weather. There are no apparent noise or blurring issues, indicating that the image is sharp. The composition is simple and direct, focusing on the subject without any distracting elements. The technical quality of the image is good, with the ability to showcase the details of the scene effectively.\n4.20'] +(ActorModelRayActor pid=287378) ele.get("min_pixels" 3136 [repeated 25x across cluster] +(ActorModelRayActor pid=287378) ele.get("max_pixels" 1254400 [repeated 25x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:16:28 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.22 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:16:28 [executor_base.py:208] It took 1.643565 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nWhat is your overall rating of this image’s techniqual quality? Provide a float between 0 and 5, rounded to two decimal places. A rating of 0 represents very poor quality, while 5 represents excellent quality.\nassistant\nThe image displays a vertical sign with the word "SUEY" in uppercase letters, mounted on a building\'s facade. The photograph is clear, with the text and the building\'s structure well-defined. The lighting is natural, suggesting the photo was taken during the day. The image does not show any apparent focus issues, motion blur, or excessive noise that would degrade the quality. The colors are also vibrant, with the red of the sign standing out against the neutral tones of the building. Overall, the image appears to be of good quality, with a focus on the subject (the sign) and adequate lighting.\n4.20'] +(ActorModelRayActor pid=287376) ele.get("min_pixels" 3136 [repeated 222x across cluster] +(ActorModelRayActor pid=287376) ele.get("max_pixels" 1254400 [repeated 222x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:28:01 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.49 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:28:02 [executor_base.py:208] It took 1.715720 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nHow close is the image content to the prompt: "a bathroom exploding"?\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a bathroom with a toilet, but there is no indication of an explosion. The damage to the wall appears to be from some other form of physical stress or impact, not an explosion. The elements provided in the prompt, such as the presence of an explosion, are not supported by the image content. The image seems to show a scene of damage, possibly due to some other reason, but not specifically due to an explosion.\n1.00'] +(ActorModelRayActor pid=287374) ele.get("min_pixels" 3136 [repeated 214x across cluster] +(ActorModelRayActor pid=287374) ele.get("max_pixels" 1254400 [repeated 214x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:39:28 [worker.py:133] Sleep mode freed 38.19 GiB memory, 21.92 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:39:28 [executor_base.py:208] It took 1.687068 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nGive an overall aesthetic rating for this image, between 0 and 5, rounded to two decimal points. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nWhen assessing the image, one may consider the composition, color balance, and the subject matter. The image shows two black swans swimming in a body of water, which could be seen as a natural and serene scene. However, the water appears a bit muddy and the image does not have a strong, intentional composition that guides the viewer's attention. The lighting seems natural, but it does not enhance the vibrancy of the scene. The swans, being the main subjects, are not prominently featured, and the image lacks any dynamic or engaging element. Overall, the image conveys a peaceful moment in nature but it does not stand out for its artistic qualities or aesthetic appeal.\n2.50"] +(LLMRayActor pid=279380) INFO 05-28 14:51:06 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.42 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 14:51:06 [executor_base.py:208] It took 1.694729 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287372) ele.get("min_pixels" 3136 [repeated 3x across cluster] +(ActorModelRayActor pid=287372) ele.get("max_pixels" 1254400 [repeated 3x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nPlease rate the aesthetics of this picture on a scale from 0 to 5, as a float rounded to two decimal places. A rating of 0 represents very poor aesthetic quality, while 5 represents excellent aesthetic quality.\nassistant\nThe image shows a close-up of a clenched fist, with a focus on the texture and detail of the skin. The monochromatic color scheme adds a sense of simplicity and raw emotion to the image. The composition is straightforward, with the fist taking up most of the frame, which could be seen as emphasizing the subject's strength or determination. The image might be interpreted as conveying a sense of power or intent. However, it lacks context, which could diminish the image's ability to evoke a full story or emotion. The photograph could be more engaging if it provided a bit more context or a different angle that showcases the subject in a broader sense. Overall, the image has a gritty, raw quality that could be considered 2.5 on the aesthetic scale, due to its textual focus and immediate emotive impact, but it could benefit from a slightly more dynamic composition or context to enhance its visual appeal.\n2.50"] +(LLMRayActor pid=279380) INFO 05-28 15:02:24 [worker.py:133] Sleep mode freed 38.22 GiB memory, 22.54 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 15:02:24 [executor_base.py:208] It took 1.674366 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287372) ele.get("min_pixels" 3136 [repeated 20x across cluster] +(ActorModelRayActor pid=287372) ele.get("max_pixels" 1254400 [repeated 20x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nIs the content visually consistent with the prompt: "a bowl of reds"?\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a bowl filled with red objects that appear to be cherries or marinated olives, which aligns with the prompt "a bowl of reds". The color of the contents is predominantly red, which satisfies the requirement for the content to be red. However, the image does not provide any context that emphasizes the contrast or diversity of the reds, which could make the image appear somewhat monotonous. The visual consistency is good due to the uniform color of the contents, but it might be slightly lacking in terms of visual interest or diversity of colors.\n4.00'] +(LLMRayActor pid=279380) INFO 05-28 15:15:57 [worker.py:133] Sleep mode freed 38.53 GiB memory, 21.62 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 15:15:57 [executor_base.py:208] It took 1.664031 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287374) ele.get("min_pixels" 3136 [repeated 28x across cluster] +(ActorModelRayActor pid=287374) ele.get("max_pixels" 1254400 [repeated 28x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00 and tags, respectively, i.e., reasoning process here answer here \nuser\nRate how well the visual matches the prompt: "cinematic areal shot of a fighter jet exploding from the energy wave concept art"\nPlease evaluate how well the image matches each element of provided prompt.\n\n And answer with the final alignment rating.\nRate it from 0 to 5 (float, 2 decimals). A rating of 0 represents very poor alignment level, while 5 represents excellent alignment level.\'\nassistant\n The image depicts a fighter jet in a state of action, with what appears to be a large burst of fire or energy being directed towards it. The image has a strong cinematic feel, with a dynamic and intense scene. However, the concept of an "energy wave" does not align with the visual elements shown in the image. The explosion and the overall action suggest a more realistic and dynamic scenario, rather than a conceptual art piece. The image does not provide clear evidence of "concept art," as it seems to be a more realistic depiction of a scenario that could be part of a movie or a game.\n3.00'] +(LLMRayActor pid=279380) INFO 05-28 15:27:32 [worker.py:133] Sleep mode freed 38.49 GiB memory, 21.88 GiB memory is still in use. [repeated 7x across cluster] +(LLMRayActor pid=279380) INFO 05-28 15:27:32 [executor_base.py:208] It took 1.679020 seconds to fall asleep. [repeated 7x across cluster] +(ActorModelRayActor pid=287374) ele.get("min_pixels" 3136 [repeated 8x across cluster] +(ActorModelRayActor pid=287374) ele.get("max_pixels" 1254400 [repeated 8x across cluster] +(ActorModelRayActor pid=286523) +(ActorModelRayActor pid=286523) +Train epoch [1/1]: 0%| | 0/128 [00:00: Failed to establish a new connection: [Errno 111] Connection refused + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/requests/adapters.py", line 667, in send + resp = conn.urlopen( + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/urllib3/connectionpool.py", line 841, in urlopen + retries = retries.increment( + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/urllib3/util/retry.py", line 519, in increment + raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type] +urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=2989): Max retries exceeded with url: /api/jobs/raysubmit_YfLa8tqbvrqPbydr (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused')) + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/bin/ray", line 8, in + sys.exit(main()) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/scripts/scripts.py", line 2690, in main + return cli() + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/click/core.py", line 1161, in __call__ + return self.main(*args, **kwargs) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/click/core.py", line 1082, in main + rv = self.invoke(ctx) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/click/core.py", line 1697, in invoke + return _process_result(sub_ctx.command.invoke(sub_ctx)) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/click/core.py", line 1697, in invoke + return _process_result(sub_ctx.command.invoke(sub_ctx)) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/click/core.py", line 1443, in invoke + return ctx.invoke(self.callback, **ctx.params) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/click/core.py", line 788, in invoke + return __callback(*args, **kwargs) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/dashboard/modules/job/cli_utils.py", line 54, in wrapper + return func(*args, **kwargs) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/autoscaler/_private/cli_logger.py", line 823, in wrapper + return f(*args, **kwargs) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/dashboard/modules/job/cli.py", line 310, in submit + job_status = get_or_create_event_loop().run_until_complete( + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete + return future.result() + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/dashboard/modules/job/cli.py", line 99, in _tail_logs + return _log_job_status(client, job_id) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/dashboard/modules/job/cli.py", line 78, in _log_job_status + info = client.get_job_info(job_id) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/dashboard/modules/job/sdk.py", line 352, in get_job_info + r = self._do_request("GET", f"/api/jobs/{job_id}") + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/ray/dashboard/modules/dashboard_sdk.py", line 303, in _do_request + return requests.request( + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/requests/api.py", line 59, in request + return session.request(method=method, url=url, **kwargs) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/requests/sessions.py", line 589, in request + resp = self.send(prep, **send_kwargs) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/requests/sessions.py", line 703, in send + r = adapter.send(request, **kwargs) + File "/mnt/petrelfs/luyiting/anaconda3/envs/lmmr1/lib/python3.10/site-packages/requests/adapters.py", line 700, in send + raise ConnectionError(e, request=request) +requests.exceptions.ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=2989): Max retries exceeded with url: /api/jobs/raysubmit_YfLa8tqbvrqPbydr (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 111] Connection refused'))