File size: 4,395 Bytes
6c9122e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
{
  "loss_mode": "adaptive_std",
  "model_type": "film_hybrid",
  "dataset_type": "mixed_condition",
  "condition_mode": "embedding",
  "cond_dim": 256,
  "freeze_cap_encoder": false,
  "pretrained_cap_encoder": null,
  "adaptive_use_predicted_cond": true,
  "predicted_cond_supervision_weight": 1.5,
  "predicted_cond_supervision_loss": "smooth_l1",
  "metadata_path": "/pfs/lyj/dataset/t2i_rm/stage2_rollout/stage2_metadata.json",
  "rl_iter_metadata_path": "/pfs/lyj/dataset/t2i_rm/stage2_rollout/rl_iter_v2/combined_sdxl_flux_metadata.json",
  "ogd_std_path": "/pfs/lyj/t2i/HPSv3/data/ogd_std_qwen3vl.json",
  "tiers": [
    "sd15",
    "sdxl",
    "qwen_image"
  ],
  "rl_tier": null,
  "max_rl_step": 1000,
  "val_size": 500,
  "max_images_per_group": 6,
  "labeled_json_list": [
    "/pfs/lyj/dataset/t2i_rm/dataset/2.24/2.24aesthetic_clean_filter_both.json"
  ],
  "sup_weight": 5.0,
  "labeled_loss_type": "uncertainty",
  "sup_warmup_steps": 0,
  "rank_weight": 2.0,
  "ranking_loss_type": "bt",
  "pairs_per_prompt": 3,
  "ranking_pair_mode": "cross_model_only",
  "loss_type": "max_std_unsup",
  "std_weight": 1.0,
  "std_floor": -2.0,
  "adaptive_weight": 2.0,
  "adaptive_margin": 0.35,
  "adaptive_priority_mode": "strong_cap_high_iter",
  "warmup_steps": 500,
  "adaptive_eps": 1e-06,
  "std_constraint_enable": true,
  "std_constraint_mode": "ratio",
  "std_constraint_weight": 0.5,
  "std_constraint_warmup_steps": 500,
  "std_bound_min": 0.05,
  "std_bound_min_gap": 0.06,
  "std_bound_lower_base": 0.08,
  "std_bound_lower_cap_coef": 0.08,
  "std_bound_lower_iter_coef": 0.6,
  "std_bound_lower_ogd_coef": 0.5,
  "std_bound_upper_base": 0.35,
  "std_bound_upper_cap_coef": 0.15,
  "std_bound_upper_iter_coef": 0.6,
  "std_bound_upper_ogd_coef": 1.25,
  "std_target_enable": false,
  "std_target_weight": 0.0,
  "std_ratio_constraint_enable": true,
  "std_ratio_constraint_weight": 3.0,
  "std_ratio_constraint_warmup_steps": 300,
  "std_ratio_lower_base": 0.85,
  "std_ratio_lower_cap_coef": 0.0,
  "std_ratio_lower_iter_coef": 0.25,
  "std_ratio_upper_base": 1.15,
  "std_ratio_upper_cap_coef": 0.0,
  "std_ratio_upper_iter_coef": 0.45,
  "std_ratio_min": 0.3,
  "std_ratio_min_gap": 0.05,
  "std_ratio_space": "raw",
  "std_ratio_target_enable": true,
  "std_ratio_target_weight": 5.0,
  "std_ratio_target_base": 1.0,
  "std_ratio_target_cap_coef": 0.0,
  "std_ratio_target_iter_coef": 0.4,
  "std_ratio_target_margin_base": 0.1,
  "std_ratio_target_margin_cap_coef": 0.0,
  "std_ratio_target_margin_iter_coef": 0.05,
  "rm_head_type": "ranknet",
  "lora_enable": false,
  "vision_lora": false,
  "freeze_vision_tower": true,
  "freeze_llm": true,
  "tune_merger": true,
  "train_rm_head": true,
  "model_name_or_path": "/pfs/lyj/models/Qwen3-VL-8B-Instruct",
  "load_from_pretrained": "/pfs/lyj/t2i/HPSv3/output_models/HPSv3_8B_Qwen3VL_ogd_train_20260325_195334",
  "load_from_pretrained_step": 2992,
  "max_pixels": 200704,
  "min_pixels": 200704,
  "with_instruction": true,
  "use_special_tokens": true,
  "reward_token": "special",
  "output_dim": 2,
  "output_dir": "output_models",
  "per_device_train_batch_size": 4,
  "per_device_eval_batch_size": 8,
  "gradient_accumulation_steps": 1,
  "num_train_epochs": 3,
  "learning_rate": 1e-05,
  "warmup_ratio": 0.05,
  "lr_scheduler_type": "cosine",
  "gradient_checkpointing": true,
  "gradient_checkpointing_kwargs": {
    "use_reentrant": false
  },
  "vision_lr": null,
  "merger_lr": 5e-06,
  "rm_head_lr": null,
  "special_token_lr": null,
  "reward_l2_coef": 0.03,
  "kl_coef": 0.0,
  "test_json_list": [
    [
      "HPDv3 Test",
      [
        "/pfs/lyj/dataset/HPDv3/test.abs.json"
      ]
    ],
    [
      "Own Aesthetic Test",
      [
        "/pfs/lyj/dataset/t2i_rm/dataset/2.24/2.24aesthetic_test.json"
      ]
    ],
    [
      "Own PromptFollow Test",
      [
        "/pfs/lyj/dataset/t2i_rm/dataset/2.24/2.24promptfollow_test.json"
      ]
    ]
  ],
  "eval_train_json": "/pfs/lyj/dataset/t2i_rm/dataset/2.24/2.24aesthetic_clean_filter_both.json",
  "eval_train_size": 10000,
  "eval_strategy": "steps",
  "logging_steps": 10,
  "eval_steps": 500,
  "save_steps": 1000,
  "vis_steps": 200,
  "report_to": "none",
  "bf16": true,
  "torch_dtype": "bfloat16",
  "deepspeed": "hpsv3/config/ds_config/zero2.json",
  "save_only_model": true,
  "save_full_model": true,
  "dataloader_num_workers": 2
}