File size: 4,085 Bytes
2f0e115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
[
  "F:\\project_python\\nlp_project\\.venv\\Scripts\\python.exe",
  "scripts/train_rich.py",
  "--train_file",
  "data\\rich_cmgui\\processed\\train_rich_teacher7000_natural_qwen8000.jsonl",
  "--valid_file",
  "data\\rich_cmgui\\processed\\valid_rich_teacher500_natural_qwen8000.jsonl",
  "--output_dir",
  "F:\\project_python\\nlp_project\\runs\\rich_cmgui_20260512_titlefix_s1e2\\stage3_vision_adapter",
  "--vision_model",
  "models/siglip2-base-patch16-224",
  "--decoder_model",
  "models/mt5-large",
  "--image_size",
  "224",
  "--num_vertical_crops",
  "0",
  "--max_visual_tokens",
  "64",
  "--max_elements",
  "48",
  "--max_element_tokens",
  "16",
  "--max_context_tokens",
  "384",
  "--context_mode",
  "tokens_direct_encoder",
  "--context_text_format",
  "text_only",
  "--context_include_screen_text",
  "true",
  "--context_screen_text_items",
  "16",
  "--context_screen_text_dropout_rate",
  "0.15",
  "--max_target_tokens",
  "384",
  "--eval_max_new_tokens",
  "384",
  "--batch_size",
  "2",
  "--eval_batch_size",
  "1",
  "--grad_accum",
  "10",
  "--epochs",
  "1",
  "--scheduler_epochs",
  "1",
  "--weight_decay",
  "0.01",
  "--optimizer_name",
  "adafactor",
  "--warmup_ratio",
  "0.05",
  "--fp16",
  "false",
  "--amp_dtype",
  "fp32",
  "--generation_loss_chunk_size",
  "8",
  "--cuda_empty_cache_steps",
  "1",
  "--cuda_memory_fraction",
  "0.0",
  "--data_parallel",
  "true",
  "--save_every_steps",
  "100",
  "--save_checkpoints",
  "true",
  "--eval_every_steps",
  "0",
  "--target_schema",
  "summary_visible_zh",
  "--grad_clip_strategy",
  "global",
  "--max_grad_norm",
  "1.0",
  "--model_selection_metric",
  "grounded_quality_score",
  "--model_selection_mode",
  "max",
  "--early_stopping_patience",
  "2",
  "--early_stopping_min_delta",
  "0.001",
  "--max_train_samples",
  "0",
  "--max_valid_samples",
  "100",
  "--strict_data_checks",
  "true",
  "--num_beams",
  "1",
  "--generation_no_repeat_ngram_size",
  "3",
  "--generation_repetition_penalty",
  "1.1",
  "--generation_min_new_tokens",
  "0",
  "--generation_block_extra_ids",
  "true",
  "--generation_block_title_prefix",
  "true",
  "--generation_force_json_start",
  "false",
  "--context_summary_repair",
  "false",
  "--canonicalize_targets",
  "false",
  "--task_intent_context",
  "false",
  "--drop_bare_search_functions",
  "false",
  "--structured_function_mode",
  "heads",
  "--structured_function_threshold",
  "0.5",
  "--structured_search_threshold",
  "0.5",
  "--structured_max_functions",
  "8",
  "--structured_evidence_mode",
  "heads",
  "--structured_evidence_threshold",
  "0.5",
  "--structured_max_evidence",
  "8",
  "--structured_evidence_fallback_top1",
  "false",
  "--evidence_loss_weight",
  "0.2",
  "--ui_function_loss_weight",
  "0.05",
  "--search_function_loss_weight",
  "0.02",
  "--function_signal_to_decoder",
  "false",
  "--search_signal_to_decoder",
  "false",
  "--pooled_memory_scale",
  "0.02",
  "--decoder_memory_scale",
  "1.0",
  "--seed",
  "20260509",
  "--num_workers",
  "0",
  "--bottleneck_queries",
  "4",
  "--init_checkpoint",
  "F:\\project_python\\nlp_project\\runs\\rich_cmgui_20260512_titlefix_s1e2\\stage2_layout_adapter\\checkpoint-best",
  "--model_variant",
  "late_fusion",
  "--native_context_forward",
  "false",
  "--disable_vision",
  "false",
  "--freeze_decoder",
  "true",
  "--freeze_vision",
  "true",
  "--unfreeze_vision_last_ratio",
  "0.0",
  "--direct_visual_tokens",
  "false",
  "--direct_element_tokens",
  "false",
  "--direct_context_passthrough",
  "true",
  "--include_pooled_memory",
  "true",
  "--activation_checkpointing",
  "true",
  "--decoder_gradient_checkpointing",
  "false",
  "--vision_gradient_checkpointing",
  "false",
  "--lr_new",
  "1e-05",
  "--lr_fusion",
  "0.0",
  "--lr_decoder",
  "0.0",
  "--visual_memory_scale",
  "0.1",
  "--element_memory_scale",
  "0.5"
]