Upload CMGUI stage3 screen-grounded summarizer checkpoint

2f0e115 verified 1 day ago

152 kB

	#!/usr/bin/env python
	"""Train rich Chinese screenshot summarization models on CMGUI-style data."""

	from __future__ import annotations

	import argparse
	import gc
	import json
	import math
	import os
	import random
	import re
	import shutil
	import time
	from collections import deque
	from dataclasses import asdict, dataclass
	from pathlib import Path
	from typing import Any, Dict, Iterable, List, Optional, Tuple

	import numpy as np
	import torch
	import torch.distributed as dist
	import torch.nn as nn
	import torch.nn.functional as F
	from PIL import Image
	from torch.nn.parallel import DistributedDataParallel
	from torch.utils.checkpoint import checkpoint
	from torch.utils.data import DataLoader, Dataset, DistributedSampler
	from tqdm import tqdm
	from transformers import Adafactor, AutoImageProcessor, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer
	from transformers.modeling_outputs import BaseModelOutput


	DEFAULT_CONFIG = {
	# 数据文件。正式训练使用 train_rich/valid_rich，先用 smoke 文件调通。
	"train_file": "data/rich_cmgui/processed/smoke_train_rich.jsonl",
	"valid_file": "data/rich_cmgui/processed/smoke_valid_rich.jsonl",
	"output_dir": "runs/rich_cmgui_20260502/rich_grounded_siglip2_mt5_v1",
	"init_checkpoint": "",
	# 模型。V100 32GB x2 默认用 mT5-base；显存紧张再换 google/mt5-small。
	"model_variant": "full",
	"vision_model": "models/siglip2-base-patch16-224",
	"decoder_model": "google/mt5-base",
	# 输入长度。384 + 纵向 crop 更适合中文小字；如果 SigLIP2 位置插值失败，先降到 224。
	"image_size": 384,
	"num_vertical_crops": 3,
	# 0 keeps all visual patch tokens. Set a cap for high-resolution/crop runs
	# to borrow Pix2Struct-style richer screenshots without quadratic fusion blowup.
	"max_visual_tokens": 0,
	"max_elements": 80,
	"max_element_tokens": 16,
	"max_context_tokens": 64,
	"context_text_format": "rich",
	"context_include_screen_text": False,
	"context_screen_text_items": 32,
	"context_screen_text_dropout_rate": 0.0,
	# mean preserves old checkpoints; direct modes expose task/app tokens directly to cross-attention.
	"context_mode": "mean",
	# max_target_tokens 只截断训练标签；eval_max_new_tokens 只控制验证生成长度。
	"max_target_tokens": 384,
	"eval_max_new_tokens": 384,
	# 训练。两张 V100 32GB: torchrun --nproc_per_node=2 train_rich.py
	"batch_size": 4,
	"eval_batch_size": 0,
	"grad_accum": 8,
	"epochs": 6,
	# 0 means use epochs. Set this higher for short early-stop runs so the LR
	# schedule matches the longer run whose early checkpoint is being reproduced.
	"scheduler_epochs": 0,
	"lr_new": 1e-4,
	"lr_fusion": 5e-5,
	"lr_decoder": 1e-5,
	"lr_ui_function_head": 0.0,
	"weight_decay": 0.01,
	"optimizer_name": "adamw",
	"lr_scheduler_type": "linear",
	"warmup_ratio": 0.05,
	"fp16": True,
	"amp_dtype": "auto",
	"generation_loss_chunk_size": 32,
	"activation_checkpointing": False,
	"cuda_empty_cache_steps": 0,
	"cuda_memory_fraction": 0.0,
	"decoder_gradient_checkpointing": False,
	"vision_gradient_checkpointing": False,
	"freeze_decoder": False,
	"freeze_vision": True,
	"unfreeze_vision_last_ratio": 0.3,
	# loss 权重。generation 是主目标，其他 loss 用来约束证据和结构。
	"evidence_loss_weight": 0.2,
	"section_loss_weight": 0.1,
	"numeric_loss_weight": 0.1,
	"ui_function_loss_weight": 0.0,
	"search_function_loss_weight": 0.0,
	"search_function_pos_weight": 1.0,
	# checkpoint 和评估。
	"save_every_steps": 1000,
	"save_checkpoints": True,
	"eval_every_steps": 0,
	"model_selection_metric": "rich_quality_score",
	"model_selection_mode": "max",
	"early_stopping_patience": 0,
	"early_stopping_min_delta": 0.0,
	"max_train_samples": 0,
	"max_valid_samples": 800,
	"num_beams": 4,
	"generation_no_repeat_ngram_size": 0,
	"generation_repetition_penalty": 1.0,
	"generation_min_new_tokens": 0,
	"generation_block_extra_ids": False,
	"generation_block_title_prefix": False,
	"generation_force_json_start": False,
	"context_summary_repair": False,
	"canonicalize_targets": False,
	"target_schema": "zh",
	"task_intent_context": False,
	"drop_bare_search_functions": False,
	"structured_function_mode": "decoder",
	"structured_function_threshold": 0.5,
	"structured_search_threshold": 0.5,
	"structured_max_functions": 12,
	"structured_strict_search_candidates": False,
	"structured_evidence_mode": "decoder",
	"structured_evidence_threshold": 0.5,
	"structured_max_evidence": 8,
	"structured_evidence_fallback_top1": True,
	# Vision-memory options. Defaults preserve old checkpoints/experiments.
	"direct_visual_tokens": False,
	"direct_element_tokens": False,
	"direct_context_passthrough": False,
	"include_pooled_memory": True,
	"native_context_forward": False,
	"disable_vision": False,
	"init_resize_mismatched_non_decoder": False,
	"grad_clip_strategy": "global",
	"max_grad_norm": 1.0,
	"function_signal_to_decoder": False,
	"function_signal_scale": 1.0,
	"search_signal_to_decoder": False,
	"search_signal_scale": 1.0,
	"visual_memory_scale": 1.0,
	"element_memory_scale": 1.0,
	"pooled_memory_scale": 1.0,
	"decoder_memory_scale": 1.0,
	"data_parallel": False,
	"strict_data_checks": True,
	"max_target_truncation_rate": 0.01,
	"seed": 20260502,
	"num_workers": 4,
	}

	SECTION_NAMES = ["visible_text", "interaction_data", "ui_functions", "key_ui_clues"]


	def read_jsonl(path: Path) -> Iterable[Dict[str, Any]]:
	with path.open("r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if line:
	yield json.loads(line)


	def write_json(path: Path, obj: Dict[str, Any]) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")


	def append_jsonl(path: Path, obj: Dict[str, Any]) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	with path.open("a", encoding="utf-8", newline="\n") as f:
	f.write(json.dumps(obj, ensure_ascii=False) + "\n")


	def set_seed(seed: int) -> None:
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)


	def normalize_model_reference(value: Any) -> str:
	normalized = str(value or "").replace("\\", "/")
	if "://" not in normalized:
	normalized = re.sub(r"/+", "/", normalized)
	return normalized


	def init_distributed() -> Tuple[bool, int, int, int]:
	if "RANK" not in os.environ or "WORLD_SIZE" not in os.environ:
	return False, 0, 1, 0
	rank = int(os.environ["RANK"])
	world_size = int(os.environ["WORLD_SIZE"])
	local_rank = int(os.environ.get("LOCAL_RANK", 0))
	dist.init_process_group(backend="nccl")
	torch.cuda.set_device(local_rank)
	return True, rank, world_size, local_rank


	def is_main(rank: int) -> bool:
	return rank == 0


	def unwrap_parallel_model(model: nn.Module) -> nn.Module:
	if isinstance(model, (DistributedDataParallel, nn.DataParallel)):
	return model.module
	return model


	def safe_text(value: Any) -> str:
	if value is None:
	return ""
	return re.sub(r"\s+", " ", str(value)).strip()


	def target_to_text(target: Dict[str, Any], target_schema: str = "zh") -> str:
	schema = str(target_schema or "zh").lower()
	if target_schema_is_summary(schema):
	return safe_text(target.get("summary_zh"))
	if target_schema_is_summary_visible(schema):
	return target_to_summary_visible_text(target)
	if target_schema_is_natural_text(schema):
	return target_to_natural_text(target)
	if schema in {"alias", "aliases", "en", "english"}:
	payload = {
	"summary_zh": safe_text(target.get("summary_zh")),
	"visible_text": target.get("visible_text", [])[:16],
	"interaction_data": target.get("interaction_data", [])[:12],
	"ui_functions": target.get("ui_functions", [])[:12],
	"key_ui_clues": target.get("key_ui_clues", [])[:12],
	}
	else:
	payload = {
	"画面总结": safe_text(target.get("summary_zh")),
	"可见文字": target.get("visible_text", [])[:16],
	"互动数据": target.get("interaction_data", [])[:12],
	"功能入口": target.get("ui_functions", [])[:12],
	"关键证据": target.get("key_ui_clues", [])[:12],
	}
	return json.dumps(payload, ensure_ascii=False, separators=(",", ":"))


	def target_schema_is_summary(target_schema: str) -> bool:
	return str(target_schema or "").lower() in {"summary", "summary_zh", "summary-only", "summary_only"}


	def target_schema_is_natural_text(target_schema: str) -> bool:
	return str(target_schema or "").lower() in {
	"natural_zh",
	"rich_text_zh",
	"zh_text",
	"text_zh",
	"summary_visible_zh",
	"natural_summary_visible_zh",
	}


	def target_schema_is_summary_visible(target_schema: str) -> bool:
	return str(target_schema or "").lower() in {"summary_visible_zh", "natural_summary_visible_zh"}


	def format_evidence_suffix(values: Any) -> str:
	evidence_ids = dedupe_texts(values or [], max_items=8)
	if not evidence_ids:
	return ""
	return f"（证据：{'、'.join(evidence_ids)}）"


	def format_natural_entry(value: Any) -> str:
	if isinstance(value, dict):
	name = safe_text(value.get("name") or value.get("text") or value.get("label"))
	detail = safe_text(value.get("value"))
	if name and detail:
	name = f"{name}={detail}"
	elif detail and not name:
	name = detail
	return safe_text(name + format_evidence_suffix(value.get("evidence_ids", [])))
	return safe_text(value)


	def format_natural_list(values: Any, max_items: int = 16) -> str:
	if not isinstance(values, list):
	return "无"
	entries = []
	seen = set()
	for value in values[:max_items]:
	text = format_natural_entry(value)
	if text and text not in seen:
	entries.append(text)
	seen.add(text)
	return "、".join(entries) if entries else "无"


	def target_to_natural_text(target: Dict[str, Any]) -> str:
	summary = safe_text(target.get("summary_zh"))
	visible = format_natural_list(target.get("visible_text", []), max_items=16)
	interactions = format_natural_list(target.get("interaction_data", []), max_items=12)
	functions = format_natural_list(target.get("ui_functions", []), max_items=12)
	evidence = format_natural_list(target.get("key_ui_clues", []), max_items=12)
	return "\n".join(
	[
	f"画面总结：{summary}",
	f"可见文字：{visible}",
	f"互动数据：{interactions}",
	f"功能入口：{functions}",
	f"关键证据：{evidence}",
	]
	)


	def target_to_summary_visible_text(target: Dict[str, Any]) -> str:
	summary = safe_text(target.get("summary_zh"))
	visible = format_natural_list(target.get("visible_text", []), max_items=16)
	interactions = format_natural_list(target.get("interaction_data", []), max_items=12)
	return "\n".join(
	[
	f"画面总结：{summary}",
	f"可见文字：{visible}",
	f"互动数据：{interactions}",
	]
	)


	JSONISH_OUTPUT_KEYS = ("画面总结", "可见文字", "互动数据", "功能入口", "关键证据")
	JSONISH_ALL_KEYS = JSONISH_OUTPUT_KEYS + ("name", "value", "evidence_ids")


	def normalize_evidence_id(value: Any) -> str:
	text = safe_text(value)
	match = re.fullmatch(r"[Ee]\s((?:\d\s){2,8})", text)
	if match:
	digits = re.sub(r"\s+", "", match.group(1))
	return f"E{digits}"
	return text


	def repair_generated_json_text(text: str) -> str:
	candidate = safe_text(text)
	start = candidate.find("{")
	if start >= 0:
	candidate = candidate[start:]
	candidate = re.sub(r"evidence\s_\sids", "evidence_ids", candidate)
	candidate = re.sub(
	r"\b[Ee]\s((?:\d\s){2,8})\b",
	lambda match: "E" + re.sub(r"\s+", "", match.group(1)),
	candidate,
	)
	for key in JSONISH_ALL_KEYS:
	candidate = re.sub(
	r"([\{\[,])\s\"?\s" + re.escape(key) + r"\s\"\s:",
	lambda match, key=key: f'{match.group(1)}"{key}":',
	candidate,
	)
	candidate = re.sub(
	r"([\{\[,])\s\"?\s" + re.escape(key) + r"\s*:",
	lambda match, key=key: f'{match.group(1)}"{key}":',
	candidate,
	)
	return candidate


	def json_object_from_text(text: str) -> Tuple[Optional[Dict[str, Any]], bool]:
	text = text.strip()
	if not text:
	return None, False
	try:
	obj = json.loads(text)
	return (obj, True) if isinstance(obj, dict) else (None, False)
	except json.JSONDecodeError:
	start = text.find("{")
	end = text.rfind("}")
	if start >= 0 and end > start:
	try:
	obj = json.loads(text[start : end + 1])
	return (obj, True) if isinstance(obj, dict) else (None, False)
	except json.JSONDecodeError:
	pass
	return None, False


	def jsonish_field_payload(text: str, key: str, following_keys: Iterable[str]) -> str:
	match = re.search(r'"' + re.escape(key) + r'"\s*:', text)
	if not match:
	return ""
	tail = text[match.end() :]
	end = len(tail)
	for following_key in following_keys:
	next_match = re.search(r',?\s"' + re.escape(following_key) + r'"\s:', tail)
	if next_match:
	end = min(end, next_match.start())
	return tail[:end].strip().rstrip(",")


	def clean_jsonish_string(value: Any) -> str:
	text = safe_text(value).strip()
	while text.startswith('"'):
	text = text[1:].strip()
	while text.endswith('"'):
	text = text[:-1].strip()
	return safe_text(text.replace('\\"', '"'))


	def dedupe_texts(values: Iterable[Any], max_items: int = 64) -> List[str]:
	output: List[str] = []
	seen = set()
	for value in values:
	text = clean_jsonish_string(normalize_evidence_id(value))
	if not text or text in seen:
	continue
	output.append(text)
	seen.add(text)
	if len(output) >= max_items:
	break
	return output


	def jsonish_string_list(payload: str, max_items: int = 64) -> List[str]:
	body = payload.strip()
	if "[" in body:
	body = body[body.find("[") + 1 :]
	if "]" in body:
	body = body[: body.rfind("]")]
	values = [match.group(1) for match in re.finditer(r'"([^"\\](?:\\.[^"\\])*)"', body)]
	if not values:
	values = re.findall(r"\bE\d{2,8}\b", body)
	return dedupe_texts(values, max_items=max_items)


	def normalize_function_entries(values: Any) -> List[Any]:
	if not isinstance(values, list):
	return []
	output: List[Any] = []
	for value in values:
	if isinstance(value, dict):
	name = clean_jsonish_string(value.get("name"))
	evidence_ids = dedupe_texts(value.get("evidence_ids", []) or [], max_items=8)
	if name:
	output.append({"name": name, "evidence_ids": evidence_ids})
	else:
	name = clean_jsonish_string(value)
	if name:
	output.append(name)
	return output


	def jsonish_function_entries(payload: str) -> List[Any]:
	body = payload.strip()
	if body.startswith("[") and body.endswith("]"):
	try:
	parsed = json.loads(body)
	normalized = normalize_function_entries(parsed)
	if normalized:
	return normalized
	except json.JSONDecodeError:
	pass
	output: List[Any] = []
	object_bodies = [match.group(1) for match in re.finditer(r"\{([^{}]*)\}", body)] or [body]
	for object_body in object_bodies:
	name_match = re.search(r'"name"\s:\s"([^"\\](?:\\.[^"\\])*)"', object_body)
	if not name_match:
	continue
	name = clean_jsonish_string(name_match.group(1))
	evidence_ids = dedupe_texts(re.findall(r"\bE\d{2,8}\b", object_body), max_items=8)
	if name:
	output.append({"name": name, "evidence_ids": evidence_ids})
	return output


	def jsonish_prediction_object(text: str) -> Optional[Dict[str, Any]]:
	repaired = repair_generated_json_text(text)
	summary = clean_jsonish_string(
	jsonish_field_payload(repaired, "画面总结", ("可见文字", "互动数据", "功能入口", "关键证据"))
	)
	visible_text = jsonish_string_list(
	jsonish_field_payload(repaired, "可见文字", ("互动数据", "功能入口", "关键证据")),
	max_items=32,
	)
	interaction_data = jsonish_string_list(
	jsonish_field_payload(repaired, "互动数据", ("功能入口", "关键证据")),
	max_items=16,
	)
	functions = jsonish_function_entries(jsonish_field_payload(repaired, "功能入口", ("关键证据",)))
	evidence_ids = jsonish_string_list(jsonish_field_payload(repaired, "关键证据", ()), max_items=16)
	if not (summary or visible_text or interaction_data or functions or evidence_ids):
	return None
	return {
	"画面总结": summary,
	"可见文字": visible_text,
	"互动数据": interaction_data,
	"功能入口": functions,
	"关键证据": evidence_ids,
	}


	NATURAL_OUTPUT_KEYS = ("画面总结", "可见文字", "互动数据", "功能入口", "关键证据")


	def has_natural_title_prefix(text: str) -> bool:
	return re.search(r"^\s(?:Title\|title\|标题)(?:\s[:：]\|\s+)", str(text or "")) is not None


	def strip_natural_title_prefix(text: str) -> Tuple[str, bool]:
	raw = str(text or "").strip().replace("\r\n", "\n").replace("\r", "\n")
	if not raw or not has_natural_title_prefix(raw):
	return raw, False
	prefix_match = re.match(r"^\s(?:Title\|title\|标题)(?:\s[:：]\|\s+)", raw)
	if not prefix_match:
	return raw, False
	key_pattern = r"(" + "\|".join(re.escape(key) for key in NATURAL_OUTPUT_KEYS) + r")\s*[:：]"
	key_matches = list(re.finditer(key_pattern, raw))
	if not key_matches:
	return re.sub(r"^\s(?:Title\|title\|标题)(?:\s[:：]\|\s+)", "", raw).strip(), True
	first_key = key_matches[0]
	prefix_body = safe_text(raw[prefix_match.end() : first_key.start()])
	rest = raw[first_key.start() :].strip()
	if first_key.group(1) == "画面总结":
	return rest, True
	if prefix_body:
	return f"画面总结：{prefix_body}\n{rest}", True
	return rest, True


	def natural_field_payload(text: str, key: str, following_keys: Iterable[str]) -> str:
	match = re.search(re.escape(key) + r"\s*[:：]", text)
	if not match:
	return ""
	tail = text[match.end() :]
	end = len(tail)
	for following_key in following_keys:
	next_match = re.search(re.escape(following_key) + r"\s*[:：]", tail)
	if next_match:
	end = min(end, next_match.start())
	return tail[:end].strip()


	def split_natural_items(payload: str, max_items: int = 64) -> List[str]:
	payload = safe_text(payload)
	if not payload or payload in {"无", "暂无", "没有", "[]"}:
	return []
	parts = re.split(r"[\n；;\|]+\|(?<!\d)、", payload)
	if len(parts) == 1:
	parts = re.split(r"\s,\s", payload)
	return dedupe_texts(parts, max_items=max_items)


	def natural_function_entries(payload: str) -> List[Any]:
	output: List[Any] = []
	for item in split_natural_items(payload, max_items=32):
	evidence_ids = dedupe_texts(re.findall(r"\bE\d{2,8}\b", normalize_evidence_id(item)), max_items=8)
	name = re.sub(r"（?证据[:：].*?）?$", "", item).strip()
	name = re.sub(r"$\s证据[:：].?$\s*$", "", name).strip()
	if name:
	output.append({"name": name, "evidence_ids": evidence_ids})
	return output


	def natural_prediction_from_text(text: str) -> Dict[str, Any]:
	text, _ = strip_natural_title_prefix(text)
	fields = {
	key: natural_field_payload(text, key, NATURAL_OUTPUT_KEYS[index + 1 :])
	for index, key in enumerate(NATURAL_OUTPUT_KEYS)
	}
	summary = safe_text(fields.get("画面总结"))
	if not summary:
	first_line = re.split(r"[\n。]", text, maxsplit=1)[0]
	summary = safe_text(first_line)
	visible_text = split_natural_items(fields.get("可见文字", ""), max_items=32)
	interaction_data = split_natural_items(fields.get("互动数据", ""), max_items=16)
	functions = natural_function_entries(fields.get("功能入口", ""))
	evidence_ids = dedupe_texts(re.findall(r"\bE\d{2,8}\b", normalize_evidence_id(fields.get("关键证据", ""))), max_items=16)
	if not evidence_ids:
	evidence_ids = dedupe_texts(
	eid for func in functions if isinstance(func, dict) for eid in func.get("evidence_ids", [])
	)
	return {
	"画面总结": summary,
	"可见文字": visible_text,
	"互动数据": interaction_data,
	"功能入口": functions,
	"关键证据": evidence_ids,
	}


	def safe_json_loads_with_repair(text: str) -> Tuple[Optional[Dict[str, Any]], bool, bool, bool]:
	obj, strict_ok = json_object_from_text(text)
	if strict_ok:
	return obj, True, False, True
	repaired_text = repair_generated_json_text(text)
	obj, repaired_json_ok = json_object_from_text(repaired_text)
	if repaired_json_ok:
	return obj, True, repaired_text.strip() != str(text or "").strip(), False
	obj = jsonish_prediction_object(repaired_text)
	if obj is not None:
	return obj, True, True, False
	return None, False, False, False


	def safe_json_loads(text: str) -> Tuple[Optional[Dict[str, Any]], bool]:
	obj, ok, _, _ = safe_json_loads_with_repair(text)
	return obj, ok


	def load_seq2seq_tokenizer(model_name_or_path: str, model_hint: str = ""):
	name = f"{model_name_or_path} {model_hint}".lower()
	load_path = str(model_name_or_path)
	path = Path(load_path)
	has_spiece = path.is_dir() and (path / "spiece.model").exists()
	if has_spiece or "mt5" in name or "t5" in name:
	hint_path = Path(str(model_hint)) if model_hint else None
	if path.is_dir() and not (path / "spiece.model").exists() and hint_path and (hint_path / "spiece.model").exists():
	load_path = str(hint_path)
	return T5Tokenizer.from_pretrained(load_path, fix_mistral_regex=True)
	return AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)


	def char_lcs(a: str, b: str) -> int:
	if not a or not b:
	return 0
	prev = [0] * (len(b) + 1)
	for ca in a:
	cur = [0]
	for j, cb in enumerate(b, start=1):
	if ca == cb:
	cur.append(prev[j - 1] + 1)
	else:
	cur.append(max(cur[-1], prev[j]))
	prev = cur
	return prev[-1]


	def rouge_l_char(pred: str, ref: str) -> float:
	if not pred or not ref:
	return 0.0
	lcs = char_lcs(pred, ref)
	prec = lcs / max(1, len(pred))
	rec = lcs / max(1, len(ref))
	if prec + rec == 0:
	return 0.0
	return 2 * prec * rec / (prec + rec)


	def extract_summary(obj: Optional[Dict[str, Any]]) -> str:
	if not obj:
	return ""
	return safe_text(obj.get("画面总结") or obj.get("summary_zh") or obj.get("summary"))


	def extract_evidence_ids(obj: Optional[Dict[str, Any]]) -> List[str]:
	if not obj:
	return []
	values = obj.get("关键证据") or obj.get("key_ui_clues") or obj.get("evidence") or []
	out: List[str] = []
	if isinstance(values, list):
	for value in values:
	if isinstance(value, str):
	normalized = normalize_evidence_id(value)
	if normalized:
	out.append(normalized)
	elif isinstance(value, dict):
	out.extend(normalize_evidence_id(x) for x in value.get("evidence_ids", []) if normalize_evidence_id(x))
	return out


	def extract_function_entries(obj: Optional[Dict[str, Any]]) -> List[Any]:
	if not isinstance(obj, dict):
	return []
	values = obj.get("功能入口") or obj.get("ui_functions") or []
	return values if isinstance(values, list) else []


	def extract_function_names(obj: Optional[Dict[str, Any]]) -> List[str]:
	names = []
	for value in extract_function_entries(obj):
	if isinstance(value, dict):
	names.append(safe_text(value.get("name")))
	else:
	names.append(safe_text(value))
	return [name for name in names if name]


	def extract_function_evidence_ids(target: Dict[str, Any]) -> List[str]:
	ids: List[str] = []
	for value in extract_function_entries(target):
	if isinstance(value, dict):
	ids.extend(normalize_evidence_id(eid) for eid in value.get("evidence_ids", []) if normalize_evidence_id(eid))
	return ids


	def extract_named_function_evidence_ids(target: Dict[str, Any], keyword: str) -> List[str]:
	ids: List[str] = []
	for value in extract_function_entries(target):
	if isinstance(value, dict) and keyword in safe_text(value.get("name")):
	ids.extend(normalize_evidence_id(eid) for eid in value.get("evidence_ids", []) if normalize_evidence_id(eid))
	return ids


	def has_search_function(obj: Optional[Dict[str, Any]]) -> bool:
	return any("搜索" in name for name in extract_function_names(obj))


	def count_search_functions(obj: Optional[Dict[str, Any]]) -> int:
	return sum(1 for name in extract_function_names(obj) if "搜索" in name)


	def count_bare_search_functions(obj: Optional[Dict[str, Any]]) -> int:
	return sum(1 for name in extract_function_names(obj) if is_bare_search_function_name(name))


	def normalized_prediction_obj(obj: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
	if not isinstance(obj, dict):
	return None
	return {
	"画面总结": safe_text(obj.get("画面总结") or obj.get("summary_zh") or obj.get("summary")),
	"可见文字": obj.get("可见文字") or obj.get("visible_text") or [],
	"互动数据": obj.get("互动数据") or obj.get("interaction_data") or [],
	"功能入口": obj.get("功能入口") or obj.get("ui_functions") or [],
	"关键证据": obj.get("关键证据") or obj.get("key_ui_clues") or obj.get("evidence") or [],
	}


	def element_id(item: Dict[str, Any]) -> str:
	return safe_text(item.get("id") or item.get("ocr_id"))


	def visible_text_from_row(row: Dict[str, Any], max_items: int = 16) -> List[str]:
	values: List[str] = []
	seen = set()
	for group_name in ("ocr_items", "ui_items"):
	for item in row.get(group_name, []) or []:
	text = safe_text(item.get("text"))
	if not text or text in seen or re.fullmatch(r"E\d+", text):
	continue
	if len(text) == 1 and not re.search(r"[\u4e00-\u9fffA-Za-z0-9]", text):
	continue
	if re.fullmatch(r"[\s\-_.:：/]+", text):
	continue
	values.append(text[:80])
	seen.add(text)
	if len(values) >= max_items:
	return values
	return values


	def visible_texts_from_items(items: Iterable[Dict[str, Any]], max_items: int = 16) -> List[str]:
	values: List[str] = []
	seen = set()
	for item in items or []:
	text = safe_text(item.get("text"))
	if not text or text in seen or re.fullmatch(r"E\d+", text):
	continue
	if len(text) == 1 and not re.search(r"[\u4e00-\u9fffA-Za-z0-9]", text):
	continue
	if re.fullmatch(r"[\s\-_.:：/]+", text):
	continue
	values.append(text[:80])
	seen.add(text)
	if len(values) >= max_items:
	break
	return values


	def maybe_dropout_texts(values: List[str], dropout_rate: float) -> List[str]:
	if dropout_rate <= 0.0 or not values:
	return values
	kept = [value for value in values if random.random() >= dropout_rate]
	if kept:
	return kept
	return [random.choice(values)]


	def build_context_text(row: Dict[str, Any], args: argparse.Namespace, screen_text_dropout_rate: float = 0.0) -> str:
	app_text = safe_text(row.get("app"))
	instruction_text = safe_text(row.get("instruction"))
	context_format = str(getattr(args, "context_text_format", "rich") or "rich").lower()
	if context_format == "text_only":
	parts = [f"app: {app_text}", f"task: {instruction_text}"]
	if bool(getattr(args, "context_include_screen_text", False)):
	max_screen_items = int(getattr(args, "context_screen_text_items", 32) or 32)
	ocr_text = visible_texts_from_items(row.get("ocr_items") or [], max_items=max_screen_items)
	ui_text = visible_texts_from_items(row.get("ui_items") or [], max_items=max_screen_items)
	ocr_text = maybe_dropout_texts(ocr_text, screen_text_dropout_rate)
	ui_text = maybe_dropout_texts(ui_text, screen_text_dropout_rate)
	if ocr_text:
	parts.append("ocr: " + " \| ".join(ocr_text))
	if ui_text:
	parts.append("ui: " + " \| ".join(ui_text))
	return "\n".join(parts)

	context_text = f"应用:{app_text} 任务:{instruction_text}"
	if bool(getattr(args, "context_include_screen_text", False)):
	screen_text = visible_text_from_row(row, max_items=int(getattr(args, "context_screen_text_items", 32) or 32))
	screen_text = maybe_dropout_texts(screen_text, screen_text_dropout_rate)
	if screen_text:
	context_text = f"{context_text} 屏幕文字:{' \| '.join(screen_text)}"
	return context_text


	def prediction_from_summary(row: Dict[str, Any], summary_text: str) -> Dict[str, Any]:
	# decoder 在 summary 模式下只生成 "画面总结"，其它字段必须留空，
	# 任何来自输入行（OCR/UI 项）的回填都会污染评估指标，让 rouge 之外的字段失真。
	# 结构化字段如需填充，必须由后续 apply_structured_*_predictions（来自辅助 head）显式写入。
	return {
	"画面总结": safe_text(summary_text),
	"可见文字": [],
	"互动数据": [],
	"功能入口": [],
	"关键证据": [],
	}


	def repair_prediction_with_context(row: Dict[str, Any], pred_obj: Optional[Dict[str, Any]]) -> Tuple[Dict[str, Any], bool]:
	app = safe_text(row.get("app")) or "移动应用"
	instruction = safe_text(row.get("instruction"))
	has_search_task = row_has_search_task(row)
	has_search_evidence = row_has_visible_search_evidence(row)
	repaired = normalized_prediction_obj(pred_obj)
	changed = repaired is None
	if repaired is None:
	repaired = {"画面总结": "", "可见文字": [], "互动数据": [], "功能入口": [], "关键证据": []}

	summary = safe_text(repaired.get("画面总结"))
	search_overuse = ("搜索页面" in summary or "搜索结果" in summary) and "搜索" not in instruction
	generic_app = "App的App" in summary or "手机App的App" in summary or summary.count("App") >= 3
	needs_summary = not summary or app not in summary or generic_app or search_overuse
	if needs_summary:
	if instruction:
	summary = f"这是一个{app}界面，页面包含若干文字内容和可操作的 UI 入口。当前任务语境是：{instruction}。"
	else:
	summary = f"这是一个{app}界面，页面包含若干文字内容和可操作的 UI 入口。"
	repaired["画面总结"] = summary
	changed = True

	visible = repaired.get("可见文字") or []
	if isinstance(visible, list):
	cleaned_visible = [value for value in visible if not re.fullmatch(r"E\d+", safe_text(value))][:16]
	if cleaned_visible != visible:
	repaired["可见文字"] = cleaned_visible
	changed = True
	else:
	repaired["可见文字"] = []
	changed = True

	if not isinstance(repaired.get("互动数据"), list):
	repaired["互动数据"] = []
	changed = True
	functions = repaired.get("功能入口") or []
	if isinstance(functions, list):
	cleaned_functions = []
	for function in functions:
	name = safe_text(function.get("name") if isinstance(function, dict) else function)
	if is_generic_function_name(name, row):
	changed = True
	continue
	if "搜索" in name and not (has_search_task and has_search_evidence):
	changed = True
	continue
	cleaned_functions.append(function)
	if cleaned_functions != functions:
	repaired["功能入口"] = cleaned_functions
	else:
	repaired["功能入口"] = []
	changed = True
	evidence = repaired.get("关键证据") or []
	if not isinstance(evidence, list):
	evidence = []
	if not evidence:
	evidence = list(row.get("weak_evidence_ids") or [])[:8]
	repaired["关键证据"] = evidence
	changed = True
	return repaired, changed


	def row_has_visible_search_evidence(row: Dict[str, Any]) -> bool:
	for item in list(row.get("ui_items", [])) + list(row.get("ocr_items", [])):
	text = safe_text(item.get("text"))
	item_type = safe_text(item.get("type")).lower()
	if "搜索" in text or "search" in item_type:
	return True
	if text in {"搜", "搜索框"} or "输入" in text:
	return True
	return False


	def row_has_search_context(row: Dict[str, Any]) -> bool:
	instruction = safe_text(row.get("instruction"))
	screen_text = "".join(safe_text(item.get("text")) for item in (row.get("ui_items") or []))
	screen_text += "".join(safe_text(item.get("text")) for item in (row.get("ocr_items") or []))
	return "搜索" in instruction or "搜索" in screen_text


	def row_has_search_task(row: Dict[str, Any]) -> bool:
	instruction = safe_text(row.get("instruction"))
	return "搜索" in instruction or "查找" in instruction or "搜" in instruction


	def is_generic_function_name(name: str, row: Dict[str, Any]) -> bool:
	text = safe_text(name)
	app = safe_text(row.get("app"))
	if not text:
	return True
	generic_names = {
	"入口",
	"功能入口",
	"功能反馈",
	"打开应用",
	"进入应用",
	"打开App",
	"进入App",
	}
	if text in generic_names:
	return True
	if app and text in {f"{app}入口", f"进入{app}", f"打开{app}", f"进入{app}App", f"打开{app}App"}:
	return True
	if re.fullmatch(r"(进入\|打开).{1,12}(App\|应用)?", text):
	return True
	return False


	def is_bare_search_function_name(name: str) -> bool:
	return safe_text(name) == "搜索"


	def is_search_ui_item(item: Dict[str, Any]) -> bool:
	text = safe_text(item.get("text"))
	item_type = safe_text(item.get("type")).lower()
	if "搜索" in text or "search" in item_type:
	return True
	return text in {"搜", "搜索框"} or "输入" in text


	def is_structured_search_candidate_item(item: Dict[str, Any], strict: bool = False) -> bool:
	if not is_search_ui_item(item):
	return False
	if not strict:
	return True
	text = safe_text(item.get("text"))
	if text in {"搜索", "搜", "搜索框"}:
	return True
	passive_terms = [
	"历史搜索",
	"搜索历史",
	"搜索发现",
	"深度搜索",
	"AI搜索",
	"热门搜索",
	"最近搜索",
	"相关搜索",
	"搜索记录",
	"清除搜索记录",
	"搜索结果",
	"搜索来源",
	"搜索推荐",
	"根据你的搜索",
	]
	if any(term in text for term in passive_terms):
	return False
	control_terms = [
	"搜索框",
	"搜索、提问",
	"搜索地点",
	"搜索店内",
	"搜索商品",
	"搜索景点",
	"搜索机票",
	"附近搜索",
	"请输入",
	"输入",
	]
	if any(term in text for term in control_terms):
	return True
	return text.startswith("搜索") and len(text) <= 8


	def structured_function_name(item: Dict[str, Any], row: Dict[str, Any]) -> str:
	text = safe_text(item.get("text"))
	item_type = safe_text(item.get("type")).lower()
	if is_search_ui_item(item):
	return "搜索功能入口"
	if text:
	if len(text) <= 12:
	return text
	if "购物车" in text:
	return "购物车"
	if "消息" in text:
	return "消息入口"
	if "设置" in text:
	return "设置入口"
	return text[:12]
	if item.get("is_action_target"):
	instruction = safe_text(row.get("instruction"))
	for keyword in ["商城", "购物车", "消息", "订单", "收藏", "关注", "分享", "返回"]:
	if keyword in instruction:
	return f"{keyword}入口" if keyword not in {"收藏", "关注", "分享", "返回"} else keyword
	if "button" in item_type:
	return "按钮入口"
	return "功能入口"


	def apply_structured_function_predictions(
	row: Dict[str, Any],
	pred_obj: Optional[Dict[str, Any]],
	function_scores: torch.Tensor,
	search_scores: torch.Tensor,
	args: argparse.Namespace,
	) -> Dict[str, Any]:
	mode = str(getattr(args, "structured_function_mode", "decoder") or "decoder").lower()
	if mode == "decoder":
	return pred_obj if isinstance(pred_obj, dict) else normalized_prediction_obj(pred_obj)
	structured = normalized_prediction_obj(pred_obj)
	if structured is None:
	structured = {"画面总结": "", "可见文字": [], "互动数据": [], "功能入口": [], "关键证据": []}
	function_threshold = float(getattr(args, "structured_function_threshold", 0.5) or 0.5)
	search_threshold = float(getattr(args, "structured_search_threshold", function_threshold) or function_threshold)
	max_functions = int(getattr(args, "structured_max_functions", 12) or 12)
	has_search_task = row_has_search_task(row)
	has_search_evidence = row_has_visible_search_evidence(row)
	strict_search_candidates = bool(getattr(args, "structured_strict_search_candidates", False))
	items = (row.get("ui_items") or [])[: int(getattr(args, "max_elements", len(function_scores)))]
	candidates: List[Tuple[float, Dict[str, Any]]] = []
	for idx, item in enumerate(items[: len(function_scores)]):
	function_score = float(function_scores[idx])
	search_score = float(search_scores[idx]) if idx < len(search_scores) else 0.0
	raw_item_is_search = is_search_ui_item(item)
	item_is_search = is_structured_search_candidate_item(item, strict=strict_search_candidates)
	if raw_item_is_search and not item_is_search:
	continue
	if item_is_search:
	keep = has_search_task and has_search_evidence and search_score >= search_threshold
	else:
	keep = function_score >= function_threshold
	if not keep:
	continue
	evidence_id = safe_text(item.get("id") or item.get("ocr_id"))
	if not evidence_id:
	continue
	name = structured_function_name(item, row)
	if is_generic_function_name(name, row):
	continue
	if "搜索" in name and not (has_search_task and has_search_evidence):
	continue
	score = search_score if item_is_search else function_score
	candidates.append((score, {"name": name, "evidence_ids": [evidence_id]}))
	candidates.sort(key=lambda value: value[0], reverse=True)
	functions: List[Dict[str, Any]] = []
	seen = set()
	for _, function in candidates:
	key = (function["name"], tuple(function.get("evidence_ids", [])))
	if key in seen:
	continue
	seen.add(key)
	functions.append(function)
	if len(functions) >= max_functions:
	break
	structured["功能入口"] = functions
	return structured


	def apply_structured_evidence_predictions(
	row: Dict[str, Any],
	pred_obj: Optional[Dict[str, Any]],
	evidence_scores: torch.Tensor,
	args: argparse.Namespace,
	) -> Dict[str, Any]:
	mode = str(getattr(args, "structured_evidence_mode", "decoder") or "decoder").lower()
	structured = normalized_prediction_obj(pred_obj)
	if structured is None:
	structured = {"画面总结": "", "可见文字": [], "互动数据": [], "功能入口": [], "关键证据": []}
	if mode == "decoder":
	return structured
	if mode != "heads":
	raise ValueError("structured_evidence_mode must be one of: decoder, heads")
	threshold = float(getattr(args, "structured_evidence_threshold", 0.5) or 0.5)
	max_evidence = int(getattr(args, "structured_max_evidence", 8) or 8)
	fallback_top1 = bool(getattr(args, "structured_evidence_fallback_top1", True))
	items = (row.get("ui_items") or [])[: int(getattr(args, "max_elements", len(evidence_scores)))]
	candidates: List[Tuple[float, str]] = []
	for idx, item in enumerate(items[: len(evidence_scores)]):
	eid = element_id(item)
	if not eid:
	continue
	score = float(evidence_scores[idx])
	candidates.append((score, eid))
	candidates.sort(key=lambda value: value[0], reverse=True)
	selected: List[str] = []
	seen = set()
	for score, eid in candidates:
	if score < threshold:
	continue
	if eid in seen:
	continue
	selected.append(eid)
	seen.add(eid)
	if len(selected) >= max_evidence:
	break
	if not selected and fallback_top1 and candidates:
	selected.append(candidates[0][1])
	structured["关键证据"] = selected
	return structured


	def build_context_summary(row: Dict[str, Any]) -> str:
	app = safe_text(row.get("app")) or "移动应用"
	instruction = safe_text(row.get("instruction"))
	if instruction:
	return f"这是一个{app}界面，页面包含若干文字内容和可操作的 UI 入口。当前任务语境是：{instruction}。"
	return f"这是一个{app}界面，页面包含若干文字内容和可操作的 UI 入口。"


	def canonicalize_target_with_context(
	row: Dict[str, Any],
	target: Dict[str, Any],
	drop_bare_search_functions: bool = False,
	) -> Dict[str, Any]:
	canonical = dict(target or {})
	# 只在 summary_zh 缺失时回填模板，绝对不要覆盖已有标签——
	# 之前无条件覆盖会把任何真实摘要替换成 "这是一个X界面...当前任务语境是..." 模板，
	# 是导致全量训练目标 100% 同质化、模型只学到模板的根因之一。
	existing_summary = safe_text(canonical.get("summary_zh"))
	if not existing_summary:
	canonical["summary_zh"] = build_context_summary(row)

	visible = canonical.get("visible_text") or []
	if isinstance(visible, list):
	canonical["visible_text"] = [value for value in visible if not re.fullmatch(r"E\d+", safe_text(value))][:16]
	else:
	canonical["visible_text"] = []

	functions = canonical.get("ui_functions") or []
	if isinstance(functions, list):
	has_search_task = row_has_search_task(row)
	has_search_evidence = row_has_visible_search_evidence(row)
	cleaned_functions = []
	for function in functions:
	name = safe_text(function.get("name") if isinstance(function, dict) else function)
	if is_generic_function_name(name, row):
	continue
	if drop_bare_search_functions and is_bare_search_function_name(name):
	continue
	if "搜索" in name and not (has_search_task and has_search_evidence):
	continue
	cleaned_functions.append(function)
	canonical["ui_functions"] = cleaned_functions[:12]
	else:
	canonical["ui_functions"] = []

	if not isinstance(canonical.get("interaction_data"), list):
	canonical["interaction_data"] = []
	evidence = canonical.get("key_ui_clues") or []
	if not isinstance(evidence, list):
	evidence = []
	if not evidence:
	evidence = list(row.get("weak_evidence_ids") or [])[:8]
	canonical["key_ui_clues"] = evidence[:12]
	return canonical


	class RichScreenshotDataset(Dataset):
	def __init__(self, path: str, max_samples: int = 0, sample_seed: Optional[int] = None):
	self.path = Path(path)
	self.rows = list(read_jsonl(self.path))
	if max_samples and max_samples < len(self.rows):
	if sample_seed is None:
	self.rows = self.rows[:max_samples]
	else:
	rng = random.Random(sample_seed)
	indices = sorted(rng.sample(range(len(self.rows)), max_samples))
	self.rows = [self.rows[index] for index in indices]
	if not self.rows:
	raise ValueError(f"No rows loaded from {path}")

	def __len__(self) -> int:
	return len(self.rows)

	def __getitem__(self, idx: int) -> Dict[str, Any]:
	return self.rows[idx]


	def dataset_diagnostics(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
	ocr_counts: List[int] = []
	ui_counts: List[int] = []
	target_chars: List[int] = []
	missing_target = 0
	short_summary = 0
	missing_image_path = 0
	missing_image_file = 0
	missing_ocr_items = 0
	missing_ui_items = 0
	for row in rows:
	target = row.get("target")
	if not isinstance(target, dict):
	missing_target += 1
	summary = ""
	else:
	summary = safe_text(target.get("summary_zh"))
	target_chars.append(len(summary))
	if len(summary) < 8:
	short_summary += 1
	image_path = safe_text(row.get("image_path"))
	if not image_path:
	missing_image_path += 1
	elif not Path(image_path).exists():
	missing_image_file += 1
	ocr_count = len(row.get("ocr_items") or [])
	ui_count = len(row.get("ui_items") or [])
	ocr_counts.append(ocr_count)
	ui_counts.append(ui_count)
	if ocr_count <= 0:
	missing_ocr_items += 1
	if ui_count <= 0:
	missing_ui_items += 1

	row_count = len(rows)

	def mean(values: List[int]) -> float:
	return float(np.mean(values)) if values else 0.0

	return {
	"rows": row_count,
	"missing_target": missing_target,
	"short_summary": short_summary,
	"missing_image_path": missing_image_path,
	"missing_image_file": missing_image_file,
	"missing_ocr_items": missing_ocr_items,
	"missing_ui_items": missing_ui_items,
	"missing_ocr_rate": missing_ocr_items / max(1, row_count),
	"missing_ui_rate": missing_ui_items / max(1, row_count),
	"ocr_items_mean": mean(ocr_counts),
	"ocr_items_max": int(max(ocr_counts)) if ocr_counts else 0,
	"ui_items_mean": mean(ui_counts),
	"ui_items_max": int(max(ui_counts)) if ui_counts else 0,
	"summary_chars_mean": mean(target_chars),
	"summary_chars_max": int(max(target_chars)) if target_chars else 0,
	}


	def validate_dataset_for_training(split_name: str, diagnostics: Dict[str, Any], args: argparse.Namespace) -> None:
	if not bool(getattr(args, "strict_data_checks", True)):
	return
	errors: List[str] = []
	if diagnostics["missing_target"]:
	errors.append(f"{diagnostics['missing_target']} rows are missing target")
	if diagnostics["short_summary"]:
	errors.append(f"{diagnostics['short_summary']} rows have summary_zh shorter than 8 chars")
	if diagnostics["missing_image_path"]:
	errors.append(f"{diagnostics['missing_image_path']} rows are missing image_path")
	vision_enabled = not bool(getattr(args, "disable_vision", False)) and str(getattr(args, "model_variant", "")) != "annotation_only"
	if vision_enabled and diagnostics["missing_image_file"]:
	errors.append(f"{diagnostics['missing_image_file']} image files do not exist")
	needs_ui_context = bool(getattr(args, "direct_element_tokens", False)) or int(getattr(args, "max_elements", 0) or 0) > 0
	needs_screen_text = bool(getattr(args, "context_include_screen_text", False))
	if needs_ui_context and diagnostics["missing_ui_rate"] > 0.05:
	errors.append(f"{diagnostics['missing_ui_items']} rows are missing ui_items")
	if needs_screen_text and diagnostics["missing_ocr_rate"] > 0.05:
	errors.append(f"{diagnostics['missing_ocr_items']} rows are missing ocr_items")
	if errors:
	raise ValueError(f"{split_name} dataset failed strict data checks: " + "; ".join(errors))


	def token_length_summary(values: List[int], max_length: int) -> Dict[str, Any]:
	if not values:
	return {
	"count": 0,
	"mean": 0.0,
	"p50": 0,
	"p90": 0,
	"p95": 0,
	"p99": 0,
	"max": 0,
	"over_max": 0,
	"over_max_rate": 0.0,
	"at_or_over_max": 0,
	"at_or_over_max_rate": 0.0,
	}
	ordered = sorted(values)

	def pct(percent: float) -> int:
	index = int(round((len(ordered) - 1) * percent / 100.0))
	return int(ordered[max(0, min(len(ordered) - 1, index))])

	over_max = sum(value > max_length for value in values)
	at_or_over_max = sum(value >= max_length for value in values)
	return {
	"count": len(values),
	"mean": float(np.mean(values)),
	"p50": pct(50),
	"p90": pct(90),
	"p95": pct(95),
	"p99": pct(99),
	"max": int(max(values)),
	"configured_max": int(max_length),
	"over_max": int(over_max),
	"over_max_rate": over_max / max(1, len(values)),
	"at_or_over_max": int(at_or_over_max),
	"at_or_over_max_rate": at_or_over_max / max(1, len(values)),
	}


	def tokenizer_diagnostics(rows: List[Dict[str, Any]], tokenizer, args: argparse.Namespace) -> Dict[str, Any]:
	target_lengths: List[int] = []
	context_lengths: List[int] = []
	target_schema = getattr(args, "target_schema", "zh")
	max_target_tokens = int(getattr(args, "max_target_tokens", 384) or 384)
	max_context_tokens = int(getattr(args, "max_context_tokens", 64) or 64)
	for row in rows:
	target = row.get("target") or {}
	if bool(getattr(args, "canonicalize_targets", False)):
	target = canonicalize_target_with_context(
	row,
	target,
	drop_bare_search_functions=bool(getattr(args, "drop_bare_search_functions", False)),
	)
	target_text = target_to_text(target, target_schema)
	context_text = build_context_text(row, args)
	target_lengths.append(len(tokenizer(target_text, add_special_tokens=True).input_ids))
	context_lengths.append(len(tokenizer(context_text, add_special_tokens=True).input_ids))
	return {
	"target_tokens": token_length_summary(target_lengths, max_target_tokens),
	"context_tokens": token_length_summary(context_lengths, max_context_tokens),
	}


	def validate_token_lengths(split_name: str, diagnostics: Dict[str, Any], args: argparse.Namespace) -> None:
	if not bool(getattr(args, "strict_data_checks", True)):
	return
	max_allowed_rate = float(getattr(args, "max_target_truncation_rate", 0.01) or 0.0)
	target_stats = diagnostics.get("target_tokens") or {}
	over_rate = float(target_stats.get("over_max_rate", 0.0) or 0.0)
	if over_rate > max_allowed_rate:
	raise ValueError(
	f"{split_name} target token truncation rate {over_rate:.2%} exceeds "
	f"max_target_truncation_rate={max_allowed_rate:.2%}; "
	f"max_target_tokens={target_stats.get('configured_max')}, "
	f"p95={target_stats.get('p95')}, p99={target_stats.get('p99')}, max={target_stats.get('max')}"
	)
	if int(getattr(args, "eval_max_new_tokens", 0) or 0) < int(getattr(args, "max_target_tokens", 0) or 0):
	raise ValueError("eval_max_new_tokens must be >= max_target_tokens for natural_zh validation.")


	class RichCollator:
	def __init__(self, tokenizer, image_processor, args: argparse.Namespace, is_training: bool = False):
	self.tokenizer = tokenizer
	self.image_processor = image_processor
	self.args = args
	self.is_training = is_training
	self.use_vision = not bool(getattr(args, "disable_vision", False)) and str(getattr(args, "model_variant", "")) != "annotation_only"

	def _load_images(self, image_path: str) -> List[Image.Image]:
	path = Path(image_path)
	try:
	image = Image.open(path).convert("RGB")
	except Exception:
	image = Image.new("RGB", (self.args.image_size, self.args.image_size), color=(245, 245, 245))
	images = [image]
	if self.args.num_vertical_crops > 0:
	width, height = image.size
	crops = self.args.num_vertical_crops
	for i in range(crops):
	top = int(height * i / crops)
	bottom = int(height * (i + 1) / crops)
	images.append(image.crop((0, top, width, bottom)))
	return images

	def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
	bsz = len(batch)
	if self.use_vision:
	flat_images: List[Image.Image] = []
	crop_counts: List[int] = []
	for row in batch:
	images = self._load_images(row.get("image_path", ""))
	crop_counts.append(len(images))
	flat_images.extend(images)
	image_kwargs = {
	"return_tensors": "pt",
	"do_resize": True,
	"size": {"height": self.args.image_size, "width": self.args.image_size},
	}
	pixel_values = self.image_processor(images=flat_images, **image_kwargs)["pixel_values"]
	crops = crop_counts[0]
	pixel_values = pixel_values.view(bsz, crops, *pixel_values.shape[1:])
	else:
	pixel_values = torch.empty(bsz, 0, 3, self.args.image_size, self.args.image_size, dtype=torch.float32)

	all_elements: List[List[Dict[str, Any]]] = []
	flat_element_texts: List[str] = []
	element_mask = torch.zeros(bsz, self.args.max_elements, dtype=torch.bool)
	bboxes = torch.zeros(bsz, self.args.max_elements, 4, dtype=torch.float32)
	type_ids = torch.zeros(bsz, self.args.max_elements, dtype=torch.long)
	source_ids = torch.zeros(bsz, self.args.max_elements, dtype=torch.long)
	loc_ids = torch.zeros(bsz, self.args.max_elements, dtype=torch.long)
	confs = torch.zeros(bsz, self.args.max_elements, 1, dtype=torch.float32)
	action_flags = torch.zeros(bsz, self.args.max_elements, 1, dtype=torch.float32)
	evidence_labels = torch.zeros(bsz, self.args.max_elements, dtype=torch.float32)
	numeric_labels = torch.zeros(bsz, self.args.max_elements, dtype=torch.float32)
	ui_function_labels = torch.zeros(bsz, self.args.max_elements, dtype=torch.float32)
	search_function_labels = torch.zeros(bsz, self.args.max_elements, dtype=torch.float32)

	type_vocab: Dict[str, int] = {"visual": 0, "text": 1, "button": 2, "label": 3, "text_number": 4}
	source_vocab: Dict[str, int] = {"unknown": 0, "cmgui": 1, "ocr": 2, "rule": 3}
	loc_vocab: Dict[str, int] = {
	"unknown": 0,
	"top-left": 1,
	"top-center": 2,
	"top-right": 3,
	"middle-left": 4,
	"middle-center": 5,
	"middle-right": 6,
	"bottom-left": 7,
	"bottom-center": 8,
	"bottom-right": 9,
	}

	target_texts: List[str] = []
	context_texts: List[str] = []
	section_labels = torch.zeros(bsz, len(SECTION_NAMES), dtype=torch.float32)

	for row_idx, row in enumerate(batch):
	target = row.get("target") or {}
	if bool(getattr(self.args, "canonicalize_targets", False)):
	target = canonicalize_target_with_context(
	row,
	target,
	drop_bare_search_functions=bool(getattr(self.args, "drop_bare_search_functions", False)),
	)
	target_texts.append(target_to_text(target, getattr(self.args, "target_schema", "zh")))
	screen_text_dropout_rate = (
	float(getattr(self.args, "context_screen_text_dropout_rate", 0.0) or 0.0) if self.is_training else 0.0
	)
	context_texts.append(build_context_text(row, self.args, screen_text_dropout_rate=screen_text_dropout_rate))
	evidence_ids = set(target.get("key_ui_clues", []) or row.get("weak_evidence_ids", []))
	function_evidence = set(extract_function_evidence_ids(target))
	search_function_evidence = set(extract_named_function_evidence_ids(target, "搜索"))
	interaction_evidence = set()
	for interaction in target.get("interaction_data", []) or []:
	interaction_evidence.update(interaction.get("evidence_ids", []) or [])
	for sec_idx, name in enumerate(SECTION_NAMES):
	if target.get(name):
	section_labels[row_idx, sec_idx] = 1.0
	if bool(getattr(self.args, "task_intent_context", False)):
	task_type = "检索" if row_has_search_task(row) else "常规"
	context_texts[-1] = f"{context_texts[-1]} 任务类别:{task_type}"
	elements = (row.get("ui_items") or [])[: self.args.max_elements]
	all_elements.append(elements)
	for elem_idx in range(self.args.max_elements):
	if elem_idx < len(elements):
	elem = elements[elem_idx]
	text = safe_text(elem.get("text"))
	flat_element_texts.append(f"{elem.get('type','text')} {elem.get('source','')} {text}")
	element_mask[row_idx, elem_idx] = True
	bbox = elem.get("bbox") or [0, 0, 1, 1]
	if len(bbox) == 4:
	bboxes[row_idx, elem_idx] = torch.tensor(bbox, dtype=torch.float32)
	type_ids[row_idx, elem_idx] = type_vocab.get(safe_text(elem.get("type")), 1)
	source_ids[row_idx, elem_idx] = source_vocab.get(safe_text(elem.get("source")), 0)
	loc_ids[row_idx, elem_idx] = loc_vocab.get(safe_text(elem.get("location")), 0)
	confs[row_idx, elem_idx, 0] = float(elem.get("ocr_conf", elem.get("conf", 1.0)) or 1.0)
	action_flags[row_idx, elem_idx, 0] = 1.0 if elem.get("is_action_target") else 0.0
	if elem.get("id") in evidence_ids or elem.get("ocr_id") in evidence_ids:
	evidence_labels[row_idx, elem_idx] = 1.0
	if elem.get("id") in function_evidence or elem.get("ocr_id") in function_evidence:
	ui_function_labels[row_idx, elem_idx] = 1.0
	if elem.get("id") in search_function_evidence or elem.get("ocr_id") in search_function_evidence:
	search_function_labels[row_idx, elem_idx] = 1.0
	if elem.get("id") in interaction_evidence or re.search(r"\d", text):
	numeric_labels[row_idx, elem_idx] = 1.0
	else:
	flat_element_texts.append("")

	elem_tokens = self.tokenizer(
	flat_element_texts,
	padding=True,
	truncation=True,
	max_length=self.args.max_element_tokens,
	return_tensors="pt",
	)
	elem_input_ids = elem_tokens.input_ids.view(bsz, self.args.max_elements, -1)
	elem_attention_mask = elem_tokens.attention_mask.view(bsz, self.args.max_elements, -1)

	context_tokens = self.tokenizer(
	context_texts,
	padding=True,
	truncation=True,
	max_length=self.args.max_context_tokens,
	return_tensors="pt",
	)

	target_tokens = self.tokenizer(
	target_texts,
	padding=True,
	truncation=True,
	max_length=self.args.max_target_tokens,
	return_tensors="pt",
	)
	labels = target_tokens.input_ids
	labels[labels == self.tokenizer.pad_token_id] = -100

	return {
	"rows": batch,
	"pixel_values": pixel_values,
	"element_input_ids": elem_input_ids,
	"element_attention_mask": elem_attention_mask,
	"element_mask": element_mask,
	"bboxes": bboxes,
	"type_ids": type_ids,
	"source_ids": source_ids,
	"loc_ids": loc_ids,
	"confs": confs,
	"action_flags": action_flags,
	"context_input_ids": context_tokens.input_ids,
	"context_attention_mask": context_tokens.attention_mask,
	"labels": labels,
	"evidence_labels": evidence_labels,
	"ui_function_labels": ui_function_labels,
	"search_function_labels": search_function_labels,
	"section_labels": section_labels,
	"numeric_labels": numeric_labels,
	}


	class BottleneckPooler(nn.Module):
	def __init__(self, hidden_size: int, num_queries: int = 64, num_heads: int = 8):
	super().__init__()
	self.queries = nn.Parameter(torch.randn(num_queries, hidden_size) * 0.02)
	self.attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True)
	self.norm = nn.LayerNorm(hidden_size)

	def forward(self, memory: torch.Tensor, key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	bsz = memory.size(0)
	queries = self.queries.unsqueeze(0).expand(bsz, -1, -1)
	pooled, _ = self.attn(queries, memory, memory, key_padding_mask=key_padding_mask)
	return self.norm(pooled + queries)


	class RichGroundedModel(nn.Module):
	def __init__(self, args: argparse.Namespace):
	super().__init__()
	self.args = args
	self.model_variant = args.model_variant
	self.use_vision = not bool(getattr(args, "disable_vision", False)) and self.model_variant != "annotation_only"
	self.vision = AutoModel.from_pretrained(args.vision_model) if self.use_vision else None
	self.decoder = AutoModelForSeq2SeqLM.from_pretrained(args.decoder_model)
	if bool(getattr(args, "decoder_gradient_checkpointing", False)) and hasattr(self.decoder, "gradient_checkpointing_enable"):
	self.decoder.gradient_checkpointing_enable()
	if hasattr(self.decoder.config, "use_cache"):
	self.decoder.config.use_cache = False
	if bool(getattr(args, "freeze_decoder", False)):
	for param in self.decoder.parameters():
	param.requires_grad = False
	if self.vision is not None and bool(getattr(args, "vision_gradient_checkpointing", False)) and hasattr(self.vision, "gradient_checkpointing_enable"):
	self.vision.gradient_checkpointing_enable()
	self.hidden_size = self.decoder.config.d_model
	vision_hidden = self._vision_hidden_size() if self.vision is not None else self.hidden_size
	self.visual_proj = nn.Linear(vision_hidden, self.hidden_size)
	self.elem_text_proj = nn.Linear(self.hidden_size, self.hidden_size)
	self.type_emb = nn.Embedding(16, self.hidden_size)
	self.source_emb = nn.Embedding(8, self.hidden_size)
	self.loc_emb = nn.Embedding(16, self.hidden_size)
	self.bbox_proj = nn.Sequential(nn.Linear(6, self.hidden_size), nn.GELU(), nn.Linear(self.hidden_size, self.hidden_size))
	self.roi_proj = nn.Linear(vision_hidden, self.hidden_size)
	self.context_proj = nn.Linear(self.hidden_size, self.hidden_size)
	enc_layer = nn.TransformerEncoderLayer(
	d_model=self.hidden_size,
	nhead=8,
	dim_feedforward=self.hidden_size * 4,
	dropout=0.1,
	batch_first=True,
	norm_first=False,
	)
	self.layout_encoder = nn.TransformerEncoder(enc_layer, num_layers=2)
	fusion_layer = nn.TransformerEncoderLayer(
	d_model=self.hidden_size,
	nhead=8,
	dim_feedforward=self.hidden_size * 4,
	dropout=0.1,
	batch_first=True,
	norm_first=False,
	)
	self.fusion = nn.TransformerEncoder(fusion_layer, num_layers=2)
	self.pooler = BottleneckPooler(self.hidden_size, num_queries=args.bottleneck_queries, num_heads=8)
	self.evidence_head = nn.Linear(self.hidden_size, 1)
	self.ui_function_head = nn.Linear(self.hidden_size, 1)
	self.search_function_head = nn.Linear(self.hidden_size, 1)
	self.function_signal_proj = nn.Linear(1, self.hidden_size)
	self.search_signal_proj = nn.Linear(1, self.hidden_size)
	nn.init.zeros_(self.function_signal_proj.weight)
	nn.init.zeros_(self.function_signal_proj.bias)
	nn.init.zeros_(self.search_signal_proj.weight)
	nn.init.zeros_(self.search_signal_proj.bias)
	self.numeric_head = nn.Linear(self.hidden_size, 1)
	self.section_head = nn.Linear(self.hidden_size, len(SECTION_NAMES))
	if self.vision is not None and args.freeze_vision:
	self.freeze_vision(args.unfreeze_vision_last_ratio)

	def _vision_hidden_size(self) -> int:
	if self.vision is None:
	return self.hidden_size
	config = self.vision.config
	if hasattr(config, "vision_config"):
	return int(config.vision_config.hidden_size)
	return int(getattr(config, "hidden_size"))

	def freeze_vision(self, unfreeze_last_ratio: float) -> None:
	if self.vision is None:
	return
	for param in self.vision.parameters():
	param.requires_grad = False
	if unfreeze_last_ratio <= 0:
	return
	layers = None
	vision_model = getattr(self.vision, "vision_model", self.vision)
	encoder = getattr(vision_model, "encoder", None)
	if encoder is not None:
	layers = getattr(encoder, "layers", None) or getattr(encoder, "layer", None)
	if layers:
	keep = max(1, int(len(layers) * unfreeze_last_ratio))
	for layer in layers[-keep:]:
	for param in layer.parameters():
	param.requires_grad = True

	def use_native_context_forward(self) -> bool:
	return bool(getattr(self.args, "native_context_forward", False))

	def mean_embed(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
	emb = self.decoder.get_input_embeddings()(input_ids)
	mask = attention_mask.unsqueeze(-1).float()
	return (emb * mask).sum(dim=-2) / mask.sum(dim=-2).clamp_min(1.0)

	def encode_vision(self, pixel_values: torch.Tensor) -> torch.Tensor:
	bsz, crops, channels, height, width = pixel_values.shape
	flat = pixel_values.view(bsz * crops, channels, height, width)
	vision_model = getattr(self.vision, "vision_model", self.vision)
	vision_trainable = any(param.requires_grad for param in self.vision.parameters())
	with torch.set_grad_enabled(vision_trainable):
	try:
	out = vision_model(pixel_values=flat, interpolate_pos_encoding=True)
	except TypeError:
	out = vision_model(pixel_values=flat)
	tokens = out.last_hidden_state
	tokens = tokens.view(bsz, crops, tokens.size(1), tokens.size(2))
	return tokens

	def reduce_visual_tokens(self, visual: torch.Tensor) -> torch.Tensor:
	bsz, crops, num_tokens, hidden = visual.shape
	max_visual_tokens = int(getattr(self.args, "max_visual_tokens", 0) or 0)
	if max_visual_tokens <= 0 or crops * num_tokens <= max_visual_tokens:
	return self.visual_proj(visual.flatten(1, 2))

	per_crop = max(1, max_visual_tokens // max(1, crops))
	reduced_crops: List[torch.Tensor] = []
	for crop_idx in range(crops):
	crop_tokens = visual[:, crop_idx, :, :]
	grid = int(math.sqrt(num_tokens))
	cls_token = None
	patch_tokens = crop_tokens
	if grid * grid != num_tokens:
	maybe_grid = int(math.sqrt(num_tokens - 1))
	if maybe_grid * maybe_grid == num_tokens - 1:
	cls_token = crop_tokens[:, :1, :]
	patch_tokens = crop_tokens[:, 1:, :]
	grid = maybe_grid
	else:
	target = min(per_crop, num_tokens)
	indices = torch.linspace(0, num_tokens - 1, steps=target, device=visual.device).round().long()
	reduced_crops.append(crop_tokens.index_select(1, indices))
	continue
	cls_budget = 1 if cls_token is not None and per_crop > 1 else 0
	patch_budget = max(1, per_crop - cls_budget)
	out_grid = max(1, int(math.sqrt(patch_budget)))
	patch_tokens = patch_tokens.view(bsz, grid, grid, hidden).permute(0, 3, 1, 2)
	pooled = F.adaptive_avg_pool2d(patch_tokens, (out_grid, out_grid))
	pooled = pooled.permute(0, 2, 3, 1).reshape(bsz, out_grid * out_grid, hidden)
	if cls_budget:
	pooled = torch.cat([cls_token, pooled], dim=1)
	reduced_crops.append(pooled)
	return self.visual_proj(torch.cat(reduced_crops, dim=1))

	def roi_pool(self, full_tokens: torch.Tensor, bboxes: torch.Tensor) -> torch.Tensor:
	bsz, num_tokens, hidden = full_tokens.shape
	grid = int(math.sqrt(num_tokens))
	if grid * grid != num_tokens:
	patch_tokens = full_tokens
	grid = int(math.sqrt(num_tokens - 1))
	if grid * grid == num_tokens - 1:
	patch_tokens = full_tokens[:, 1:, :]
	else:
	pooled = full_tokens.mean(dim=1, keepdim=True).expand(-1, bboxes.size(1), -1)
	return pooled
	else:
	patch_tokens = full_tokens
	patch_tokens = patch_tokens.view(bsz, grid, grid, hidden)
	outputs = []
	for b in range(bsz):
	elem_vecs = []
	for bbox in bboxes[b]:
	x1, y1, x2, y2 = bbox.tolist()
	ix1 = max(0, min(grid - 1, int(x1 * grid)))
	iy1 = max(0, min(grid - 1, int(y1 * grid)))
	ix2 = max(ix1 + 1, min(grid, int(math.ceil(x2 * grid))))
	iy2 = max(iy1 + 1, min(grid, int(math.ceil(y2 * grid))))
	region = patch_tokens[b, iy1:iy2, ix1:ix2, :]
	elem_vecs.append(region.reshape(-1, hidden).mean(dim=0))
	outputs.append(torch.stack(elem_vecs, dim=0))
	return torch.stack(outputs, dim=0)

	def build_memory(self, batch: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	use_activation_checkpointing = self.training and bool(getattr(self.args, "activation_checkpointing", False))
	bsz = batch["element_mask"].size(0)
	if self.vision is not None:
	pixel_values = batch["pixel_values"]
	visual = self.encode_vision(pixel_values)
	full_visual = visual[:, 0, :, :]
	visual_tokens = self.reduce_visual_tokens(visual)
	else:
	full_visual = None
	visual_tokens = batch["element_mask"].new_zeros((bsz, 0, self.hidden_size), dtype=self.decoder.get_input_embeddings().weight.dtype)

	elem_ids = batch["element_input_ids"]
	elem_mask_tok = batch["element_attention_mask"]
	bsz, max_elements, elem_len = elem_ids.shape
	elem_text = self.mean_embed(elem_ids.view(bsz * max_elements, elem_len), elem_mask_tok.view(bsz * max_elements, elem_len))
	elem_text = self.elem_text_proj(elem_text.view(bsz, max_elements, -1))
	bbox = batch["bboxes"]
	wh = (bbox[..., 2:] - bbox[..., :2]).clamp_min(0)
	bbox_feat = torch.cat([bbox, wh], dim=-1)
	elem_tokens = (
	elem_text
	+ self.type_emb(batch["type_ids"])
	+ self.source_emb(batch["source_ids"])
	+ self.loc_emb(batch["loc_ids"])
	+ self.bbox_proj(bbox_feat)
	+ batch["confs"] * 0.1
	+ batch["action_flags"] * 0.1
	)
	if self.vision is not None and self.model_variant in {"full", "late_fusion"}:
	roi = self.roi_proj(self.roi_pool(full_visual, bbox))
	elem_tokens = elem_tokens + roi
	elem_key_padding = ~batch["element_mask"]
	if use_activation_checkpointing:
	elem_tokens = checkpoint(
	lambda tokens, padding: self.layout_encoder(tokens, src_key_padding_mask=padding),
	elem_tokens,
	elem_key_padding,
	use_reentrant=False,
	)
	else:
	elem_tokens = self.layout_encoder(elem_tokens, src_key_padding_mask=elem_key_padding)
	head_elem_tokens = elem_tokens
	decoder_elem_tokens = elem_tokens
	if bool(getattr(self.args, "function_signal_to_decoder", False)):
	function_signal = torch.sigmoid(self.ui_function_head(head_elem_tokens)).detach()
	function_signal = function_signal.masked_fill(elem_key_padding.unsqueeze(-1), 0.0)
	signal_scale = float(getattr(self.args, "function_signal_scale", 1.0) or 1.0)
	decoder_elem_tokens = decoder_elem_tokens + signal_scale * self.function_signal_proj(function_signal)
	if bool(getattr(self.args, "search_signal_to_decoder", False)):
	search_signal = torch.sigmoid(self.search_function_head(head_elem_tokens)).detach()
	search_signal = search_signal.masked_fill(elem_key_padding.unsqueeze(-1), 0.0)
	search_signal_scale = float(getattr(self.args, "search_signal_scale", 1.0) or 1.0)
	decoder_elem_tokens = decoder_elem_tokens + search_signal_scale * self.search_signal_proj(search_signal)

	context_mode = str(getattr(self.args, "context_mode", "mean") or "mean").lower()
	direct_context_tokens = None
	if context_mode in {"tokens_encoder", "tokens_direct_encoder"}:
	context_encoded = self.decoder.get_encoder()(input_ids=batch["context_input_ids"], attention_mask=batch["context_attention_mask"], return_dict=True)
	direct_context_tokens = context_encoded.last_hidden_state
	context_tokens = self.context_proj(context_encoded.last_hidden_state)
	context_padding = ~batch["context_attention_mask"].bool()
	elif context_mode in {"tokens", "tokens_direct"}:
	context_emb = self.decoder.get_input_embeddings()(batch["context_input_ids"])
	direct_context_tokens = context_emb
	context_tokens = self.context_proj(context_emb)
	context_padding = ~batch["context_attention_mask"].bool()
	else:
	context = self.mean_embed(batch["context_input_ids"], batch["context_attention_mask"])
	context_tokens = self.context_proj(context).unsqueeze(1)
	context_padding = torch.zeros(bsz, 1, dtype=torch.bool, device=context_tokens.device)

	context_len = context_tokens.size(1)
	visual_start = -1
	visual_len = 0
	elem_start = -1
	elem_len_for_fusion = 0
	if self.model_variant == "annotation_only":
	elem_start = context_len
	elem_len_for_fusion = decoder_elem_tokens.size(1)
	fusion_input = torch.cat([context_tokens, decoder_elem_tokens], dim=1)
	fusion_padding = torch.cat([context_padding, elem_key_padding], dim=1)
	elif self.model_variant == "image_only":
	if self.vision is None:
	raise ValueError("image_only requires vision; do not set disable_vision=true")
	visual_start = 0
	visual_len = visual_tokens.size(1)
	fusion_input = visual_tokens
	fusion_padding = torch.zeros(bsz, fusion_input.size(1), dtype=torch.bool, device=fusion_input.device)
	elif self.model_variant == "late_fusion":
	visual_summary = visual_tokens.mean(dim=1, keepdim=True)
	visual_start = context_len
	visual_len = 1
	elem_start = context_len + 1
	elem_len_for_fusion = decoder_elem_tokens.size(1)
	fusion_input = torch.cat([context_tokens, visual_summary, decoder_elem_tokens], dim=1)
	fusion_padding = torch.cat(
	[context_padding, torch.zeros(bsz, 1, dtype=torch.bool, device=elem_key_padding.device), elem_key_padding],
	dim=1,
	)
	else:
	visual_start = context_len
	visual_len = visual_tokens.size(1)
	elem_start = context_len + visual_len
	elem_len_for_fusion = decoder_elem_tokens.size(1)
	fusion_input = torch.cat([context_tokens, visual_tokens, decoder_elem_tokens], dim=1)
	fusion_padding = torch.cat(
	[
	context_padding,
	torch.zeros(bsz, visual_tokens.size(1), dtype=torch.bool, device=elem_key_padding.device),
	elem_key_padding,
	],
	dim=1,
	)
	if use_activation_checkpointing:
	fused = checkpoint(
	lambda tokens, padding: self.fusion(tokens, src_key_padding_mask=padding),
	fusion_input,
	fusion_padding,
	use_reentrant=False,
	)
	else:
	fused = self.fusion(fusion_input, src_key_padding_mask=fusion_padding)
	if elem_start >= 0 and elem_len_for_fusion > 0:
	head_elem_tokens = fused[:, elem_start : elem_start + elem_len_for_fusion, :]
	if use_activation_checkpointing:
	pooled = checkpoint(
	lambda tokens, padding: self.pooler(tokens, key_padding_mask=padding),
	fused,
	fusion_padding,
	use_reentrant=False,
	)
	else:
	pooled = self.pooler(fused, key_padding_mask=fusion_padding)
	pooled_padding = torch.zeros(bsz, pooled.size(1), dtype=torch.bool, device=pooled.device)
	memory_parts = []
	memory_padding_parts = []
	if context_mode in {"tokens_direct", "tokens_direct_encoder"} and self.model_variant != "image_only":
	if bool(getattr(self.args, "direct_context_passthrough", False)) and direct_context_tokens is not None:
	memory_parts.append(direct_context_tokens)
	else:
	memory_parts.append(fused[:, :context_len, :])
	memory_padding_parts.append(context_padding)
	if bool(getattr(self.args, "direct_visual_tokens", False)) and visual_start >= 0 and visual_len > 0:
	visual_scale = float(getattr(self.args, "visual_memory_scale", 1.0) or 1.0)
	memory_parts.append(fused[:, visual_start : visual_start + visual_len, :] * visual_scale)
	memory_padding_parts.append(torch.zeros(bsz, visual_len, dtype=torch.bool, device=pooled.device))
	if bool(getattr(self.args, "direct_element_tokens", False)) and elem_start >= 0 and elem_len_for_fusion > 0:
	element_scale = float(getattr(self.args, "element_memory_scale", 1.0) or 1.0)
	memory_parts.append(fused[:, elem_start : elem_start + elem_len_for_fusion, :] * element_scale)
	memory_padding_parts.append(elem_key_padding)
	if bool(getattr(self.args, "include_pooled_memory", True)):
	pooled_scale = float(getattr(self.args, "pooled_memory_scale", 1.0) or 1.0)
	memory_parts.append(pooled * pooled_scale)
	memory_padding_parts.append(pooled_padding)
	if not memory_parts:
	pooled_scale = float(getattr(self.args, "pooled_memory_scale", 1.0) or 1.0)
	memory_parts.append(pooled * pooled_scale)
	memory_padding_parts.append(pooled_padding)
	memory = torch.cat(memory_parts, dim=1)
	memory_scale = float(getattr(self.args, "decoder_memory_scale", 1.0) or 1.0)
	if memory_scale != 1.0:
	memory = memory * memory_scale
	memory_padding = torch.cat(memory_padding_parts, dim=1)
	memory_attention_mask = (~memory_padding).long()
	return memory, memory_attention_mask, head_elem_tokens, elem_key_padding

	def forward(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
	if self.use_native_context_forward():
	labels = batch["labels"]
	decoder_out = self.decoder(
	input_ids=batch["context_input_ids"],
	attention_mask=batch["context_attention_mask"],
	labels=labels,
	use_cache=False,
	)
	gen_loss = decoder_out.loss
	element_shape = batch["element_mask"].shape
	empty_logits = labels.new_full(element_shape, -20.0, dtype=torch.float32)
	section_logits = labels.new_zeros((labels.size(0), int(batch["section_labels"].shape[-1])), dtype=torch.float32)
	zero_loss = gen_loss.detach().new_zeros(())
	return {
	"loss": gen_loss,
	"generation_loss": gen_loss.detach(),
	"evidence_loss": zero_loss,
	"ui_function_loss": zero_loss,
	"search_function_loss": zero_loss,
	"section_loss": zero_loss,
	"numeric_loss": zero_loss,
	"evidence_logits": empty_logits,
	"ui_function_logits": empty_logits,
	"search_function_logits": empty_logits,
	"section_logits": section_logits,
	}
	memory, memory_attention_mask, elem_tokens, elem_key_padding = self.build_memory(batch)
	encoder_outputs = BaseModelOutput(last_hidden_state=memory)
	labels = batch["labels"]
	if hasattr(self.decoder, "prepare_decoder_input_ids_from_labels"):
	decoder_input_ids = self.decoder.prepare_decoder_input_ids_from_labels(labels=labels)
	elif hasattr(self.decoder, "_shift_right"):
	decoder_input_ids = self.decoder._shift_right(labels)
	else:
	pad_token_id = int(getattr(self.decoder.config, "pad_token_id", 0) or 0)
	start_token_id = int(getattr(self.decoder.config, "decoder_start_token_id", pad_token_id) or pad_token_id)
	decoder_input_ids = labels.new_full(labels.shape, pad_token_id)
	decoder_input_ids[:, 0] = start_token_id
	decoder_input_ids[:, 1:] = labels[:, :-1].masked_fill(labels[:, :-1] == -100, pad_token_id)
	decoder_out = self.decoder(
	encoder_outputs=encoder_outputs,
	attention_mask=memory_attention_mask,
	decoder_input_ids=decoder_input_ids,
	use_cache=False,
	)
	logits = decoder_out.logits
	flat_logits = logits.reshape(-1, logits.size(-1))
	flat_labels = labels.reshape(-1)
	valid_token_count = flat_labels.ne(-100).sum().clamp_min(1)
	chunk_size = max(1, int(getattr(self.args, "generation_loss_chunk_size", 32) or 32))
	gen_loss_sum = logits.new_zeros((), dtype=torch.float32)
	for start in range(0, flat_logits.size(0), chunk_size):
	end = min(start + chunk_size, flat_logits.size(0))
	gen_loss_sum = gen_loss_sum + F.cross_entropy(
	flat_logits[start:end].float(),
	flat_labels[start:end],
	ignore_index=-100,
	reduction="sum",
	)
	gen_loss = gen_loss_sum / valid_token_count
	evidence_logits = self.evidence_head(elem_tokens).squeeze(-1)
	ui_function_logits = self.ui_function_head(elem_tokens).squeeze(-1)
	search_function_logits = self.search_function_head(elem_tokens).squeeze(-1)
	numeric_logits = self.numeric_head(elem_tokens).squeeze(-1)
	valid = (~elem_key_padding).float()
	evidence_loss = F.binary_cross_entropy_with_logits(
	evidence_logits, batch["evidence_labels"], reduction="none"
	)
	evidence_loss = (evidence_loss * valid).sum() / valid.sum().clamp_min(1.0)
	ui_function_loss = F.binary_cross_entropy_with_logits(
	ui_function_logits, batch["ui_function_labels"], reduction="none"
	)
	ui_function_loss = (ui_function_loss * valid).sum() / valid.sum().clamp_min(1.0)
	search_pos_weight = float(getattr(self.args, "search_function_pos_weight", 1.0) or 1.0)
	search_loss_kwargs: Dict[str, Any] = {"reduction": "none"}
	if search_pos_weight != 1.0:
	search_loss_kwargs["pos_weight"] = torch.tensor(search_pos_weight, device=search_function_logits.device)
	search_function_loss = F.binary_cross_entropy_with_logits(
	search_function_logits, batch["search_function_labels"], **search_loss_kwargs
	)
	search_function_loss = (search_function_loss * valid).sum() / valid.sum().clamp_min(1.0)
	numeric_loss = F.binary_cross_entropy_with_logits(
	numeric_logits, batch["numeric_labels"], reduction="none"
	)
	numeric_loss = (numeric_loss * valid).sum() / valid.sum().clamp_min(1.0)
	memory_mask = memory_attention_mask.unsqueeze(-1).float()
	memory_summary = (memory * memory_mask).sum(dim=1) / memory_mask.sum(dim=1).clamp_min(1.0)
	section_logits = self.section_head(memory_summary)
	section_loss = F.binary_cross_entropy_with_logits(section_logits, batch["section_labels"])
	total = (
	gen_loss
	+ self.args.evidence_loss_weight * evidence_loss
	+ float(getattr(self.args, "ui_function_loss_weight", 0.0) or 0.0) * ui_function_loss
	+ float(getattr(self.args, "search_function_loss_weight", 0.0) or 0.0) * search_function_loss
	+ self.args.section_loss_weight * section_loss
	+ self.args.numeric_loss_weight * numeric_loss
	)
	return {
	"loss": total,
	"generation_loss": gen_loss.detach(),
	"evidence_loss": evidence_loss.detach(),
	"ui_function_loss": ui_function_loss.detach(),
	"search_function_loss": search_function_loss.detach(),
	"section_loss": section_loss.detach(),
	"numeric_loss": numeric_loss.detach(),
	"evidence_logits": evidence_logits.detach(),
	"ui_function_logits": ui_function_logits.detach(),
	"search_function_logits": search_function_logits.detach(),
	"section_logits": section_logits.detach(),
	}

	@torch.no_grad()
	def generate_text(self, batch: Dict[str, torch.Tensor], tokenizer, num_beams: int = 4, max_new_tokens: int = 384) -> List[str]:
	# 评估时也按训练 amp_dtype 走 autocast，避免 mt5-large 在 valid 阶段以 fp32
	# 跑生成时显存峰值远高于训练而 OOM，同时也明显加快评估速度。
	amp_dtype_name = str(getattr(self.args, "amp_dtype", "auto") or "auto").lower()
	if amp_dtype_name == "auto":
	amp_dtype_name = "fp16" if bool(getattr(self.args, "fp16", False)) else "fp32"
	device_is_cuda = batch["pixel_values"].is_cuda if torch.is_tensor(batch.get("pixel_values")) else False
	amp_enabled = device_is_cuda and amp_dtype_name in {"fp16", "bf16"}
	amp_dtype = torch.float16 if amp_dtype_name == "fp16" else torch.bfloat16
	with torch.amp.autocast("cuda", enabled=amp_enabled, dtype=amp_dtype):
	generation_kwargs = build_generation_kwargs(self.args, tokenizer)
	if self.use_native_context_forward():
	generated = self.decoder.generate(
	input_ids=batch["context_input_ids"],
	attention_mask=batch["context_attention_mask"],
	num_beams=num_beams,
	max_new_tokens=max_new_tokens,
	**generation_kwargs,
	)
	else:
	memory, memory_attention_mask, _, _ = self.build_memory(batch)
	encoder_outputs = BaseModelOutput(last_hidden_state=memory)
	generated = self.decoder.generate(
	encoder_outputs=encoder_outputs,
	attention_mask=memory_attention_mask,
	num_beams=num_beams,
	max_new_tokens=max_new_tokens,
	**generation_kwargs,
	)
	return tokenizer.batch_decode(generated, skip_special_tokens=True)


	def move_batch(batch: Dict[str, Any], device: torch.device) -> Dict[str, Any]:
	out = {}
	for key, value in batch.items():
	if key == "rows":
	out[key] = value
	elif torch.is_tensor(value):
	out[key] = value.to(device, non_blocking=True)
	else:
	out[key] = value
	return out


	def trainable_parameter_stats(model: nn.Module) -> Dict[str, Any]:
	stats = {
	"trainable_params_total": 0,
	"trainable_params_vision": 0,
	"trainable_params_decoder": 0,
	"trainable_params_other": 0,
	}
	for name, param in model.named_parameters():
	if not param.requires_grad:
	continue
	count = int(param.numel())
	stats["trainable_params_total"] += count
	if name.startswith("vision."):
	stats["trainable_params_vision"] += count
	elif name.startswith("decoder."):
	stats["trainable_params_decoder"] += count
	else:
	stats["trainable_params_other"] += count
	stats["vision_trainable"] = stats["trainable_params_vision"] > 0
	return stats


	def reduce_parallel_losses(output: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
	for key in [
	"loss",
	"generation_loss",
	"evidence_loss",
	"ui_function_loss",
	"search_function_loss",
	"section_loss",
	"numeric_loss",
	]:
	value = output.get(key)
	if torch.is_tensor(value) and value.ndim > 0:
	output[key] = value.mean()
	return output


	def build_optimizer(model: nn.Module, args: argparse.Namespace) -> torch.optim.Optimizer:
	decoder_params = []
	vision_params = []
	ui_function_params = []
	other_params = []
	lr_ui_function_head = float(getattr(args, "lr_ui_function_head", 0.0) or 0.0)
	for name, param in model.named_parameters():
	if not param.requires_grad:
	continue
	if lr_ui_function_head > 0 and (
	name.startswith("ui_function_head.")
	or name.startswith("search_function_head.")
	or name.startswith("function_signal_proj.")
	or name.startswith("search_signal_proj.")
	):
	ui_function_params.append(param)
	elif name.startswith("decoder."):
	decoder_params.append(param)
	elif name.startswith("vision."):
	vision_params.append(param)
	else:
	other_params.append(param)
	groups = [
	{"params": other_params, "lr": args.lr_new},
	{"params": vision_params, "lr": args.lr_fusion},
	{"params": decoder_params, "lr": args.lr_decoder},
	]
	if ui_function_params:
	groups.append({"params": ui_function_params, "lr": lr_ui_function_head})
	optimizer_name = str(getattr(args, "optimizer_name", "adamw") or "adamw").lower()
	if optimizer_name == "adamw":
	return torch.optim.AdamW(groups, weight_decay=args.weight_decay)
	if optimizer_name == "adafactor":
	return Adafactor(
	groups,
	scale_parameter=False,
	relative_step=False,
	warmup_init=False,
	weight_decay=args.weight_decay,
	)
	raise ValueError("optimizer_name must be one of: adamw, adafactor")


	def clip_gradients(model: nn.Module, optimizer: torch.optim.Optimizer, args: argparse.Namespace) -> torch.Tensor:
	max_grad_norm = float(getattr(args, "max_grad_norm", 1.0) or 0.0)
	first_param = next(model.parameters())
	if max_grad_norm <= 0:
	return first_param.detach().new_tensor(0.0)
	strategy = str(getattr(args, "grad_clip_strategy", "global") or "global").lower()
	if strategy == "per_group":
	norms: List[torch.Tensor] = []
	for group in optimizer.param_groups:
	params = [param for param in group["params"] if param.grad is not None]
	if not params:
	continue
	group_norm = torch.nn.utils.clip_grad_norm_(params, max_grad_norm, foreach=False)
	norms.append(group_norm.detach().to(device=first_param.device, dtype=torch.float32))
	if not norms:
	return first_param.detach().new_tensor(0.0)
	return torch.stack(norms).max()
	if strategy != "global":
	raise ValueError("grad_clip_strategy must be one of: global, per_group")
	return torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm, foreach=False)


	def get_lr_schedule(optimizer, total_steps: int, warmup_ratio: float, scheduler_type: str = "linear"):
	warmup = int(total_steps * warmup_ratio)
	scheduler_type = str(scheduler_type or "linear").lower()

	def lr_lambda(step: int) -> float:
	if step < warmup:
	return max(1e-8, step / max(1, warmup))
	progress = (step - warmup) / max(1, total_steps - warmup)
	if scheduler_type == "cosine":
	return max(0.0, 0.5 * (1.0 + math.cos(math.pi * min(1.0, progress))))
	return max(0.0, 1.0 - progress)

	return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)


	def save_checkpoint(
	output_dir: Path,
	name: str,
	model: nn.Module,
	tokenizer,
	image_processor,
	args: argparse.Namespace,
	metrics: Dict[str, Any],
	) -> None:
	ckpt_dir = output_dir / name
	ckpt_dir.mkdir(parents=True, exist_ok=True)
	raw_model = unwrap_parallel_model(model)
	torch.save(raw_model.state_dict(), ckpt_dir / "pytorch_model.bin")
	write_json(ckpt_dir / "rich_config.json", vars(args))
	write_json(ckpt_dir / "metrics.json", metrics)
	tokenizer_dir = ckpt_dir / "decoder_tokenizer"
	tokenizer.save_pretrained(tokenizer_dir)
	vocab_file = getattr(tokenizer, "vocab_file", None)
	if vocab_file and Path(vocab_file).exists() and not (tokenizer_dir / "spiece.model").exists():
	shutil.copyfile(vocab_file, tokenizer_dir / "spiece.model")
	decoder_model_dir = Path(str(getattr(args, "decoder_model", "") or ""))
	if decoder_model_dir.is_dir():
	for tokenizer_asset in ("spiece.model", "config.json"):
	source_asset = decoder_model_dir / tokenizer_asset
	target_asset = tokenizer_dir / tokenizer_asset
	if source_asset.exists() and not target_asset.exists():
	shutil.copyfile(source_asset, target_asset)
	image_processor.save_pretrained(ckpt_dir / "image_processor")


	def resize_checkpoint_tensor(value: torch.Tensor, target: torch.Tensor) -> Optional[torch.Tensor]:
	if value.ndim != target.ndim or value.ndim == 0:
	return None
	resized = value.detach().cpu()
	for dim, target_size in enumerate(target.shape):
	current_size = resized.shape[dim]
	if current_size == target_size:
	continue
	if current_size <= 0 or target_size <= 0:
	return None
	if current_size > target_size:
	resized = resized.narrow(dim, 0, target_size)
	continue
	repeats = [1] * resized.ndim
	repeats[dim] = math.ceil(target_size / current_size)
	resized = resized.repeat(*repeats).narrow(dim, 0, target_size)
	return resized.to(dtype=target.dtype)


	def load_compatible_model_state(
	model: nn.Module,
	state: Dict[str, torch.Tensor],
	allow_missing_prefixes: Tuple[str, ...] = (),
	resize_mismatched_non_decoder: bool = False,
	) -> Tuple[List[str], List[str], List[str], List[str]]:
	model_state = model.state_dict()
	compatible_state: Dict[str, torch.Tensor] = {}
	resized_incompatible: List[str] = []
	skipped_incompatible: List[str] = []
	skipped_unexpected: List[str] = []
	for key, value in state.items():
	if key not in model_state:
	skipped_unexpected.append(key)
	continue
	if tuple(model_state[key].shape) != tuple(value.shape):
	if resize_mismatched_non_decoder and not key.startswith("decoder."):
	resized_value = resize_checkpoint_tensor(value, model_state[key])
	if resized_value is not None and tuple(resized_value.shape) == tuple(model_state[key].shape):
	compatible_state[key] = resized_value
	resized_incompatible.append(key)
	continue
	skipped_incompatible.append(key)
	continue
	compatible_state[key] = value

	load_result = model.load_state_dict(compatible_state, strict=False)
	allowed_missing = [
	key
	for key in load_result.missing_keys
	if key.startswith("ui_function_head.")
	or key.startswith("search_function_head.")
	or key.startswith("function_signal_proj.")
	or key.startswith("search_signal_proj.")
	or key in skipped_incompatible
	or key.startswith(allow_missing_prefixes)
	]
	bad_missing = [key for key in load_result.missing_keys if key not in allowed_missing]
	if bad_missing or load_result.unexpected_keys or skipped_unexpected:
	raise RuntimeError(
	f"Checkpoint mismatch. missing={bad_missing}, unexpected={list(load_result.unexpected_keys) + skipped_unexpected}"
	)
	return list(load_result.missing_keys), list(load_result.unexpected_keys), skipped_incompatible, resized_incompatible


	def load_rich_checkpoint(checkpoint: str, device: torch.device) -> Tuple[RichGroundedModel, Any, Any, argparse.Namespace]:
	ckpt_dir = Path(checkpoint)
	config = json.loads((ckpt_dir / "rich_config.json").read_text(encoding="utf-8"))
	merged_config = dict(DEFAULT_CONFIG)
	merged_config.update(config)
	args = argparse.Namespace(**merged_config)
	tokenizer = load_seq2seq_tokenizer(str(ckpt_dir / "decoder_tokenizer"), str(args.decoder_model))
	image_processor = AutoImageProcessor.from_pretrained(ckpt_dir / "image_processor")
	model = RichGroundedModel(args)
	state = torch.load(ckpt_dir / "pytorch_model.bin", map_location="cpu")
	load_compatible_model_state(model, state)
	model.to(device)
	model.eval()
	return model, tokenizer, image_processor, args


	def get_eval_max_new_tokens(args: argparse.Namespace) -> int:
	return int(getattr(args, "eval_max_new_tokens", getattr(args, "max_target_tokens", 384)))


	def build_generation_kwargs(args: argparse.Namespace, tokenizer) -> Dict[str, Any]:
	kwargs: Dict[str, Any] = {}
	no_repeat_ngram_size = int(getattr(args, "generation_no_repeat_ngram_size", 0) or 0)
	if no_repeat_ngram_size > 0:
	kwargs["no_repeat_ngram_size"] = no_repeat_ngram_size
	repetition_penalty = float(getattr(args, "generation_repetition_penalty", 1.0) or 1.0)
	if repetition_penalty != 1.0:
	kwargs["repetition_penalty"] = repetition_penalty
	min_new_tokens = int(getattr(args, "generation_min_new_tokens", 0) or 0)
	if min_new_tokens > 0:
	kwargs["min_new_tokens"] = min_new_tokens
	bad_words_ids = []
	if bool(getattr(args, "generation_block_extra_ids", False)):
	vocab_len = len(tokenizer)
	for token_id in range(max(0, vocab_len - 256), vocab_len):
	token = tokenizer.convert_ids_to_tokens(token_id)
	if isinstance(token, str) and "<extra_id_" in token:
	bad_words_ids.append([token_id])
	if bool(getattr(args, "generation_block_title_prefix", False)):
	for phrase in ("Title", "title", "Title:", "title:", "Title :", "title :"):
	token_ids = tokenizer.encode(phrase, add_special_tokens=False)
	if token_ids:
	bad_words_ids.append(token_ids)
	if bad_words_ids:
	deduped_bad_words = []
	seen_bad_words = set()
	for token_ids in bad_words_ids:
	key = tuple(int(token_id) for token_id in token_ids)
	if key in seen_bad_words:
	continue
	seen_bad_words.add(key)
	deduped_bad_words.append(list(key))
	kwargs["bad_words_ids"] = deduped_bad_words
	if bool(getattr(args, "generation_force_json_start", False)):
	json_start_ids = tokenizer.encode("{", add_special_tokens=False)
	if len(json_start_ids) == 1:
	kwargs["forced_bos_token_id"] = json_start_ids[0]
	return kwargs


	@torch.no_grad()
	def evaluate(
	model: nn.Module,
	loader: DataLoader,
	tokenizer,
	device: torch.device,
	args: argparse.Namespace,
	rank: int = 0,
	) -> Dict[str, Any]:
	raw_model = unwrap_parallel_model(model)
	raw_model.eval()
	losses: List[float] = []
	rouges: List[float] = []
	json_valid = 0
	generation_json_valid = 0
	generation_json_strict_valid = 0
	generation_json_repair_applied = 0
	total = 0
	evidence_precisions: List[float] = []
	ui_function_tp = 0
	ui_function_fp = 0
	ui_function_fn = 0
	ui_function_pred_positive = 0
	ui_function_ref_positive = 0
	ui_function_valid_elements = 0
	generation_char_lengths: List[int] = []
	pred_summary_char_lengths: List[int] = []
	ref_summary_char_lengths: List[int] = []
	empty_summary_count = 0
	extra_id_count = 0
	title_prefix_count = 0
	natural_title_prefix_stripped_count = 0
	whitespace_only_count = 0
	search_function_tp = 0
	search_function_fp = 0
	search_function_fn = 0
	search_function_pred_positive = 0
	search_function_ref_positive = 0
	search_tp = 0
	search_fp = 0
	search_fn = 0
	pred_search_count = 0
	ref_search_count = 0
	pred_function_count = 0
	ref_function_count = 0
	pred_bare_search_count = 0
	predictions: List[Dict[str, Any]] = []
	eval_max_new_tokens = get_eval_max_new_tokens(args)
	context_summary_repair = bool(getattr(args, "context_summary_repair", False))
	structured_mode = str(getattr(args, "structured_function_mode", "decoder") or "decoder").lower()
	target_schema = str(getattr(args, "target_schema", "zh") or "zh")
	summary_output_mode = target_schema_is_summary(target_schema)
	natural_output_mode = target_schema_is_natural_text(target_schema)
	function_metric_threshold = 0.5
	search_metric_threshold = 0.5
	if structured_mode == "heads":
	function_metric_threshold = float(getattr(args, "structured_function_threshold", 0.5) or 0.5)
	search_metric_threshold = float(getattr(args, "structured_search_threshold", function_metric_threshold) or function_metric_threshold)
	repair_count = 0
	# 诊断指标：记录预测摘要是否与“这是一个{app}界面...当前任务语境是：{instruction}。”模板完全一致。
	# 在 context_mode=tokens_direct 下 decoder 能直接看到 app+instruction token，只要模型能复制这些 token 到
	# 模板位置，就能不看屏幕拿到高 ROUGE。template_match_rate 偏高表示训练信号实际是“拼模板”而非屏幕理解。
	template_exact_match = 0
	template_app_in_pred = 0
	template_instruction_in_pred = 0
	for batch in tqdm(loader, desc="valid", disable=not is_main(rank)):
	rows = batch["rows"]
	batch = move_batch(batch, device)
	out = raw_model(**{k: v for k, v in batch.items() if k != "rows"})
	losses.append(float(out["loss"].detach().cpu()))
	valid_elements = batch["element_mask"].bool()
	function_labels = batch["ui_function_labels"] > 0.5
	function_preds = torch.sigmoid(out["ui_function_logits"]) >= function_metric_threshold
	search_function_labels = batch["search_function_labels"] > 0.5
	search_function_preds = torch.sigmoid(out["search_function_logits"]) >= search_metric_threshold
	evidence_scores_batch = torch.sigmoid(out["evidence_logits"]).detach().cpu()
	function_scores_batch = torch.sigmoid(out["ui_function_logits"]).detach().cpu()
	search_scores_batch = torch.sigmoid(out["search_function_logits"]).detach().cpu()
	ui_function_tp += int((function_preds & function_labels & valid_elements).sum().detach().cpu())
	ui_function_fp += int((function_preds & ~function_labels & valid_elements).sum().detach().cpu())
	ui_function_fn += int((~function_preds & function_labels & valid_elements).sum().detach().cpu())
	ui_function_pred_positive += int((function_preds & valid_elements).sum().detach().cpu())
	ui_function_ref_positive += int((function_labels & valid_elements).sum().detach().cpu())
	search_function_tp += int((search_function_preds & search_function_labels & valid_elements).sum().detach().cpu())
	search_function_fp += int((search_function_preds & ~search_function_labels & valid_elements).sum().detach().cpu())
	search_function_fn += int((~search_function_preds & search_function_labels & valid_elements).sum().detach().cpu())
	search_function_pred_positive += int((search_function_preds & valid_elements).sum().detach().cpu())
	search_function_ref_positive += int((search_function_labels & valid_elements).sum().detach().cpu())
	ui_function_valid_elements += int(valid_elements.sum().detach().cpu())
	texts = raw_model.generate_text(batch, tokenizer, num_beams=args.num_beams, max_new_tokens=eval_max_new_tokens)
	for row_idx, (row, text) in enumerate(zip(rows, texts)):
	generation_char_lengths.append(len(text))
	if text and not text.strip():
	whitespace_only_count += 1
	if "<extra_id_" in text:
	extra_id_count += 1
	if has_natural_title_prefix(text):
	title_prefix_count += 1
	if summary_output_mode:
	pred_obj = prediction_from_summary(row, text)
	ok = True
	elif natural_output_mode:
	parse_text, stripped_title_prefix = strip_natural_title_prefix(text)
	natural_title_prefix_stripped_count += int(stripped_title_prefix)
	pred_obj = natural_prediction_from_text(parse_text)
	ok = bool(extract_summary(pred_obj))
	generation_json_valid += int(ok)
	else:
	pred_obj, ok, repair_applied, strict_ok = safe_json_loads_with_repair(text)
	generation_json_valid += int(ok)
	generation_json_strict_valid += int(strict_ok)
	generation_json_repair_applied += int(repair_applied)
	if context_summary_repair and not summary_output_mode:
	pred_obj, repaired = repair_prediction_with_context(row, pred_obj)
	ok = True
	repair_count += int(repaired)
	pred_obj = apply_structured_function_predictions(
	row,
	pred_obj,
	function_scores_batch[row_idx],
	search_scores_batch[row_idx],
	args,
	)
	pred_obj = apply_structured_evidence_predictions(row, pred_obj, evidence_scores_batch[row_idx], args)
	ref_target = row.get("target") or {}
	if bool(getattr(args, "canonicalize_targets", False)):
	ref_target = canonicalize_target_with_context(
	row,
	ref_target,
	drop_bare_search_functions=bool(getattr(args, "drop_bare_search_functions", False)),
	)
	ref_obj = json.loads(target_to_text(ref_target, "zh"))
	pred_has_search = has_search_function(pred_obj)
	ref_has_search = has_search_function(ref_obj)
	search_tp += int(pred_has_search and ref_has_search)
	search_fp += int(pred_has_search and not ref_has_search)
	search_fn += int((not pred_has_search) and ref_has_search)
	pred_search_count += count_search_functions(pred_obj)
	ref_search_count += count_search_functions(ref_obj)
	pred_function_count += len(extract_function_entries(pred_obj))
	ref_function_count += len(extract_function_entries(ref_obj))
	pred_bare_search_count += count_bare_search_functions(pred_obj)
	pred_summary = extract_summary(pred_obj)
	ref_summary = extract_summary(ref_obj)
	pred_summary_char_lengths.append(len(pred_summary))
	ref_summary_char_lengths.append(len(ref_summary))
	if not pred_summary:
	empty_summary_count += 1
	rouges.append(rouge_l_char(pred_summary, ref_summary))
	# 诊断：预测是否与 build_context_summary(row) 完全一致（拼模板）。
	template_summary = build_context_summary(row)
	if pred_summary and pred_summary == template_summary:
	template_exact_match += 1
	row_app = safe_text(row.get("app"))
	row_instruction = safe_text(row.get("instruction"))
	if row_app and pred_summary and row_app in pred_summary:
	template_app_in_pred += 1
	if row_instruction and pred_summary and row_instruction in pred_summary:
	template_instruction_in_pred += 1
	json_valid += int(ok)
	total += 1
	pred_evidence = set(extract_evidence_ids(pred_obj))
	ref_evidence = set(ref_target.get("key_ui_clues", []) or row.get("weak_evidence_ids", []))
	if pred_evidence:
	evidence_precisions.append(len(pred_evidence & ref_evidence) / len(pred_evidence))
	else:
	evidence_precisions.append(0.0)
	if len(predictions) < 50:
	predictions.append(
	{
	"screen_id": row.get("screen_id"),
	"prediction_raw": text,
	"prediction": pred_obj,
	"reference": ref_obj,
	}
	)
	metrics = {
	"loss": float(np.mean(losses)) if losses else 0.0,
	"rouge_l_char": float(np.mean(rouges)) if rouges else 0.0,
	"json_valid_rate": json_valid / max(1, total),
	"generation_json_valid_rate": generation_json_valid / max(1, total),
	"generation_json_strict_valid_rate": generation_json_strict_valid / max(1, total),
	"generation_json_repair_rate": generation_json_repair_applied / max(1, total),
	"generation_char_len_mean": float(np.mean(generation_char_lengths)) if generation_char_lengths else 0.0,
	"generation_char_len_max": int(max(generation_char_lengths)) if generation_char_lengths else 0,
	"pred_summary_char_len_mean": float(np.mean(pred_summary_char_lengths)) if pred_summary_char_lengths else 0.0,
	"ref_summary_char_len_mean": float(np.mean(ref_summary_char_lengths)) if ref_summary_char_lengths else 0.0,
	"empty_summary_rate": empty_summary_count / max(1, total),
	"whitespace_only_rate": whitespace_only_count / max(1, total),
	"extra_id_rate": extra_id_count / max(1, total),
	"title_prefix_rate": title_prefix_count / max(1, total),
	"natural_title_prefix_stripped_rate": natural_title_prefix_stripped_count / max(1, total),
	"evidence_precision": float(np.mean(evidence_precisions)) if evidence_precisions else 0.0,
	"ui_function_precision": ui_function_tp / max(1, ui_function_tp + ui_function_fp),
	"ui_function_recall": ui_function_tp / max(1, ui_function_tp + ui_function_fn),
	"ui_function_f1": (2 * ui_function_tp) / max(1, 2 * ui_function_tp + ui_function_fp + ui_function_fn),
	"ui_function_pred_positive_rate": ui_function_pred_positive / max(1, ui_function_valid_elements),
	"ui_function_ref_positive_rate": ui_function_ref_positive / max(1, ui_function_valid_elements),
	"search_function_precision": search_function_tp / max(1, search_function_tp + search_function_fp),
	"search_function_recall": search_function_tp / max(1, search_function_tp + search_function_fn),
	"search_function_f1": (2 * search_function_tp) / max(1, 2 * search_function_tp + search_function_fp + search_function_fn),
	"search_function_pred_positive_rate": search_function_pred_positive / max(1, ui_function_valid_elements),
	"search_function_ref_positive_rate": search_function_ref_positive / max(1, ui_function_valid_elements),
	"search_precision": search_tp / max(1, search_tp + search_fp),
	"search_recall": search_tp / max(1, search_tp + search_fn),
	"search_f1": (2 * search_tp) / max(1, 2 * search_tp + search_fp + search_fn),
	"search_tp": search_tp,
	"search_fp": search_fp,
	"search_fn": search_fn,
	"pred_search_count": pred_search_count,
	"ref_search_count": ref_search_count,
	"pred_function_count": pred_function_count,
	"ref_function_count": ref_function_count,
	"function_overgen_rate": max(0, pred_function_count - ref_function_count) / max(1, ref_function_count),
	"function_count_ratio": pred_function_count / max(1, ref_function_count),
	"search_overgen_rate": max(0, pred_search_count - ref_search_count) / max(1, total),
	"pred_bare_search_count": pred_bare_search_count,
	"bare_search_rate": pred_bare_search_count / max(1, pred_function_count),
	"max_target_tokens": int(args.max_target_tokens),
	"eval_max_new_tokens": eval_max_new_tokens,
	"num_beams": int(args.num_beams),
	"scheduler_epochs": int(getattr(args, "scheduler_epochs", 0) or 0),
	"lr_scheduler_type": str(getattr(args, "lr_scheduler_type", "linear") or "linear"),
	"context_mode": str(getattr(args, "context_mode", "mean") or "mean"),
	"context_text_format": str(getattr(args, "context_text_format", "rich") or "rich"),
	"context_include_screen_text": bool(getattr(args, "context_include_screen_text", False)),
	"context_screen_text_items": int(getattr(args, "context_screen_text_items", 32) or 32),
	"context_screen_text_dropout_rate": float(getattr(args, "context_screen_text_dropout_rate", 0.0) or 0.0),
	"generation_block_extra_ids": bool(getattr(args, "generation_block_extra_ids", False)),
	"generation_block_title_prefix": bool(getattr(args, "generation_block_title_prefix", False)),
	"generation_no_repeat_ngram_size": int(getattr(args, "generation_no_repeat_ngram_size", 0) or 0),
	"generation_repetition_penalty": float(getattr(args, "generation_repetition_penalty", 1.0) or 1.0),
	"generation_min_new_tokens": int(getattr(args, "generation_min_new_tokens", 0) or 0),
	"generation_force_json_start": bool(getattr(args, "generation_force_json_start", False)),
	"context_summary_repair": context_summary_repair,
	"context_repair_applied_rate": repair_count / max(1, total),
	"canonicalize_targets": bool(getattr(args, "canonicalize_targets", False)),
	"target_schema": str(getattr(args, "target_schema", "zh") or "zh"),
	"decoder_output_mode": "summary" if summary_output_mode else ("natural_text" if natural_output_mode else "json"),
	"task_intent_context": bool(getattr(args, "task_intent_context", False)),
	"drop_bare_search_functions": bool(getattr(args, "drop_bare_search_functions", False)),
	"structured_function_mode": str(getattr(args, "structured_function_mode", "decoder") or "decoder"),
	"structured_function_threshold": float(getattr(args, "structured_function_threshold", 0.5) or 0.5),
	"structured_search_threshold": float(getattr(args, "structured_search_threshold", 0.5) or 0.5),
	"structured_max_functions": int(getattr(args, "structured_max_functions", 12) or 12),
	"structured_strict_search_candidates": bool(getattr(args, "structured_strict_search_candidates", False)),
	"structured_evidence_mode": str(getattr(args, "structured_evidence_mode", "decoder") or "decoder"),
	"structured_evidence_threshold": float(getattr(args, "structured_evidence_threshold", 0.5) or 0.5),
	"structured_max_evidence": int(getattr(args, "structured_max_evidence", 8) or 8),
	"structured_evidence_fallback_top1": bool(getattr(args, "structured_evidence_fallback_top1", True)),
	"ui_function_loss_weight": float(getattr(args, "ui_function_loss_weight", 0.0) or 0.0),
	"search_function_loss_weight": float(getattr(args, "search_function_loss_weight", 0.0) or 0.0),
	"search_function_pos_weight": float(getattr(args, "search_function_pos_weight", 1.0) or 1.0),
	"lr_ui_function_head": float(getattr(args, "lr_ui_function_head", 0.0) or 0.0),
	"max_visual_tokens": int(getattr(args, "max_visual_tokens", 0) or 0),
	"model_variant": str(getattr(args, "model_variant", "")),
	"vision_enabled": not bool(getattr(args, "disable_vision", False)) and str(getattr(args, "model_variant", "")) != "annotation_only",
	"image_size": int(getattr(args, "image_size", 0) or 0),
	"direct_visual_tokens": bool(getattr(args, "direct_visual_tokens", False)),
	"direct_element_tokens": bool(getattr(args, "direct_element_tokens", False)),
	"direct_context_passthrough": bool(getattr(args, "direct_context_passthrough", False)),
	"include_pooled_memory": bool(getattr(args, "include_pooled_memory", True)),
	"native_context_forward": bool(getattr(args, "native_context_forward", False)),
	"disable_vision": bool(getattr(args, "disable_vision", False)),
	"freeze_decoder": bool(getattr(args, "freeze_decoder", False)),
	"init_resize_mismatched_non_decoder": bool(getattr(args, "init_resize_mismatched_non_decoder", False)),
	"grad_clip_strategy": str(getattr(args, "grad_clip_strategy", "global") or "global"),
	"max_grad_norm": float(getattr(args, "max_grad_norm", 1.0) or 0.0),
	"function_signal_to_decoder": bool(getattr(args, "function_signal_to_decoder", False)),
	"function_signal_scale": float(getattr(args, "function_signal_scale", 1.0) or 1.0),
	"search_signal_to_decoder": bool(getattr(args, "search_signal_to_decoder", False)),
	"search_signal_scale": float(getattr(args, "search_signal_scale", 1.0) or 1.0),
	"visual_memory_scale": float(getattr(args, "visual_memory_scale", 1.0) or 1.0),
	"element_memory_scale": float(getattr(args, "element_memory_scale", 1.0) or 1.0),
	"pooled_memory_scale": float(getattr(args, "pooled_memory_scale", 1.0) or 1.0),
	"decoder_gradient_checkpointing": bool(getattr(args, "decoder_gradient_checkpointing", False)),
	"vision_gradient_checkpointing": bool(getattr(args, "vision_gradient_checkpointing", False)),
	"cuda_memory_fraction": float(getattr(args, "cuda_memory_fraction", 0.0) or 0.0),
	"max_train_samples": int(getattr(args, "max_train_samples", 0) or 0),
	}
	# summary 模式下 decoder 只输出摘要：
	# - json_valid_rate 在 prediction_from_summary 里被无条件置 True，恒为 1.0，
	# 给 rich_quality_score 贡献白送的 0.25，掩盖真实 rouge；
	# - evidence_precision 来自辅助 head（apply_structured_evidence_predictions），
	# 与正在 selection 的 decoder 输出无关，混入会让 checkpoint 选择被 head 拉偏。
	# 因此 summary 模式下 rich_quality_score 折叠为纯 rouge_l_char，并显式暴露 summary_quality_score；
	# JSON 模式保留原加权，那时三项才都来自 decoder 输出本身。
	metrics["summary_quality_score"] = float(metrics["rouge_l_char"])
	# 诊断指标输出：
	# - summary_template_match_rate 越高表示预测越是在拼“这是一个X界面...当前任务语境是...”模板，
	# 该值接近 1 + 高 ROUGE 应读作“模型未学到屏幕理解，只是复制 context token”。
	# - summary_app_recall_rate / summary_instruction_recall_rate 反映 app/instruction 是否被原样复制进去，
	# 两者同时接近 1 指向同一拼接行为。
	metrics["summary_template_match_rate"] = template_exact_match / max(1, total)
	metrics["summary_app_recall_rate"] = template_app_in_pred / max(1, total)
	metrics["summary_instruction_recall_rate"] = template_instruction_in_pred / max(1, total)
	if summary_output_mode:
	metrics["rich_quality_score"] = float(metrics["rouge_l_char"])
	else:
	metrics["rich_quality_score"] = (
	0.45 * metrics["rouge_l_char"]
	+ 0.25 * metrics["json_valid_rate"]
	+ 0.30 * metrics["evidence_precision"]
	)
	metrics["rich_function_score"] = (
	0.70 * metrics["rich_quality_score"]
	+ 0.20 * metrics["search_f1"]
	+ 0.10 * metrics["ui_function_f1"]
	)
	overgen_penalty = min(0.12, 0.04 * float(metrics["function_overgen_rate"]))
	search_penalty = min(0.06, 0.02 * float(metrics["search_overgen_rate"]))
	bare_search_penalty = min(0.03, 0.03 * float(metrics["bare_search_rate"]))
	metrics["grounded_quality_score"] = max(
	0.0,
	float(metrics["summary_quality_score"]) + 0.05 * float(metrics["evidence_precision"]) - overgen_penalty - search_penalty - bare_search_penalty,
	)
	metrics["grounded_overgen_penalty"] = overgen_penalty + search_penalty + bare_search_penalty
	if is_main(rank):
	write_json(Path(args.output_dir) / "val_preview.json", {"metrics": metrics, "predictions": predictions})
	raw_model.train()
	return metrics


	def selection_metric_name(args: argparse.Namespace) -> str:
	return str(getattr(args, "model_selection_metric", "rich_quality_score") or "rich_quality_score")


	def selection_metric_value(metrics: Dict[str, Any], args: argparse.Namespace) -> float:
	metric_name = selection_metric_name(args)
	value = metrics.get(metric_name)
	if value is None:
	value = metrics.get("rich_quality_score", 0.0)
	return float(value)


	def metric_is_better(current: float, best: float, args: argparse.Namespace) -> bool:
	mode = str(getattr(args, "model_selection_mode", "max") or "max").lower()
	min_delta = float(getattr(args, "early_stopping_min_delta", 0.0) or 0.0)
	if mode == "min":
	return current < best - min_delta
	return current > best + min_delta


	def train(args: argparse.Namespace) -> None:
	set_seed(args.seed)
	distributed, rank, world_size, local_rank = init_distributed()
	device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
	if device.type == "cuda":
	cuda_memory_fraction = float(getattr(args, "cuda_memory_fraction", 0.0) or 0.0)
	if cuda_memory_fraction < 0.0 or cuda_memory_fraction > 1.0:
	raise ValueError("cuda_memory_fraction must be in [0, 1]. Use 0 to disable the limit.")
	if cuda_memory_fraction > 0.0:
	torch.cuda.set_per_process_memory_fraction(cuda_memory_fraction, device=device)
	if is_main(rank):
	total_gb = torch.cuda.get_device_properties(device).total_memory / 1024**3
	print(f"CUDA memory fraction limit: {cuda_memory_fraction:.3f} (~{total_gb * cuda_memory_fraction:.2f}GB of {total_gb:.2f}GB)")
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	if hasattr(torch, "set_float32_matmul_precision"):
	torch.set_float32_matmul_precision("high")
	output_dir = Path(args.output_dir)
	if is_main(rank):
	output_dir.mkdir(parents=True, exist_ok=True)
	write_json(output_dir / "config.json", vars(args))

	max_train_samples = int(getattr(args, "max_train_samples", 0) or 0)
	train_dataset = RichScreenshotDataset(args.train_file, max_samples=max_train_samples, sample_seed=args.seed if max_train_samples else None)
	valid_dataset = RichScreenshotDataset(args.valid_file, max_samples=args.max_valid_samples)
	train_data_diagnostics = dataset_diagnostics(train_dataset.rows)
	valid_data_diagnostics = dataset_diagnostics(valid_dataset.rows)
	validate_dataset_for_training("train", train_data_diagnostics, args)
	validate_dataset_for_training("valid", valid_data_diagnostics, args)
	tokenizer = load_seq2seq_tokenizer(args.decoder_model)
	train_token_diagnostics = tokenizer_diagnostics(train_dataset.rows, tokenizer, args)
	valid_token_diagnostics = tokenizer_diagnostics(valid_dataset.rows, tokenizer, args)
	validate_token_lengths("train", train_token_diagnostics, args)
	validate_token_lengths("valid", valid_token_diagnostics, args)
	if is_main(rank):
	diagnostics_payload = {
	"train_file": args.train_file,
	"valid_file": args.valid_file,
	"strict_data_checks": bool(getattr(args, "strict_data_checks", True)),
	"train": train_data_diagnostics,
	"valid": valid_data_diagnostics,
	"tokenizer": args.decoder_model,
	"max_target_tokens": int(getattr(args, "max_target_tokens", 0) or 0),
	"eval_max_new_tokens": int(getattr(args, "eval_max_new_tokens", 0) or 0),
	"max_target_truncation_rate": float(getattr(args, "max_target_truncation_rate", 0.01) or 0.0),
	"train_token_lengths": train_token_diagnostics,
	"valid_token_lengths": valid_token_diagnostics,
	}
	write_json(output_dir / "data_diagnostics.json", diagnostics_payload)
	print("Data diagnostics:")
	print(json.dumps(diagnostics_payload, ensure_ascii=False, indent=2))
	image_processor = AutoImageProcessor.from_pretrained(args.vision_model)
	train_collator = RichCollator(tokenizer, image_processor, args, is_training=True)
	valid_collator = RichCollator(tokenizer, image_processor, args, is_training=False)
	train_sampler = DistributedSampler(train_dataset, shuffle=True, seed=args.seed) if distributed else None
	valid_sampler = DistributedSampler(valid_dataset, shuffle=False) if distributed else None
	train_loader = DataLoader(
	train_dataset,
	batch_size=args.batch_size,
	shuffle=train_sampler is None,
	sampler=train_sampler,
	collate_fn=train_collator,
	num_workers=args.num_workers,
	pin_memory=torch.cuda.is_available(),
	)
	valid_loader = DataLoader(
	valid_dataset,
	batch_size=args.eval_batch_size if getattr(args, "eval_batch_size", 0) else max(1, args.batch_size // 2),
	shuffle=False,
	sampler=valid_sampler,
	collate_fn=valid_collator,
	num_workers=args.num_workers,
	pin_memory=torch.cuda.is_available(),
	)
	model = RichGroundedModel(args)
	init_checkpoint = str(getattr(args, "init_checkpoint", "") or "")
	if init_checkpoint:
	checkpoint_decoder_model = ""
	init_config_path = Path(init_checkpoint) / "rich_config.json"
	init_config: Dict[str, Any] = {}
	if init_config_path.exists():
	init_config = json.loads(init_config_path.read_text(encoding="utf-8"))
	checkpoint_decoder_model = str(init_config.get("decoder_model", "") or "")
	allow_missing_prefixes_list: List[str] = []
	if checkpoint_decoder_model and checkpoint_decoder_model != str(getattr(args, "decoder_model", "") or ""):
	allow_missing_prefixes_list.append("decoder.")
	if bool(init_config.get("disable_vision", False)) and not bool(getattr(args, "disable_vision", False)):
	allow_missing_prefixes_list.append("vision.")
	state = torch.load(Path(init_checkpoint) / "pytorch_model.bin", map_location="cpu")
	missing_keys, _, skipped_incompatible, resized_incompatible = load_compatible_model_state(
	model,
	state,
	allow_missing_prefixes=tuple(allow_missing_prefixes_list),
	resize_mismatched_non_decoder=bool(getattr(args, "init_resize_mismatched_non_decoder", False)),
	)
	if is_main(rank):
	missing_preview = missing_keys[:12]
	skipped_preview = skipped_incompatible[:12]
	resized_preview = resized_incompatible[:12]
	print(
	f"Loaded init checkpoint: {init_checkpoint}; "
	f"missing_count={len(missing_keys)} preview={missing_preview}; "
	f"skipped_incompatible_count={len(skipped_incompatible)} preview={skipped_preview}; "
	f"resized_incompatible_count={len(resized_incompatible)} preview={resized_preview}"
	)
	model = model.to(device)
	trainable_stats = trainable_parameter_stats(model)
	if is_main(rank):
	print(
	"Trainable parameters: "
	f"total={trainable_stats['trainable_params_total']:,}, "
	f"vision={trainable_stats['trainable_params_vision']:,}, "
	f"decoder={trainable_stats['trainable_params_decoder']:,}, "
	f"other={trainable_stats['trainable_params_other']:,}",
	flush=True,
	)
	if distributed:
	model = DistributedDataParallel(model, device_ids=[local_rank], find_unused_parameters=True)
	elif bool(getattr(args, "data_parallel", False)) and device.type == "cuda" and torch.cuda.device_count() > 1:
	model = nn.DataParallel(model)
	if is_main(rank):
	print(f"Using nn.DataParallel on {torch.cuda.device_count()} CUDA devices.", flush=True)
	optimizer = build_optimizer(unwrap_parallel_model(model), args)
	scheduler_epochs = int(getattr(args, "scheduler_epochs", 0) or args.epochs)
	scheduler_epochs = max(int(args.epochs), scheduler_epochs)
	total_update_steps = math.ceil(len(train_loader) / args.grad_accum) * scheduler_epochs
	scheduler = get_lr_schedule(optimizer, total_update_steps, args.warmup_ratio, getattr(args, "lr_scheduler_type", "linear"))
	amp_dtype_name = str(getattr(args, "amp_dtype", "auto") or "auto").lower()
	if amp_dtype_name == "auto":
	amp_dtype_name = "fp16" if bool(getattr(args, "fp16", False)) else "fp32"
	if amp_dtype_name not in {"fp32", "fp16", "bf16"}:
	raise ValueError("amp_dtype must be one of: auto, fp32, fp16, bf16")
	amp_enabled = device.type == "cuda" and amp_dtype_name in {"fp16", "bf16"}
	amp_dtype = torch.float16 if amp_dtype_name == "fp16" else torch.bfloat16
	scaler = torch.amp.GradScaler("cuda", enabled=amp_enabled and amp_dtype_name == "fp16")
	model_selection_mode = str(getattr(args, "model_selection_mode", "max") or "max").lower()
	best_score = math.inf if model_selection_mode == "min" else -math.inf
	epochs_without_improvement = 0
	global_step = 0
	train_start_time = time.time()
	recent_losses: deque[float] = deque(maxlen=100)
	optimizer.zero_grad(set_to_none=True)
	if device.type == "cuda":
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.reset_peak_memory_stats(device)
	skip_optimizer_window = False
	skipped_nonfinite_windows = 0

	for epoch in range(args.epochs):
	if train_sampler is not None:
	train_sampler.set_epoch(epoch)
	progress = tqdm(train_loader, desc=f"epoch {epoch + 1}/{args.epochs}", disable=not is_main(rank))
	for step, batch in enumerate(progress, start=1):
	batch = move_batch(batch, device)
	with torch.amp.autocast("cuda", enabled=amp_enabled, dtype=amp_dtype):
	out = model(**{k: v for k, v in batch.items() if k != "rows"})
	out = reduce_parallel_losses(out)
	loss = out["loss"] / args.grad_accum
	loss_is_finite = bool(torch.isfinite(loss.detach()).all().item())
	if not loss_is_finite:
	skip_optimizer_window = True
	optimizer.zero_grad(set_to_none=True)
	if is_main(rank):
	append_jsonl(
	output_dir / "metrics.jsonl",
	{
	"event": "skip_nonfinite_microbatch",
	"epoch": epoch + 1,
	"micro_step": step,
	"loss": float(out["loss"].detach().cpu()),
	"generation_loss": float(out["generation_loss"].detach().cpu()),
	"evidence_loss": float(out["evidence_loss"].detach().cpu()),
	"ui_function_loss": float(out["ui_function_loss"].detach().cpu()),
	"search_function_loss": float(out["search_function_loss"].detach().cpu()),
	"section_loss": float(out["section_loss"].detach().cpu()),
	"numeric_loss": float(out["numeric_loss"].detach().cpu()),
	},
	)
	elif not skip_optimizer_window:
	scaler.scale(loss).backward()
	step_log = None
	if step % args.grad_accum == 0:
	if skip_optimizer_window:
	skipped_nonfinite_windows += 1
	optimizer.zero_grad(set_to_none=True)
	skip_optimizer_window = False
	if is_main(rank):
	append_jsonl(
	output_dir / "metrics.jsonl",
	{
	"event": "skip_nonfinite_optimizer_window",
	"epoch": epoch + 1,
	"micro_step": step,
	"skipped_nonfinite_windows": skipped_nonfinite_windows,
	"lr": max(scheduler.get_last_lr()),
	},
	)
	else:
	lrs_used = list(scheduler.get_last_lr())
	scaler.unscale_(optimizer)
	grad_norm = clip_gradients(model, optimizer, args)
	grad_is_finite = bool(torch.isfinite(grad_norm.detach()).all().item()) if torch.is_tensor(grad_norm) else math.isfinite(float(grad_norm))
	if not grad_is_finite:
	skipped_nonfinite_windows += 1
	optimizer.zero_grad(set_to_none=True)
	if scaler.is_enabled():
	scaler.update()
	if is_main(rank):
	append_jsonl(
	output_dir / "metrics.jsonl",
	{
	"event": "skip_nonfinite_grad",
	"epoch": epoch + 1,
	"micro_step": step,
	"grad_norm": float(grad_norm.detach().cpu()) if torch.is_tensor(grad_norm) else float(grad_norm),
	"skipped_nonfinite_windows": skipped_nonfinite_windows,
	"lr": max(scheduler.get_last_lr()),
	},
	)
	else:
	scaler.step(optimizer)
	scaler.update()
	optimizer.zero_grad(set_to_none=True)
	scheduler.step()
	global_step += 1
	if is_main(rank):
	loss_value = float(out["loss"].detach().cpu())
	recent_losses.append(loss_value)
	label_counts = (batch["labels"] != -100).sum(dim=1).detach().float().cpu()
	context_counts = batch["context_attention_mask"].sum(dim=1).detach().float().cpu()
	element_counts = batch["element_mask"].sum(dim=1).detach().float().cpu()
	loss_window = list(recent_losses)
	loss_window_20 = loss_window[-20:]
	elapsed_seconds = max(time.time() - train_start_time, 1e-6)
	next_lrs = list(scheduler.get_last_lr())
	step_log = {
	"step": global_step,
	"epoch": epoch + 1,
	"loss": loss_value,
	"loss_ma20": float(np.mean(loss_window_20)) if loss_window_20 else loss_value,
	"loss_ma100": float(np.mean(loss_window)) if loss_window else loss_value,
	"generation_loss": float(out["generation_loss"].cpu()),
	"evidence_loss": float(out["evidence_loss"].cpu()),
	"ui_function_loss": float(out["ui_function_loss"].cpu()),
	"search_function_loss": float(out["search_function_loss"].cpu()),
	"section_loss": float(out["section_loss"].cpu()),
	"numeric_loss": float(out["numeric_loss"].cpu()),
	"lr": max(lrs_used) if lrs_used else 0.0,
	"lr_other": lrs_used[0] if len(lrs_used) > 0 else 0.0,
	"lr_vision": lrs_used[1] if len(lrs_used) > 1 else 0.0,
	"lr_decoder": lrs_used[2] if len(lrs_used) > 2 else 0.0,
	"lr_ui_function_head": lrs_used[3] if len(lrs_used) > 3 else 0.0,
	"lr_next": max(next_lrs) if next_lrs else 0.0,
	"lr_scheduler_type": str(getattr(args, "lr_scheduler_type", "linear") or "linear"),
	"grad_norm": float(grad_norm.detach().cpu()) if torch.is_tensor(grad_norm) else float(grad_norm),
	"skipped_nonfinite_windows": skipped_nonfinite_windows,
	"optimizer_steps_total": total_update_steps,
	"train_progress_pct": 100.0 * global_step / max(1, total_update_steps),
	"elapsed_seconds": elapsed_seconds,
	"optimizer_steps_per_sec": global_step / elapsed_seconds,
	"target_tokens_mean": float(label_counts.mean().item()) if label_counts.numel() else 0.0,
	"target_tokens_max": int(label_counts.max().item()) if label_counts.numel() else 0,
	"target_at_max_rate": float((label_counts >= int(args.max_target_tokens)).float().mean().item()) if label_counts.numel() else 0.0,
	"context_tokens_mean": float(context_counts.mean().item()) if context_counts.numel() else 0.0,
	"context_tokens_max": int(context_counts.max().item()) if context_counts.numel() else 0,
	"context_at_max_rate": float((context_counts >= int(args.max_context_tokens)).float().mean().item()) if context_counts.numel() else 0.0,
	"element_count_mean": float(element_counts.mean().item()) if element_counts.numel() else 0.0,
	"element_count_max": int(element_counts.max().item()) if element_counts.numel() else 0,
	"model_variant": str(getattr(args, "model_variant", "")),
	"vision_enabled": not bool(getattr(args, "disable_vision", False)) and str(getattr(args, "model_variant", "")) != "annotation_only",
	"native_context_forward": bool(getattr(args, "native_context_forward", False)),
	"freeze_decoder": bool(getattr(args, "freeze_decoder", False)),
	"image_size": int(getattr(args, "image_size", 0) or 0),
	"image_crops": int(batch["pixel_values"].shape[1]) if torch.is_tensor(batch.get("pixel_values")) and batch["pixel_values"].ndim >= 5 else 0,
	"max_visual_tokens": int(getattr(args, "max_visual_tokens", 0) or 0),
	"direct_visual_tokens": bool(getattr(args, "direct_visual_tokens", False)),
	"direct_element_tokens": bool(getattr(args, "direct_element_tokens", False)),
	"visual_memory_scale": float(getattr(args, "visual_memory_scale", 1.0) or 1.0),
	"element_memory_scale": float(getattr(args, "element_memory_scale", 1.0) or 1.0),
	"pooled_memory_scale": float(getattr(args, "pooled_memory_scale", 1.0) or 1.0),
	"cuda_memory_fraction": float(getattr(args, "cuda_memory_fraction", 0.0) or 0.0),
	"data_parallel": isinstance(model, nn.DataParallel),
	"distributed": bool(distributed),
	**trainable_stats,
	}
	del loss
	del out
	del batch
	empty_cache_steps = int(getattr(args, "cuda_empty_cache_steps", 0) or 0)
	if (
	device.type == "cuda"
	and empty_cache_steps > 0
	and step % args.grad_accum == 0
	and global_step % empty_cache_steps == 0
	):
	gc.collect()
	torch.cuda.empty_cache()
	if step_log is not None and is_main(rank):
	mem = {}
	if torch.cuda.is_available():
	mem = {
	"gpu_allocated_gb": round(torch.cuda.memory_allocated(device) / 1024**3, 3),
	"gpu_reserved_gb": round(torch.cuda.memory_reserved(device) / 1024**3, 3),
	"gpu_peak_allocated_gb": round(torch.cuda.max_memory_allocated(device) / 1024**3, 3),
	"gpu_peak_reserved_gb": round(torch.cuda.max_memory_reserved(device) / 1024**3, 3),
	}
	log = {
	**step_log,
	**mem,
	}
	append_jsonl(output_dir / "metrics.jsonl", log)
	if device.type == "cuda":
	torch.cuda.reset_peak_memory_stats(device)
	progress.set_postfix(loss=f"{log['loss']:.3f}", ma20=f"{log['loss_ma20']:.3f}", pct=f"{log['train_progress_pct']:.1f}%")
	if (
	step_log is not None
	and args.save_checkpoints
	and args.save_every_steps
	and global_step > 0
	and global_step % args.save_every_steps == 0
	and is_main(rank)
	):
	save_checkpoint(output_dir, "checkpoint-last", model, tokenizer, image_processor, args, {"step": global_step})
	if step_log is not None and args.eval_every_steps and global_step > 0 and global_step % args.eval_every_steps == 0:
	if is_main(rank) and args.save_checkpoints:
	save_checkpoint(output_dir, "checkpoint-last", model, tokenizer, image_processor, args, {"step": global_step, "pre_eval": True})
	metrics = evaluate(model, valid_loader, tokenizer, device, args, rank=rank)
	current_score = selection_metric_value(metrics, args)
	metrics["selection_metric"] = selection_metric_name(args)
	metrics["selection_score"] = current_score
	if metric_is_better(current_score, best_score, args):
	best_score = current_score
	if is_main(rank) and args.save_checkpoints:
	save_checkpoint(output_dir, "checkpoint-best", model, tokenizer, image_processor, args, metrics)
	if device.type == "cuda" and empty_cache_steps > 0:
	gc.collect()
	torch.cuda.empty_cache()
	if is_main(rank):
	append_jsonl(output_dir / "metrics.jsonl", {"step_eval": global_step, **metrics})
	if is_main(rank) and args.save_checkpoints:
	save_checkpoint(output_dir, "checkpoint-last", model, tokenizer, image_processor, args, {"epoch": epoch + 1, "step": global_step, "pre_epoch_eval": True})
	metrics = evaluate(model, valid_loader, tokenizer, device, args, rank=rank)
	current_score = selection_metric_value(metrics, args)
	metrics["selection_metric"] = selection_metric_name(args)
	metrics["selection_score"] = current_score
	improved = metric_is_better(current_score, best_score, args)
	if improved:
	best_score = current_score
	epochs_without_improvement = 0
	else:
	epochs_without_improvement += 1
	if is_main(rank):
	append_jsonl(output_dir / "metrics.jsonl", {"epoch_eval": epoch + 1, **metrics})
	if args.save_checkpoints:
	save_checkpoint(output_dir, "checkpoint-last", model, tokenizer, image_processor, args, metrics)
	if args.save_checkpoints and improved:
	save_checkpoint(output_dir, "checkpoint-best", model, tokenizer, image_processor, args, metrics)
	patience = int(getattr(args, "early_stopping_patience", 0) or 0)
	if patience > 0:
	print(
	f"selection_metric={metrics['selection_metric']} score={current_score:.6f} "
	f"best={best_score:.6f} no_improve={epochs_without_improvement}/{patience}"
	)
	patience = int(getattr(args, "early_stopping_patience", 0) or 0)
	if patience > 0 and epochs_without_improvement >= patience:
	if is_main(rank):
	print(f"Early stopping at epoch {epoch + 1}: no improvement for {patience} evals.")
	break
	if distributed:
	dist.destroy_process_group()


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Train rich CMGUI screenshot summarization models.")
	for key, value in DEFAULT_CONFIG.items():
	arg_type = type(value)
	if isinstance(value, bool):
	parser.add_argument(f"--{key}", type=lambda x: str(x).lower() in {"1", "true", "yes"}, default=value)
	else:
	parser.add_argument(f"--{key}", type=arg_type, default=value)
	parser.add_argument("--bottleneck_queries", type=int, default=64)
	args = parser.parse_args()
	args.decoder_model = normalize_model_reference(args.decoder_model)
	args.vision_model = normalize_model_reference(args.vision_model)
	if args.model_variant not in {"annotation_only", "image_only", "late_fusion", "full"}:
	raise ValueError("model_variant must be one of annotation_only, image_only, late_fusion, full")
	args.context_mode = str(getattr(args, "context_mode", "mean") or "mean").lower()
	if args.context_mode not in {"mean", "tokens", "tokens_direct", "tokens_encoder", "tokens_direct_encoder"}:
	raise ValueError("context_mode must be one of: mean, tokens, tokens_direct, tokens_encoder, tokens_direct_encoder")
	args.lr_scheduler_type = str(getattr(args, "lr_scheduler_type", "linear") or "linear").lower()
	if args.lr_scheduler_type not in {"linear", "cosine"}:
	raise ValueError("lr_scheduler_type must be one of: linear, cosine")
	args.target_schema = str(getattr(args, "target_schema", "zh") or "zh").lower()
	if args.target_schema not in {
	"zh",
	"alias",
	"aliases",
	"en",
	"english",
	"summary",
	"summary_zh",
	"summary-only",
	"summary_only",
	"natural_zh",
	"rich_text_zh",
	"zh_text",
	"text_zh",
	"summary_visible_zh",
	"natural_summary_visible_zh",
	}:
	raise ValueError("target_schema must be one of: zh, aliases, summary_zh, natural_zh, summary_visible_zh")
	args.grad_clip_strategy = str(getattr(args, "grad_clip_strategy", "global") or "global").lower()
	if args.grad_clip_strategy not in {"global", "per_group"}:
	raise ValueError("grad_clip_strategy must be one of: global, per_group")
	args.structured_function_mode = str(getattr(args, "structured_function_mode", "decoder") or "decoder").lower()
	if args.structured_function_mode not in {"decoder", "heads"}:
	raise ValueError("structured_function_mode must be one of: decoder, heads")
	args.structured_evidence_mode = str(getattr(args, "structured_evidence_mode", "decoder") or "decoder").lower()
	if args.structured_evidence_mode not in {"decoder", "heads"}:
	raise ValueError("structured_evidence_mode must be one of: decoder, heads")
	return args


	if __name__ == "__main__":
	train(parse_args())