{ "backbone_arch": "Qwen/Qwen2.5-Math-7B-Instruct", "hidden_size": 3584, "objective": "bce", "use_rank": false, "margin": "m_t = head(h_t) = -logit(H_t); reward r_t = m_t - m_{t-1}", "step": 1000 }