| Check QUESTION_TEMPLATE! | |
| kl: 0.01 | |
| lr: 1e-6 | |
| epoch: 1 | |
| with cold-start data | |
| noise = torch.randn_like(advantages) * 0.02 | |
| advantages = advantages + noise | |
| def format_reward(completions, **kwargs): | |
| pattern = r"<think>.*?</think>\s*<answer>.*?</answer>" | |
| completion_contents = [completion[0]["content"] for completion in completions] | |
| reward = [] | |
| for content in completion_contents: | |
| for_re = 0.0 | |
| if re.fullmatch(pattern, content, re.DOTALL) and recheck_format(content): | |
| for_re += 0.5 | |
| think = extract_first_think_answer(content) | |
| for_re += min(len(think) / 1200, 1) * 0.5 | |
| reward.append(for_re) | |
| return reward |