File size: 678 Bytes
3162f59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
Check QUESTION_TEMPLATE!
kl: 0.01
lr: 1e-6
epoch: 1
with cold-start data
noise = torch.randn_like(advantages) * 0.02
advantages = advantages + noise
def format_reward(completions, **kwargs):
pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
completion_contents = [completion[0]["content"] for completion in completions]
reward = []
for content in completion_contents:
for_re = 0.0
if re.fullmatch(pattern, content, re.DOTALL) and recheck_format(content):
for_re += 0.5
think = extract_first_think_answer(content)
for_re += min(len(think) / 1200, 1) * 0.5
reward.append(for_re)
return reward |