File size: 3,373 Bytes
3e81abb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
{
  "experiment": "EXP-TRAIN-CDG",
  "total_iterations": 300,
  "training_time_hours": 19.159403832289907,
  "eval_interval": 100,
  "rolling_checkpoint_interval": 50,
  "pending_evaluations": 3,
  "eval_queue_file": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/metrics/pending_eval_checkpoints.jsonl",
  "best_model": {
    "iteration": 226,
    "training_reward": 0.9533333333333334,
    "path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/best_model",
    "note": "Best by training reward; run EuroEval for validation score"
  },
  "final_model": {
    "path": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg/final_model"
  },
  "cdg_metrics": {
    "total_episodes": 3200,
    "wrong_to_right": 723,
    "right_to_wrong": 317,
    "right_to_right": 1140,
    "wrong_to_wrong": 1020,
    "filtered_episodes": 1494,
    "avg_episode_quality": 0.78125
  },
  "eval_history": [],
  "config": {
    "cdg_config": {
      "enabled": true,
      "max_turns": 2,
      "helpful_critic_ratio": 0.5,
      "prover_correct_bonus": 0.3,
      "prover_weight": 1.0,
      "helpful_critic_weight": 0.5,
      "misleading_critic_weight": 0.3,
      "min_episode_quality": 0.2,
      "filter_trivial_episodes": true,
      "critique_max_tokens": 128,
      "revision_max_tokens": 256,
      "prover_temperature": 0.7,
      "critic_temperature": 0.9,
      "log_role_rewards": true,
      "log_state_transitions": true,
      "rest_filtering": {
        "enabled": true,
        "top_k_ratio": 0.8,
        "min_episodes_per_batch": 4
      }
    },
    "rl_config": {
      "output_dir": "/proj/berzelius-aiics-real/users/x_anbue/frodi_data/exp_checkpoints/exp_train_cdg",
      "iterations": 300,
      "batch_size": 8,
      "learning_rate": 5e-06,
      "gradient_accumulation_steps": 4,
      "max_length": 512,
      "grpo_group_size": 4,
      "ppo_epochs": 4,
      "adap_kl_ctrl": true,
      "init_kl_coef": 0.1,
      "target": 0.1,
      "horizon": 10000,
      "kl_coef_min": 0.05,
      "kl_coef_max": 0.5,
      "advantage_clip": 3.0,
      "advantage_epsilon": 1e-06,
      "kl_decay": 0.95,
      "kl_growth": 1.2,
      "ref_update_interval": 1,
      "reward_weights": {
        "accuracy": 1.0,
        "fluency": 0.0,
        "reconstruction": 0.0,
        "novelty": 0.0,
        "grammar": 0.0,
        "grammar_enabled": false
      },
      "adaptive_weights": true,
      "weight_update_interval": 100,
      "kl_penalty_coef": 0.08,
      "kl_penalty_warmup_steps": 500,
      "max_kl_divergence": 5.0,
      "entropy_penalty_weight": 0.01,
      "min_response_tokens": 10,
      "qa_min_response_tokens": 3,
      "qa_short_response_advantage_margin": -0.5,
      "skip_reference_kl": false,
      "pretraining_weight": 0.3,
      "pretraining_samples_per_batch": 2,
      "reward_clip_min": -1.0,
      "reward_clip_max": 1.0,
      "reward_norm_momentum": 0.1,
      "reward_downscale": 0.5,
      "generation": {
        "max_new_tokens": 256,
        "min_new_tokens": 5,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 50,
        "do_sample": true,
        "repetition_penalty": 1.1
      }
    }
  },
  "post_training_eval_required": true,
  "storage_note": "Only best_model and final_model saved (storage-efficient)"
}