| model: |
| name: meta-llama/Llama-3.2-1B |
| learning_rate: 1.2e-05 |
| ppo_epochs: 4 |
| init_kl_coef: 0.3 |
| target: 3 |
| cliprange: 0.2 |
| cliprange_value: 0.3 |
| vf_coef: 0.15 |
| adap_kl_ctrl: true |
| use_score_norm: true |
| ratio_threshold: 10.0 |
| batch_size: 64 |
| mini_batch_size: 8 |
| forward_batch_size: 2 |
| gradient_accumulation_steps: 8 |
| reward_model: s-nlp/roberta_toxicity_classifier |
| use_raw_logits: true |
| generation: |
| min_length: 5 |
| max_new_tokens: 64 |
| output_min_length: 15 |
| output_max_length: 20 |
| do_sample: true |
| top_k: 0.0 |
| top_p: 0.85 |
| now: 2025-11-22_15-09-36 |
| training: |
| num_train_epochs: 100 |
| save_freq: 20 |
| eval_freq: 20 |
| seed: 42 |
| fast_start: true |
| dataset: |
| name: allenai/real-toxicity-prompts |
| toxicity_threshold: 0.8 |
| filter_metric: profanity |
| input_min_text_length: 15 |
| input_max_text_length: 20 |
| test_size: 0.1 |
| original_dataset_path: null |
| detoxified_dataset_path: null |
| output: |
| push_to_hub: true |
| push_checkpoints_to_hub: true |
| checkpoint_push_freq: 20 |
| organization: null |
| repository_name: SequentialLR001_2000samples |
| private: false |
| wandb: |
| project: irl_llms |
| entity: null |
| name: Llama-3.2-1B-2025-11-22_15-09-36 |
| irl: |
| posterior_dir: re_irl_min_stratified_plots/round_5 |
| global_norm_dir: re_irl_min_stratified_plots |
| base_model_name: null |
| use_round: 5 |
| sample_theta_each_step: true |
| n_samples: 100 |
| feature_max_length: 256 |
| feature_batch_size: 16 |
| use_platt: false |
| platt_a: 1.0 |
| platt_b: 0.0 |
| features_on_cpu: false |
| reward_scale: 8 |
| reward_clip: 4 |
|
|