| AdamW.lr: 0.0001 |
| AdamW.weight_decay: 0.0 |
|
|
| DecoderTransformer.depth: 8 |
| DecoderTransformer.dim: 512 |
| DecoderTransformer.dropout: 0.1 |
| DecoderTransformer.heads: 8 |
| DecoderTransformer.max_seq_len: 514 |
|
|
| accelerator: auto |
|
|
| actor_init_on_gpu: false |
|
|
| actor_learning_rate: 5.0e-06 |
|
|
| adam_betas: |
| - 0.9 |
| - 0.95 |
|
|
| adam_offload: false |
|
|
| advantage_estimator: gae_interleave |
|
|
| anchor_model_path: logs/enc_dec_base_chord_3_datasets/step=13000.ckpt |
|
|
| args.debug: 0 |
| args.load: configs/single_agent_rl/gapt.yml |
| args.save: null |
| args.unknown: [] |
|
|
| aux_loss_coef: 0.0 |
|
|
| batch_size: 64 |
|
|
| bf16: true |
|
|
| buffer_cpu_offload: false |
|
|
| cache_dir: data/cache |
|
|
| checkpoint_interval: 1000 |
|
|
| checkpoint_metric: val/loss |
|
|
| checkpoint_mode: min |
|
|
| checkpoint_top_k: -1 |
|
|
| chord_names_path: data/hooktheory/chord_names.json |
|
|
| compile: true |
|
|
| contrastive_reward_model_path: |
| - logs/contrastive_reward_3_datasets/step=8000.ckpt |
| - logs/contrastive_reward_2_3_datasets/step=8000.ckpt |
|
|
| contrastive_reward_rhythm_model_path: |
| - logs/contrastive_reward_no_augmentation_rhythm_3_datasets/step=2500.ckpt |
| - logs/contrastive_reward_no_augmentation_rhythm_2_3_datasets/step=2500.ckpt |
|
|
| contrastive_reward_rhythm_weight: 1.0 |
|
|
| counterpart_vram_swap: false |
|
|
| critic_learning_rate: 9.0e-05 |
|
|
| data_augmentation: true |
|
|
| data_path: data/hooktheory/Hooktheory.json.gz |
|
|
| dataloader_pin_memory: false |
|
|
| datasets: |
| - hooktheory |
| - pop909 |
| - nottingham |
|
|
| devices: auto |
|
|
| disable_trace_cache: false |
|
|
| discriminative_reward_model_path: |
| - logs/discriminative_reward_128_bs_3_datasets/step=3000.ckpt |
| - logs/discriminative_reward_128_bs_2_3_datasets/step=3000.ckpt |
|
|
| discriminative_reward_rhythm_model_path: |
| - logs/discriminative_reward_no_augmentation_rhythm_3_datasets/step=3000.ckpt |
| - logs/discriminative_reward_no_augmentation_rhythm_2_3_datasets/step=3000.ckpt |
|
|
| discriminative_reward_rhythm_weight: 1.0 |
|
|
| enable_reward_label_smoothing: true |
|
|
| entropy_loss_coef: 0.01 |
|
|
| eps_clip: 0.2 |
|
|
| eval_steps: 200 |
|
|
| flash_attn: true |
|
|
| freezing_actor_steps: 0 |
|
|
| gail_discriminative_model_configs: |
|
|
| depth: 8 |
|
|
| dim: 512 |
|
|
| dropout: 0.1 |
|
|
| heads: 8 |
|
|
| gail_reward_formulation: logits_prob_log |
|
|
| gail_reward_learning_rate: 9.0e-05 |
|
|
| gamma: 1 |
|
|
| grad_accum_dtype: null |
|
|
| gradient_checkpointing: false |
|
|
| gradient_checkpointing_use_reentrant: false |
|
|
| init_kl_coef: 0.001 |
|
|
| invalid_output_penalty_weight: 1.0 |
|
|
| kl_estimator: k3 |
|
|
| kl_horizon: 10000 |
|
|
| kl_target: null |
|
|
| l2: 0.0 |
|
|
| lambd: 0.95 |
|
|
| limit_eval_batches: 4 |
|
|
| lit_module_override_args: |
|
|
| chord_names_path: data/cache/chord_names_augmented.json |
|
|
| data_path: data/hooktheory/Hooktheory.json.gz |
|
|
| num_workers: 4 |
|
|
| local_rank: -1 |
|
|
| log_every_n_steps: 1 |
|
|
| logging_steps: 1 |
|
|
| logits_vram_swap: true |
|
|
| max_epochs: 1 |
|
|
| max_len: 512 |
|
|
| max_log_examples: 8 |
|
|
| max_norm: 1.0 |
|
|
| max_samples: 1000000 |
|
|
| micro_rollout_batch_size: 384 |
|
|
| micro_train_batch_size: 48 |
|
|
| model_part: chord |
|
|
| model_type: decoder_only |
|
|
| n_samples_per_prompt: 1 |
|
|
| normalize_reward: true |
|
|
| num_nodes: 1 |
|
|
| num_steps: 1000 |
|
|
| num_workers: 8 |
|
|
| overfit_batches: 0 |
|
|
| packing_samples: false |
|
|
| precision: bf16-mixed |
|
|
| pretrain_data: null |
|
|
| pretrain_model_path: logs/decoder_only_online_chord_3_datasets/step=11000.ckpt |
|
|
| repetition_penalty_threshold: 4 |
|
|
| repetition_penalty_weight: 1.0 |
|
|
| reward_apply_threshold_after_steps: 200 |
|
|
| reward_average_steps: 3 |
|
|
| reward_clip_range: null |
|
|
| reward_update_early_stop_steps: 500 |
|
|
| reward_update_steps: 5 |
|
|
| reward_update_strategy: average |
|
|
| reward_update_threshold: 1.0 |
|
|
| reward_vram_swap: false |
|
|
| rollout_batch_size: 384 |
|
|
| sample_interval: 5000 |
|
|
| save_dir: logs/gapt |
|
|
| save_eval_gen: false |
|
|
| save_steps: -1 |
|
|
| save_value_network: false |
|
|
| seed: 42 |
|
|
| strategy: auto |
|
|
| temperature: 0.99 |
|
|
| top_p: 1.0 |
|
|
| train_batch_size: 48 |
|
|
| train_steps: 30000 |
|
|
| trainer_empty_cache: false |
|
|
| use_full_kl: true |
|
|
| use_kl_estimator_k3: false |
|
|
| use_kl_loss: false |
|
|
| use_reverse_kl: false |
|
|
| use_tensorboard: false |
|
|
| use_wandb: 'true' |
|
|
| val_interval: 1000 |
|
|
| value_clip: null |
|
|
| wandb_group: null |
|
|
| wandb_org: null |
|
|
| wandb_project: realchords |
|
|
| wandb_run_name: null |
|
|
| warmup_steps: 100 |
|
|
| weights: |
| - 0.6 |
| - 0.3 |
| - 0.1 |
|
|
| zero_stage: 0 |
|
|
| zpg: 1 |
|
|
|
|