| { |
| "output_dir": "/home/infidea/suwon/omni-voice-advance/exp/omnivoice_0.6B-FT", |
| "data_config": "/home/infidea/suwon/omni-voice-advance/examples/config/data_config_finetune.json", |
| "llm_name_or_path": "Qwen/Qwen3-0.6B", |
| "tokenizer_name_or_path": null, |
| "expected_llm_model_type": null, |
| "expected_llm_hidden_size": null, |
| "expected_llm_intermediate_size": null, |
| "expected_llm_num_hidden_layers": null, |
| "expected_llm_num_attention_heads": null, |
| "expected_llm_num_key_value_heads": null, |
| "expected_llm_vocab_size": null, |
| "audio_vocab_size": 1025, |
| "audio_mask_id": 1024, |
| "num_audio_codebook": 8, |
| "audio_codebook_weights": [ |
| 8, |
| 8, |
| 6, |
| 6, |
| 4, |
| 4, |
| 2, |
| 2 |
| ], |
| "drop_cond_ratio": 0.1, |
| "prompt_ratio_range": [ |
| 0.05, |
| 0.3 |
| ], |
| "mask_ratio_range": [ |
| 0.3, |
| 0.9 |
| ], |
| "min_masked_audio_tokens": 8, |
| "language_ratio": 0.0, |
| "use_pinyin_ratio": 0.0, |
| "instruct_ratio": 0.7, |
| "only_instruct_ratio": 0.3, |
| "log_codebook_losses": true, |
| "loss_label_smoothing": 0.01, |
| "resume_from_checkpoint": null, |
| "init_from_checkpoint": "k2-fsa/OmniVoice", |
| "learning_rate": 2e-05, |
| "weight_decay": 0.01, |
| "max_grad_norm": 1.0, |
| "steps": 1000000, |
| "seed": 42, |
| "lr_scheduler_type": "cosine", |
| "warmup_type": "steps", |
| "warmup_ratio": 0.03, |
| "warmup_steps": 10000, |
| "batch_tokens": 8192, |
| "gradient_accumulation_steps": 4, |
| "num_workers": 2, |
| "mixed_precision": "bf16", |
| "allow_tf32": true, |
| "require_cuda": true, |
| "use_deepspeed": false, |
| "deepspeed_config": null, |
| "compile_flex_attention_mask": true, |
| "validate_audio_token_range": false, |
| "skip_bad_batches": true, |
| "max_consecutive_batch_skips": 50, |
| "attn_implementation": "flex_attention", |
| "logging_steps": 100, |
| "eval_steps": 500, |
| "save_steps": 500, |
| "keep_last_n_checkpoints": 3, |
| "use_wandb": true, |
| "wandb_project": "omnivoice_0.6B-FT", |
| "wandb_entity": null, |
| "wandb_run_name": null, |
| "wandb_group": null, |
| "wandb_tags": [], |
| "wandb_mode": null, |
| "inference_logging_steps": 1000, |
| "inference_logging_text": null, |
| "inference_logging_language": null, |
| "inference_logging_ref_audio": null, |
| "inference_logging_ref_text": null, |
| "inference_logging_num_step": 16, |
| "inference_logging_guidance_scale": 2.0, |
| "inference_logging_speed": 1.0, |
| "inference_logging_duration": null, |
| "inference_audio_tokenizer_path": null, |
| "inference_logging_jsonl_dir": [ |
| "/home/infidea/tts-data/suwon/OmniVoice_data/server_data/txts", |
| "/home/infidea/tts-data/suwon/OmniVoice_data/ml-tts-data-others/txts", |
| "/home/infidea/tts-data/suwon/OmniVoice_data/voice_design_all/txts" |
| ], |
| "inference_logging_voice_design_jsonl_dir": [ |
| "/home/infidea/tts-data/suwon/OmniVoice_data/voice_design_all/txts" |
| ], |
| "inference_logging_save_eval_artifacts": true, |
| "inference_logging_eval_dir": null |
| } |