File size: 1,423 Bytes
da45747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
{
    "output_dir": "exp/omnivoice_vietnamese_4kh",
    "data_config": "/vast/tts/robert/OmniVoice/data/data_config_vietnamese.json",
    "llm_name_or_path": "Qwen/Qwen3-0.6B",
    "audio_vocab_size": 1025,
    "audio_mask_id": 1024,
    "num_audio_codebook": 8,
    "audio_codebook_weights": [
        8,
        8,
        6,
        6,
        4,
        4,
        2,
        2
    ],
    "drop_cond_ratio": 0.1,
    "prompt_ratio_range": [
        0.0,
        0.3
    ],
    "mask_ratio_range": [
        0.0,
        1.0
    ],
    "language_ratio": 0.8,
    "use_pinyin_ratio": 0.0,
    "instruct_ratio": 0.0,
    "only_instruct_ratio": 0.0,
    "resume_from_checkpoint": null,
    "init_from_checkpoint": "k2-fsa/OmniVoice",
    "learning_rate": 1e-05,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,
    "steps": 100000,
    "epochs": null,
    "seed": 42,
    "lr_scheduler_type": "cosine",
    "warmup_type": "ratio",
    "warmup_ratio": 0.01,
    "warmup_steps": 0,
    "batch_tokens": 8192,
    "gradient_accumulation_steps": 1,
    "num_workers": 2,
    "mixed_precision": "bf16",
    "allow_tf32": true,
    "use_deepspeed": false,
    "deepspeed_config": null,
    "attn_implementation": "flex_attention",
    "max_sample_tokens": 2000,
    "min_sample_tokens": 50,
    "max_batch_size": 64,
    "logging_steps": 50,
    "eval_steps": 10000,
    "save_steps": 10000,
    "keep_last_n_checkpoints": -1
}