model: FunCineForgeInferModel
index_ds: FunCineForgeDS
xvec_model: pretrained_models/funcineforge_zh_en/camplus.onnx
model_conf: {}

dataset_conf:
# face is from the video, vocal is the reference audio, extract speaker ID and start-end timestamp from dialogue
    load_meta_data_key: "text,clue,face,dialogue,vocal,video"
    sos: 6561
    eos: 6562
    turn_of_speech: 6563
    fill_token: 6564
    ignore_id: -100
    startofclue_token: 151646
    endofclue_token: 151647
    frame_shift: 25  # ms
    timebook_size: 1500 # 60 * 25 = 1500
    pangbai: 1500
    dubai: 1501
    duihua: 1502
    duoren: 1503
    male: 1504
    female: 1505
    child: 1506
    youth: 1507
    adult: 1508
    middle: 1509
    elderly: 1510
    speaker_id_start: 1511


sampling: ras
lm_use_prompt: true
fm_use_prompt: true
use_llm_cache: true
seed: 0
max_length: 1500  # 60s * 25 fps
min_length: 50   # 2s * 25 fps
llm_dtype: fp32
fm_dtype: fp32
voc_dtype: fp32
batch_size: 1