model: FunCineForgeInferModel index_ds: FunCineForgeDS xvec_model: pretrained_models/funcineforge_zh_en/camplus.onnx model_conf: {} dataset_conf: # face is from the video, vocal is the reference audio, extract speaker ID and start-end timestamp from dialogue load_meta_data_key: "text,clue,face,dialogue,vocal,video" sos: 6561 eos: 6562 turn_of_speech: 6563 fill_token: 6564 ignore_id: -100 startofclue_token: 151646 endofclue_token: 151647 frame_shift: 25 # ms timebook_size: 1500 # 60 * 25 = 1500 pangbai: 1500 dubai: 1501 duihua: 1502 duoren: 1503 male: 1504 female: 1505 child: 1506 youth: 1507 adult: 1508 middle: 1509 elderly: 1510 speaker_id_start: 1511 sampling: ras lm_use_prompt: true fm_use_prompt: true use_llm_cache: true seed: 0 max_length: 1500 # 60s * 25 fps min_length: 50 # 2s * 25 fps llm_dtype: fp32 fm_dtype: fp32 voc_dtype: fp32 batch_size: 1