Spaces:

Umong
/

MayerBhasha

Build error

App Files Files Community

Umong commited on Feb 1

Commit

a53ae3a

verified ·

1 Parent(s): 9b60926

Deploy Bengali TTS app

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
app.py +254 -0
data/Bengali/vocab.txt +2545 -0
requirements.txt +15 -0
src/f5_tts/api.py +164 -0
src/f5_tts/configs/E2TTS_Base.yaml +49 -0
src/f5_tts/configs/E2TTS_Small.yaml +49 -0
src/f5_tts/configs/F5TTS_Base.yaml +54 -0
src/f5_tts/configs/F5TTS_Small.yaml +54 -0
src/f5_tts/configs/F5TTS_v1_Base.yaml +55 -0
src/f5_tts/eval/README.md +63 -0
src/f5_tts/eval/__pycache__/compare_checkpoints.cpython-311.pyc +0 -0
src/f5_tts/eval/__pycache__/eval_bengali.cpython-311.pyc +0 -0
src/f5_tts/eval/__pycache__/gen_bengali_batch.cpython-311.pyc +0 -0
src/f5_tts/eval/__pycache__/gen_elevenlabs_batch.cpython-311.pyc +0 -0
src/f5_tts/eval/__pycache__/gen_indicf5_batch.cpython-311.pyc +0 -0
src/f5_tts/eval/compare_checkpoints.py +150 -0
src/f5_tts/eval/ecapa_tdnn.py +331 -0
src/f5_tts/eval/eval_bengali.py +215 -0
src/f5_tts/eval/eval_gemini.py +160 -0
src/f5_tts/eval/eval_infer_batch.py +221 -0
src/f5_tts/eval/eval_infer_batch.sh +116 -0
src/f5_tts/eval/eval_infer_batch_example.sh +18 -0
src/f5_tts/eval/eval_librispeech_test_clean.py +105 -0
src/f5_tts/eval/eval_seedtts_testset.py +104 -0
src/f5_tts/eval/eval_utmos.py +42 -0
src/f5_tts/eval/gen_bengali_batch.py +159 -0
src/f5_tts/eval/gen_elevenlabs_batch.py +110 -0
src/f5_tts/eval/gen_gemini_batch.py +121 -0
src/f5_tts/eval/utils_eval.py +444 -0
src/f5_tts/infer/README.md +177 -0
src/f5_tts/infer/SHARED.md +193 -0
src/f5_tts/infer/__pycache__/infer_cli.cpython-311.pyc +0 -0
src/f5_tts/infer/__pycache__/infer_cli_emotion.cpython-311.pyc +0 -0
src/f5_tts/infer/__pycache__/infer_elevenlabs.cpython-311.pyc +0 -0
src/f5_tts/infer/__pycache__/infer_emotion.cpython-311.pyc +0 -0
src/f5_tts/infer/__pycache__/utils_infer.cpython-311.pyc +0 -0
src/f5_tts/infer/examples/basic/basic.toml +11 -0
src/f5_tts/infer/examples/basic/basic_ref_en.wav +3 -0
src/f5_tts/infer/examples/basic/basic_ref_zh.wav +3 -0
src/f5_tts/infer/examples/multi/country.flac +3 -0
src/f5_tts/infer/examples/multi/main.flac +3 -0
src/f5_tts/infer/examples/multi/story.toml +20 -0
src/f5_tts/infer/examples/multi/story.txt +1 -0
src/f5_tts/infer/examples/multi/town.flac +3 -0
src/f5_tts/infer/examples/vocab.txt +2545 -0
src/f5_tts/infer/infer_cli.py +388 -0
src/f5_tts/infer/infer_cli_emotion.py +287 -0
src/f5_tts/infer/infer_elevenlabs.py +71 -0
src/f5_tts/infer/infer_emotion.py +265 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/infer/examples/basic/basic_ref_en.wav filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/infer/examples/basic/basic_ref_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/infer/examples/multi/country.flac filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/infer/examples/multi/main.flac filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/infer/examples/multi/town.flac filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import os
+import tempfile
+import gradio as gr
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+# Optional HF Spaces GPU decorator
+try:
+    import spaces
+    gpu_decorator = spaces.GPU
+except ImportError:
+    def gpu_decorator(fn):
+        return fn
+import torchaudio
+from transformers import WhisperProcessor, WhisperForConditionalGeneration, GenerationConfig
+from f5_tts.model import CFM, DiT
+from f5_tts.infer.utils_infer import (
+    device,
+    load_checkpoint,
+    load_vocoder,
+    preprocess_ref_audio_text,
+    infer_process,
+    target_sample_rate,
+    hop_length,
+    n_fft,
+    win_length,
+    n_mel_channels,
+)
+from f5_tts.model.utils import get_tokenizer
+# Config
+MODEL_CKPT = hf_hub_download("Umong/bengali-f5-tts", "model_50000.pt")
+VOCAB_FILE = "data/Bengali/vocab.txt"
+WHISPER_MODEL = "bengaliAI/tugstugi_bengaliai-asr_whisper-medium"
+# Model architecture (same as F5TTS_v1_Base)
+model_cfg = dict(
+    dim=1024,
+    depth=22,
+    heads=16,
+    ff_mult=2,
+    text_dim=512,
+    text_mask_padding=True,
+    qk_norm=None,
+    conv_layers=4,
+    pe_attn_head=None,
+)
+# Globals
+ema_model = None
+vocoder = None
+bn_asr_model = None
+bn_asr_processor = None
+def load_models():
+    global ema_model, vocoder
+    if ema_model is not None:
+        return
+    print("Loading Bengali TTS model...")
+    vocab_char_map, vocab_size = get_tokenizer(VOCAB_FILE, "custom")
+    model = CFM(
+        transformer=DiT(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
+        mel_spec_kwargs=dict(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            n_mel_channels=n_mel_channels,
+            target_sample_rate=target_sample_rate,
+            mel_spec_type="vocos",
+        ),
+        odeint_kwargs=dict(method="euler"),
+        vocab_char_map=vocab_char_map,
+    ).to(device)
+    ema_model = load_checkpoint(model, MODEL_CKPT, device, use_ema=True)
+    print("Loading vocoder...")
+    vocoder = load_vocoder(vocoder_name="vocos", is_local=False, device=device)
+    print("Models loaded.")
+def init_bengali_asr():
+    global bn_asr_model, bn_asr_processor
+    if bn_asr_model is not None:
+        return
+    print("Loading Bengali ASR...")
+    bn_asr_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
+    bn_asr_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL).to(device)
+    # Fix outdated generation config
+    bn_asr_model.generation_config = GenerationConfig.from_pretrained("openai/whisper-medium")
+    print("Bengali ASR loaded.")
+def transcribe_bengali(audio_path: str) -> str:
+    init_bengali_asr()
+    waveform, sr = torchaudio.load(audio_path)
+    if sr != 16000:
+        waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    input_features = bn_asr_processor(
+        waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt"
+    ).input_features.to(device)
+    predicted_ids = bn_asr_model.generate(input_features, language="bn", task="transcribe")
+    text = bn_asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return text.strip()
+def preprocess_ref_audio_text_bn(ref_audio, ref_text, show_info=print):
+    """Wrapper that uses Bengali ASR instead of default whisper."""
+    # Use original preprocessing for audio clipping/silence
+    from f5_tts.infer.utils_infer import (
+        _ref_audio_cache,
+        remove_silence_edges,
+    )
+    from pydub import AudioSegment, silence
+    import hashlib
+    show_info("Converting audio...")
+    with open(ref_audio, "rb") as f:
+        audio_hash = hashlib.md5(f.read()).hexdigest()
+    if audio_hash in _ref_audio_cache:
+        processed_audio = _ref_audio_cache[audio_hash]
+    else:
+        tempfile_kwargs = {"delete": False, "suffix": ".wav"}
+        with tempfile.NamedTemporaryFile(**tempfile_kwargs) as f:
+            temp_path = f.name
+        aseg = AudioSegment.from_file(ref_audio)
+        # Clip to 15s using silence detection
+        non_silent_segs = silence.split_on_silence(
+            aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
+        )
+        non_silent_wave = AudioSegment.silent(duration=0)
+        for seg in non_silent_segs:
+            if len(non_silent_wave) > 6000 and len(non_silent_wave + seg) > 15000:
+                show_info("Audio over 15s, clipping.")
+                break
+            non_silent_wave += seg
+        if len(non_silent_wave) > 15000:
+            non_silent_segs = silence.split_on_silence(
+                aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
+            )
+            non_silent_wave = AudioSegment.silent(duration=0)
+            for seg in non_silent_segs:
+                if len(non_silent_wave) > 6000 and len(non_silent_wave + seg) > 15000:
+                    break
+                non_silent_wave += seg
+        aseg = non_silent_wave
+        if len(aseg) > 15000:
+            aseg = aseg[:15000]
+            show_info("Audio over 15s, hard clip.")
+        aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
+        aseg.export(temp_path, format="wav")
+        processed_audio = temp_path
+        _ref_audio_cache[audio_hash] = processed_audio
+    # Bengali transcription if no ref_text
+    if not ref_text.strip():
+        show_info("Transcribing with Bengali ASR...")
+        ref_text = transcribe_bengali(processed_audio)
+    # Ensure proper ending punctuation
+    if not ref_text.endswith(". ") and not ref_text.endswith("।"):
+        if ref_text.endswith("."):
+            ref_text += " "
+        else:
+            ref_text += "। "
+    print("ref_text:", ref_text)
+    return processed_audio, ref_text
+@gpu_decorator
+def generate_tts(ref_audio, gen_text, speed):
+    if ref_audio is None:
+        return None, "Please provide reference audio."
+    if not gen_text.strip():
+        return None, "Please enter text to generate."
+    load_models()
+    try:
+        ref_audio_processed, ref_text_processed = preprocess_ref_audio_text_bn(
+            ref_audio, ""
+        )
+        audio, sr, _ = infer_process(
+            ref_audio_processed,
+            ref_text_processed,
+            gen_text,
+            ema_model,
+            vocoder,
+            mel_spec_type="vocos",
+            speed=speed,
+            device=device,
+        )
+        return (sr, audio), f"Generated with ref: '{ref_text_processed[:50]}...'"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Gradio UI
+with gr.Blocks(title="Bengali TTS") as demo:
+    gr.Markdown("# Bengali Text-to-Speech")
+    gr.Markdown("Upload or record Bengali audio (max 15s) as reference, then generate speech.")
+    with gr.Row():
+        with gr.Column():
+            ref_audio = gr.Audio(
+                label="Reference Audio (record or upload)",
+                type="filepath",
+                max_length=15,
+            )
+            gen_text = gr.Textbox(
+                label="Text to Generate (Bengali)",
+                placeholder="Enter Bengali text here...",
+                lines=3,
+            )
+            speed = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                label="Speed",
+            )
+            generate_btn = gr.Button("Generate", variant="primary")
+        with gr.Column():
+            output_audio = gr.Audio(label="Generated Audio", type="numpy")
+            status = gr.Textbox(label="Status", interactive=False)
+    generate_btn.click(
+        fn=generate_tts,
+        inputs=[ref_audio, gen_text, speed],
+        outputs=[output_audio, status],
+    )
+if __name__ == "__main__":
+    demo.launch()

data/Bengali/vocab.txt ADDED Viewed

	@@ -0,0 +1,2545 @@

+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+=
+>
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+_
+a
+a1
+ai1
+ai2
+ai3
+ai4
+an1
+an3
+an4
+ang1
+ang2
+ang4
+ao1
+ao2
+ao3
+ao4
+b
+ba
+ba1
+ba2
+ba3
+ba4
+bai1
+bai2
+bai3
+bai4
+ban1
+ban2
+ban3
+ban4
+bang1
+bang2
+bang3
+bang4
+bao1
+bao2
+bao3
+bao4
+bei
+bei1
+bei2
+bei3
+bei4
+ben1
+ben2
+ben3
+ben4
+beng
+beng1
+beng2
+beng3
+beng4
+bi1
+bi2
+bi3
+bi4
+bian1
+bian2
+bian3
+bian4
+biao1
+biao2
+biao3
+bie1
+bie2
+bie3
+bie4
+bin1
+bin4
+bing1
+bing2
+bing3
+bing4
+bo
+bo1
+bo2
+bo3
+bo4
+bu2
+bu3
+bu4
+c
+ca1
+cai1
+cai2
+cai3
+cai4
+can1
+can2
+can3
+can4
+cang1
+cang2
+cao1
+cao2
+cao3
+ce4
+cen1
+cen2
+ceng1
+ceng2
+ceng4
+cha1
+cha2
+cha3
+cha4
+chai1
+chai2
+chan1
+chan2
+chan3
+chan4
+chang1
+chang2
+chang3
+chang4
+chao1
+chao2
+chao3
+che1
+che2
+che3
+che4
+chen1
+chen2
+chen3
+chen4
+cheng1
+cheng2
+cheng3
+cheng4
+chi1
+chi2
+chi3
+chi4
+chong1
+chong2
+chong3
+chong4
+chou1
+chou2
+chou3
+chou4
+chu1
+chu2
+chu3
+chu4
+chua1
+chuai1
+chuai2
+chuai3
+chuai4
+chuan1
+chuan2
+chuan3
+chuan4
+chuang1
+chuang2
+chuang3
+chuang4
+chui1
+chui2
+chun1
+chun2
+chun3
+chuo1
+chuo4
+ci1
+ci2
+ci3
+ci4
+cong1
+cong2
+cou4
+cu1
+cu4
+cuan1
+cuan2
+cuan4
+cui1
+cui3
+cui4
+cun1
+cun2
+cun4
+cuo1
+cuo2
+cuo4
+d
+da
+da1
+da2
+da3
+da4
+dai1
+dai2
+dai3
+dai4
+dan1
+dan2
+dan3
+dan4
+dang1
+dang2
+dang3
+dang4
+dao1
+dao2
+dao3
+dao4
+de
+de1
+de2
+dei3
+den4
+deng1
+deng2
+deng3
+deng4
+di1
+di2
+di3
+di4
+dia3
+dian1
+dian2
+dian3
+dian4
+diao1
+diao3
+diao4
+die1
+die2
+die4
+ding1
+ding2
+ding3
+ding4
+diu1
+dong1
+dong3
+dong4
+dou1
+dou2
+dou3
+dou4
+du1
+du2
+du3
+du4
+duan1
+duan2
+duan3
+duan4
+dui1
+dui4
+dun1
+dun3
+dun4
+duo1
+duo2
+duo3
+duo4
+e
+e1
+e2
+e3
+e4
+ei2
+en1
+en4
+er
+er2
+er3
+er4
+f
+fa1
+fa2
+fa3
+fa4
+fan1
+fan2
+fan3
+fan4
+fang1
+fang2
+fang3
+fang4
+fei1
+fei2
+fei3
+fei4
+fen1
+fen2
+fen3
+fen4
+feng1
+feng2
+feng3
+feng4
+fo2
+fou2
+fou3
+fu1
+fu2
+fu3
+fu4
+g
+ga1
+ga2
+ga3
+ga4
+gai1
+gai2
+gai3
+gai4
+gan1
+gan2
+gan3
+gan4
+gang1
+gang2
+gang3
+gang4
+gao1
+gao2
+gao3
+gao4
+ge1
+ge2
+ge3
+ge4
+gei2
+gei3
+gen1
+gen2
+gen3
+gen4
+geng1
+geng3
+geng4
+gong1
+gong3
+gong4
+gou1
+gou2
+gou3
+gou4
+gu
+gu1
+gu2
+gu3
+gu4
+gua1
+gua2
+gua3
+gua4
+guai1
+guai2
+guai3
+guai4
+guan1
+guan2
+guan3
+guan4
+guang1
+guang2
+guang3
+guang4
+gui1
+gui2
+gui3
+gui4
+gun3
+gun4
+guo1
+guo2
+guo3
+guo4
+h
+ha1
+ha2
+ha3
+hai1
+hai2
+hai3
+hai4
+han1
+han2
+han3
+han4
+hang1
+hang2
+hang4
+hao1
+hao2
+hao3
+hao4
+he1
+he2
+he4
+hei1
+hen2
+hen3
+hen4
+heng1
+heng2
+heng4
+hong1
+hong2
+hong3
+hong4
+hou1
+hou2
+hou3
+hou4
+hu1
+hu2
+hu3
+hu4
+hua1
+hua2
+hua4
+huai2
+huai4
+huan1
+huan2
+huan3
+huan4
+huang1
+huang2
+huang3
+huang4
+hui1
+hui2
+hui3
+hui4
+hun1
+hun2
+hun4
+huo
+huo1
+huo2
+huo3
+huo4
+i
+j
+ji1
+ji2
+ji3
+ji4
+jia
+jia1
+jia2
+jia3
+jia4
+jian1
+jian2
+jian3
+jian4
+jiang1
+jiang2
+jiang3
+jiang4
+jiao1
+jiao2
+jiao3
+jiao4
+jie1
+jie2
+jie3
+jie4
+jin1
+jin2
+jin3
+jin4
+jing1
+jing2
+jing3
+jing4
+jiong3
+jiu1
+jiu2
+jiu3
+jiu4
+ju1
+ju2
+ju3
+ju4
+juan1
+juan2
+juan3
+juan4
+jue1
+jue2
+jue4
+jun1
+jun4
+k
+ka1
+ka2
+ka3
+kai1
+kai2
+kai3
+kai4
+kan1
+kan2
+kan3
+kan4
+kang1
+kang2
+kang4
+kao1
+kao2
+kao3
+kao4
+ke1
+ke2
+ke3
+ke4
+ken3
+keng1
+kong1
+kong3
+kong4
+kou1
+kou2
+kou3
+kou4
+ku1
+ku2
+ku3
+ku4
+kua1
+kua3
+kua4
+kuai3
+kuai4
+kuan1
+kuan2
+kuan3
+kuang1
+kuang2
+kuang4
+kui1
+kui2
+kui3
+kui4
+kun1
+kun3
+kun4
+kuo4
+l
+la
+la1
+la2
+la3
+la4
+lai2
+lai4
+lan2
+lan3
+lan4
+lang1
+lang2
+lang3
+lang4
+lao1
+lao2
+lao3
+lao4
+le
+le1
+le4
+lei
+lei1
+lei2
+lei3
+lei4
+leng1
+leng2
+leng3
+leng4
+li
+li1
+li2
+li3
+li4
+lia3
+lian2
+lian3
+lian4
+liang2
+liang3
+liang4
+liao1
+liao2
+liao3
+liao4
+lie1
+lie2
+lie3
+lie4
+lin1
+lin2
+lin3
+lin4
+ling2
+ling3
+ling4
+liu1
+liu2
+liu3
+liu4
+long1
+long2
+long3
+long4
+lou1
+lou2
+lou3
+lou4
+lu1
+lu2
+lu3
+lu4
+luan2
+luan3
+luan4
+lun1
+lun2
+lun4
+luo1
+luo2
+luo3
+luo4
+lv2
+lv3
+lv4
+lve3
+lve4
+m
+ma
+ma1
+ma2
+ma3
+ma4
+mai2
+mai3
+mai4
+man1
+man2
+man3
+man4
+mang2
+mang3
+mao1
+mao2
+mao3
+mao4
+me
+mei2
+mei3
+mei4
+men
+men1
+men2
+men4
+meng
+meng1
+meng2
+meng3
+meng4
+mi1
+mi2
+mi3
+mi4
+mian2
+mian3
+mian4
+miao1
+miao2
+miao3
+miao4
+mie1
+mie4
+min2
+min3
+ming2
+ming3
+ming4
+miu4
+mo1
+mo2
+mo3
+mo4
+mou1
+mou2
+mou3
+mu2
+mu3
+mu4
+n
+n2
+na1
+na2
+na3
+na4
+nai2
+nai3
+nai4
+nan1
+nan2
+nan3
+nan4
+nang1
+nang2
+nang3
+nao1
+nao2
+nao3
+nao4
+ne
+ne2
+ne4
+nei3
+nei4
+nen4
+neng2
+ni1
+ni2
+ni3
+ni4
+nian1
+nian2
+nian3
+nian4
+niang2
+niang4
+niao2
+niao3
+niao4
+nie1
+nie4
+nin2
+ning2
+ning3
+ning4
+niu1
+niu2
+niu3
+niu4
+nong2
+nong4
+nou4
+nu2
+nu3
+nu4
+nuan3
+nuo2
+nuo4
+nv2
+nv3
+nve4
+o
+o1
+o2
+ou1
+ou2
+ou3
+ou4
+p
+pa1
+pa2
+pa4
+pai1
+pai2
+pai3
+pai4
+pan1
+pan2
+pan4
+pang1
+pang2
+pang4
+pao1
+pao2
+pao3
+pao4
+pei1
+pei2
+pei4
+pen1
+pen2
+pen4
+peng1
+peng2
+peng3
+peng4
+pi1
+pi2
+pi3
+pi4
+pian1
+pian2
+pian4
+piao1
+piao2
+piao3
+piao4
+pie1
+pie2
+pie3
+pin1
+pin2
+pin3
+pin4
+ping1
+ping2
+po1
+po2
+po3
+po4
+pou1
+pu1
+pu2
+pu3
+pu4
+q
+qi1
+qi2
+qi3
+qi4
+qia1
+qia3
+qia4
+qian1
+qian2
+qian3
+qian4
+qiang1
+qiang2
+qiang3
+qiang4
+qiao1
+qiao2
+qiao3
+qiao4
+qie1
+qie2
+qie3
+qie4
+qin1
+qin2
+qin3
+qin4
+qing1
+qing2
+qing3
+qing4
+qiong1
+qiong2
+qiu1
+qiu2
+qiu3
+qu1
+qu2
+qu3
+qu4
+quan1
+quan2
+quan3
+quan4
+que1
+que2
+que4
+qun2
+r
+ran2
+ran3
+rang1
+rang2
+rang3
+rang4
+rao2
+rao3
+rao4
+re2
+re3
+re4
+ren2
+ren3
+ren4
+reng1
+reng2
+ri4
+rong1
+rong2
+rong3
+rou2
+rou4
+ru2
+ru3
+ru4
+ruan2
+ruan3
+rui3
+rui4
+run4
+ruo4
+s
+sa1
+sa2
+sa3
+sa4
+sai1
+sai4
+san1
+san2
+san3
+san4
+sang1
+sang3
+sang4
+sao1
+sao2
+sao3
+sao4
+se4
+sen1
+seng1
+sha1
+sha2
+sha3
+sha4
+shai1
+shai2
+shai3
+shai4
+shan1
+shan3
+shan4
+shang
+shang1
+shang3
+shang4
+shao1
+shao2
+shao3
+shao4
+she1
+she2
+she3
+she4
+shei2
+shen1
+shen2
+shen3
+shen4
+sheng1
+sheng2
+sheng3
+sheng4
+shi
+shi1
+shi2
+shi3
+shi4
+shou1
+shou2
+shou3
+shou4
+shu1
+shu2
+shu3
+shu4
+shua1
+shua2
+shua3
+shua4
+shuai1
+shuai3
+shuai4
+shuan1
+shuan4
+shuang1
+shuang3
+shui2
+shui3
+shui4
+shun3
+shun4
+shuo1
+shuo4
+si1
+si2
+si3
+si4
+song1
+song3
+song4
+sou1
+sou3
+sou4
+su1
+su2
+su4
+suan1
+suan4
+sui1
+sui2
+sui3
+sui4
+sun1
+sun3
+suo
+suo1
+suo2
+suo3
+t
+ta1
+ta2
+ta3
+ta4
+tai1
+tai2
+tai4
+tan1
+tan2
+tan3
+tan4
+tang1
+tang2
+tang3
+tang4
+tao1
+tao2
+tao3
+tao4
+te4
+teng2
+ti1
+ti2
+ti3
+ti4
+tian1
+tian2
+tian3
+tiao1
+tiao2
+tiao3
+tiao4
+tie1
+tie2
+tie3
+tie4
+ting1
+ting2
+ting3
+tong1
+tong2
+tong3
+tong4
+tou
+tou1
+tou2
+tou4
+tu1
+tu2
+tu3
+tu4
+tuan1
+tuan2
+tui1
+tui2
+tui3
+tui4
+tun1
+tun2
+tun4
+tuo1
+tuo2
+tuo3
+tuo4
+u
+v
+w
+wa
+wa1
+wa2
+wa3
+wa4
+wai1
+wai3
+wai4
+wan1
+wan2
+wan3
+wan4
+wang1
+wang2
+wang3
+wang4
+wei1
+wei2
+wei3
+wei4
+wen1
+wen2
+wen3
+wen4
+weng1
+weng4
+wo1
+wo2
+wo3
+wo4
+wu1
+wu2
+wu3
+wu4
+x
+xi1
+xi2
+xi3
+xi4
+xia1
+xia2
+xia4
+xian1
+xian2
+xian3
+xian4
+xiang1
+xiang2
+xiang3
+xiang4
+xiao1
+xiao2
+xiao3
+xiao4
+xie1
+xie2
+xie3
+xie4
+xin1
+xin2
+xin4
+xing1
+xing2
+xing3
+xing4
+xiong1
+xiong2
+xiu1
+xiu3
+xiu4
+xu
+xu1
+xu2
+xu3
+xu4
+xuan1
+xuan2
+xuan3
+xuan4
+xue1
+xue2
+xue3
+xue4
+xun1
+xun2
+xun4
+y
+ya
+ya1
+ya2
+ya3
+ya4
+yan1
+yan2
+yan3
+yan4
+yang1
+yang2
+yang3
+yang4
+yao1
+yao2
+yao3
+yao4
+ye1
+ye2
+ye3
+ye4
+yi
+yi1
+yi2
+yi3
+yi4
+yin1
+yin2
+yin3
+yin4
+ying1
+ying2
+ying3
+ying4
+yo1
+yong1
+yong2
+yong3
+yong4
+you1
+you2
+you3
+you4
+yu1
+yu2
+yu3
+yu4
+yuan1
+yuan2
+yuan3
+yuan4
+yue1
+yue4
+yun1
+yun2
+yun3
+yun4
+z
+za1
+za2
+za3
+zai1
+zai3
+zai4
+zan1
+zan2
+zan3
+zan4
+zang1
+zang4
+zao1
+zao2
+zao3
+zao4
+ze2
+ze4
+zei2
+zen3
+zeng1
+zeng4
+zha1
+zha2
+zha3
+zha4
+zhai1
+zhai2
+zhai3
+zhai4
+zhan1
+zhan2
+zhan3
+zhan4
+zhang1
+zhang2
+zhang3
+zhang4
+zhao1
+zhao2
+zhao3
+zhao4
+zhe
+zhe1
+zhe2
+zhe3
+zhe4
+zhen1
+zhen2
+zhen3
+zhen4
+zheng1
+zheng2
+zheng3
+zheng4
+zhi1
+zhi2
+zhi3
+zhi4
+zhong1
+zhong2
+zhong3
+zhong4
+zhou1
+zhou2
+zhou3
+zhou4
+zhu1
+zhu2
+zhu3
+zhu4
+zhua1
+zhua2
+zhua3
+zhuai1
+zhuai3
+zhuai4
+zhuan1
+zhuan2
+zhuan3
+zhuan4
+zhuang1
+zhuang4
+zhui1
+zhui4
+zhun1
+zhun2
+zhun3
+zhuo1
+zhuo2
+zi
+zi1
+zi2
+zi3
+zi4
+zong1
+zong2
+zong3
+zong4
+zou1
+zou2
+zou3
+zou4
+zu1
+zu2
+zu3
+zuan1
+zuan3
+zuan4
+zui2
+zui3
+zui4
+zun1
+zuo
+zuo1
+zuo2
+zuo3
+zuo4
+{
+~
+¡
+¢
+£
+¥
+§
+¨
+©
+«
+®
+¯
+°
+±
+²
+³
+´
+µ
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+À
+Á
+Â
+Ã
+Ä
+Å
+Æ
+Ç
+È
+É
+Ê
+Í
+Î
+Ñ
+Ó
+Ö
+×
+Ø
+Ú
+Ü
+Ý
+Þ
+ß
+à
+á
+â
+ã
+ä
+å
+æ
+ç
+è
+é
+ê
+ë
+ì
+í
+î
+ï
+ð
+ñ
+ò
+ó
+ô
+õ
+ö
+ø
+ù
+ú
+û
+ü
+ý
+Ā
+ā
+ă
+ą
+ć
+Č
+č
+Đ
+đ
+ē
+ė
+ę
+ě
+ĝ
+ğ
+ħ
+ī
+į
+İ
+ı
+Ł
+ł
+ń
+ņ
+ň
+ŋ
+Ō
+ō
+ő
+œ
+ř
+Ś
+ś
+Ş
+ş
+Š
+š
+Ť
+ť
+ũ
+ū
+ź
+Ż
+ż
+Ž
+ž
+ơ
+ư
+ǎ
+ǐ
+ǒ
+ǔ
+ǚ
+ș
+ț
+ɑ
+ɔ
+ɕ
+ə
+ɛ
+ɜ
+ɡ
+ɣ
+ɪ
+ɫ
+ɴ
+ɹ
+ɾ
+ʃ
+ʊ
+ʌ
+ʒ
+ʔ
+ʰ
+ʷ
+ʻ
+ʾ
+ʿ
+ˈ
+ː
+˙
+˜
+ˢ
+́
+̅
+Α
+Β
+Δ
+Ε
+Θ
+Κ
+Λ
+Μ
+Ξ
+Π
+Σ
+Τ
+Φ
+Χ
+Ψ
+Ω
+ά
+έ
+ή
+ί
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+ϊ
+ό
+ύ
+ώ
+ϕ
+ϵ
+Ё
+А
+Б
+В
+Г
+Д
+Е
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ё
+і
+ְ
+ִ
+ֵ
+ֶ
+ַ
+ָ
+ֹ
+<
+^
+`
+|
+}
+ʼ
+̮
+ँ
+ं
+ः
+अ
+आ
+इ
+ई
+उ
+ऊ
+ऋ
+ऍ
+ऎ
+ए
+ऐ
+ऑ
+ऒ
+ओ
+औ
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+ऩ
+प
+फ
+ब
+भ
+म
+य
+र
+ऱ
+ल
+ळ
+ऴ
+व
+श
+ष
+स
+ह
+ऺ
+ऻ
+़
+ऽ
+ा
+ि
+ी
+ु
+ू
+ृ
+ॄ
+ॅ
+ॆ
+े
+ै
+ॉ
+ॊ
+ो
+ौ
+्
+ॐ
+॑
+॒
+॔
+ॕ
+ॖ
+क़
+ख़
+ग़
+ज़
+ड़
+ढ़
+फ़
+य़
+ॠ
+।
+॥
+०
+१
+२
+३
+४
+५
+६
+७
+८
+९
+॰
+ॲ
+ঁ
+ং
+ঃ
+অ
+আ
+ই
+ঈ
+উ
+ঊ
+ঋ
+ঌ
+঎
+এ
+ঐ
+ও
+ঔ
+ক
+খ
+গ
+ঘ
+ঙ
+চ
+ছ
+জ
+ঝ
+ঞ
+ট
+ঠ
+ড
+ঢ
+ণ
+ত
+থ
+দ
+ধ
+ন
+প
+ফ
+ব
+ভ
+ম
+য
+র
+ল
+঵
+শ
+ষ
+স
+হ
+়
+ঽ
+া
+ি
+ী
+ু
+ূ
+ৃ
+ৄ
+৆
+ে
+ৈ
+ো
+ৌ
+্
+ৎ
+ৗ
+ড়
+ঢ়
+য়
+০
+১
+২
+৩
+৪
+৫
+৬
+৭
+৮
+৯
+ৰ
+ৱ
+৲
+৷
+৹
+৻
+ਂ
+ਃ
+ਅ
+ਆ
+ਇ
+ਈ
+ਉ
+ਊ
+ਏ
+ਐ
+ਓ
+ਔ
+ਕ
+ਖ
+ਗ
+ਘ
+ਙ
+ਚ
+ਛ
+ਜ
+ਝ
+ਞ
+ਟ
+ਠ
+ਡ
+ਢ
+ਣ
+ਤ
+ਥ
+ਦ
+ਧ
+ਨ
+ਪ
+ਫ
+ਬ
+ਭ
+ਮ
+ਯ
+ਰ
+ਲ
+ਲ਼
+ਵ
+ਸ਼
+ਸ
+ਹ
+਼
+ਾ
+ਿ
+ੀ
+ੁ
+ੂ
+ੇ
+ੈ
+ੋ
+ੌ
+੍
+ੑ
+ਖ਼
+ਗ਼
+ਜ਼
+ੜ
+ਫ਼
+ੰ
+ੱ
+ੲ
+ੳ
+ઁ
+ં
+ઃ
+અ
+આ
+ઇ
+ઈ
+ઉ
+ઊ
+ઋ
+ઍ
+એ
+ઐ
+ઑ
+ઓ
+ઔ
+ક
+ખ
+ગ
+ઘ
+ચ
+છ
+જ
+ઝ
+ઞ
+ટ
+ઠ
+ડ
+ઢ
+ણ
+ત
+થ
+દ
+ધ
+ન
+પ
+ફ
+બ
+ભ
+મ
+ય
+ર
+લ
+ળ
+વ
+શ
+ષ
+સ
+હ
+઼
+ા
+િ
+ી
+ુ
+ૂ
+ૃ
+ૄ
+ૅ
+ે
+ૈ
+ૉ
+ો
+ૌ
+્
+ૐ
+ૠ
+૧
+૨
+૪
+૫
+ଁ
+ଂ
+ଃ
+ଅ
+ଆ
+ଇ
+ଈ
+ଉ
+ଊ
+ଋ
+ଏ
+ଐ
+ଓ
+ଔ
+କ
+ଖ
+ଗ
+ଘ
+ଙ
+ଚ
+ଛ
+ଜ
+ଝ
+ଞ
+ଟ
+ଠ
+ଡ
+ଢ
+ଣ
+ତ
+ଥ
+ଦ
+ଧ
+ନ
+ପ
+ଫ
+ବ
+ଭ
+ମ
+ଯ
+ର
+ଲ
+ଳ
+ଵ
+ଶ
+ଷ
+ସ
+ହ
+଼
+ା
+ି
+ୀ
+ୁ
+ୂ
+ୃ
+ୄ
+େ
+ୈ
+ୋ
+ୌ
+୍
+ୖ
+ୗ
+ଡ଼
+ଢ଼
+ୟ
+ୠ
+୦
+୧
+୨
+୪
+୫
+୬
+୮
+ୱ
+ஃ
+அ
+ஆ
+இ
+ஈ
+உ
+ஊ
+எ
+ஏ
+ஐ
+ஒ
+ஓ
+ஔ
+க
+ங
+ச
+ஜ
+ஞ
+ட
+ண
+த
+ந
+ன
+ப
+ம
+ய
+ர
+ற
+ல
+ள
+ழ
+வ
+ஷ
+ஸ
+ஹ
+ா
+ி
+ீ
+ு
+ூ
+ெ
+ே
+ை
+ொ
+ோ
+ௌ
+்
+ௗ
+௦
+ఁ
+ం
+ః
+అ
+ఆ
+ఇ
+ఈ
+ఉ
+ఊ
+ఋ
+ఎ
+ఏ
+ఐ
+ఒ
+ఓ
+ఔ
+క
+ఖ
+గ
+ఘ
+ఙ
+చ
+ఛ
+జ
+ఝ
+ఞ
+ట
+ఠ
+డ
+ఢ
+ణ
+త
+థ
+ద
+ధ
+న
+ప
+ఫ
+బ
+భ
+మ
+య
+ర
+ఱ
+ల
+ళ
+వ
+శ
+ష
+స
+హ
+ఽ
+ా
+ి
+ీ
+ు
+ూ
+ృ
+ౄ
+ె
+ే
+ై
+ొ
+ో
+ౌ
+్
+ౕ
+ౖ
+ౙ
+ౠ
+౦
+౩
+ಂ
+ಃ
+ಅ
+ಆ
+ಇ
+ಈ
+ಉ
+ಊ
+ಋ
+ಎ
+ಏ
+ಐ
+ಒ
+ಓ
+ಔ
+ಕ
+ಖ
+ಗ
+ಘ
+ಙ
+ಚ
+ಛ
+ಜ
+ಝ
+ಞ
+ಟ
+ಠ
+ಡ
+ಢ
+ಣ
+ತ
+ಥ
+ದ
+ಧ
+ನ
+ಪ
+ಫ
+ಬ
+ಭ
+ಮ
+ಯ
+ರ
+ಱ
+ಲ
+ಳ
+ವ
+ಶ
+ಷ
+ಸ
+ಹ
+಼
+ಽ
+ಾ
+ಿ
+ೀ
+ು
+ೂ
+ೃ
+ೆ
+ೇ
+ೈ
+ೊ
+ೋ
+ೌ
+್
+ೕ
+ೖ
+ೞ
+ೠ
+೦
+೧
+೨
+೩
+೪
+೫
+೬
+೭
+೮
+೯
+ം
+ഃ
+അ
+ആ
+ഇ
+ഈ
+ഉ
+ഊ
+ഋ
+എ
+ഏ
+ഐ
+ഒ
+ഓ
+ഔ
+ക
+ഖ
+ഗ
+ഘ
+ങ
+ച
+ഛ
+ജ
+ഝ
+ഞ
+ട
+ഠ
+ഡ
+ഢ
+ണ
+ത
+ഥ
+ദ
+ധ
+ന
+പ
+ഫ
+ബ
+ഭ
+മ
+യ
+ര
+റ
+ല
+ള
+ഴ
+വ
+ശ
+ഷ
+സ
+ഹ
+ാ
+ി
+ീ
+ു
+ൂ
+ൃ
+െ
+േ
+ൈ
+ൊ
+ോ
+ൌ
+്
+ൎ
+ൗ
+ൟ
+ൺ
+ൻ
+ർ
+ൽ
+ൾ
+ൿ
+᠎
+‌
+‍
+›
+⇒
+
+
+📯
+읽
+임
+입
+있
+자
+작
+잔
+잖
+잘
+잡
+잤
+장
+재
+저
+전
+점
+정
+제
+져
+졌
+조
+족
+좀
+종
+좋
+죠
+주
+준
+줄
+중
+줘
+즈
+즐
+즘
+지
+진
+집
+짜
+짝
+쩌
+쪼
+쪽
+쫌
+쭈
+쯔
+찌
+찍
+차
+착
+찾
+책
+처
+천
+철
+체
+쳐
+쳤
+초
+촌
+추
+출
+춤
+춥
+춰
+치
+친
+칠
+침
+칩
+칼
+커
+켓
+코
+콩
+쿠
+퀴
+크
+큰
+큽
+키
+킨
+타
+태
+터
+턴
+털
+테
+토
+통
+투
+트
+특
+튼
+틀
+티
+팀
+파
+팔
+패
+페
+펜
+펭
+평
+포
+폭
+표
+품
+풍
+프
+플
+피
+필
+하
+학
+한
+할
+함
+합
+항
+해
+햇
+했
+행
+허
+험
+형
+혜
+호
+혼
+홀
+화
+회
+획
+후
+휴
+흐
+흔
+희
+히
+힘
+ﷺ
+ﷻ
+！
+，
+？
+�
+𠮶

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+accelerate>=0.33.0
+cached_path
+ema_pytorch>=0.5.2
+gradio>=5.0.0
+librosa
+pydub
+safetensors
+soundfile
+torch>=2.0.0
+torchaudio>=2.0.0
+torchdiffeq
+tqdm>=4.65.0
+transformers
+vocos
+x_transformers>=1.31.14

src/f5_tts/api.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import random
+import sys
+from importlib.resources import files
+import soundfile as sf
+import tqdm
+from cached_path import cached_path
+from hydra.utils import get_class
+from omegaconf import OmegaConf
+from f5_tts.infer.utils_infer import (
+    infer_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
+    remove_silence_for_generated_wav,
+    save_spectrogram,
+    transcribe,
+)
+from f5_tts.model.utils import seed_everything
+class F5TTS:
+    def __init__(
+        self,
+        model="F5TTS_v1_Base",
+        ckpt_file="",
+        vocab_file="",
+        ode_method="euler",
+        use_ema=True,
+        vocoder_local_path=None,
+        device=None,
+        hf_cache_dir=None,
+    ):
+        model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
+        model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
+        model_arc = model_cfg.model.arch
+        self.mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
+        self.target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
+        self.ode_method = ode_method
+        self.use_ema = use_ema
+        if device is not None:
+            self.device = device
+        else:
+            import torch
+            self.device = (
+                "cuda"
+                if torch.cuda.is_available()
+                else "xpu"
+                if torch.xpu.is_available()
+                else "mps"
+                if torch.backends.mps.is_available()
+                else "cpu"
+            )
+        # Load models
+        self.vocoder = load_vocoder(
+            self.mel_spec_type, vocoder_local_path is not None, vocoder_local_path, self.device, hf_cache_dir
+        )
+        repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
+        # override for previous models
+        if model == "F5TTS_Base":
+            if self.mel_spec_type == "vocos":
+                ckpt_step = 1200000
+            elif self.mel_spec_type == "bigvgan":
+                model = "F5TTS_Base_bigvgan"
+                ckpt_type = "pt"
+        elif model == "E2TTS_Base":
+            repo_name = "E2-TTS"
+            ckpt_step = 1200000
+        if not ckpt_file:
+            ckpt_file = str(
+                cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}", cache_dir=hf_cache_dir)
+            )
+        self.ema_model = load_model(
+            model_cls, model_arc, ckpt_file, self.mel_spec_type, vocab_file, self.ode_method, self.use_ema, self.device
+        )
+    def transcribe(self, ref_audio, language=None):
+        return transcribe(ref_audio, language)
+    def export_wav(self, wav, file_wave, remove_silence=False):
+        sf.write(file_wave, wav, self.target_sample_rate)
+        if remove_silence:
+            remove_silence_for_generated_wav(file_wave)
+    def export_spectrogram(self, spec, file_spec):
+        save_spectrogram(spec, file_spec)
+    def infer(
+        self,
+        ref_file,
+        ref_text,
+        gen_text,
+        show_info=print,
+        progress=tqdm,
+        target_rms=0.1,
+        cross_fade_duration=0.15,
+        sway_sampling_coef=-1,
+        cfg_strength=2,
+        nfe_step=32,
+        speed=1.0,
+        fix_duration=None,
+        remove_silence=False,
+        file_wave=None,
+        file_spec=None,
+        seed=None,
+    ):
+        if seed is None:
+            seed = random.randint(0, sys.maxsize)
+        seed_everything(seed)
+        self.seed = seed
+        ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text)
+        wav, sr, spec = infer_process(
+            ref_file,
+            ref_text,
+            gen_text,
+            self.ema_model,
+            self.vocoder,
+            self.mel_spec_type,
+            show_info=show_info,
+            progress=progress,
+            target_rms=target_rms,
+            cross_fade_duration=cross_fade_duration,
+            nfe_step=nfe_step,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=sway_sampling_coef,
+            speed=speed,
+            fix_duration=fix_duration,
+            device=self.device,
+        )
+        if file_wave is not None:
+            self.export_wav(wav, file_wave, remove_silence)
+        if file_spec is not None:
+            self.export_spectrogram(spec, file_spec)
+        return wav, sr, spec
+if __name__ == "__main__":
+    f5tts = F5TTS()
+    wav, sr, spec = f5tts.infer(
+        ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
+        ref_text="Some call me nature, others call me mother nature.",
+        gen_text="I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
+        file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
+        file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
+        seed=None,
+    )
+    print("seed :", f5tts.seed)

src/f5_tts/configs/E2TTS_Base.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN  # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # frame | sample
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 11
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup updates
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: E2TTS_Base
+  tokenizer: pinyin
+  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
+  backbone: UNetT
+  arch:
+    dim: 1024
+    depth: 24
+    heads: 16
+    ff_mult: 4
+    text_mask_padding: False
+    pe_attn_head: 1
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # vocos | bigvgan
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: null  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | null
+  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples
+  save_per_updates: 50000  # save checkpoint per updates
+  keep_last_n_checkpoints: -1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
+  last_per_updates: 5000  # save last checkpoint per updates
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/configs/E2TTS_Small.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # frame | sample
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 11
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup updates
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+  bnb_optimizer: False
+model:
+  name: E2TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
+  backbone: UNetT
+  arch:
+    dim: 768
+    depth: 20
+    heads: 12
+    ff_mult: 4
+    text_mask_padding: False
+    pe_attn_head: 1
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # vocos | bigvgan
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: null  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | null
+  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples
+  save_per_updates: 50000  # save checkpoint per updates
+  keep_last_n_checkpoints: -1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
+  last_per_updates: 5000  # save last checkpoint per updates
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/configs/F5TTS_Base.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN  # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # frame | sample
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 11
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup updates
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: F5TTS_Base  # model name
+  tokenizer: pinyin  # tokenizer type
+  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
+  backbone: DiT
+  arch:
+    dim: 1024
+    depth: 22
+    heads: 16
+    ff_mult: 2
+    text_dim: 512
+    text_mask_padding: False
+    conv_layers: 4
+    pe_attn_head: 1
+    attn_backend: torch  # torch | flash_attn
+    attn_mask_enabled: False
+    checkpoint_activations: False  # recompute activations and save memory for extra compute
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # vocos | bigvgan
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: null  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | null
+  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples
+  save_per_updates: 50000  # save checkpoint per updates
+  keep_last_n_checkpoints: -1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
+  last_per_updates: 5000  # save last checkpoint per updates
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/configs/F5TTS_Small.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # frame | sample
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 11  # only suitable for Emilia, if you want to train it on LibriTTS, set epoch 686
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup updates
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: F5TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
+  backbone: DiT
+  arch:
+    dim: 768
+    depth: 18
+    heads: 12
+    ff_mult: 2
+    text_dim: 512
+    text_mask_padding: False
+    conv_layers: 4
+    pe_attn_head: 1
+    attn_backend: torch  # torch | flash_attn
+    attn_mask_enabled: False
+    checkpoint_activations: False  # recompute activations and save memory for extra compute
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # vocos | bigvgan
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: null  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | null
+  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples
+  save_per_updates: 50000  # save checkpoint per updates
+  keep_last_n_checkpoints: -1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
+  last_per_updates: 5000  # save last checkpoint per updates
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/configs/F5TTS_v1_Base.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN  # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # frame | sample
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16
+optim:
+  epochs: 11
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup updates
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: F5TTS_v1_Base  # model name
+  tokenizer: pinyin  # tokenizer type
+  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
+  backbone: DiT
+  arch:
+    dim: 1024
+    depth: 22
+    heads: 16
+    ff_mult: 2
+    text_dim: 512
+    text_mask_padding: True
+    qk_norm: null  # null | rms_norm
+    conv_layers: 4
+    pe_attn_head: null
+    attn_backend: torch  # torch | flash_attn
+    attn_mask_enabled: False
+    checkpoint_activations: False  # recompute activations and save memory for extra compute
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # vocos | bigvgan
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: null  # local vocoder path
+ckpts:
+  logger: wandb  # wandb | tensorboard | null
+  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples
+  save_per_updates: 50000  # save checkpoint per updates
+  keep_last_n_checkpoints: -1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
+  last_per_updates: 5000  # save last checkpoint per updates
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

src/f5_tts/eval/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# Evaluation
+Install packages for evaluation:
+```bash
+pip install -e .[eval]
+```
+## Generating Samples for Evaluation
+### Prepare Test Datasets
+1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
+2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
+3. Unzip the downloaded datasets and place them in the `data/` directory.
+4. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
+### Batch Inference for Test Set
+To run batch inference for evaluations, execute the following commands:
+```bash
+# if not setup accelerate config yet
+accelerate config
+# if only perform inference
+bash src/f5_tts/eval/eval_infer_batch.sh --infer-only
+# if inference and with corresponding evaluation, setup the following tools first
+bash src/f5_tts/eval/eval_infer_batch.sh
+```
+## Objective Evaluation on Generated Results
+### Download Evaluation Model Checkpoints
+1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
+2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
+3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
+> [!NOTE]
+> ASR model will be automatically downloaded if `--local` not set for evaluation scripts.
+> Otherwise, you should update the `asr_ckpt_dir` path values in `eval_librispeech_test_clean.py` or `eval_seedtts_testset.py`.
+>
+> WavLM model must be downloaded and your `wavlm_ckpt_dir` path updated in `eval_librispeech_test_clean.py` and `eval_seedtts_testset.py`.
+### Objective Evaluation Examples
+Update the path with your batch-inferenced results, and carry out WER / SIM / UTMOS evaluations:
+```bash
+# Evaluation [WER] for Seed-TTS test [ZH] set
+python src/f5_tts/eval/eval_seedtts_testset.py --eval_task wer --lang zh --gen_wav_dir <GEN_WAV_DIR> --gpu_nums 8
+# Evaluation [SIM] for LibriSpeech-PC test-clean (cross-sentence)
+python src/f5_tts/eval/eval_librispeech_test_clean.py --eval_task sim --gen_wav_dir <GEN_WAV_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
+# Evaluation [UTMOS]. --ext: Audio extension
+python src/f5_tts/eval/eval_utmos.py --audio_dir <WAV_DIR> --ext wav
+```
+> [!NOTE]
+> Evaluation results can also be found in `_*_results.jsonl` files saved in `<GEN_WAV_DIR>`/`<WAV_DIR>`.

src/f5_tts/eval/__pycache__/compare_checkpoints.cpython-311.pyc ADDED Viewed

Binary file (9.67 kB). View file

src/f5_tts/eval/__pycache__/eval_bengali.cpython-311.pyc ADDED Viewed

Binary file (15.1 kB). View file

src/f5_tts/eval/__pycache__/gen_bengali_batch.cpython-311.pyc ADDED Viewed

Binary file (6.8 kB). View file

src/f5_tts/eval/__pycache__/gen_elevenlabs_batch.cpython-311.pyc ADDED Viewed

Binary file (5.58 kB). View file

src/f5_tts/eval/__pycache__/gen_indicf5_batch.cpython-311.pyc ADDED Viewed

Binary file (8.08 kB). View file

src/f5_tts/eval/compare_checkpoints.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Compare checkpoints on stimulai53 and plot results."""
+import argparse
+import csv
+import json
+import subprocess
+import sys
+from pathlib import Path
+import matplotlib.pyplot as plt
+def run_gen_eval(ckpt_file, output_dir, testset="stimulai53"):
+    """Generate and evaluate for a single checkpoint."""
+    src_dir = Path(f"examples/{testset}")
+    dst_dir = Path(output_dir)
+    # Clear source dir to force regeneration
+    if src_dir.exists():
+        for f in src_dir.glob("*.wav"):
+            f.unlink()
+    # Generate
+    gen_cmd = [
+        sys.executable, "-m", "src.f5_tts.eval.gen_bengali_batch",
+        "--testset", testset,
+        "--ckpt_file", ckpt_file,
+        "--use_ema",
+    ]
+    subprocess.run(gen_cmd, check=True)
+    # Move outputs to checkpoint-specific dir
+    dst_dir.mkdir(parents=True, exist_ok=True)
+    for f in src_dir.glob("*.wav"):
+        f.rename(dst_dir / f.name)
+    # Evaluate
+    eval_cmd = [
+        sys.executable, "-m", "src.f5_tts.eval.eval_bengali",
+        "--testset", testset,
+        "--gen_dir", str(dst_dir),
+    ]
+    subprocess.run(eval_cmd, check=True)
+    # Rename result file to include step number
+    src_result = Path(f"results/bengali_eval_{testset}.json")
+    dst_result = Path(f"results/bengali_eval_{dst_dir.name}.json")
+    if src_result.exists():
+        src_result.rename(dst_result)
+def save_csv(results, output_path="results/checkpoint_comparison.csv"):
+    """Save results to CSV."""
+    with open(output_path, 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['step', 'cer', 'sim', 'utmos'])
+        for step in sorted(results.keys()):
+            r = results[step]
+            writer.writerow([step, r['cer'], r['sim'], r['utmos']])
+    print(f"CSV saved to {output_path}")
+def plot_results(results, output_path="results/checkpoint_comparison.png"):
+    """Plot CER, SIM, UTMOS across checkpoints in 3 separate plots."""
+    steps = sorted(results.keys())
+    cer = [results[s]["cer"] for s in steps]
+    sim = [results[s]["sim"] for s in steps]
+    utmos = [results[s]["utmos"] for s in steps]
+    fig, axes = plt.subplots(3, 1, figsize=(8, 10))
+    # CER
+    axes[0].plot(steps, cer, 'o-', color='tab:red', linewidth=2, markersize=8)
+    axes[0].set_xlabel('Training Steps')
+    axes[0].set_ylabel('CER')
+    axes[0].set_title('Character Error Rate (lower=better)')
+    axes[0].grid(True, alpha=0.3)
+    for i, (x, y) in enumerate(zip(steps, cer)):
+        axes[0].annotate(f'{y:.4f}', (x, y), textcoords="offset points", xytext=(0, 10), ha='center')
+    # SIM
+    axes[1].plot(steps, sim, 'o-', color='tab:blue', linewidth=2, markersize=8)
+    axes[1].set_xlabel('Training Steps')
+    axes[1].set_ylabel('SIM')
+    axes[1].set_title('Speaker Similarity (higher=better)')
+    axes[1].grid(True, alpha=0.3)
+    for i, (x, y) in enumerate(zip(steps, sim)):
+        axes[1].annotate(f'{y:.4f}', (x, y), textcoords="offset points", xytext=(0, 10), ha='center')
+    # UTMOS
+    axes[2].plot(steps, utmos, 'o-', color='tab:green', linewidth=2, markersize=8)
+    axes[2].set_xlabel('Training Steps')
+    axes[2].set_ylabel('UTMOS')
+    axes[2].set_title('Audio Quality (higher=better)')
+    axes[2].grid(True, alpha=0.3)
+    for i, (x, y) in enumerate(zip(steps, utmos)):
+        axes[2].annotate(f'{y:.4f}', (x, y), textcoords="offset points", xytext=(0, 10), ha='center')
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150)
+    print(f"Plot saved to {output_path}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_dir", default="ckpts/bengali_300h")
+    parser.add_argument("--testset", default="stimulai53")
+    parser.add_argument("--steps", nargs="+", type=int, default=[10000, 20000, 30000, 40000, 50000])
+    args = parser.parse_args()
+    results = {}
+    for step in args.steps:
+        ckpt_file = f"{args.ckpt_dir}/model_{step}.pt"
+        output_dir = f"examples/{args.testset}_{step}"
+        result_file = f"results/bengali_eval_{args.testset}_{step}.json"
+        print(f"\n{'='*50}")
+        print(f"Processing step {step}")
+        print(f"{'='*50}")
+        run_gen_eval(ckpt_file, output_dir, args.testset)
+        # Load results
+        with open(result_file) as f:
+            data = json.load(f)
+            results[step] = {
+                "cer": data["avg_cer"],
+                "sim": data["avg_sim"],
+                "utmos": data["avg_utmos"],
+            }
+        print(f"Step {step}: CER={data['avg_cer']:.4f}, SIM={data['avg_sim']:.4f}, UTMOS={data['avg_utmos']:.4f}")
+    # Print summary table
+    print(f"\n{'='*50}")
+    print("Summary")
+    print(f"{'='*50}")
+    print(f"{'Step':>10} {'CER':>10} {'SIM':>10} {'UTMOS':>10}")
+    for step in sorted(results.keys()):
+        r = results[step]
+        print(f"{step:>10} {r['cer']:>10.4f} {r['sim']:>10.4f} {r['utmos']:>10.4f}")
+    # Save CSV and plot
+    save_csv(results)
+    plot_results(results)
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/ecapa_tdnn.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# just for speaker similarity evaluation, third-party code
+# From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
+# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+""" Res2Conv1d + BatchNorm1d + ReLU
+"""
+class Res2Conv1dReluBn(nn.Module):
+    """
+    in_channels == out_channels == channels
+    """
+    def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            # Order: conv -> relu -> bn
+            sp = self.convs[i](sp)
+            sp = self.bns[i](F.relu(sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+        return out
+""" Conv1d + BatchNorm1d + ReLU
+"""
+class Conv1dReluBn(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
+        super().__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
+        self.bn = nn.BatchNorm1d(out_channels)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+""" The SE connection of 1D case.
+"""
+class SE_Connect(nn.Module):
+    def __init__(self, channels, se_bottleneck_dim=128):
+        super().__init__()
+        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+        return out
+""" SE-Res2Block of the ECAPA-TDNN architecture.
+"""
+# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
+#     return nn.Sequential(
+#         Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
+#         Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
+#         Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
+#         SE_Connect(channels)
+#     )
+class SE_Res2Block(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
+        super().__init__()
+        self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
+        self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+    def forward(self, x):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+        x = self.Conv1dReluBn1(x)
+        x = self.Res2Conv1dReluBn(x)
+        x = self.Conv1dReluBn2(x)
+        x = self.SE_Connect(x)
+        return x + residual
+""" Attentive weighted mean and standard deviation pooling.
+"""
+class AttentiveStatsPool(nn.Module):
+    def __init__(self, in_dim, attention_channels=128, global_context_att=False):
+        super().__init__()
+        self.global_context_att = global_context_att
+        # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1)  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1)  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1)  # equals V and k in the paper
+    def forward(self, x):
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+        # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+        alpha = torch.tanh(self.linear1(x_in))
+        # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
+        std = torch.sqrt(residuals.clamp(min=1e-9))
+        return torch.cat([mean, std], dim=1)
+class ECAPA_TDNN(nn.Module):
+    def __init__(
+        self,
+        feat_dim=80,
+        channels=512,
+        emb_dim=192,
+        global_context_att=False,
+        feat_type="wavlm_large",
+        sr=16000,
+        feature_selection="hidden_states",
+        update_extract=False,
+        config_path=None,
+    ):
+        super().__init__()
+        self.feat_type = feat_type
+        self.feature_selection = feature_selection
+        self.update_extract = update_extract
+        self.sr = sr
+        torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
+        try:
+            local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
+            self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
+        except:  # noqa: E722
+            self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
+        if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
+            self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
+        ):
+            self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
+        if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
+            self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
+        ):
+            self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
+        self.feat_num = self.get_feat_num()
+        self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
+        if feat_type != "fbank" and feat_type != "mfcc":
+            freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
+            for name, param in self.feature_extract.named_parameters():
+                for freeze_val in freeze_list:
+                    if freeze_val in name:
+                        param.requires_grad = False
+                        break
+        if not self.update_extract:
+            for param in self.feature_extract.parameters():
+                param.requires_grad = False
+        self.instance_norm = nn.InstanceNorm1d(feat_dim)
+        # self.channels = [channels] * 4 + [channels * 3]
+        self.channels = [channels] * 4 + [1536]
+        self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(
+            self.channels[0],
+            self.channels[1],
+            kernel_size=3,
+            stride=1,
+            padding=2,
+            dilation=2,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        self.layer3 = SE_Res2Block(
+            self.channels[1],
+            self.channels[2],
+            kernel_size=3,
+            stride=1,
+            padding=3,
+            dilation=3,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        self.layer4 = SE_Res2Block(
+            self.channels[2],
+            self.channels[3],
+            kernel_size=3,
+            stride=1,
+            padding=4,
+            dilation=4,
+            scale=8,
+            se_bottleneck_dim=128,
+        )
+        # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
+        cat_channels = channels * 3
+        self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
+        self.pooling = AttentiveStatsPool(
+            self.channels[-1], attention_channels=128, global_context_att=global_context_att
+        )
+        self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
+        self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
+    def get_feat_num(self):
+        self.feature_extract.eval()
+        wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
+        with torch.no_grad():
+            features = self.feature_extract(wav)
+        select_feature = features[self.feature_selection]
+        if isinstance(select_feature, (list, tuple)):
+            return len(select_feature)
+        else:
+            return 1
+    def get_feat(self, x):
+        if self.update_extract:
+            x = self.feature_extract([sample for sample in x])
+        else:
+            with torch.no_grad():
+                if self.feat_type == "fbank" or self.feat_type == "mfcc":
+                    x = self.feature_extract(x) + 1e-6  # B x feat_dim x time_len
+                else:
+                    x = self.feature_extract([sample for sample in x])
+        if self.feat_type == "fbank":
+            x = x.log()
+        if self.feat_type != "fbank" and self.feat_type != "mfcc":
+            x = x[self.feature_selection]
+            if isinstance(x, (list, tuple)):
+                x = torch.stack(x, dim=0)
+            else:
+                x = x.unsqueeze(0)
+            norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
+            x = (norm_weights * x).sum(dim=0)
+            x = torch.transpose(x, 1, 2) + 1e-6
+        x = self.instance_norm(x)
+        return x
+    def forward(self, x):
+        x = self.get_feat(x)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out = torch.cat([out2, out3, out4], dim=1)
+        out = F.relu(self.conv(out))
+        out = self.bn(self.pooling(out))
+        out = self.linear(out)
+        return out
+def ECAPA_TDNN_SMALL(
+    feat_dim,
+    emb_dim=256,
+    feat_type="wavlm_large",
+    sr=16000,
+    feature_selection="hidden_states",
+    update_extract=False,
+    config_path=None,
+):
+    return ECAPA_TDNN(
+        feat_dim=feat_dim,
+        channels=512,
+        emb_dim=emb_dim,
+        feat_type=feat_type,
+        sr=sr,
+        feature_selection=feature_selection,
+        update_extract=update_extract,
+        config_path=config_path,
+    )

src/f5_tts/eval/eval_bengali.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import argparse
+import json
+import os
+from pathlib import Path
+import librosa
+import torch
+import torch.nn.functional as F
+import torchaudio
+from tqdm import tqdm
+from huggingface_hub import hf_hub_download
+TESTSET_CONFIG = {
+    "stimulai53": {
+        "gen_dir": "examples/stimulai53",
+        "text_file": "examples/BengaliStimulai53.txt",
+        "n_samples": 53,
+    },
+    "ne200": {
+        "gen_dir": "examples/ne200",
+        "text_file": "examples/BengaliNamedEntity200.txt",
+        "n_samples": 200,
+    },
+    "st200": {
+        "gen_dir": "examples/st200",
+        "text_file": "examples/ShortText200.txt",
+        "n_samples": 200,
+    },
+}
+REF_DIR = "examples/slr37"
+def load_texts(text_file):
+    with open(text_file, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f.readlines()]
+def normalize_bengali_text(text):
+    import re
+    text = re.sub(r"[।,\.!?;:\"\'\-\(\)]", "", text)
+    text = " ".join(text.split())
+    return text
+def compute_cer(reference, hypothesis):
+    from jiwer import cer
+    ref_norm = normalize_bengali_text(reference)
+    hyp_norm = normalize_bengali_text(hypothesis)
+    if not ref_norm:
+        return 0.0
+    return cer(ref_norm, hyp_norm)
+def run_asr_bengali(audio_paths, device="cuda", model_id="bengaliAI/tugstugi_bengaliai-asr_whisper-medium"):
+    from transformers import WhisperProcessor, WhisperForConditionalGeneration, GenerationConfig
+    import librosa
+    processor = WhisperProcessor.from_pretrained(model_id)
+    model = WhisperForConditionalGeneration.from_pretrained(model_id).to(device)
+    # Fix outdated generation config
+    model.generation_config = GenerationConfig.from_pretrained("openai/whisper-medium")
+    transcriptions = []
+    for audio_path in tqdm(audio_paths, desc="ASR"):
+        audio, sr = librosa.load(str(audio_path), sr=16000)
+        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(device)
+        predicted_ids = model.generate(input_features, language="bn", task="transcribe")
+        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        transcriptions.append(text)
+    return transcriptions
+def run_sim_bengali(gen_paths, ref_paths, ckpt_dir=None, device="cuda"):
+    # Load ECAPA2 model from HuggingFace
+    model_file = hf_hub_download(repo_id="Jenthe/ECAPA2", filename="ecapa2.pt")
+    ecapa2 = torch.jit.load(model_file, map_location=device)
+    sim_scores = []
+    for gen_path, ref_path in tqdm(zip(gen_paths, ref_paths), desc="SIM", total=len(gen_paths)):
+        wav1, sr1 = torchaudio.load(gen_path)
+        wav2, sr2 = torchaudio.load(ref_path)
+        if sr1 != 16000:
+            resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
+            wav1 = resample1(wav1)
+        if sr2 != 16000:
+            resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
+            wav2 = resample2(wav2)
+        wav1 = wav1.to(device)
+        wav2 = wav2.to(device)
+        with torch.jit.optimized_execution(False):
+            emb1 = ecapa2(wav1)
+            emb2 = ecapa2(wav2)
+        sim = F.cosine_similarity(emb1, emb2).item()
+        sim_scores.append(sim)
+    return sim_scores
+def run_utmos_bengali(audio_paths, device="cuda"):
+    predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
+    predictor = predictor.to(device)
+    utmos_scores = []
+    for audio_path in tqdm(audio_paths, desc="UTMOS"):
+        wav, sr = librosa.load(audio_path, sr=None, mono=True)
+        wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
+        score = predictor(wav_tensor, sr)
+        utmos_scores.append(score.item())
+    return utmos_scores
+def evaluate_testset(testset_name, base_dir, gen_dir=None, device="cuda"):
+    config = TESTSET_CONFIG[testset_name]
+    gen_dir = Path(gen_dir) if gen_dir else Path(base_dir) / config["gen_dir"]
+    text_file = Path(base_dir) / config["text_file"]
+    ref_dir = Path(base_dir) / REF_DIR
+    n_samples = config["n_samples"]
+    gen_paths = [gen_dir / f"output_{i}.wav" for i in range(n_samples)]
+    ref_paths = [ref_dir / f"slr_{i}.wav" for i in range(n_samples)]
+    gt_texts = load_texts(text_file)
+    missing = [p for p in gen_paths if not p.exists()]
+    if missing:
+        print(f"Warning: {len(missing)} generated files missing")
+        existing_idx = [i for i, p in enumerate(gen_paths) if p.exists()]
+        gen_paths = [gen_paths[i] for i in existing_idx]
+        ref_paths = [ref_paths[i] for i in existing_idx]
+        gt_texts = [gt_texts[i] for i in existing_idx]
+    print(f"\n=== Evaluating {testset_name} ({len(gen_paths)} samples) ===")
+    # ASR + CER
+    print("Running ASR...")
+    transcriptions = run_asr_bengali(gen_paths, device)
+    cer_scores = [compute_cer(gt, hyp) for gt, hyp in zip(gt_texts, transcriptions)]
+    # Speaker Similarity
+    print("Running Speaker Similarity...")
+    sim_scores = run_sim_bengali(gen_paths, ref_paths, device=device)
+    # UTMOS
+    print("Running UTMOS...")
+    utmos_scores = run_utmos_bengali(gen_paths, device)
+    results = {
+        "testset": testset_name,
+        "n_samples": len(gen_paths),
+        "avg_cer": sum(cer_scores) / len(cer_scores),
+        "avg_sim": sum(sim_scores) / len(sim_scores),
+        "avg_utmos": sum(utmos_scores) / len(utmos_scores),
+        "per_sample": [
+            {
+                "idx": i,
+                "gt_text": gt_texts[i],
+                "hyp_text": transcriptions[i],
+                "cer": cer_scores[i],
+                "sim": sim_scores[i],
+                "utmos": utmos_scores[i],
+            }
+            for i in range(len(gen_paths))
+        ],
+    }
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="Bengali TTS Evaluation")
+    parser.add_argument("--testset", type=str, default="all",
+                        choices=["stimulai53", "ne200", "st200", "all"])
+    parser.add_argument("--base_dir", type=str, default=".")
+    parser.add_argument("--gen_dir", type=str, default=None,
+                        help="Override generated audio directory")
+    parser.add_argument("--output_dir", type=str, default="results")
+    parser.add_argument("--device", type=str, default="cuda")
+    args = parser.parse_args()
+    output_dir = Path(args.base_dir) / args.output_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+    testsets = list(TESTSET_CONFIG.keys()) if args.testset == "all" else [args.testset]
+    all_results = []
+    for testset in testsets:
+        results = evaluate_testset(testset, args.base_dir, args.gen_dir, args.device)
+        all_results.append(results)
+        output_file = output_dir / f"bengali_eval_{testset}.json"
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        print(f"Results saved to {output_file}")
+        print(f"\n{testset}: CER={results['avg_cer']:.4f}, SIM={results['avg_sim']:.4f}, UTMOS={results['avg_utmos']:.4f}")
+    if len(all_results) > 1:
+        total_samples = sum(r["n_samples"] for r in all_results)
+        avg_cer = sum(r["avg_cer"] * r["n_samples"] for r in all_results) / total_samples
+        avg_sim = sum(r["avg_sim"] * r["n_samples"] for r in all_results) / total_samples
+        avg_utmos = sum(r["avg_utmos"] * r["n_samples"] for r in all_results) / total_samples
+        print(f"\n=== Overall ({total_samples} samples) ===")
+        print(f"CER={avg_cer:.4f}, SIM={avg_sim:.4f}, UTMOS={avg_utmos:.4f}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_gemini.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+Gemini TTS Evaluation Script for Bengali
+Computes CER (via Whisper ASR) and UTMOS for Gemini-generated audio.
+No SIM computation (Gemini doesn't do voice cloning).
+"""
+import argparse
+import json
+import re
+from pathlib import Path
+import librosa
+import torch
+from tqdm import tqdm
+TESTSET_CONFIG = {
+    "stimulai53": {
+        "gen_dir": "examples/stimulai53_gemini",
+        "text_file": "examples/BengaliStimulai53.txt",
+        "n_samples": 53,
+    },
+    "ne200": {
+        "gen_dir": "examples/ne200_gemini",
+        "text_file": "examples/BengaliNamedEntity200.txt",
+        "n_samples": 200,
+    },
+    "st200": {
+        "gen_dir": "examples/st200_gemini",
+        "text_file": "examples/ShortText200.txt",
+        "n_samples": 200,
+    },
+}
+def load_texts(text_file):
+    with open(text_file, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f.readlines()]
+def normalize_bengali_text(text):
+    text = re.sub(r"[।,\.!?;:\"\'\-\(\)]", "", text)
+    text = " ".join(text.split())
+    return text
+def compute_cer(reference, hypothesis):
+    from jiwer import cer
+    ref_norm = normalize_bengali_text(reference)
+    hyp_norm = normalize_bengali_text(hypothesis)
+    if not ref_norm:
+        return 0.0
+    return cer(ref_norm, hyp_norm)
+def run_asr_bengali(audio_paths, device="cuda", model_id="bengaliAI/tugstugi_bengaliai-asr_whisper-medium"):
+    from transformers import WhisperProcessor, WhisperForConditionalGeneration, GenerationConfig
+    processor = WhisperProcessor.from_pretrained(model_id)
+    model = WhisperForConditionalGeneration.from_pretrained(model_id).to(device)
+    model.generation_config = GenerationConfig.from_pretrained("openai/whisper-medium")
+    transcriptions = []
+    for audio_path in tqdm(audio_paths, desc="ASR"):
+        audio, sr = librosa.load(str(audio_path), sr=16000)
+        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(device)
+        predicted_ids = model.generate(input_features, language="bn", task="transcribe")
+        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        transcriptions.append(text)
+    return transcriptions
+def run_utmos(audio_paths, device="cuda"):
+    predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
+    predictor = predictor.to(device)
+    utmos_scores = []
+    for audio_path in tqdm(audio_paths, desc="UTMOS"):
+        wav, sr = librosa.load(audio_path, sr=None, mono=True)
+        wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
+        score = predictor(wav_tensor, sr)
+        utmos_scores.append(score.item())
+    return utmos_scores
+def evaluate_testset(testset_name, base_dir, gen_dir=None, device="cuda"):
+    config = TESTSET_CONFIG[testset_name]
+    gen_dir = Path(gen_dir) if gen_dir else Path(base_dir) / config["gen_dir"]
+    text_file = Path(base_dir) / config["text_file"]
+    n_samples = config["n_samples"]
+    gen_paths = [gen_dir / f"output_{i}.wav" for i in range(n_samples)]
+    gt_texts = load_texts(text_file)
+    missing = [p for p in gen_paths if not p.exists()]
+    if missing:
+        print(f"Warning: {len(missing)} generated files missing")
+        existing_idx = [i for i, p in enumerate(gen_paths) if p.exists()]
+        gen_paths = [gen_paths[i] for i in existing_idx]
+        gt_texts = [gt_texts[i] for i in existing_idx]
+    print(f"\n=== Evaluating {testset_name} ({len(gen_paths)} samples) ===")
+    # ASR + CER
+    print("Running ASR...")
+    transcriptions = run_asr_bengali(gen_paths, device)
+    cer_scores = [compute_cer(gt, hyp) for gt, hyp in zip(gt_texts, transcriptions)]
+    # UTMOS
+    print("Running UTMOS...")
+    utmos_scores = run_utmos(gen_paths, device)
+    results = {
+        "testset": testset_name,
+        "n_samples": len(gen_paths),
+        "avg_cer": sum(cer_scores) / len(cer_scores),
+        "avg_utmos": sum(utmos_scores) / len(utmos_scores),
+        "per_sample": [
+            {
+                "idx": i,
+                "gt_text": gt_texts[i],
+                "hyp_text": transcriptions[i],
+                "cer": cer_scores[i],
+                "utmos": utmos_scores[i],
+            }
+            for i in range(len(gen_paths))
+        ],
+    }
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="Gemini TTS Evaluation (CER + UTMOS)")
+    parser.add_argument("--testset", type=str, default="stimulai53",
+                        choices=list(TESTSET_CONFIG.keys()))
+    parser.add_argument("--base_dir", type=str, default=".")
+    parser.add_argument("--gen_dir", type=str, default=None,
+                        help="Override generated audio directory")
+    parser.add_argument("--output_dir", type=str, default="results")
+    parser.add_argument("--device", type=str, default="cuda")
+    args = parser.parse_args()
+    output_dir = Path(args.base_dir) / args.output_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+    results = evaluate_testset(args.testset, args.base_dir, args.gen_dir, args.device)
+    output_file = output_dir / f"bengali_eval_{args.testset}_gemini.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"Results saved to {output_file}")
+    print(f"\n{args.testset}: CER={results['avg_cer']:.4f}, UTMOS={results['avg_utmos']:.4f}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_infer_batch.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+import sys
+sys.path.append(os.getcwd())
+import argparse
+import time
+from importlib.resources import files
+import torch
+import torchaudio
+from accelerate import Accelerator
+from hydra.utils import get_class
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from f5_tts.eval.utils_eval import (
+    get_inference_prompt,
+    get_librispeech_test_clean_metainfo,
+    get_seedtts_testset_metainfo,
+)
+from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
+from f5_tts.model import CFM
+from f5_tts.model.utils import get_tokenizer
+accelerator = Accelerator()
+device = f"cuda:{accelerator.process_index}"
+use_ema = True
+target_rms = 0.1
+rel_path = str(files("f5_tts").joinpath("../../"))
+def main():
+    parser = argparse.ArgumentParser(description="batch inference")
+    parser.add_argument("-s", "--seed", default=None, type=int)
+    parser.add_argument("-n", "--expname", required=True)
+    parser.add_argument("-c", "--ckptstep", default=1250000, type=int)
+    parser.add_argument("-nfe", "--nfestep", default=32, type=int)
+    parser.add_argument("-o", "--odemethod", default="euler")
+    parser.add_argument("-ss", "--swaysampling", default=-1, type=float)
+    parser.add_argument("-t", "--testset", required=True)
+    parser.add_argument(
+        "-p", "--librispeech_test_clean_path", default=f"{rel_path}/data/LibriSpeech/test-clean", type=str
+    )
+    parser.add_argument("--local", action="store_true", help="Use local vocoder checkpoint directory")
+    args = parser.parse_args()
+    seed = args.seed
+    exp_name = args.expname
+    ckpt_step = args.ckptstep
+    nfe_step = args.nfestep
+    ode_method = args.odemethod
+    sway_sampling_coef = args.swaysampling
+    testset = args.testset
+    infer_batch_size = 1  # max frames. 1 for ddp single inference (recommended)
+    cfg_strength = 2.0
+    speed = 1.0
+    use_truth_duration = False
+    no_ref_audio = False
+    model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{exp_name}.yaml")))
+    model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
+    model_arc = model_cfg.model.arch
+    dataset_name = model_cfg.datasets.name
+    tokenizer = model_cfg.model.tokenizer
+    mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
+    target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
+    n_mel_channels = model_cfg.model.mel_spec.n_mel_channels
+    hop_length = model_cfg.model.mel_spec.hop_length
+    win_length = model_cfg.model.mel_spec.win_length
+    n_fft = model_cfg.model.mel_spec.n_fft
+    if testset == "ls_pc_test_clean":
+        metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
+        librispeech_test_clean_path = args.librispeech_test_clean_path
+        metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
+    elif testset == "seedtts_test_zh":
+        metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
+        metainfo = get_seedtts_testset_metainfo(metalst)
+    elif testset == "seedtts_test_en":
+        metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
+        metainfo = get_seedtts_testset_metainfo(metalst)
+    # path to save genereted wavs
+    output_dir = (
+        f"{rel_path}/"
+        f"results/{exp_name}_{ckpt_step}/{testset}/"
+        f"seed{seed}_{ode_method}_nfe{nfe_step}_{mel_spec_type}"
+        f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"
+        f"_cfg{cfg_strength}_speed{speed}"
+        f"{'_gt-dur' if use_truth_duration else ''}"
+        f"{'_no-ref-audio' if no_ref_audio else ''}"
+    )
+    # -------------------------------------------------#
+    prompts_all = get_inference_prompt(
+        metainfo,
+        speed=speed,
+        tokenizer=tokenizer,
+        target_sample_rate=target_sample_rate,
+        n_mel_channels=n_mel_channels,
+        hop_length=hop_length,
+        mel_spec_type=mel_spec_type,
+        target_rms=target_rms,
+        use_truth_duration=use_truth_duration,
+        infer_batch_size=infer_batch_size,
+    )
+    # Vocoder model
+    local = args.local
+    if mel_spec_type == "vocos":
+        vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
+    elif mel_spec_type == "bigvgan":
+        vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+    vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
+    # Tokenizer
+    vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
+    # Model
+    model = CFM(
+        transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
+        mel_spec_kwargs=dict(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            n_mel_channels=n_mel_channels,
+            target_sample_rate=target_sample_rate,
+            mel_spec_type=mel_spec_type,
+        ),
+        odeint_kwargs=dict(
+            method=ode_method,
+        ),
+        vocab_char_map=vocab_char_map,
+    ).to(device)
+    ckpt_prefix = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}"
+    if os.path.exists(ckpt_prefix + ".pt"):
+        ckpt_path = ckpt_prefix + ".pt"
+    elif os.path.exists(ckpt_prefix + ".safetensors"):
+        ckpt_path = ckpt_prefix + ".safetensors"
+    else:
+        print("Loading from self-organized training checkpoints rather than released pretrained.")
+        ckpt_prefix = rel_path + f"/{model_cfg.ckpts.save_dir}/model_{ckpt_step}"
+        if os.path.exists(ckpt_prefix + ".pt"):
+            ckpt_path = ckpt_prefix + ".pt"
+        elif os.path.exists(ckpt_prefix + ".safetensors"):
+            ckpt_path = ckpt_prefix + ".safetensors"
+        else:
+            raise ValueError("The checkpoint does not exist or cannot be found in given location.")
+    dtype = torch.float32 if mel_spec_type == "bigvgan" else None
+    model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
+    if not os.path.exists(output_dir) and accelerator.is_main_process:
+        os.makedirs(output_dir)
+    # start batch inference
+    accelerator.wait_for_everyone()
+    start = time.time()
+    with accelerator.split_between_processes(prompts_all) as prompts:
+        for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
+            utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
+            ref_mels = ref_mels.to(device)
+            ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
+            total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
+            # Inference
+            with torch.inference_mode():
+                generated, _ = model.sample(
+                    cond=ref_mels,
+                    text=final_text_list,
+                    duration=total_mel_lens,
+                    lens=ref_mel_lens,
+                    steps=nfe_step,
+                    cfg_strength=cfg_strength,
+                    sway_sampling_coef=sway_sampling_coef,
+                    no_ref_audio=no_ref_audio,
+                    seed=seed,
+                )
+                # Final result
+                for i, gen in enumerate(generated):
+                    gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
+                    gen_mel_spec = gen.permute(0, 2, 1).to(torch.float32)
+                    if mel_spec_type == "vocos":
+                        generated_wave = vocoder.decode(gen_mel_spec).cpu()
+                    elif mel_spec_type == "bigvgan":
+                        generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
+                    if ref_rms_list[i] < target_rms:
+                        generated_wave = generated_wave * ref_rms_list[i] / target_rms
+                    torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        timediff = time.time() - start
+        print(f"Done batch inference in {timediff / 60:.2f} minutes.")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_infer_batch.sh ADDED Viewed

	@@ -0,0 +1,116 @@

+#!/bin/bash
+set -e
+export PYTHONWARNINGS="ignore::UserWarning,ignore::FutureWarning"
+# Configuration parameters
+MODEL_NAME="F5TTS_v1_Base"
+SEEDS=(0 1 2)
+CKPTSTEPS=(1250000)
+TASKS=("seedtts_test_zh" "seedtts_test_en" "ls_pc_test_clean")
+LS_TEST_CLEAN_PATH="data/LibriSpeech/test-clean"
+GPUS="[0,1,2,3,4,5,6,7]"
+OFFLINE_MODE=false
+# Parse arguments
+if [ $OFFLINE_MODE = true ]; then
+    LOCAL="--local"
+else
+    LOCAL=""
+fi
+INFER_ONLY=false
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --infer-only)
+            INFER_ONLY=true
+            shift
+            ;;
+        *)
+            echo "======== Unknown parameter: $1"
+            exit 1
+            ;;
+    esac
+done
+echo "======== Starting F5-TTS batch evaluation task..."
+if [ "$INFER_ONLY" = true ]; then
+    echo "======== Mode: Execute infer tasks only"
+else
+    echo "======== Mode: Execute full pipeline (infer + eval)"
+fi
+# Function: Execute eval tasks
+execute_eval_tasks() {
+    local ckptstep=$1
+    local seed=$2
+    local task_name=$3
+    local gen_wav_dir="results/${MODEL_NAME}_${ckptstep}/${task_name}/seed${seed}_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0"
+    echo ">>>>>>>> Starting eval task: ckptstep=${ckptstep}, seed=${seed}, task=${task_name}"
+    case $task_name in
+        "seedtts_test_zh")
+            python src/f5_tts/eval/eval_seedtts_testset.py -e wer -l zh -g "$gen_wav_dir" -n "$GPUS" $LOCAL
+            python src/f5_tts/eval/eval_seedtts_testset.py -e sim -l zh -g "$gen_wav_dir" -n "$GPUS" $LOCAL
+            python src/f5_tts/eval/eval_utmos.py --audio_dir "$gen_wav_dir"
+            ;;
+        "seedtts_test_en")
+            python src/f5_tts/eval/eval_seedtts_testset.py -e wer -l en -g "$gen_wav_dir" -n "$GPUS" $LOCAL
+            python src/f5_tts/eval/eval_seedtts_testset.py -e sim -l en -g "$gen_wav_dir" -n "$GPUS" $LOCAL
+            python src/f5_tts/eval/eval_utmos.py --audio_dir "$gen_wav_dir"
+            ;;
+        "ls_pc_test_clean")
+            python src/f5_tts/eval/eval_librispeech_test_clean.py -e wer -g "$gen_wav_dir" -n "$GPUS" -p "$LS_TEST_CLEAN_PATH" $LOCAL
+            python src/f5_tts/eval/eval_librispeech_test_clean.py -e sim -g "$gen_wav_dir" -n "$GPUS" -p "$LS_TEST_CLEAN_PATH" $LOCAL
+            python src/f5_tts/eval/eval_utmos.py --audio_dir "$gen_wav_dir"
+            ;;
+    esac
+    echo ">>>>>>>> Completed eval task: ckptstep=${ckptstep}, seed=${seed}, task=${task_name}"
+}
+# Main execution loop
+for ckptstep in "${CKPTSTEPS[@]}"; do
+    echo "======== Processing ckptstep: ${ckptstep}"
+    for seed in "${SEEDS[@]}"; do
+        echo "-------- Processing seed: ${seed}"
+        # Store eval task PIDs for current seed (if not infer-only mode)
+        if [ "$INFER_ONLY" = false ]; then
+            declare -a eval_pids
+        fi
+        # Execute each infer task sequentially
+        for task in "${TASKS[@]}"; do
+            echo ">>>>>>>> Executing infer task: accelerate launch src/f5_tts/eval/eval_infer_batch.py -s ${seed} -n \"${MODEL_NAME}\" -t \"${task}\" -c ${ckptstep} $LOCAL"
+            # Execute infer task (foreground execution, wait for completion)
+            accelerate launch src/f5_tts/eval/eval_infer_batch.py -s ${seed} -n "${MODEL_NAME}" -t "${task}" -c ${ckptstep} -p "${LS_TEST_CLEAN_PATH}" $LOCAL
+            # If not infer-only mode, launch corresponding eval task
+            if [ "$INFER_ONLY" = false ]; then
+                # Launch corresponding eval task (background execution, non-blocking for next infer)
+                execute_eval_tasks $ckptstep $seed $task &
+                eval_pids+=($!)
+            fi
+        done
+        # If not infer-only mode, wait for all eval tasks of current seed to complete
+        if [ "$INFER_ONLY" = false ]; then
+            echo ">>>>>>>> All infer tasks for seed ${seed} completed, waiting for corresponding eval tasks to finish..."
+            for pid in "${eval_pids[@]}"; do
+                wait $pid
+            done
+            unset eval_pids  # Clean up array
+        fi
+        echo "-------- All eval tasks for seed ${seed} completed"
+    done
+    echo "======== Completed ckptstep: ${ckptstep}"
+    echo
+done
+echo "======== All tasks completed!"

src/f5_tts/eval/eval_infer_batch_example.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+# e.g. F5-TTS, 16 NFE
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_zh" -nfe 16
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_en" -nfe 16
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "ls_pc_test_clean" -nfe 16 -p data/LibriSpeech/test-clean
+# e.g. Vanilla E2 TTS, 32 NFE
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_zh" -o "midpoint" -ss 0
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_en" -o "midpoint" -ss 0
+accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "ls_pc_test_clean" -o "midpoint" -ss 0 -p data/LibriSpeech/test-clean
+# e.g. evaluate F5-TTS 16 NFE result on Seed-TTS test-zh
+python src/f5_tts/eval/eval_seedtts_testset.py -e wer -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe16_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
+python src/f5_tts/eval/eval_seedtts_testset.py -e sim -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe16_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
+python src/f5_tts/eval/eval_utmos.py --audio_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe16_vocos_ss-1_cfg2.0_speed1.0
+# etc.

src/f5_tts/eval/eval_librispeech_test_clean.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
+import argparse
+import ast
+import json
+import os
+import sys
+sys.path.append(os.getcwd())
+import multiprocessing as mp
+from importlib.resources import files
+import numpy as np
+from f5_tts.eval.utils_eval import get_librispeech_test, run_asr_wer, run_sim
+rel_path = str(files("f5_tts").joinpath("../../"))
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
+    parser.add_argument("-l", "--lang", type=str, default="en")
+    parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
+    parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
+    parser.add_argument(
+        "-n", "--gpu_nums", type=str, default="8", help="Number of GPUs to use (e.g., 8) or GPU list (e.g., [0,1,2,3])"
+    )
+    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
+    return parser.parse_args()
+def parse_gpu_nums(gpu_nums_str):
+    try:
+        if gpu_nums_str.startswith("[") and gpu_nums_str.endswith("]"):
+            gpu_list = ast.literal_eval(gpu_nums_str)
+            if isinstance(gpu_list, list):
+                return gpu_list
+        return list(range(int(gpu_nums_str)))
+    except (ValueError, SyntaxError):
+        raise argparse.ArgumentTypeError(
+            f"Invalid GPU specification: {gpu_nums_str}. Use a number (e.g., 8) or a list (e.g., [0,1,2,3])"
+        )
+def main():
+    args = get_args()
+    eval_task = args.eval_task
+    lang = args.lang
+    librispeech_test_clean_path = args.librispeech_test_clean_path  # test-clean path
+    gen_wav_dir = args.gen_wav_dir
+    metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
+    gpus = parse_gpu_nums(args.gpu_nums)
+    test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
+    ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
+    ## leading to a low similarity for the ground truth in some cases.
+    # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True)  # eval ground truth
+    local = args.local
+    if local:  # use local custom checkpoint dir
+        asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
+    else:
+        asr_ckpt_dir = ""  # auto download to cache dir
+    wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
+    # --------------------------------------------------------------------------
+    full_results = []
+    metrics = []
+    if eval_task == "wer":
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_asr_wer, args)
+            for r in results:
+                full_results.extend(r)
+    elif eval_task == "sim":
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_sim, args)
+            for r in results:
+                full_results.extend(r)
+    else:
+        raise ValueError(f"Unknown metric type: {eval_task}")
+    result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
+    with open(result_path, "w") as f:
+        for line in full_results:
+            metrics.append(line[eval_task])
+            f.write(json.dumps(line, ensure_ascii=False) + "\n")
+        metric = round(np.mean(metrics), 5)
+        f.write(f"\n{eval_task.upper()}: {metric}\n")
+    print(f"\nTotal {len(metrics)} samples")
+    print(f"{eval_task.upper()}: {metric}")
+    print(f"{eval_task.upper()} results saved to {result_path}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_seedtts_testset.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Evaluate with Seed-TTS testset
+import argparse
+import ast
+import json
+import os
+import sys
+sys.path.append(os.getcwd())
+import multiprocessing as mp
+from importlib.resources import files
+import numpy as np
+from f5_tts.eval.utils_eval import get_seed_tts_test, run_asr_wer, run_sim
+rel_path = str(files("f5_tts").joinpath("../../"))
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
+    parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
+    parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
+    parser.add_argument(
+        "-n", "--gpu_nums", type=str, default="8", help="Number of GPUs to use (e.g., 8) or GPU list (e.g., [0,1,2,3])"
+    )
+    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
+    return parser.parse_args()
+def parse_gpu_nums(gpu_nums_str):
+    try:
+        if gpu_nums_str.startswith("[") and gpu_nums_str.endswith("]"):
+            gpu_list = ast.literal_eval(gpu_nums_str)
+            if isinstance(gpu_list, list):
+                return gpu_list
+        return list(range(int(gpu_nums_str)))
+    except (ValueError, SyntaxError):
+        raise argparse.ArgumentTypeError(
+            f"Invalid GPU specification: {gpu_nums_str}. Use a number (e.g., 8) or a list (e.g., [0,1,2,3])"
+        )
+def main():
+    args = get_args()
+    eval_task = args.eval_task
+    lang = args.lang
+    gen_wav_dir = args.gen_wav_dir
+    metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst"  # seed-tts testset
+    # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
+    #       zh 1.254 seems a result of 4 workers wer_seed_tts
+    gpus = parse_gpu_nums(args.gpu_nums)
+    test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
+    local = args.local
+    if local:  # use local custom checkpoint dir
+        if lang == "zh":
+            asr_ckpt_dir = "../checkpoints/funasr"  # paraformer-zh dir under funasr
+        elif lang == "en":
+            asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
+    else:
+        asr_ckpt_dir = ""  # auto download to cache dir
+    wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
+    # --------------------------------------------------------------------------
+    full_results = []
+    metrics = []
+    if eval_task == "wer":
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_asr_wer, args)
+            for r in results:
+                full_results.extend(r)
+    elif eval_task == "sim":
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_sim, args)
+            for r in results:
+                full_results.extend(r)
+    else:
+        raise ValueError(f"Unknown metric type: {eval_task}")
+    result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
+    with open(result_path, "w") as f:
+        for line in full_results:
+            metrics.append(line[eval_task])
+            f.write(json.dumps(line, ensure_ascii=False) + "\n")
+        metric = round(np.mean(metrics), 5)
+        f.write(f"\n{eval_task.upper()}: {metric}\n")
+    print(f"\nTotal {len(metrics)} samples")
+    print(f"{eval_task.upper()}: {metric}")
+    print(f"{eval_task.upper()} results saved to {result_path}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_utmos.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import argparse
+import json
+from pathlib import Path
+import librosa
+import torch
+from tqdm import tqdm
+def main():
+    parser = argparse.ArgumentParser(description="UTMOS Evaluation")
+    parser.add_argument("--audio_dir", type=str, required=True, help="Audio file path.")
+    parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
+    args = parser.parse_args()
+    device = "cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu"
+    predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
+    predictor = predictor.to(device)
+    audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))
+    utmos_score = 0
+    utmos_result_path = Path(args.audio_dir) / "_utmos_results.jsonl"
+    with open(utmos_result_path, "w", encoding="utf-8") as f:
+        for audio_path in tqdm(audio_paths, desc="Processing"):
+            wav, sr = librosa.load(audio_path, sr=None, mono=True)
+            wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
+            score = predictor(wav_tensor, sr)
+            line = {}
+            line["wav"], line["utmos"] = str(audio_path.stem), score.item()
+            utmos_score += score.item()
+            f.write(json.dumps(line, ensure_ascii=False) + "\n")
+        avg_score = utmos_score / len(audio_paths) if len(audio_paths) > 0 else 0
+        f.write(f"\nUTMOS: {avg_score:.4f}\n")
+    print(f"UTMOS: {avg_score:.4f}")
+    print(f"UTMOS results saved to {utmos_result_path}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/gen_bengali_batch.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+Batch inference script for Bengali TTS evaluation.
+Generates all test audios efficiently by loading model once and processing in batches.
+"""
+import argparse
+import os
+from pathlib import Path
+import torch
+import torchaudio
+from tqdm import tqdm
+from f5_tts.infer.utils_infer import (
+    load_model,
+    load_vocoder,
+    infer_process,
+    preprocess_ref_audio_text,
+)
+from f5_tts.model import DiT
+TESTSET_CONFIG = {
+    "stimulai53": {
+        "text_file": "examples/BengaliStimulai53.txt",
+        "output_dir": "examples/stimulai53",
+        "n_samples": 53,
+        "speed": 1.0,
+    },
+    "ne200": {
+        "text_file": "examples/BengaliNamedEntity200.txt",
+        "output_dir": "examples/ne200",
+        "n_samples": 200,
+        "speed": 0.9,
+    },
+    "st200": {
+        "text_file": "examples/ShortText200.txt",
+        "output_dir": "examples/st200",
+        "n_samples": 200,
+        "speed": 0.9,
+    },
+}
+REF_DIR = "examples/slr37"
+REF_TEXT_FILE = "examples/slr37/slr37_texts.txt"
+def load_texts(text_file):
+    with open(text_file, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f.readlines()]
+def generate_testset(
+    testset_name,
+    model,
+    vocoder,
+    vocab_file,
+    base_dir,
+    device,
+    use_ema=True,
+):
+    config = TESTSET_CONFIG[testset_name]
+    text_file = Path(base_dir) / config["text_file"]
+    output_dir = Path(base_dir) / config["output_dir"]
+    ref_dir = Path(base_dir) / REF_DIR
+    ref_text_file = Path(base_dir) / REF_TEXT_FILE
+    n_samples = config["n_samples"]
+    speed = config["speed"]
+    output_dir.mkdir(parents=True, exist_ok=True)
+    gen_texts = load_texts(text_file)
+    ref_texts = load_texts(ref_text_file)
+    print(f"\n=== Generating {testset_name} ({n_samples} samples) ===")
+    for i in tqdm(range(n_samples), desc=testset_name):
+        ref_audio_path = ref_dir / f"slr_{i}.wav"
+        output_path = output_dir / f"output_{i}.wav"
+        if output_path.exists():
+            continue
+        ref_text = ref_texts[i]
+        gen_text = gen_texts[i]
+        try:
+            ref_audio, ref_text_processed = preprocess_ref_audio_text(
+                str(ref_audio_path), ref_text, show_info=lambda x: None
+            )
+            audio, sr, _ = infer_process(
+                ref_audio,
+                ref_text_processed,
+                gen_text,
+                model,
+                vocoder,
+                device=device,
+                speed=speed,
+            )
+            audio_tensor = torch.tensor(audio).unsqueeze(0)
+            torchaudio.save(str(output_path), audio_tensor, sr)
+        except Exception as e:
+            print(f"Failed {i}: {e}")
+            continue
+    print(f"Done! Generated {n_samples} files in {output_dir}")
+def main():
+    parser = argparse.ArgumentParser(description="Batch Bengali TTS Generation")
+    parser.add_argument("--testset", type=str, default="all",
+                        choices=["stimulai53", "ne200", "st200", "all"])
+    parser.add_argument("--base_dir", type=str, default=".")
+    parser.add_argument("--ckpt_file", type=str,
+                        default="ckpts/bengali_300h/model_50000.pt")
+    parser.add_argument("--vocab_file", type=str,
+                        default="data/Bengali/vocab.txt")
+    parser.add_argument("--use_ema", action="store_true", default=True)
+    parser.add_argument("--device", type=str, default=None)
+    args = parser.parse_args()
+    device = args.device or ("cuda" if torch.cuda.is_available() else "cpu")
+    print("Loading model...")
+    F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+    model = load_model(
+        model_cls=DiT,
+        model_cfg=F5TTS_model_cfg,
+        ckpt_path=args.ckpt_file,
+        mel_spec_type="vocos",
+        vocab_file=args.vocab_file,
+        device=device,
+        use_ema=args.use_ema,
+    )
+    print("Loading vocoder...")
+    vocoder = load_vocoder(vocoder_name="vocos", is_local=False)
+    testsets = list(TESTSET_CONFIG.keys()) if args.testset == "all" else [args.testset]
+    for testset in testsets:
+        generate_testset(
+            testset,
+            model,
+            vocoder,
+            args.vocab_file,
+            args.base_dir,
+            device,
+            args.use_ema,
+        )
+    print("\n=== All generation complete! ===")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/gen_elevenlabs_batch.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+ElevenLabs Batch Generation Script for Bengali Evaluation
+Generates audio for evaluation testsets using ElevenLabs API with voice cloning.
+"""
+import argparse
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+from elevenlabs import ElevenLabs
+from tqdm import tqdm
+TESTSET_CONFIG = {
+    "stimulai53": {
+        "text_file": "examples/BengaliStimulai53.txt",
+        "n_samples": 53,
+    },
+    "ne200": {
+        "text_file": "examples/BengaliNamedEntity200.txt",
+        "n_samples": 200,
+    },
+    "st200": {
+        "text_file": "examples/ShortText200.txt",
+        "n_samples": 200,
+    },
+}
+REF_DIR = "examples/slr37"
+def load_texts(text_file):
+    with open(text_file, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f.readlines()]
+def main():
+    load_dotenv()
+    parser = argparse.ArgumentParser(description="ElevenLabs Batch Generation")
+    parser.add_argument("--testset", type=str, required=True,
+                        choices=list(TESTSET_CONFIG.keys()))
+    parser.add_argument("--model", type=str, default="eleven_v3",
+                        help="ElevenLabs model ID")
+    parser.add_argument("--output_dir", type=str, default=None,
+                        help="Output directory (default: examples/{testset}_elevenlabs)")
+    parser.add_argument("--base_dir", type=str, default=".")
+    args = parser.parse_args()
+    api_key = os.getenv("ELEVENLABS_API_KEY")
+    if not api_key:
+        raise ValueError("ELEVENLABS_API_KEY not found. Add to .env file.")
+    client = ElevenLabs(api_key=api_key)
+    config = TESTSET_CONFIG[args.testset]
+    n_samples = config["n_samples"]
+    text_file = Path(args.base_dir) / config["text_file"]
+    ref_dir = Path(args.base_dir) / REF_DIR
+    output_dir = Path(args.output_dir) if args.output_dir else Path(args.base_dir) / f"examples/{args.testset}_elevenlabs"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    gen_texts = load_texts(text_file)
+    print(f"Generating {n_samples} samples for {args.testset}")
+    print(f"Model: {args.model}")
+    print(f"Output: {output_dir}")
+    for i in tqdm(range(n_samples), desc="Generating"):
+        output_path = output_dir / f"output_{i}.wav"
+        # Skip existing files (resume support)
+        if output_path.exists():
+            continue
+        ref_audio = ref_dir / f"slr_{i}.wav"
+        gen_text = gen_texts[i]
+        # Create voice clone
+        with open(ref_audio, "rb") as audio_file:
+            voice = client.voices.ivc.create(
+                name=f"temp_clone_{i}",
+                files=[audio_file],
+            )
+        try:
+            # Generate speech
+            audio_generator = client.text_to_speech.convert(
+                voice_id=voice.voice_id,
+                text=gen_text,
+                model_id=args.model,
+            )
+            # Save output
+            with open(output_path, "wb") as f:
+                for chunk in audio_generator:
+                    f.write(chunk)
+        finally:
+            # Cleanup voice
+            client.voices.delete(voice.voice_id)
+    print(f"Done! Generated files in {output_dir}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/gen_gemini_batch.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Gemini TTS Batch Generation Script for Bengali Evaluation
+Generates audio for evaluation testsets using Gemini 2.5 TTS API.
+No voice cloning - Gemini auto-detects language from Bengali text.
+"""
+import argparse
+import os
+import time
+import wave
+from pathlib import Path
+from dotenv import load_dotenv
+from google import genai
+from google.genai import types
+from google.genai.errors import ClientError
+from tqdm import tqdm
+TESTSET_CONFIG = {
+    "stimulai53": {
+        "text_file": "examples/BengaliStimulai53.txt",
+        "n_samples": 53,
+    },
+    "ne200": {
+        "text_file": "examples/BengaliNamedEntity200.txt",
+        "n_samples": 200,
+    },
+    "st200": {
+        "text_file": "examples/ShortText200.txt",
+        "n_samples": 200,
+    },
+}
+def load_texts(text_file):
+    with open(text_file, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f.readlines()]
+def main():
+    load_dotenv()
+    parser = argparse.ArgumentParser(description="Gemini TTS Batch Generation")
+    parser.add_argument("--testset", type=str, required=True,
+                        choices=list(TESTSET_CONFIG.keys()))
+    parser.add_argument("--model", type=str, default="gemini-2.5-flash-preview-tts",
+                        help="Gemini TTS model")
+    parser.add_argument("--output_dir", type=str, default=None,
+                        help="Output directory (default: examples/{testset}_gemini)")
+    parser.add_argument("--base_dir", type=str, default=".")
+    args = parser.parse_args()
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise ValueError("GEMINI_API_KEY not found. Add to .env file.")
+    client = genai.Client(api_key=api_key)
+    config = TESTSET_CONFIG[args.testset]
+    n_samples = config["n_samples"]
+    text_file = Path(args.base_dir) / config["text_file"]
+    output_dir = Path(args.output_dir) if args.output_dir else Path(args.base_dir) / f"examples/{args.testset}_gemini"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    gen_texts = load_texts(text_file)
+    print(f"Generating {n_samples} samples for {args.testset}")
+    print(f"Model: {args.model}")
+    print(f"Output: {output_dir}")
+    for i in tqdm(range(n_samples), desc="Generating"):
+        output_path = output_dir / f"output_{i}.wav"
+        # Skip existing files (resume support)
+        if output_path.exists():
+            continue
+        gen_text = gen_texts[i]
+        # Retry with backoff for rate limits
+        response = None
+        for attempt in range(5):
+            try:
+                response = client.models.generate_content(
+                    model=args.model,
+                    contents=gen_text,
+                    config=types.GenerateContentConfig(
+                        response_modalities=["AUDIO"],
+                    ),
+                )
+                break
+            except ClientError as e:
+                if "429" in str(e) or "RESOURCE_EXHAUSTED" in str(e):
+                    wait = 20 * (attempt + 1)
+                    print(f"\nRate limit hit, waiting {wait}s...")
+                    time.sleep(wait)
+                else:
+                    raise
+        if response is None:
+            print(f"\nSkipping sample {i} after 5 failed attempts")
+            continue
+        # Extract audio data
+        audio_data = response.candidates[0].content.parts[0].inline_data.data
+        # Write as WAV file (Gemini returns PCM audio at 24kHz)
+        with wave.open(str(output_path), "wb") as wav_file:
+            wav_file.setnchannels(1)
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(24000)
+            wav_file.writeframes(audio_data)
+    print(f"Done! Generated files in {output_dir}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/utils_eval.py ADDED Viewed

	@@ -0,0 +1,444 @@

+import math
+import os
+import random
+import string
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+import torchaudio
+from tqdm import tqdm
+from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
+from f5_tts.model.modules import MelSpec
+from f5_tts.model.utils import convert_char_to_pinyin
+# seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
+def get_seedtts_testset_metainfo(metalst):
+    f = open(metalst)
+    lines = f.readlines()
+    f.close()
+    metainfo = []
+    for line in lines:
+        if len(line.strip().split("|")) == 5:
+            utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
+        elif len(line.strip().split("|")) == 4:
+            utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
+            gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
+        if not os.path.isabs(prompt_wav):
+            prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
+        metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
+    return metainfo
+# librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
+def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
+    f = open(metalst)
+    lines = f.readlines()
+    f.close()
+    metainfo = []
+    for line in lines:
+        ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
+        # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
+        ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
+        ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
+        # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.'  # if use librispeech test-clean (no-pc)
+        gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
+        gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
+        metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
+    return metainfo
+# padded to max length mel batch
+def padded_mel_batch(ref_mels):
+    max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
+    padded_ref_mels = []
+    for mel in ref_mels:
+        padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value=0)
+        padded_ref_mels.append(padded_ref_mel)
+    padded_ref_mels = torch.stack(padded_ref_mels)
+    padded_ref_mels = padded_ref_mels.permute(0, 2, 1)
+    return padded_ref_mels
+# get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
+def get_inference_prompt(
+    metainfo,
+    speed=1.0,
+    tokenizer="pinyin",
+    polyphone=True,
+    target_sample_rate=24000,
+    n_fft=1024,
+    win_length=1024,
+    n_mel_channels=100,
+    hop_length=256,
+    mel_spec_type="vocos",
+    target_rms=0.1,
+    use_truth_duration=False,
+    infer_batch_size=1,
+    num_buckets=200,
+    min_secs=3,
+    max_secs=40,
+):
+    prompts_all = []
+    min_tokens = min_secs * target_sample_rate // hop_length
+    max_tokens = max_secs * target_sample_rate // hop_length
+    batch_accum = [0] * num_buckets
+    utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = (
+        [[] for _ in range(num_buckets)] for _ in range(6)
+    )
+    mel_spectrogram = MelSpec(
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mel_channels=n_mel_channels,
+        target_sample_rate=target_sample_rate,
+        mel_spec_type=mel_spec_type,
+    )
+    for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
+        # Audio
+        ref_audio, ref_sr = torchaudio.load(prompt_wav)
+        ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
+        if ref_rms < target_rms:
+            ref_audio = ref_audio * target_rms / ref_rms
+        assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
+        if ref_sr != target_sample_rate:
+            resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
+            ref_audio = resampler(ref_audio)
+        # Text
+        if len(prompt_text[-1].encode("utf-8")) == 1:
+            prompt_text = prompt_text + " "
+        text = [prompt_text + gt_text]
+        if tokenizer == "pinyin":
+            text_list = convert_char_to_pinyin(text, polyphone=polyphone)
+        else:
+            text_list = text
+        # to mel spectrogram
+        ref_mel = mel_spectrogram(ref_audio)
+        ref_mel = ref_mel.squeeze(0)
+        # Duration, mel frame length
+        ref_mel_len = ref_mel.shape[-1]
+        if use_truth_duration:
+            gt_audio, gt_sr = torchaudio.load(gt_wav)
+            if gt_sr != target_sample_rate:
+                resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
+                gt_audio = resampler(gt_audio)
+            total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
+            # # test vocoder resynthesis
+            # ref_audio = gt_audio
+        else:
+            ref_text_len = len(prompt_text.encode("utf-8"))
+            gen_text_len = len(gt_text.encode("utf-8"))
+            total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
+        # deal with batch
+        assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
+        assert min_tokens <= total_mel_len <= max_tokens, (
+            f"Audio {utt} has duration {total_mel_len * hop_length // target_sample_rate}s out of range [{min_secs}, {max_secs}]."
+        )
+        bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
+        utts[bucket_i].append(utt)
+        ref_rms_list[bucket_i].append(ref_rms)
+        ref_mels[bucket_i].append(ref_mel)
+        ref_mel_lens[bucket_i].append(ref_mel_len)
+        total_mel_lens[bucket_i].append(total_mel_len)
+        final_text_list[bucket_i].extend(text_list)
+        batch_accum[bucket_i] += total_mel_len
+        if batch_accum[bucket_i] >= infer_batch_size:
+            # print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
+            prompts_all.append(
+                (
+                    utts[bucket_i],
+                    ref_rms_list[bucket_i],
+                    padded_mel_batch(ref_mels[bucket_i]),
+                    ref_mel_lens[bucket_i],
+                    total_mel_lens[bucket_i],
+                    final_text_list[bucket_i],
+                )
+            )
+            batch_accum[bucket_i] = 0
+            (
+                utts[bucket_i],
+                ref_rms_list[bucket_i],
+                ref_mels[bucket_i],
+                ref_mel_lens[bucket_i],
+                total_mel_lens[bucket_i],
+                final_text_list[bucket_i],
+            ) = [], [], [], [], [], []
+    # add residual
+    for bucket_i, bucket_frames in enumerate(batch_accum):
+        if bucket_frames > 0:
+            prompts_all.append(
+                (
+                    utts[bucket_i],
+                    ref_rms_list[bucket_i],
+                    padded_mel_batch(ref_mels[bucket_i]),
+                    ref_mel_lens[bucket_i],
+                    total_mel_lens[bucket_i],
+                    final_text_list[bucket_i],
+                )
+            )
+    # not only leave easy work for last workers
+    random.seed(666)
+    random.shuffle(prompts_all)
+    return prompts_all
+# get wav_res_ref_text of seed-tts test metalst
+# https://github.com/BytedanceSpeech/seed-tts-eval
+def get_seed_tts_test(metalst, gen_wav_dir, gpus):
+    f = open(metalst)
+    lines = f.readlines()
+    f.close()
+    test_set_ = []
+    for line in tqdm(lines):
+        if len(line.strip().split("|")) == 5:
+            utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
+        elif len(line.strip().split("|")) == 4:
+            utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
+        if not os.path.exists(os.path.join(gen_wav_dir, utt + ".wav")):
+            continue
+        gen_wav = os.path.join(gen_wav_dir, utt + ".wav")
+        if not os.path.isabs(prompt_wav):
+            prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
+        test_set_.append((gen_wav, prompt_wav, gt_text))
+    num_jobs = len(gpus)
+    if num_jobs == 1:
+        return [(gpus[0], test_set_)]
+    wav_per_job = len(test_set_) // num_jobs + 1
+    test_set = []
+    for i in range(num_jobs):
+        test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
+    return test_set
+# get librispeech test-clean cross sentence test
+def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
+    f = open(metalst)
+    lines = f.readlines()
+    f.close()
+    test_set_ = []
+    for line in tqdm(lines):
+        ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
+        if eval_ground_truth:
+            gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
+            gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
+        else:
+            if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
+                raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
+            gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
+        ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
+        ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
+        test_set_.append((gen_wav, ref_wav, gen_txt))
+    num_jobs = len(gpus)
+    if num_jobs == 1:
+        return [(gpus[0], test_set_)]
+    wav_per_job = len(test_set_) // num_jobs + 1
+    test_set = []
+    for i in range(num_jobs):
+        test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
+    return test_set
+# load asr model
+def load_asr_model(lang, ckpt_dir=""):
+    if lang == "zh":
+        from funasr import AutoModel
+        model = AutoModel(
+            model=os.path.join(ckpt_dir, "paraformer-zh"),
+            # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
+            # punc_model = os.path.join(ckpt_dir, "ct-punc"),
+            # spk_model = os.path.join(ckpt_dir, "cam++"),
+            disable_update=True,
+        )  # following seed-tts setting
+    elif lang == "en":
+        from faster_whisper import WhisperModel
+        model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
+        model = WhisperModel(model_size, device="cuda", compute_type="float16")
+    elif lang == "bn":
+        from faster_whisper import WhisperModel
+        model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
+        model = WhisperModel(model_size, device="cuda", compute_type="float16")
+    return model
+# WER Evaluation, the way Seed-TTS does
+def run_asr_wer(args):
+    rank, lang, test_set, ckpt_dir = args
+    if lang == "zh":
+        import zhconv
+        torch.cuda.set_device(rank)
+    elif lang == "en":
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
+    elif lang == "bn":
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
+    else:
+        raise NotImplementedError(
+            "lang support only 'zh' (funasr paraformer-zh), 'en'/'bn' (faster-whisper-large-v3), for now."
+        )
+    asr_model = load_asr_model(lang, ckpt_dir=ckpt_dir)
+    from zhon.hanzi import punctuation
+    punctuation_all = punctuation + string.punctuation
+    wer_results = []
+    from jiwer import compute_measures
+    for gen_wav, prompt_wav, truth in tqdm(test_set):
+        if lang == "zh":
+            res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
+            hypo = res[0]["text"]
+            hypo = zhconv.convert(hypo, "zh-cn")
+        elif lang == "en":
+            segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
+            hypo = ""
+            for segment in segments:
+                hypo = hypo + " " + segment.text
+        elif lang == "bn":
+            segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="bn")
+            hypo = ""
+            for segment in segments:
+                hypo = hypo + segment.text
+        raw_truth = truth
+        raw_hypo = hypo
+        for x in punctuation_all:
+            truth = truth.replace(x, "")
+            hypo = hypo.replace(x, "")
+        truth = truth.replace("  ", " ")
+        hypo = hypo.replace("  ", " ")
+        if lang == "zh":
+            truth = " ".join([x for x in truth])
+            hypo = " ".join([x for x in hypo])
+        elif lang == "en":
+            truth = truth.lower()
+            hypo = hypo.lower()
+        elif lang == "bn":
+            # Bengali: character-level for CER, remove Bangla danda
+            truth = truth.replace("।", "")
+            hypo = hypo.replace("।", "")
+            truth = " ".join([x for x in truth if x.strip()])
+            hypo = " ".join([x for x in hypo if x.strip()])
+        measures = compute_measures(truth, hypo)
+        wer = measures["wer"]
+        # ref_list = truth.split(" ")
+        # subs = measures["substitutions"] / len(ref_list)
+        # dele = measures["deletions"] / len(ref_list)
+        # inse = measures["insertions"] / len(ref_list)
+        wer_results.append(
+            {
+                "wav": Path(gen_wav).stem,
+                "truth": raw_truth,
+                "hypo": raw_hypo,
+                "wer": wer,
+            }
+        )
+    return wer_results
+# SIM Evaluation
+def run_sim(args):
+    rank, test_set, ckpt_dir = args
+    device = f"cuda:{rank}"
+    model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
+    state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
+    model.load_state_dict(state_dict["model"], strict=False)
+    use_gpu = True if torch.cuda.is_available() else False
+    if use_gpu:
+        model = model.cuda(device)
+    model.eval()
+    sim_results = []
+    for gen_wav, prompt_wav, truth in tqdm(test_set):
+        wav1, sr1 = torchaudio.load(gen_wav)
+        wav2, sr2 = torchaudio.load(prompt_wav)
+        if use_gpu:
+            wav1 = wav1.cuda(device)
+            wav2 = wav2.cuda(device)
+        if sr1 != 16000:
+            resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
+            if use_gpu:
+                resample1 = resample1.cuda(device)
+            wav1 = resample1(wav1)
+        if sr2 != 16000:
+            resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
+            if use_gpu:
+                resample2 = resample2.cuda(device)
+            wav2 = resample2(wav2)
+        with torch.no_grad():
+            emb1 = model(wav1)
+            emb2 = model(wav2)
+        sim = F.cosine_similarity(emb1, emb2)[0].item()
+        # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
+        sim_results.append(
+            {
+                "wav": Path(gen_wav).stem,
+                "sim": sim,
+            }
+        )
+    return sim_results

src/f5_tts/infer/README.md ADDED Viewed

	@@ -0,0 +1,177 @@

+# Inference
+The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
+**More checkpoints with whole community efforts can be found in [SHARED.md](SHARED.md), supporting more languages.**
+Currently support **30s for a single** generation, which is the **total length** (same logic if `fix_duration`) including both prompt and output audio. However, `infer_cli` and `infer_gradio` will automatically do chunk generation for longer text. Long reference audio will be **clip short to ~12s**.
+To avoid possible inference failures, make sure you have seen through the following instructions.
+- Use reference audio <12s and leave proper silence space (e.g. 1s) at the end. Otherwise there is a risk of truncating in the middle of word, leading to suboptimal generation.
+- <ins>Uppercased letters</ins> (best with form like K.F.C.) will be uttered letter by letter, and lowercased letters used for common words.
+- Add some spaces (blank: " ") or punctuations (e.g. "," ".") <ins>to explicitly introduce some pauses</ins>.
+- If English punctuation marks the end of a sentence, make sure there is a space " " after it. Otherwise not regarded as when chunk.
+- <ins>Preprocess numbers</ins> to Chinese letters if you want to have them read in Chinese, otherwise in English.
+- If the generation output is blank (pure silence), <ins>check for FFmpeg installation</ins>.
+- Try <ins>turn off `use_ema` if using an early-stage</ins> finetuned checkpoint (which goes just few updates).
+## Gradio App
+Currently supported features:
+- Basic TTS with Chunk Inference
+- Multi-Style / Multi-Speaker Generation
+- Voice Chat powered by Qwen2.5-3B-Instruct
+- [Custom inference with more language support](SHARED.md)
+The cli command `f5-tts_infer-gradio` equals to `python src/f5_tts/infer/infer_gradio.py`, which launches a Gradio APP (web interface) for inference.
+The script will load model checkpoints from Huggingface. You can also manually download files and update the path to `load_model()` in `infer_gradio.py`. Currently only load TTS models first, will load ASR model to do transcription if `ref_text` not provided, will load LLM model if use Voice Chat.
+More flags options:
+```bash
+# Automatically launch the interface in the default web browser
+f5-tts_infer-gradio --inbrowser
+# Set the root path of the application, if it's not served from the root ("/") of the domain
+# For example, if the application is served at "https://example.com/myapp"
+f5-tts_infer-gradio --root_path "/myapp"
+```
+Could also be used as a component for larger application:
+```python
+import gradio as gr
+from f5_tts.infer.infer_gradio import app
+with gr.Blocks() as main_app:
+    gr.Markdown("# This is an example of using F5-TTS within a bigger Gradio app")
+    # ... other Gradio components
+    app.render()
+main_app.launch()
+```
+## CLI Inference
+The cli command `f5-tts_infer-cli` equals to `python src/f5_tts/infer/infer_cli.py`, which is a command line tool for inference.
+The script will load model checkpoints from Huggingface. You can also manually download files and use `--ckpt_file` to specify the model you want to load, or directly update in `infer_cli.py`.
+For change vocab.txt use `--vocab_file` to provide your `vocab.txt` file.
+Basically you can inference with flags:
+```bash
+# Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
+f5-tts_infer-cli \
+--model F5TTS_v1_Base \
+--ref_audio "ref_audio.wav" \
+--ref_text "The content, subtitle or transcription of reference audio." \
+--gen_text "Some text you want TTS model generate for you."
+# Use BigVGAN as vocoder. Currently only support F5TTS_Base.
+f5-tts_infer-cli --model F5TTS_Base --vocoder_name bigvgan --load_vocoder_from_local
+# Use custom path checkpoint, e.g.
+f5-tts_infer-cli --ckpt_file ckpts/F5TTS_v1_Base/model_1250000.safetensors
+# More instructions
+f5-tts_infer-cli --help
+```
+And a `.toml` file would help with more flexible usage.
+```bash
+f5-tts_infer-cli -c custom.toml
+```
+For example, you can use `.toml` to pass in variables, refer to `src/f5_tts/infer/examples/basic/basic.toml`:
+```toml
+# F5TTS_v1_Base | E2TTS_Base
+model = "F5TTS_v1_Base"
+ref_audio = "infer/examples/basic/basic_ref_en.wav"
+# If an empty "", transcribes the reference audio automatically.
+ref_text = "Some call me nature, others call me mother nature."
+gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
+# File with text to generate. Ignores the text above.
+gen_file = ""
+remove_silence = false
+output_dir = "tests"
+```
+You can also leverage `.toml` file to do multi-style generation, refer to `src/f5_tts/infer/examples/multi/story.toml`.
+```toml
+# F5TTS_v1_Base | E2TTS_Base
+model = "F5TTS_v1_Base"
+ref_audio = "infer/examples/multi/main.flac"
+# If an empty "", transcribes the reference audio automatically.
+ref_text = ""
+gen_text = ""
+# File with text to generate. Ignores the text above.
+gen_file = "infer/examples/multi/story.txt"
+remove_silence = true
+output_dir = "tests"
+[voices.town]
+ref_audio = "infer/examples/multi/town.flac"
+ref_text = ""
+[voices.country]
+ref_audio = "infer/examples/multi/country.flac"
+ref_text = ""
+```
+You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`.
+## API Usage
+```python
+from importlib.resources import files
+from f5_tts.api import F5TTS
+f5tts = F5TTS()
+wav, sr, spec = f5tts.infer(
+    ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
+    ref_text="some call me nature, others call me mother nature.",
+    gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
+    file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
+    file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
+    seed=None,
+)
+```
+Check [api.py](../api.py) for more details.
+## TensorRT-LLM Deployment
+See [detailed instructions](../runtime/triton_trtllm/README.md) for more information.
+## Socket Real-time Service
+Real-time voice output with chunk stream:
+```bash
+# Start socket server
+python src/f5_tts/socket_server.py
+# If PyAudio not installed
+sudo apt-get install portaudio19-dev
+pip install pyaudio
+# Communicate with socket client
+python src/f5_tts/socket_client.py
+```
+## Speech Editing
+To test speech editing capabilities, use the following command:
+```bash
+python src/f5_tts/infer/speech_edit.py
+```

src/f5_tts/infer/SHARED.md ADDED Viewed

	@@ -0,0 +1,193 @@

+<!-- omit in toc -->
+# Shared Model Cards
+<!-- omit in toc -->
+### **Prerequisites of using**
+- This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
+- The models in this repository are open source and are based on voluntary contributions from contributors.
+- The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
+<!-- omit in toc -->
+### **Welcome to share here**
+- Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
+- Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
+- Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
+<!-- omit in toc -->
+### Supported Languages
+- [Multilingual](#multilingual)
+    - [F5-TTS v1 v0 Base @ zh \& en @ F5-TTS](#f5-tts-v1-v0-base--zh--en--f5-tts)
+- [English](#english)
+- [Finnish](#finnish)
+    - [F5-TTS Base @ fi @ AsmoKoskinen](#f5-tts-base--fi--asmokoskinen)
+- [French](#french)
+    - [F5-TTS Base @ fr @ RASPIAUDIO](#f5-tts-base--fr--raspiaudio)
+- [German](#german)
+    - [F5-TTS Base @ de @ hvoss-techfak](#f5-tts-base--de--hvoss-techfak)
+- [Hindi](#hindi)
+    - [F5-TTS Small @ hi @ SPRINGLab](#f5-tts-small--hi--springlab)
+- [Italian](#italian)
+    - [F5-TTS Base @ it @ alien79](#f5-tts-base--it--alien79)
+- [Japanese](#japanese)
+    - [F5-TTS Base @ ja @ Jmica](#f5-tts-base--ja--jmica)
+- [Mandarin](#mandarin)
+- [Russian](#russian)
+    - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
+- [Spanish](#spanish)
+    - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
+## Multilingual
+#### F5-TTS v1 v0 Base @ zh & en @ F5-TTS
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS v1 Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_v1_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
+```bash
+Model: hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors
+# A Variant Model: hf://SWivid/F5-TTS/F5TTS_v1_Base_no_zero_init/model_1250000.safetensors
+Vocab: hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
+```
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
+```bash
+Model: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
+Vocab: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+*Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
+## English
+## Finnish
+#### F5-TTS Base @ fi @ AsmoKoskinen
+|Model|🤗Hugging Face|Data|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/AsmoKoskinen/F5-TTS_Finnish_Model)|[Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0), [Vox Populi](https://huggingface.co/datasets/facebook/voxpopuli)|cc-by-nc-4.0|
+```bash
+Model: hf://AsmoKoskinen/F5-TTS_Finnish_Model/model_common_voice_fi_vox_populi_fi_20241206.safetensors
+Vocab: hf://AsmoKoskinen/F5-TTS_Finnish_Model/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+## French
+#### F5-TTS Base @ fr @ RASPIAUDIO
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
+```bash
+Model: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
+Vocab: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+- [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
+- [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
+- [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).
+## German
+#### F5-TTS Base @ de @ hvoss-techfak
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/hvoss-techfak/F5-TTS-German)|[Mozilla Common Voice 19.0](https://commonvoice.mozilla.org/en/datasets) & 800 hours Crowdsourced |cc-by-nc-4.0|
+```bash
+Model: hf://hvoss-techfak/F5-TTS-German/model_f5tts_german.pt
+Vocab: hf://hvoss-techfak/F5-TTS-German/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+- Finetuned by [@hvoss-techfak](https://github.com/hvoss-techfak)
+## Hindi
+#### F5-TTS Small @ hi @ SPRINGLab
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
+```bash
+Model: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
+Vocab: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
+Config: {"dim": 768, "depth": 18, "heads": 12, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+- Authors: SPRING Lab, Indian Institute of Technology, Madras
+- Website: https://asr.iitm.ac.in/
+## Italian
+#### F5-TTS Base @ it @ alien79
+|Model|🤗Hugging Face|Data|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/alien79/F5-TTS-italian)|[ylacombe/cml-tts](https://huggingface.co/datasets/ylacombe/cml-tts) |cc-by-nc-4.0|
+```bash
+Model: hf://alien79/F5-TTS-italian/model_159600.safetensors
+Vocab: hf://alien79/F5-TTS-italian/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+- Trained by [Mithril Man](https://github.com/MithrilMan)
+- Model details on [hf project home](https://huggingface.co/alien79/F5-TTS-italian)
+- Open to collaborations to further improve the model
+## Japanese
+#### F5-TTS Base @ ja @ Jmica
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_21999120)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
+```bash
+Model: hf://Jmica/F5TTS/JA_21999120/model_21999120.pt
+Vocab: hf://Jmica/F5TTS/JA_21999120/vocab_japanese.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+## Mandarin
+## Russian
+#### F5-TTS Base @ ru @ HotDro4illa
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/hotstone228/F5-TTS-Russian)|[Common voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0)|cc-by-nc-4.0|
+```bash
+Model: hf://hotstone228/F5-TTS-Russian/model_last.safetensors
+Vocab: hf://hotstone228/F5-TTS-Russian/vocab.txt
+Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
+```
+- Finetuned by [HotDro4illa](https://github.com/HotDro4illa)
+- Any improvements are welcome
+## Spanish
+#### F5-TTS Base @ es @ jpgallegoar
+|Model|🤗Hugging Face|Data (Hours)|Model License|
+|:---:|:------------:|:-----------:|:-------------:|
+|F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
+- @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.

src/f5_tts/infer/__pycache__/infer_cli.cpython-311.pyc ADDED Viewed

Binary file (15.2 kB). View file

src/f5_tts/infer/__pycache__/infer_cli_emotion.cpython-311.pyc ADDED Viewed

Binary file (9.47 kB). View file

src/f5_tts/infer/__pycache__/infer_elevenlabs.cpython-311.pyc ADDED Viewed

Binary file (4.53 kB). View file

src/f5_tts/infer/__pycache__/infer_emotion.cpython-311.pyc ADDED Viewed

Binary file (11 kB). View file

src/f5_tts/infer/__pycache__/utils_infer.cpython-311.pyc ADDED Viewed

Binary file (24.6 kB). View file

src/f5_tts/infer/examples/basic/basic.toml ADDED Viewed

	@@ -0,0 +1,11 @@

+# F5TTS_v1_Base | E2TTS_Base
+model = "F5TTS_v1_Base"
+ref_audio = "infer/examples/basic/basic_ref_en.wav"
+# If an empty "", transcribes the reference audio automatically.
+ref_text = "Some call me nature, others call me mother nature."
+gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
+# File with text to generate. Ignores the text above.
+gen_file = ""
+remove_silence = false
+output_dir = "tests"
+output_file = "infer_cli_basic.wav"

src/f5_tts/infer/examples/basic/basic_ref_en.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0e22048e72414fcc1e6b6342e47a774d748a195ed34e4a5b3fcf416707f2b71
+size 256018

src/f5_tts/infer/examples/basic/basic_ref_zh.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96724a113240d1f82c6ded1334122f0176b96c9226ccd3c919e625bcfd2a3ede
+size 324558

src/f5_tts/infer/examples/multi/country.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb15708b4b3875e37beec46591a5d89e1a9a63fdad3b8fe4a5c8738f4f554400
+size 180321

src/f5_tts/infer/examples/multi/main.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4abb1107771ce7e14926fde879b959dde6db6e572476b98684f04e45e978ab19
+size 279219

src/f5_tts/infer/examples/multi/story.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+# F5TTS_v1_Base | E2TTS_Base
+model = "F5TTS_v1_Base"
+ref_audio = "infer/examples/multi/main.flac"
+# If an empty "", transcribes the reference audio automatically.
+ref_text = ""
+gen_text = ""
+# File with text to generate. Ignores the text above.
+gen_file = "infer/examples/multi/story.txt"
+remove_silence = true
+output_dir = "tests"
+output_file = "infer_cli_story.wav"
+[voices.town]
+ref_audio = "infer/examples/multi/town.flac"
+ref_text = ""
+speed = 0.8  # will ignore global speed
+[voices.country]
+ref_audio = "infer/examples/multi/country.flac"
+ref_text = ""

src/f5_tts/infer/examples/multi/story.txt ADDED Viewed

	@@ -0,0 +1 @@

+ A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] "My poor dear friend, you live here no better than the ants! Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land." [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] "Goodbye," [main] said he, [country] "I'm off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace."

src/f5_tts/infer/examples/multi/town.flac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7d069b8ebd5180c3b30fde5d378f0a1ddac96722d62cf43537efc3c3f3a3ce8
+size 229383

src/f5_tts/infer/examples/vocab.txt ADDED Viewed

	@@ -0,0 +1,2545 @@

+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+=
+>
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+_
+a
+a1
+ai1
+ai2
+ai3
+ai4
+an1
+an3
+an4
+ang1
+ang2
+ang4
+ao1
+ao2
+ao3
+ao4
+b
+ba
+ba1
+ba2
+ba3
+ba4
+bai1
+bai2
+bai3
+bai4
+ban1
+ban2
+ban3
+ban4
+bang1
+bang2
+bang3
+bang4
+bao1
+bao2
+bao3
+bao4
+bei
+bei1
+bei2
+bei3
+bei4
+ben1
+ben2
+ben3
+ben4
+beng
+beng1
+beng2
+beng3
+beng4
+bi1
+bi2
+bi3
+bi4
+bian1
+bian2
+bian3
+bian4
+biao1
+biao2
+biao3
+bie1
+bie2
+bie3
+bie4
+bin1
+bin4
+bing1
+bing2
+bing3
+bing4
+bo
+bo1
+bo2
+bo3
+bo4
+bu2
+bu3
+bu4
+c
+ca1
+cai1
+cai2
+cai3
+cai4
+can1
+can2
+can3
+can4
+cang1
+cang2
+cao1
+cao2
+cao3
+ce4
+cen1
+cen2
+ceng1
+ceng2
+ceng4
+cha1
+cha2
+cha3
+cha4
+chai1
+chai2
+chan1
+chan2
+chan3
+chan4
+chang1
+chang2
+chang3
+chang4
+chao1
+chao2
+chao3
+che1
+che2
+che3
+che4
+chen1
+chen2
+chen3
+chen4
+cheng1
+cheng2
+cheng3
+cheng4
+chi1
+chi2
+chi3
+chi4
+chong1
+chong2
+chong3
+chong4
+chou1
+chou2
+chou3
+chou4
+chu1
+chu2
+chu3
+chu4
+chua1
+chuai1
+chuai2
+chuai3
+chuai4
+chuan1
+chuan2
+chuan3
+chuan4
+chuang1
+chuang2
+chuang3
+chuang4
+chui1
+chui2
+chun1
+chun2
+chun3
+chuo1
+chuo4
+ci1
+ci2
+ci3
+ci4
+cong1
+cong2
+cou4
+cu1
+cu4
+cuan1
+cuan2
+cuan4
+cui1
+cui3
+cui4
+cun1
+cun2
+cun4
+cuo1
+cuo2
+cuo4
+d
+da
+da1
+da2
+da3
+da4
+dai1
+dai2
+dai3
+dai4
+dan1
+dan2
+dan3
+dan4
+dang1
+dang2
+dang3
+dang4
+dao1
+dao2
+dao3
+dao4
+de
+de1
+de2
+dei3
+den4
+deng1
+deng2
+deng3
+deng4
+di1
+di2
+di3
+di4
+dia3
+dian1
+dian2
+dian3
+dian4
+diao1
+diao3
+diao4
+die1
+die2
+die4
+ding1
+ding2
+ding3
+ding4
+diu1
+dong1
+dong3
+dong4
+dou1
+dou2
+dou3
+dou4
+du1
+du2
+du3
+du4
+duan1
+duan2
+duan3
+duan4
+dui1
+dui4
+dun1
+dun3
+dun4
+duo1
+duo2
+duo3
+duo4
+e
+e1
+e2
+e3
+e4
+ei2
+en1
+en4
+er
+er2
+er3
+er4
+f
+fa1
+fa2
+fa3
+fa4
+fan1
+fan2
+fan3
+fan4
+fang1
+fang2
+fang3
+fang4
+fei1
+fei2
+fei3
+fei4
+fen1
+fen2
+fen3
+fen4
+feng1
+feng2
+feng3
+feng4
+fo2
+fou2
+fou3
+fu1
+fu2
+fu3
+fu4
+g
+ga1
+ga2
+ga3
+ga4
+gai1
+gai2
+gai3
+gai4
+gan1
+gan2
+gan3
+gan4
+gang1
+gang2
+gang3
+gang4
+gao1
+gao2
+gao3
+gao4
+ge1
+ge2
+ge3
+ge4
+gei2
+gei3
+gen1
+gen2
+gen3
+gen4
+geng1
+geng3
+geng4
+gong1
+gong3
+gong4
+gou1
+gou2
+gou3
+gou4
+gu
+gu1
+gu2
+gu3
+gu4
+gua1
+gua2
+gua3
+gua4
+guai1
+guai2
+guai3
+guai4
+guan1
+guan2
+guan3
+guan4
+guang1
+guang2
+guang3
+guang4
+gui1
+gui2
+gui3
+gui4
+gun3
+gun4
+guo1
+guo2
+guo3
+guo4
+h
+ha1
+ha2
+ha3
+hai1
+hai2
+hai3
+hai4
+han1
+han2
+han3
+han4
+hang1
+hang2
+hang4
+hao1
+hao2
+hao3
+hao4
+he1
+he2
+he4
+hei1
+hen2
+hen3
+hen4
+heng1
+heng2
+heng4
+hong1
+hong2
+hong3
+hong4
+hou1
+hou2
+hou3
+hou4
+hu1
+hu2
+hu3
+hu4
+hua1
+hua2
+hua4
+huai2
+huai4
+huan1
+huan2
+huan3
+huan4
+huang1
+huang2
+huang3
+huang4
+hui1
+hui2
+hui3
+hui4
+hun1
+hun2
+hun4
+huo
+huo1
+huo2
+huo3
+huo4
+i
+j
+ji1
+ji2
+ji3
+ji4
+jia
+jia1
+jia2
+jia3
+jia4
+jian1
+jian2
+jian3
+jian4
+jiang1
+jiang2
+jiang3
+jiang4
+jiao1
+jiao2
+jiao3
+jiao4
+jie1
+jie2
+jie3
+jie4
+jin1
+jin2
+jin3
+jin4
+jing1
+jing2
+jing3
+jing4
+jiong3
+jiu1
+jiu2
+jiu3
+jiu4
+ju1
+ju2
+ju3
+ju4
+juan1
+juan2
+juan3
+juan4
+jue1
+jue2
+jue4
+jun1
+jun4
+k
+ka1
+ka2
+ka3
+kai1
+kai2
+kai3
+kai4
+kan1
+kan2
+kan3
+kan4
+kang1
+kang2
+kang4
+kao1
+kao2
+kao3
+kao4
+ke1
+ke2
+ke3
+ke4
+ken3
+keng1
+kong1
+kong3
+kong4
+kou1
+kou2
+kou3
+kou4
+ku1
+ku2
+ku3
+ku4
+kua1
+kua3
+kua4
+kuai3
+kuai4
+kuan1
+kuan2
+kuan3
+kuang1
+kuang2
+kuang4
+kui1
+kui2
+kui3
+kui4
+kun1
+kun3
+kun4
+kuo4
+l
+la
+la1
+la2
+la3
+la4
+lai2
+lai4
+lan2
+lan3
+lan4
+lang1
+lang2
+lang3
+lang4
+lao1
+lao2
+lao3
+lao4
+le
+le1
+le4
+lei
+lei1
+lei2
+lei3
+lei4
+leng1
+leng2
+leng3
+leng4
+li
+li1
+li2
+li3
+li4
+lia3
+lian2
+lian3
+lian4
+liang2
+liang3
+liang4
+liao1
+liao2
+liao3
+liao4
+lie1
+lie2
+lie3
+lie4
+lin1
+lin2
+lin3
+lin4
+ling2
+ling3
+ling4
+liu1
+liu2
+liu3
+liu4
+long1
+long2
+long3
+long4
+lou1
+lou2
+lou3
+lou4
+lu1
+lu2
+lu3
+lu4
+luan2
+luan3
+luan4
+lun1
+lun2
+lun4
+luo1
+luo2
+luo3
+luo4
+lv2
+lv3
+lv4
+lve3
+lve4
+m
+ma
+ma1
+ma2
+ma3
+ma4
+mai2
+mai3
+mai4
+man1
+man2
+man3
+man4
+mang2
+mang3
+mao1
+mao2
+mao3
+mao4
+me
+mei2
+mei3
+mei4
+men
+men1
+men2
+men4
+meng
+meng1
+meng2
+meng3
+meng4
+mi1
+mi2
+mi3
+mi4
+mian2
+mian3
+mian4
+miao1
+miao2
+miao3
+miao4
+mie1
+mie4
+min2
+min3
+ming2
+ming3
+ming4
+miu4
+mo1
+mo2
+mo3
+mo4
+mou1
+mou2
+mou3
+mu2
+mu3
+mu4
+n
+n2
+na1
+na2
+na3
+na4
+nai2
+nai3
+nai4
+nan1
+nan2
+nan3
+nan4
+nang1
+nang2
+nang3
+nao1
+nao2
+nao3
+nao4
+ne
+ne2
+ne4
+nei3
+nei4
+nen4
+neng2
+ni1
+ni2
+ni3
+ni4
+nian1
+nian2
+nian3
+nian4
+niang2
+niang4
+niao2
+niao3
+niao4
+nie1
+nie4
+nin2
+ning2
+ning3
+ning4
+niu1
+niu2
+niu3
+niu4
+nong2
+nong4
+nou4
+nu2
+nu3
+nu4
+nuan3
+nuo2
+nuo4
+nv2
+nv3
+nve4
+o
+o1
+o2
+ou1
+ou2
+ou3
+ou4
+p
+pa1
+pa2
+pa4
+pai1
+pai2
+pai3
+pai4
+pan1
+pan2
+pan4
+pang1
+pang2
+pang4
+pao1
+pao2
+pao3
+pao4
+pei1
+pei2
+pei4
+pen1
+pen2
+pen4
+peng1
+peng2
+peng3
+peng4
+pi1
+pi2
+pi3
+pi4
+pian1
+pian2
+pian4
+piao1
+piao2
+piao3
+piao4
+pie1
+pie2
+pie3
+pin1
+pin2
+pin3
+pin4
+ping1
+ping2
+po1
+po2
+po3
+po4
+pou1
+pu1
+pu2
+pu3
+pu4
+q
+qi1
+qi2
+qi3
+qi4
+qia1
+qia3
+qia4
+qian1
+qian2
+qian3
+qian4
+qiang1
+qiang2
+qiang3
+qiang4
+qiao1
+qiao2
+qiao3
+qiao4
+qie1
+qie2
+qie3
+qie4
+qin1
+qin2
+qin3
+qin4
+qing1
+qing2
+qing3
+qing4
+qiong1
+qiong2
+qiu1
+qiu2
+qiu3
+qu1
+qu2
+qu3
+qu4
+quan1
+quan2
+quan3
+quan4
+que1
+que2
+que4
+qun2
+r
+ran2
+ran3
+rang1
+rang2
+rang3
+rang4
+rao2
+rao3
+rao4
+re2
+re3
+re4
+ren2
+ren3
+ren4
+reng1
+reng2
+ri4
+rong1
+rong2
+rong3
+rou2
+rou4
+ru2
+ru3
+ru4
+ruan2
+ruan3
+rui3
+rui4
+run4
+ruo4
+s
+sa1
+sa2
+sa3
+sa4
+sai1
+sai4
+san1
+san2
+san3
+san4
+sang1
+sang3
+sang4
+sao1
+sao2
+sao3
+sao4
+se4
+sen1
+seng1
+sha1
+sha2
+sha3
+sha4
+shai1
+shai2
+shai3
+shai4
+shan1
+shan3
+shan4
+shang
+shang1
+shang3
+shang4
+shao1
+shao2
+shao3
+shao4
+she1
+she2
+she3
+she4
+shei2
+shen1
+shen2
+shen3
+shen4
+sheng1
+sheng2
+sheng3
+sheng4
+shi
+shi1
+shi2
+shi3
+shi4
+shou1
+shou2
+shou3
+shou4
+shu1
+shu2
+shu3
+shu4
+shua1
+shua2
+shua3
+shua4
+shuai1
+shuai3
+shuai4
+shuan1
+shuan4
+shuang1
+shuang3
+shui2
+shui3
+shui4
+shun3
+shun4
+shuo1
+shuo4
+si1
+si2
+si3
+si4
+song1
+song3
+song4
+sou1
+sou3
+sou4
+su1
+su2
+su4
+suan1
+suan4
+sui1
+sui2
+sui3
+sui4
+sun1
+sun3
+suo
+suo1
+suo2
+suo3
+t
+ta1
+ta2
+ta3
+ta4
+tai1
+tai2
+tai4
+tan1
+tan2
+tan3
+tan4
+tang1
+tang2
+tang3
+tang4
+tao1
+tao2
+tao3
+tao4
+te4
+teng2
+ti1
+ti2
+ti3
+ti4
+tian1
+tian2
+tian3
+tiao1
+tiao2
+tiao3
+tiao4
+tie1
+tie2
+tie3
+tie4
+ting1
+ting2
+ting3
+tong1
+tong2
+tong3
+tong4
+tou
+tou1
+tou2
+tou4
+tu1
+tu2
+tu3
+tu4
+tuan1
+tuan2
+tui1
+tui2
+tui3
+tui4
+tun1
+tun2
+tun4
+tuo1
+tuo2
+tuo3
+tuo4
+u
+v
+w
+wa
+wa1
+wa2
+wa3
+wa4
+wai1
+wai3
+wai4
+wan1
+wan2
+wan3
+wan4
+wang1
+wang2
+wang3
+wang4
+wei1
+wei2
+wei3
+wei4
+wen1
+wen2
+wen3
+wen4
+weng1
+weng4
+wo1
+wo2
+wo3
+wo4
+wu1
+wu2
+wu3
+wu4
+x
+xi1
+xi2
+xi3
+xi4
+xia1
+xia2
+xia4
+xian1
+xian2
+xian3
+xian4
+xiang1
+xiang2
+xiang3
+xiang4
+xiao1
+xiao2
+xiao3
+xiao4
+xie1
+xie2
+xie3
+xie4
+xin1
+xin2
+xin4
+xing1
+xing2
+xing3
+xing4
+xiong1
+xiong2
+xiu1
+xiu3
+xiu4
+xu
+xu1
+xu2
+xu3
+xu4
+xuan1
+xuan2
+xuan3
+xuan4
+xue1
+xue2
+xue3
+xue4
+xun1
+xun2
+xun4
+y
+ya
+ya1
+ya2
+ya3
+ya4
+yan1
+yan2
+yan3
+yan4
+yang1
+yang2
+yang3
+yang4
+yao1
+yao2
+yao3
+yao4
+ye1
+ye2
+ye3
+ye4
+yi
+yi1
+yi2
+yi3
+yi4
+yin1
+yin2
+yin3
+yin4
+ying1
+ying2
+ying3
+ying4
+yo1
+yong1
+yong2
+yong3
+yong4
+you1
+you2
+you3
+you4
+yu1
+yu2
+yu3
+yu4
+yuan1
+yuan2
+yuan3
+yuan4
+yue1
+yue4
+yun1
+yun2
+yun3
+yun4
+z
+za1
+za2
+za3
+zai1
+zai3
+zai4
+zan1
+zan2
+zan3
+zan4
+zang1
+zang4
+zao1
+zao2
+zao3
+zao4
+ze2
+ze4
+zei2
+zen3
+zeng1
+zeng4
+zha1
+zha2
+zha3
+zha4
+zhai1
+zhai2
+zhai3
+zhai4
+zhan1
+zhan2
+zhan3
+zhan4
+zhang1
+zhang2
+zhang3
+zhang4
+zhao1
+zhao2
+zhao3
+zhao4
+zhe
+zhe1
+zhe2
+zhe3
+zhe4
+zhen1
+zhen2
+zhen3
+zhen4
+zheng1
+zheng2
+zheng3
+zheng4
+zhi1
+zhi2
+zhi3
+zhi4
+zhong1
+zhong2
+zhong3
+zhong4
+zhou1
+zhou2
+zhou3
+zhou4
+zhu1
+zhu2
+zhu3
+zhu4
+zhua1
+zhua2
+zhua3
+zhuai1
+zhuai3
+zhuai4
+zhuan1
+zhuan2
+zhuan3
+zhuan4
+zhuang1
+zhuang4
+zhui1
+zhui4
+zhun1
+zhun2
+zhun3
+zhuo1
+zhuo2
+zi
+zi1
+zi2
+zi3
+zi4
+zong1
+zong2
+zong3
+zong4
+zou1
+zou2
+zou3
+zou4
+zu1
+zu2
+zu3
+zuan1
+zuan3
+zuan4
+zui2
+zui3
+zui4
+zun1
+zuo
+zuo1
+zuo2
+zuo3
+zuo4
+{
+~
+¡
+¢
+£
+¥
+§
+¨
+©
+«
+®
+¯
+°
+±
+²
+³
+´
+µ
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+À
+Á
+Â
+Ã
+Ä
+Å
+Æ
+Ç
+È
+É
+Ê
+Í
+Î
+Ñ
+Ó
+Ö
+×
+Ø
+Ú
+Ü
+Ý
+Þ
+ß
+à
+á
+â
+ã
+ä
+å
+æ
+ç
+è
+é
+ê
+ë
+ì
+í
+î
+ï
+ð
+ñ
+ò
+ó
+ô
+õ
+ö
+ø
+ù
+ú
+û
+ü
+ý
+Ā
+ā
+ă
+ą
+ć
+Č
+č
+Đ
+đ
+ē
+ė
+ę
+ě
+ĝ
+ğ
+ħ
+ī
+į
+İ
+ı
+Ł
+ł
+ń
+ņ
+ň
+ŋ
+Ō
+ō
+ő
+œ
+ř
+Ś
+ś
+Ş
+ş
+Š
+š
+Ť
+ť
+ũ
+ū
+ź
+Ż
+ż
+Ž
+ž
+ơ
+ư
+ǎ
+ǐ
+ǒ
+ǔ
+ǚ
+ș
+ț
+ɑ
+ɔ
+ɕ
+ə
+ɛ
+ɜ
+ɡ
+ɣ
+ɪ
+ɫ
+ɴ
+ɹ
+ɾ
+ʃ
+ʊ
+ʌ
+ʒ
+ʔ
+ʰ
+ʷ
+ʻ
+ʾ
+ʿ
+ˈ
+ː
+˙
+˜
+ˢ
+́
+̅
+Α
+Β
+Δ
+Ε
+Θ
+Κ
+Λ
+Μ
+Ξ
+Π
+Σ
+Τ
+Φ
+Χ
+Ψ
+Ω
+ά
+έ
+ή
+ί
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+ϊ
+ό
+ύ
+ώ
+ϕ
+ϵ
+Ё
+А
+Б
+В
+Г
+Д
+Е
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ё
+і
+ְ
+ִ
+ֵ
+ֶ
+ַ
+ָ
+ֹ
+ּ
+־
+ׁ
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+פ
+ק
+ר
+ש
+ת
+أ
+ب
+ة
+ت
+ج
+ح
+د
+ر
+ز
+س
+ص
+ط
+ع
+ق
+ك
+ل
+م
+ن
+ه
+و
+ي
+َ
+ُ
+ِ
+ْ
+ก
+ข
+ง
+จ
+ต
+ท
+น
+ป
+ย
+ร
+ว
+ส
+ห
+อ
+ฮ
+ั
+า
+ี
+ึ
+โ
+ใ
+ไ
+่
+้
+์
+ḍ
+Ḥ
+ḥ
+ṁ
+ṃ
+ṅ
+ṇ
+Ṛ
+ṛ
+Ṣ
+ṣ
+Ṭ
+ṭ
+ạ
+ả
+Ấ
+ấ
+ầ
+ậ
+ắ
+ằ
+ẻ
+ẽ
+ế
+ề
+ể
+ễ
+ệ
+ị
+ọ
+ỏ
+ố
+ồ
+ộ
+ớ
+ờ
+ở
+ụ
+ủ
+ứ
+ữ
+ἀ
+ἁ
+Ἀ
+ἐ
+ἔ
+ἰ
+ἱ
+ὀ
+ὁ
+ὐ
+ὲ
+ὸ
+���
+᾽
+ῆ
+ῇ
+ῶ
+‎
+‑
+‒
+–
+—
+―
+‖
+†
+‡
+•
+…
+‧
+‬
+′
+″
+⁄
+⁡
+⁰
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+₁
+₂
+₃
+€
+₱
+₹
+₽
+℃
+ℏ
+ℓ
+№
+ℝ
+™
+⅓
+⅔
+⅛
+→
+∂
+∈
+∑
+−
+∗
+√
+∞
+∫
+≈
+≠
+≡
+≤
+≥
+⋅
+⋯
+█
+♪
+⟨
+⟩
+、
+。
+《
+》
+「
+」
+【
+】
+あ
+う
+え
+お
+か
+が
+き
+ぎ
+く
+ぐ
+け
+げ
+こ
+ご
+さ
+し
+じ
+す
+ず
+せ
+ぜ
+そ
+ぞ
+た
+だ
+ち
+っ
+つ
+で
+と
+ど
+な
+に
+ね
+の
+は
+ば
+ひ
+ぶ
+へ
+べ
+ま
+み
+む
+め
+も
+ゃ
+や
+ゆ
+ょ
+よ
+ら
+り
+る
+れ
+ろ
+わ
+を
+ん
+ァ
+ア
+ィ
+イ
+ウ
+ェ
+エ
+オ
+カ
+ガ
+キ
+ク
+ケ
+ゲ
+コ
+ゴ
+サ
+ザ
+シ
+ジ
+ス
+ズ
+セ
+ゾ
+タ
+ダ
+チ
+ッ
+ツ
+テ
+デ
+ト
+ド
+ナ
+ニ
+ネ
+ノ
+バ
+パ
+ビ
+ピ
+フ
+プ
+ヘ
+ベ
+ペ
+ホ
+ボ
+ポ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ヤ
+ュ
+ユ
+ョ
+ヨ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ン
+・
+ー
+ㄋ
+ㄍ
+ㄎ
+ㄏ
+ㄓ
+ㄕ
+ㄚ
+ㄜ
+ㄟ
+ㄤ
+ㄥ
+ㄧ
+ㄱ
+ㄴ
+ㄷ
+ㄹ
+ㅁ
+ㅂ
+ㅅ
+ㅈ
+ㅍ
+ㅎ
+ㅏ
+ㅓ
+ㅗ
+ㅜ
+ㅡ
+ㅣ
+㗎
+가
+각
+간
+갈
+감
+갑
+갓
+갔
+강
+같
+개
+거
+건
+걸
+겁
+것
+겉
+게
+겠
+겨
+결
+겼
+경
+계
+고
+곤
+골
+곱
+공
+과
+관
+광
+교
+구
+국
+굴
+귀
+귄
+그
+근
+글
+금
+기
+긴
+길
+까
+깍
+깔
+깜
+깨
+께
+꼬
+꼭
+꽃
+꾸
+꿔
+끔
+끗
+끝
+끼
+나
+난
+날
+남
+납
+내
+냐
+냥
+너
+넘
+넣
+네
+녁
+년
+녕
+노
+녹
+놀
+누
+눈
+느
+는
+늘
+니
+님
+닙
+다
+닥
+단
+달
+닭
+당
+대
+더
+덕
+던
+덥
+데
+도
+독
+동
+돼
+됐
+되
+된
+될
+두
+둑
+둥
+드
+들
+등
+디
+따
+딱
+딸
+땅
+때
+떤
+떨
+떻
+또
+똑
+뚱
+뛰
+뜻
+띠
+라
+락
+란
+람
+랍
+랑
+래
+랜
+러
+런
+럼
+렇
+레
+려
+력
+렵
+렸
+로
+록
+롬
+루
+르
+른
+를
+름
+릉
+리
+릴
+림
+마
+막
+만
+많
+말
+맑
+맙
+맛
+매
+머
+먹
+멍
+메
+면
+명
+몇
+모
+목
+몸
+못
+무
+문
+물
+뭐
+뭘
+미
+민
+밌
+밑
+바
+박
+밖
+반
+받
+발
+밤
+밥
+방
+배
+백
+밸
+뱀
+버
+번
+벌
+벚
+베
+벼
+벽
+별
+병
+보
+복
+본
+볼
+봐
+봤
+부
+분
+불
+비
+빔
+빛
+빠
+빨
+뼈
+뽀
+뿅
+쁘
+사
+산
+살
+삼
+샀
+상
+새
+색
+생
+서
+선
+설
+섭
+섰
+성
+세
+셔
+션
+셨
+소
+속
+손
+송
+수
+숙
+순
+술
+숫
+숭
+숲
+쉬
+쉽
+스
+슨
+습
+슷
+시
+식
+신
+실
+싫
+심
+십
+싶
+싸
+써
+쓰
+쓴
+씌
+씨
+씩
+씬
+아
+악
+안
+않
+알
+야
+약
+얀
+양
+얘
+어
+언
+얼
+엄
+업
+없
+었
+엉
+에
+여
+역
+연
+염
+엽
+영
+옆
+예
+옛
+오
+온
+올
+옷
+옹
+와
+왔
+왜
+요
+욕
+용
+우
+운
+울
+웃
+워
+원
+월
+웠
+위
+윙
+유
+육
+윤
+으
+은
+을
+음
+응
+의
+이
+익
+인
+일
+읽
+임
+입
+있
+자
+작
+잔
+잖
+잘
+잡
+잤
+장
+재
+저
+전
+점
+정
+제
+져
+졌
+조
+족
+좀
+종
+좋
+죠
+주
+준
+줄
+중
+줘
+즈
+즐
+즘
+지
+진
+집
+짜
+짝
+쩌
+쪼
+쪽
+쫌
+쭈
+쯔
+찌
+찍
+차
+착
+찾
+책
+처
+천
+철
+체
+쳐
+쳤
+초
+촌
+추
+출
+춤
+춥
+춰
+치
+친
+칠
+침
+칩
+칼
+커
+켓
+코
+콩
+쿠
+퀴
+크
+큰
+큽
+키
+킨
+타
+태
+터
+턴
+털
+테
+토
+통
+투
+트
+특
+튼
+틀
+티
+팀
+파
+팔
+패
+페
+펜
+펭
+평
+포
+폭
+표
+품
+풍
+프
+플
+피
+필
+하
+학
+한
+할
+함
+합
+항
+해
+햇
+했
+행
+허
+험
+형
+혜
+호
+혼
+홀
+화
+회
+획
+후
+휴
+흐
+흔
+희
+히
+힘
+ﷺ
+ﷻ
+！
+，
+？
+�
+𠮶

src/f5_tts/infer/infer_cli.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import argparse
+import codecs
+import os
+import re
+from datetime import datetime
+from importlib.resources import files
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import tomli
+from cached_path import cached_path
+from hydra.utils import get_class
+from omegaconf import OmegaConf
+from unidecode import unidecode
+from f5_tts.infer.utils_infer import (
+    cfg_strength,
+    cross_fade_duration,
+    device,
+    fix_duration,
+    infer_process,
+    load_model,
+    load_vocoder,
+    mel_spec_type,
+    nfe_step,
+    preprocess_ref_audio_text,
+    remove_silence_for_generated_wav,
+    speed,
+    sway_sampling_coef,
+    target_rms,
+)
+parser = argparse.ArgumentParser(
+    prog="python3 infer-cli.py",
+    description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
+    epilog="Specify options above to override one or more settings from config.",
+)
+parser.add_argument(
+    "-c",
+    "--config",
+    type=str,
+    default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
+    help="The configuration file, default see infer/examples/basic/basic.toml",
+)
+# Note. Not to provide default value here in order to read default from config file
+parser.add_argument(
+    "-m",
+    "--model",
+    type=str,
+    help="The model name: F5TTS_v1_Base | F5TTS_Base | E2TTS_Base | etc.",
+)
+parser.add_argument(
+    "-mc",
+    "--model_cfg",
+    type=str,
+    help="The path to F5-TTS model config file .yaml",
+)
+parser.add_argument(
+    "-p",
+    "--ckpt_file",
+    type=str,
+    help="The path to model checkpoint .pt, leave blank to use default",
+)
+parser.add_argument(
+    "--use_ema",
+    action="store_true",
+    help="To use ema model",
+)
+parser.add_argument(
+    "-v",
+    "--vocab_file",
+    type=str,
+    help="The path to vocab file .txt, leave blank to use default",
+)
+parser.add_argument(
+    "-r",
+    "--ref_audio",
+    type=str,
+    help="The reference audio file.",
+)
+parser.add_argument(
+    "-s",
+    "--ref_text",
+    type=str,
+    help="The transcript/subtitle for the reference audio",
+)
+parser.add_argument(
+    "-t",
+    "--gen_text",
+    type=str,
+    help="The text to make model synthesize a speech",
+)
+parser.add_argument(
+    "-f",
+    "--gen_file",
+    type=str,
+    help="The file with text to generate, will ignore --gen_text",
+)
+parser.add_argument(
+    "-o",
+    "--output_dir",
+    type=str,
+    help="The path to output folder",
+)
+parser.add_argument(
+    "-w",
+    "--output_file",
+    type=str,
+    help="The name of output file",
+)
+parser.add_argument(
+    "--save_chunk",
+    action="store_true",
+    help="To save each audio chunks during inference",
+)
+parser.add_argument(
+    "--no_legacy_text",
+    action="store_false",
+    help="Not to use lossy ASCII transliterations of unicode text in saved file names.",
+)
+parser.add_argument(
+    "--remove_silence",
+    action="store_true",
+    help="To remove long silence found in ouput",
+)
+parser.add_argument(
+    "--load_vocoder_from_local",
+    action="store_true",
+    help="To load vocoder from local dir, default to ../checkpoints/vocos-mel-24khz",
+)
+parser.add_argument(
+    "--vocoder_name",
+    type=str,
+    choices=["vocos", "bigvgan"],
+    help=f"Used vocoder name: vocos | bigvgan, default {mel_spec_type}",
+)
+parser.add_argument(
+    "--target_rms",
+    type=float,
+    help=f"Target output speech loudness normalization value, default {target_rms}",
+)
+parser.add_argument(
+    "--cross_fade_duration",
+    type=float,
+    help=f"Duration of cross-fade between audio segments in seconds, default {cross_fade_duration}",
+)
+parser.add_argument(
+    "--nfe_step",
+    type=int,
+    help=f"The number of function evaluation (denoising steps), default {nfe_step}",
+)
+parser.add_argument(
+    "--cfg_strength",
+    type=float,
+    help=f"Classifier-free guidance strength, default {cfg_strength}",
+)
+parser.add_argument(
+    "--sway_sampling_coef",
+    type=float,
+    help=f"Sway Sampling coefficient, default {sway_sampling_coef}",
+)
+parser.add_argument(
+    "--speed",
+    type=float,
+    help=f"The speed of the generated audio, default {speed}",
+)
+parser.add_argument(
+    "--fix_duration",
+    type=float,
+    help=f"Fix the total duration (ref and gen audios) in seconds, default {fix_duration}",
+)
+parser.add_argument(
+    "--device",
+    type=str,
+    help="Specify the device to run on",
+)
+args = parser.parse_args()
+# config file
+config = tomli.load(open(args.config, "rb"))
+# command-line interface parameters
+model = args.model or config.get("model", "F5TTS_v1_Base")
+ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
+vocab_file = args.vocab_file or config.get("vocab_file", "")
+ref_audio = args.ref_audio or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
+ref_text = (
+    args.ref_text
+    if args.ref_text is not None
+    else config.get("ref_text", "Some call me nature, others call me mother nature.")
+)
+gen_text = args.gen_text or config.get("gen_text", "Here we generate something just for test.")
+gen_file = args.gen_file or config.get("gen_file", "")
+output_dir = args.output_dir or config.get("output_dir", "tests")
+output_file = args.output_file or config.get(
+    "output_file", f"infer_cli_{datetime.now().strftime(r'%Y%m%d_%H%M%S')}.wav"
+)
+save_chunk = args.save_chunk or config.get("save_chunk", False)
+use_legacy_text = args.no_legacy_text or config.get("no_legacy_text", False)  # no_legacy_text is a store_false arg
+if save_chunk and use_legacy_text:
+    print(
+        "\nWarning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.\n"
+    )
+remove_silence = args.remove_silence or config.get("remove_silence", False)
+load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
+vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
+target_rms = args.target_rms or config.get("target_rms", target_rms)
+cross_fade_duration = args.cross_fade_duration or config.get("cross_fade_duration", cross_fade_duration)
+nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
+cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
+sway_sampling_coef = args.sway_sampling_coef or config.get("sway_sampling_coef", sway_sampling_coef)
+speed = args.speed or config.get("speed", speed)
+fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
+device = args.device or config.get("device", device)
+# patches for pip pkg user
+if "infer/examples/" in ref_audio:
+    ref_audio = str(files("f5_tts").joinpath(f"{ref_audio}"))
+if "infer/examples/" in gen_file:
+    gen_file = str(files("f5_tts").joinpath(f"{gen_file}"))
+if "voices" in config:
+    for voice in config["voices"]:
+        voice_ref_audio = config["voices"][voice]["ref_audio"]
+        if "infer/examples/" in voice_ref_audio:
+            config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
+# ignore gen_text if gen_file provided
+if gen_file:
+    gen_text = codecs.open(gen_file, "r", "utf-8").read()
+# output path
+wave_path = Path(output_dir) / output_file
+# spectrogram_path = Path(output_dir) / "infer_cli_out.png"
+if save_chunk:
+    output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
+    if not os.path.exists(output_chunk_dir):
+        os.makedirs(output_chunk_dir)
+# load vocoder
+if vocoder_name == "vocos":
+    vocoder_local_path = "../checkpoints/vocos-mel-24khz"
+elif vocoder_name == "bigvgan":
+    vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+vocoder = load_vocoder(
+    vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path, device=device
+)
+# load TTS model
+model_cfg = OmegaConf.load(
+    args.model_cfg or config.get("model_cfg", str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
+)
+model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
+model_arc = model_cfg.model.arch
+repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
+if model != "F5TTS_Base":
+    assert vocoder_name == model_cfg.model.mel_spec.mel_spec_type
+# override for previous models
+if model == "F5TTS_Base":
+    if vocoder_name == "vocos":
+        ckpt_step = 1200000
+    elif vocoder_name == "bigvgan":
+        model = "F5TTS_Base_bigvgan"
+        ckpt_type = "pt"
+elif model == "E2TTS_Base":
+    repo_name = "E2-TTS"
+    ckpt_step = 1200000
+if not ckpt_file:
+    ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}"))
+print(f"Using {model}...")
+ema_model = load_model(
+    model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device, use_ema=args.use_ema,
+)
+# inference process
+def main():
+    main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
+    if "voices" not in config:
+        voices = {"main": main_voice}
+    else:
+        voices = config["voices"]
+        voices["main"] = main_voice
+    for voice in voices:
+        print("Voice:", voice)
+        print("ref_audio ", voices[voice]["ref_audio"])
+        voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
+            voices[voice]["ref_audio"], voices[voice]["ref_text"]
+        )
+        print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
+    generated_audio_segments = []
+    reg1 = r"(?=\[\w+\])"
+    chunks = re.split(reg1, gen_text)
+    reg2 = r"\[(\w+)\]"
+    for text in chunks:
+        if not text.strip():
+            continue
+        match = re.match(reg2, text)
+        if match:
+            voice = match[1]
+        else:
+            print("No voice tag found, using main.")
+            voice = "main"
+        if voice not in voices:
+            print(f"Voice {voice} not found, using main.")
+            voice = "main"
+        text = re.sub(reg2, "", text)
+        ref_audio_ = voices[voice]["ref_audio"]
+        ref_text_ = voices[voice]["ref_text"]
+        local_speed = voices[voice].get("speed", speed)
+        gen_text_ = text.strip()
+        print(f"Voice: {voice}")
+        audio_segment, final_sample_rate, spectrogram = infer_process(
+            ref_audio_,
+            ref_text_,
+            gen_text_,
+            ema_model,
+            vocoder,
+            mel_spec_type=vocoder_name,
+            target_rms=target_rms,
+            cross_fade_duration=cross_fade_duration,
+            nfe_step=nfe_step,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=sway_sampling_coef,
+            speed=local_speed,
+            fix_duration=fix_duration,
+            device=device,
+        )
+        generated_audio_segments.append(audio_segment)
+        if save_chunk:
+            if len(gen_text_) > 200:
+                gen_text_ = gen_text_[:200] + " ... "
+            if use_legacy_text:
+                gen_text_ = unidecode(gen_text_)
+            sf.write(
+                os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"),
+                audio_segment,
+                final_sample_rate,
+            )
+    if generated_audio_segments:
+        final_wave = np.concatenate(generated_audio_segments)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(wave_path, "wb") as f:
+            sf.write(f.name, final_wave, final_sample_rate)
+            # Remove silence
+            if remove_silence:
+                remove_silence_for_generated_wav(f.name)
+            print(f.name)
+if __name__ == "__main__":
+    main()

src/f5_tts/infer/infer_cli_emotion.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""CLI emotion inference using F5-TTS-Emotional-CFG (emotion-conditioned)."""
+from __future__ import annotations
+import argparse
+import time
+from pathlib import Path
+import torch
+import torchaudio
+from f5_tts.infer.infer_emotion import (
+    CFM,
+    CFMConditioned,
+    DiT,
+    DiTConditioned,
+    TTSModel,
+    compute_mel_from_wav,
+    cfg_strength,
+    hop_length,
+    mel_spec_type,
+    n_fft,
+    n_mel_channels,
+    nfe_step,
+    sway_sampling_coef,
+    target_sample_rate,
+    tokenizer,
+    win_length,
+)
+from f5_tts.model.utils import get_tokenizer
+def build_arg_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        description="CLI emotion inference using F5-TTS-Emotional-CFG (emotion-conditioned)."
+    )
+    # --- Reference inputs ---
+    p.add_argument(
+        "-ref",
+        "--ref-audio-path",
+        type=str,
+        required=True,
+        help="Path to reference .wav for voice cloning.",
+    )
+    p.add_argument(
+        "-rt",
+        "--ref-text",
+        type=str,
+        required=True,
+        help="Transcription text for the reference audio.",
+    )
+    p.add_argument(
+        "-re",
+        "--ref-emotion",
+        type=str,
+        default="Neutral",
+        choices=["Angry", "Surprise", "Neutral", "Sad", "Happy"],
+        help='Reference emotion label (emotion in the reference audio).',
+    )
+    # --- Inference target ---
+    p.add_argument(
+        "-it",
+        "--inference-text",
+        type=str,
+        required=True,
+        help="New text to synthesize (will be appended after ref_text).",
+    )
+    p.add_argument(
+        "-ie",
+        "--inference-emotion",
+        type=str,
+        required=True,
+        choices=["Angry", "Surprise", "Neutral", "Sad", "Happy"],
+        help='Target emotion label for the new speech.',
+    )
+    # --- Output ---
+    p.add_argument(
+        "-o",
+        "--output-path",
+        type=str,
+        default="data/output.wav",
+        help="Path to the generated audio (.wav).",
+    )
+    # --- Checkpoints ---
+    p.add_argument(
+        "--checkpoint-path-emotion",
+        type=str,
+        default="ckpts/model_emo.pt",
+        help="Path to the trained emotion-conditioned model checkpoint (.pt).",
+    )
+    # --- Tokenizer / vocab ---
+    p.add_argument(
+        "--vocab-dataset-name",
+        type=str,
+        default="EmiliaPetite_dataset_ZH_EN",
+        help="Dataset name used for tokenizer building.",
+    )
+    p.add_argument(
+        "--tokenizer",
+        type=str,
+        default=tokenizer,
+        choices=["pinyin", "char", "custom"],
+        help="Tokenizer type.",
+    )
+    p.add_argument(
+        "--tokenizer-path",
+        type=str,
+        default=None,
+        help="Path to custom tokenizer vocab.txt (if tokenizer='custom').",
+    )
+    # --- Sampling / guidance params ---
+    p.add_argument(
+        "--nfe",
+        type=int,
+        default=nfe_step,
+        help="# function evaluations (steps). Lower = faster, lower quality.",
+    )
+    p.add_argument(
+        "--cfg-strength",
+        type=float,
+        default=cfg_strength,
+        help="Classifier-free guidance for content/text.",
+    )
+    p.add_argument(
+        "--cfg-strength2",
+        type=float,
+        default=10.0,
+        help="Emotion guidance strength; higher = stronger emotion, less natural.",
+    )
+    p.add_argument(
+        "--sway-sampling-coef",
+        type=float,
+        default=sway_sampling_coef,
+        help="Sway sampling coefficient.",
+    )
+    # --- Emotion conditioning block ---
+    p.add_argument(
+        "--emotion-condition-type",
+        type=str,
+        default="text_mirror",
+        choices=["text_mirror", "cross_attention", "text_early_fusion"],
+        help="How emotion is injected into the transformer.",
+    )
+    p.add_argument(
+        "--emotion-dim",
+        type=int,
+        default=128,
+        help="Dimension of emotion embedding.",
+    )
+    p.add_argument(
+        "--emotion-conv-layers",
+        type=int,
+        default=4,
+        help="# of conv layers used in emotion path.",
+    )
+    p.add_argument(
+        "--init-type",
+        type=str,
+        default="xavier_reduced",
+        help="(text_mirror only) initialization method for new emotion weights.",
+    )
+    p.add_argument(
+        "--weight-reduction-scale",
+        type=float,
+        default=1.0,
+        help="(text_mirror only) scale for reduced Xavier init.",
+    )
+    # --- Audio & mel spec ---
+    p.add_argument(
+        "--mel-spec-type",
+        type=str,
+        default=mel_spec_type,
+        choices=["vocos", "bigvgan"],
+        help="Vocoder/mel type.",
+    )
+    p.add_argument("--target-sr", type=int, default=target_sample_rate)
+    p.add_argument("--n-mel", type=int, default=n_mel_channels)
+    p.add_argument("--n-fft", type=int, default=n_fft)
+    p.add_argument("--hop-length", type=int, default=hop_length)
+    p.add_argument("--win-length", type=int, default=win_length)
+    # --- Device ---
+    p.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "mps", "cpu"],
+        help="Inference device.",
+    )
+    return p
+def main():
+    args = build_arg_parser().parse_args()
+    mel_spec_kwargs = dict(
+        n_fft=args.n_fft,
+        hop_length=args.hop_length,
+        win_length=args.win_length,
+        n_mel_channels=args.n_mel,
+        target_sample_rate=args.target_sr,
+        mel_spec_type=args.mel_spec_type,
+    )
+    if args.tokenizer == "custom":
+        if not args.tokenizer_path:
+            raise ValueError("tokenizer='custom' requires --tokenizer-path (vocab.txt).")
+        vocab_char_map, vocab_size = get_tokenizer(args.tokenizer_path, args.tokenizer)
+    else:
+        vocab_char_map, vocab_size = get_tokenizer(args.vocab_dataset_name, args.tokenizer)
+    emotion_conditioning_parameters = {
+        "emotion_condition_type": args.emotion_condition_type,
+        "init_type": args.init_type,
+        "weight_reduction_scale": args.weight_reduction_scale,
+        "emotion_dim": args.emotion_dim,
+        "emotion_conv_layers": args.emotion_conv_layers,
+        "load_emotion_weights": False,
+    }
+    model_cfg_emotion = dict(
+        dim=1024,
+        depth=22,
+        heads=16,
+        ff_mult=2,
+        text_dim=512,
+        emotion_dim=args.emotion_dim,
+        conv_layers=args.emotion_conv_layers,
+    )
+    transformer = DiTConditioned(
+        **model_cfg_emotion,
+        text_num_embeds=vocab_size,
+        mel_dim=args.n_mel,
+        emotion_conditioning=emotion_conditioning_parameters,
+    )
+    model_emotion = CFMConditioned(
+        transformer=transformer,
+        mel_spec_kwargs=mel_spec_kwargs,
+        vocab_char_map=vocab_char_map,
+    )
+    tts = TTSModel(
+        model=model_emotion,
+        vocoder_name=args.mel_spec_type,
+        checkpoint_path=args.checkpoint_path_emotion,
+        emotion_conditioning_parameters=emotion_conditioning_parameters,
+        device=args.device,
+    )
+    mel = compute_mel_from_wav(args.ref_audio_path, mel_spec_kwargs, device=args.device)
+    start = time.perf_counter()
+    gen_mel, gen_audio = tts.infer(
+        inference_text=args.inference_text,
+        inference_emotion=args.inference_emotion,
+        ref_mel=mel,
+        ref_text=args.ref_text,
+        ref_emotion=args.ref_emotion,
+        steps=args.nfe,
+        cfg_strength=args.cfg_strength,
+        cfg_strength2=args.cfg_strength2,
+        sway_sampling_coef=args.sway_sampling_coef,
+    )
+    dur = time.perf_counter() - start
+    outpath = Path(args.output_path)
+    outpath.parent.mkdir(parents=True, exist_ok=True)
+    torchaudio.save(str(outpath), gen_audio.cpu(), args.target_sr)
+    print(f"[OK] Saved: {outpath}  |  duration: {dur:.2f}s")
+    print(f"    Inference emotion: {args.inference_emotion}")
+    print(f"    Steps (nfe): {args.nfe} | cfg_strength: {args.cfg_strength} | cfg_strength2: {args.cfg_strength2}")
+if __name__ == "__main__":
+    main()

src/f5_tts/infer/infer_elevenlabs.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+ElevenLabs Voice Cloning Inference Script
+Uses ElevenLabs API to clone a voice from reference audio and generate speech.
+"""
+import argparse
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+from elevenlabs import ElevenLabs
+def main():
+    load_dotenv()
+    parser = argparse.ArgumentParser(description="ElevenLabs Voice Cloning Inference")
+    parser.add_argument("--ref_audio", type=str, required=True, help="Path to reference audio file")
+    parser.add_argument("--ref_text", type=str, default=None, help="Reference text (unused, for API compatibility)")
+    parser.add_argument("--gen_text", type=str, required=True, help="Text to generate speech for")
+    parser.add_argument("--output_file", type=str, required=True, help="Output filename")
+    parser.add_argument("--output_dir", type=str, default=".", help="Output directory")
+    parser.add_argument("--model", type=str, default="eleven_multilingual_v2", help="ElevenLabs model ID")
+    parser.add_argument("--keep_voice", action="store_true", help="Keep cloned voice after generation")
+    args = parser.parse_args()
+    api_key = os.getenv("ELEVENLABS_API_KEY")
+    if not api_key:
+        raise ValueError("ELEVENLABS_API_KEY not found in environment. Add it to .env file.")
+    client = ElevenLabs(api_key=api_key)
+    # Create voice clone from reference audio (instant voice clone)
+    print(f"Creating voice clone from: {args.ref_audio}")
+    with open(args.ref_audio, "rb") as audio_file:
+        voice = client.voices.ivc.create(
+            name="temp_clone_voice",
+            files=[audio_file],
+        )
+    print(f"Voice created with ID: {voice.voice_id}")
+    try:
+        # Generate speech
+        print(f"Generating speech for text: {args.gen_text[:50]}...")
+        audio_generator = client.text_to_speech.convert(
+            voice_id=voice.voice_id,
+            text=args.gen_text,
+            model_id=args.model,
+        )
+        # Save output
+        output_path = Path(args.output_dir) / args.output_file
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, "wb") as f:
+            for chunk in audio_generator:
+                f.write(chunk)
+        print(f"Audio saved to: {output_path}")
+    finally:
+        # Cleanup: delete the cloned voice
+        if not args.keep_voice:
+            print(f"Deleting cloned voice: {voice.voice_id}")
+            client.voices.delete(voice.voice_id)
+            print("Voice deleted.")
+if __name__ == "__main__":
+    main()

src/f5_tts/infer/infer_emotion.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""Inference script for emotion-conditioned F5-TTS."""
+from __future__ import annotations
+import time
+import torch
+import torchaudio
+from f5_tts.model import CFM
+from f5_tts.model.cfm_emotion import CFMConditioned
+from f5_tts.model.backbones.dit import DiT
+from f5_tts.model.backbones.dit_emotion import DiTConditioned
+from f5_tts.model.backbones.unett import UNetT
+from f5_tts.model.modules import MelSpec
+from f5_tts.model.utils import get_tokenizer
+from f5_tts.infer.utils_infer import cfg_strength, load_vocoder, nfe_step, sway_sampling_coef
+# Dataset Settings
+target_sample_rate = 24000
+n_mel_channels = 100
+hop_length = 256
+win_length = 1024
+n_fft = 1024
+mel_spec_type = "vocos"
+tokenizer = "pinyin"
+tokenizer_path = None
+emotion_dict = {
+    "Angry": 1,
+    "Neutral": 2,
+    "Sad": 3,
+    "Surprise": 4,
+    "Happy": 5,
+}
+# Model params
+model_cls_emotion = DiTConditioned
+model_cls_pretrained = DiT
+model_cfg_pretrained = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+def compute_mel_from_wav(
+    audio_path: str,
+    mel_spec_kwargs: dict,
+    device: str = "cpu",
+) -> torch.Tensor:
+    """Compute mel spectrogram from a .wav file using parameters in mel_spec_kwargs."""
+    audio, sample_rate = torchaudio.load(audio_path)
+    if audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True)
+    if sample_rate != mel_spec_kwargs["target_sample_rate"]:
+        resampler = torchaudio.transforms.Resample(
+            orig_freq=sample_rate, new_freq=mel_spec_kwargs["target_sample_rate"]
+        )
+        audio = resampler(audio)
+    audio = audio.to(device)
+    mel_processor = MelSpec(**mel_spec_kwargs).to(device)
+    mel = mel_processor(audio)
+    return mel.squeeze(0).permute(1, 0)
+class TTSModel:
+    def __init__(self, model, vocoder_name, checkpoint_path: str, emotion_conditioning_parameters, device: str = "cuda"):
+        self.device = device
+        self.model = model
+        self._load_checkpoint(checkpoint_path)
+        self.emotion_conditioning_parameters = emotion_conditioning_parameters
+        self.vocoder_name = vocoder_name
+        self.vocoder = load_vocoder(vocoder_name=self.vocoder_name)
+    def _load_checkpoint(self, path: str):
+        checkpoint = torch.load(path, weights_only=True, map_location="cpu")
+        if "step" in checkpoint:
+            for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
+                if key in checkpoint["model_state_dict"]:
+                    del checkpoint["model_state_dict"][key]
+            self.model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+        else:
+            checkpoint["model_state_dict"] = {
+                k.replace("ema_model.", ""): v
+                for k, v in checkpoint["ema_model_state_dict"].items()
+                if k not in ["initted", "step"]
+            }
+            self.model.load_state_dict(checkpoint["model_state_dict"], strict=False)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        del checkpoint
+    def remove_leading_value(self, spec, value=0.0):
+        """Remove leading rows of 'value' elements in a melspectrogram."""
+        gen_flat = spec[0]
+        is_row_of_ones = torch.all(gen_flat == value, dim=1)
+        num_rows_to_remove = torch.sum(is_row_of_ones).item()
+        spec = spec[:, num_rows_to_remove:, :]
+        return spec
+    @torch.inference_mode()
+    def infer(
+        self,
+        inference_text: str,
+        inference_emotion: str,
+        ref_mel: torch.Tensor,
+        ref_text: str,
+        ref_emotion: str,
+        steps: int,
+        cfg_strength,
+        cfg_strength2,
+        sway_sampling_coef,
+        seed: int = 50,
+    ) -> torch.Tensor:
+        text_input = [ref_text + " " + inference_text]
+        emotion_input = [[ref_emotion, inference_emotion]]
+        first_phrase_length = [len(ref_text)]
+        mel_lengths = torch.LongTensor([ref_mel.shape[0]])
+        ref_audio_len = mel_lengths.item()
+        estimated_duration = ref_audio_len + int(ref_audio_len * len(inference_text) / len(ref_text))
+        start = time.perf_counter()
+        if inference_emotion is not None:
+            generated_melspec, _ = self.model.sample(
+                cond=ref_mel.to(self.device).unsqueeze(0),
+                text=text_input,
+                emotion=emotion_input,
+                first_phrase_length=first_phrase_length,
+                duration=estimated_duration,
+                steps=steps,
+                cfg_strength=cfg_strength,
+                cfg_strength2=cfg_strength2,
+                sway_sampling_coef=sway_sampling_coef,
+                seed=seed,
+            )
+        else:
+            generated_melspec, _ = self.model.sample(
+                cond=ref_mel.to(self.device).unsqueeze(0),
+                text=text_input,
+                duration=estimated_duration,
+                steps=steps,
+                cfg_strength=cfg_strength,
+                sway_sampling_coef=sway_sampling_coef,
+                seed=seed,
+            )
+        end = time.perf_counter()
+        generated_melspec = self.remove_leading_value(generated_melspec)
+        generated_melspec_2ndhalf = generated_melspec[:, ref_mel.shape[0] :, :]
+        start = time.perf_counter()
+        generated_audio = self.vocode(generated_melspec_2ndhalf)
+        end = time.perf_counter()
+        print(f"TIME vocoder ({len(text_input[0])}): ", end - start)
+        return generated_melspec_2ndhalf, generated_audio
+    def vocode(self, mel: torch.Tensor) -> torch.Tensor:
+        mel = mel.unsqueeze(0) if mel.ndim == 2 else mel
+        return self.vocoder.decode(mel.float().permute(0, 2, 1).to(self.device))
+if __name__ == "__main__":
+    ref_audio_path = "data/0011_angry.wav"
+    ref_emotion = "Angry"
+    ref_text = "The nine, the eggs, I keep."
+    inference_text = "Hello, this is a text to check emotion."
+    inference_emotion = "Surprise"
+    output_path = "data/output.wav"
+    nfe = nfe_step
+    cfg_strength2 = 10
+    emotion_conditioning_parameters = {
+        "emotion_condition_type": "text_mirror",
+        "init_type": "xavier_reduced",
+        "weight_reduction_scale": 1,
+        "emotion_dim": 128,
+        "emotion_conv_layers": 4,
+        "load_emotion_weights": False,
+    }
+    tokenizer_path = "ckpts/vocab.txt"
+    vocab_char_map, vocab_size = get_tokenizer("EmiliaPetite_dataset_ZH_EN", "pinyin")
+    device = "cuda"
+    checkpoint_path_emotion = "ckpts/model_emo.pt"
+    checkpoint_path_pretrained = "ckpts/model_0.pt"
+    mel_spec_kwargs = dict(
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mel_channels=n_mel_channels,
+        target_sample_rate=target_sample_rate,
+        mel_spec_type=mel_spec_type,
+    )
+    model_cfg_emotion = dict(
+        dim=1024,
+        depth=22,
+        heads=16,
+        ff_mult=2,
+        text_dim=512,
+        emotion_dim=emotion_conditioning_parameters["emotion_dim"],
+        conv_layers=emotion_conditioning_parameters["emotion_conv_layers"],
+    )
+    model_emotion = CFMConditioned(
+        transformer=model_cls_emotion(
+            **model_cfg_emotion,
+            text_num_embeds=vocab_size,
+            mel_dim=n_mel_channels,
+            emotion_conditioning=emotion_conditioning_parameters,
+        ),
+        mel_spec_kwargs=mel_spec_kwargs,
+        vocab_char_map=vocab_char_map,
+    )
+    model_wrapper_emotion = TTSModel(
+        model_emotion, mel_spec_type, checkpoint_path_emotion, emotion_conditioning_parameters, device
+    )
+    model_pretrained = CFM(
+        transformer=model_cls_pretrained(**model_cfg_pretrained, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
+        mel_spec_kwargs=mel_spec_kwargs,
+        vocab_char_map=vocab_char_map,
+    )
+    model_wrapper_pretrained = TTSModel(
+        model_pretrained, mel_spec_type, checkpoint_path_pretrained, emotion_conditioning_parameters, device
+    )
+    mel = compute_mel_from_wav(ref_audio_path, mel_spec_kwargs, device="cuda")
+    generated_melspec, generated_audio = model_wrapper_emotion.infer(
+        inference_text=inference_text,
+        inference_emotion=inference_emotion,
+        ref_mel=mel,
+        ref_text=ref_text,
+        ref_emotion=ref_emotion,
+        steps=nfe,
+        cfg_strength=cfg_strength,
+        cfg_strength2=cfg_strength2,
+        sway_sampling_coef=sway_sampling_coef,
+    )
+    torchaudio.save(output_path.replace(".wav", f"_{inference_emotion}.wav"), generated_audio.cpu(), target_sample_rate)
+    generated_melspec, generated_audio = model_wrapper_pretrained.infer(
+        inference_text=inference_text,
+        inference_emotion=None,
+        ref_mel=mel,
+        ref_text=ref_text,
+        ref_emotion=None,
+        steps=nfe,
+        cfg_strength=cfg_strength,
+        cfg_strength2=None,
+        sway_sampling_coef=sway_sampling_coef,
+    )
+    torchaudio.save(output_path.replace(".wav", "_NOemotion.wav"), generated_audio.cpu(), target_sample_rate)