File size: 2,694 Bytes
a8fdab7
 
1b20d8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8fdab7
 
 
 
 
1b20d8b
 
 
 
 
 
 
 
 
a8fdab7
 
1b20d8b
 
 
 
 
 
a8fdab7
1b20d8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8fdab7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os

from config import ROOT_DIR, get_tts_voice, is_running_in_spaces

# Voice mapping: friendly name -> edge-tts voice ID
EDGE_TTS_VOICES = {
    "Jasper": "en-US-GuyNeural",
    "Bella": "en-US-JennyNeural",
    "Luna": "en-GB-SoniaNeural",
    "Bruno": "en-US-ChristopherNeural",
    "Rosie": "en-AU-NatashaNeural",
    "Hugo": "en-GB-RyanNeural",
    "Kiki": "en-US-AriaNeural",
    "Leo": "en-US-DavisNeural",
}


def _use_edge_tts() -> bool:
    """Use edge-tts when KittenTTS is not available (e.g. on HF Spaces)."""
    if is_running_in_spaces():
        return True
    try:
        from kittentts import KittenTTS  # noqa: F401
        return False
    except ImportError:
        return True


class TTS:
    def __init__(self) -> None:
        self._voice = get_tts_voice()
        self._use_edge = _use_edge_tts()

        if not self._use_edge:
            import soundfile  # noqa: F401 — ensure available
            from kittentts import KittenTTS as KittenModel
            self._model = KittenModel("KittenML/kitten-tts-mini-0.8")
            self._sample_rate = 24000
        else:
            self._model = None

    def synthesize(self, text, output_file=os.path.join(ROOT_DIR, ".mp", "audio.wav")):
        if self._use_edge:
            return self._synthesize_edge(text, output_file)
        return self._synthesize_kitten(text, output_file)

    def _synthesize_kitten(self, text, output_file):
        import soundfile as sf
        audio = self._model.generate(text, voice=self._voice)
        sf.write(output_file, audio, self._sample_rate)
        return output_file

    def _synthesize_edge(self, text, output_file):
        import asyncio
        import edge_tts

        voice_id = EDGE_TTS_VOICES.get(self._voice, "en-US-GuyNeural")

        # edge-tts outputs mp3; we write to mp3 then keep as-is
        # MoviePy can handle mp3 audio via ffmpeg
        mp3_path = output_file.rsplit(".", 1)[0] + ".mp3"

        async def _generate():
            communicate = edge_tts.Communicate(text, voice_id)
            await communicate.save(mp3_path)

        asyncio.run(_generate())

        # Convert mp3 to wav for compatibility with the rest of the pipeline
        try:
            from pydub import AudioSegment
            audio = AudioSegment.from_mp3(mp3_path)
            audio.export(output_file, format="wav")
            os.remove(mp3_path)
        except ImportError:
            # If pydub not available, just use the mp3 directly
            # Rename mp3 to the expected output path
            if os.path.exists(output_file):
                os.remove(output_file)
            os.rename(mp3_path, output_file)

        return output_file