SeaWolf-AI commited on
Commit
1b20d8b
Β·
verified Β·
1 Parent(s): 400d937

Fix build: replace KittenTTS with edge-tts, pin Python 3.12, add packages.txt for imagemagick

Browse files
Files changed (4) hide show
  1. README.md +2 -1
  2. packages.txt +2 -0
  3. requirements.txt +3 -2
  4. src/classes/Tts.py +71 -7
README.md CHANGED
@@ -5,6 +5,7 @@ colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: "4.44.1"
 
8
  app_file: app.py
9
  pinned: false
10
  ---
@@ -16,7 +17,7 @@ AI-powered YouTube Shorts video generation pipeline:
16
  1. **Topic Generation** β€” LLM generates a video topic from your niche
17
  2. **Script Writing** β€” LLM writes a short video script
18
  3. **Image Generation** β€” Gemini API creates visuals from AI prompts
19
- 4. **Text-to-Speech** β€” KittenTTS converts script to natural speech
20
  5. **Subtitle Generation** β€” faster-whisper creates synced subtitles
21
  6. **Video Assembly** β€” MoviePy combines everything into a downloadable MP4
22
 
 
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: "4.44.1"
8
+ python_version: "3.12"
9
  app_file: app.py
10
  pinned: false
11
  ---
 
17
  1. **Topic Generation** β€” LLM generates a video topic from your niche
18
  2. **Script Writing** β€” LLM writes a short video script
19
  3. **Image Generation** β€” Gemini API creates visuals from AI prompts
20
+ 4. **Text-to-Speech** β€” Edge TTS converts script to natural speech
21
  5. **Subtitle Generation** β€” faster-whisper creates synced subtitles
22
  6. **Video Assembly** β€” MoviePy combines everything into a downloadable MP4
23
 
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ imagemagick
2
+ ffmpeg
requirements.txt CHANGED
@@ -2,9 +2,10 @@ gradio>=4.0
2
  huggingface_hub>=0.20.0
3
  termcolor
4
  requests
5
- kittentts @ https://github.com/KittenML/KittenTTS/releases/download/0.8.1/kittentts-0.8.1-py3-none-any.whl
 
6
  soundfile
7
- moviepy
8
  Pillow>=10.0.0
9
  faster-whisper
10
  srt_equalizer
 
2
  huggingface_hub>=0.20.0
3
  termcolor
4
  requests
5
+ edge-tts
6
+ pydub
7
  soundfile
8
+ moviepy<2
9
  Pillow>=10.0.0
10
  faster-whisper
11
  srt_equalizer
src/classes/Tts.py CHANGED
@@ -1,18 +1,82 @@
1
  import os
2
- import soundfile as sf
3
- from kittentts import KittenTTS as KittenModel
4
 
5
- from config import ROOT_DIR, get_tts_voice
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- KITTEN_MODEL = "KittenML/kitten-tts-mini-0.8"
8
- KITTEN_SAMPLE_RATE = 24000
9
 
10
  class TTS:
11
  def __init__(self) -> None:
12
- self._model = KittenModel(KITTEN_MODEL)
13
  self._voice = get_tts_voice()
 
 
 
 
 
 
 
 
 
14
 
15
  def synthesize(self, text, output_file=os.path.join(ROOT_DIR, ".mp", "audio.wav")):
 
 
 
 
 
 
16
  audio = self._model.generate(text, voice=self._voice)
17
- sf.write(output_file, audio, KITTEN_SAMPLE_RATE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  return output_file
 
1
  import os
 
 
2
 
3
+ from config import ROOT_DIR, get_tts_voice, is_running_in_spaces
4
+
5
+ # Voice mapping: friendly name -> edge-tts voice ID
6
+ EDGE_TTS_VOICES = {
7
+ "Jasper": "en-US-GuyNeural",
8
+ "Bella": "en-US-JennyNeural",
9
+ "Luna": "en-GB-SoniaNeural",
10
+ "Bruno": "en-US-ChristopherNeural",
11
+ "Rosie": "en-AU-NatashaNeural",
12
+ "Hugo": "en-GB-RyanNeural",
13
+ "Kiki": "en-US-AriaNeural",
14
+ "Leo": "en-US-DavisNeural",
15
+ }
16
+
17
+
18
+ def _use_edge_tts() -> bool:
19
+ """Use edge-tts when KittenTTS is not available (e.g. on HF Spaces)."""
20
+ if is_running_in_spaces():
21
+ return True
22
+ try:
23
+ from kittentts import KittenTTS # noqa: F401
24
+ return False
25
+ except ImportError:
26
+ return True
27
 
 
 
28
 
29
  class TTS:
30
  def __init__(self) -> None:
 
31
  self._voice = get_tts_voice()
32
+ self._use_edge = _use_edge_tts()
33
+
34
+ if not self._use_edge:
35
+ import soundfile # noqa: F401 β€” ensure available
36
+ from kittentts import KittenTTS as KittenModel
37
+ self._model = KittenModel("KittenML/kitten-tts-mini-0.8")
38
+ self._sample_rate = 24000
39
+ else:
40
+ self._model = None
41
 
42
  def synthesize(self, text, output_file=os.path.join(ROOT_DIR, ".mp", "audio.wav")):
43
+ if self._use_edge:
44
+ return self._synthesize_edge(text, output_file)
45
+ return self._synthesize_kitten(text, output_file)
46
+
47
+ def _synthesize_kitten(self, text, output_file):
48
+ import soundfile as sf
49
  audio = self._model.generate(text, voice=self._voice)
50
+ sf.write(output_file, audio, self._sample_rate)
51
+ return output_file
52
+
53
+ def _synthesize_edge(self, text, output_file):
54
+ import asyncio
55
+ import edge_tts
56
+
57
+ voice_id = EDGE_TTS_VOICES.get(self._voice, "en-US-GuyNeural")
58
+
59
+ # edge-tts outputs mp3; we write to mp3 then keep as-is
60
+ # MoviePy can handle mp3 audio via ffmpeg
61
+ mp3_path = output_file.rsplit(".", 1)[0] + ".mp3"
62
+
63
+ async def _generate():
64
+ communicate = edge_tts.Communicate(text, voice_id)
65
+ await communicate.save(mp3_path)
66
+
67
+ asyncio.run(_generate())
68
+
69
+ # Convert mp3 to wav for compatibility with the rest of the pipeline
70
+ try:
71
+ from pydub import AudioSegment
72
+ audio = AudioSegment.from_mp3(mp3_path)
73
+ audio.export(output_file, format="wav")
74
+ os.remove(mp3_path)
75
+ except ImportError:
76
+ # If pydub not available, just use the mp3 directly
77
+ # Rename mp3 to the expected output path
78
+ if os.path.exists(output_file):
79
+ os.remove(output_file)
80
+ os.rename(mp3_path, output_file)
81
+
82
  return output_file