FarmerlineML commited on
Commit
30123cd
·
verified ·
1 Parent(s): e3c585f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -21
app.py CHANGED
@@ -8,7 +8,6 @@ from pydub import AudioSegment
8
  from transformers import VitsModel, AutoTokenizer
9
 
10
  # ---------- Configuration --------------------------------------------------
11
- # Define available TTS models here. Add new entries as needed.
12
  TTS_MODELS = {
13
  "Ewe": {
14
  "tokenizer": "FarmerlineML/Ewe-tts-2025_v3",
@@ -30,13 +29,25 @@ TTS_MODELS = {
30
  "tokenizer": "FarmerlineML/luganda_TTS_v1",
31
  "checkpoint": "FarmerlineML/luganda_TTS_v1"
32
  },
33
-
34
  "Yoruba": {
35
  "tokenizer": "FarmerlineML/yoruba_tts-2025",
36
  "checkpoint": "FarmerlineML/yoruba_tts-2025"
37
  },
38
  }
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
 
42
  # ---------- Load all models & tokenizers -----------------------------------
@@ -46,17 +57,16 @@ for name, paths in TTS_MODELS.items():
46
  print(f"Loading {name} model...")
47
  model = VitsModel.from_pretrained(paths["checkpoint"]).to(device)
48
  model.eval()
49
- # Apply clear-speech inference parameters (tweak per model if desired)
50
- model.noise_scale = 0.667
51
- model.noise_scale_duration = 0.7
52
- model.speaking_rate = 0.85
53
- models[name] = model
54
  tokenizers[name] = AutoTokenizer.from_pretrained(paths["tokenizer"])
55
 
56
  # ---------- Utility: WAV ➔ MP3 Conversion -----------------------------------
57
  def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str:
58
  """Convert int16 numpy waveform to an MP3 temp file, return its path."""
59
- # Ensure int16 for pydub
60
  if wave_np.dtype != np.int16:
61
  wave_np = (wave_np * 32767).astype(np.int16)
62
 
@@ -74,25 +84,25 @@ def tts_generate(model_name: str, text: str):
74
  """Generate speech for `text` using the selected model."""
75
  if not text:
76
  return None
77
- model = models[model_name]
78
  tokenizer = tokenizers[model_name]
79
- inputs = tokenizer(text, return_tensors="pt").to(device)
80
  with torch.no_grad():
81
  wave = model(**inputs).waveform[0].cpu().numpy()
82
  return _wav_to_mp3(wave, model.config.sampling_rate)
83
 
84
  # ---------- Gradio Interface ------------------------------------------------
85
  examples = [
86
- ["Yoruba", "Jídé ń ta irinṣẹ́ kápẹ́ntà àti ti ìkọ́lé ní ọjà Ìsìnkàn"],
87
- ["Ewe", "amewo le atsi tre woɖo fli kple woƒe tɔkpowo kple agbawo kple galɔn wo. ʋu si nɔ tsi dram la tɔ ɖe wo xa eye nyɔnu eve yi le drɔm me le kɔkɔm ɖe tɔkpo kple galɔn me bubu hā le agba ɖe ta."],
88
- ["Ewe", "ɖekakpui ene wonɔ dɔgɔe me henɔ tsi kum le teƒe aɖe to. ɖeka ɖɔ kuku se avɔ ɖe ali eye tɔkpo et̄ɔ ye nɔ wo si."],
89
  ["Swahili", "zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"],
90
  ["Swahili", "Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."],
91
- ["Krio", "Wetin na yu nem?"],
92
- ["Krio", "aw yu de du"],
93
  ["Luganda", "Ndi musanyufu okukulaba leero"],
94
- ["Hausa", "yaya za ka ƙi hafsan mafi ƙanƙanci na hafsoshin maigidana ko da yake kana dogara ga masar don kekunan yaƙi da mahayan dawakai"],
95
- ["Hausa", "ina fata dukkanku za ku ji ni sosai. wannan ita ce ma'anar kawai."]
96
  ]
97
 
98
  demo = gr.Interface(
@@ -103,12 +113,10 @@ demo = gr.Interface(
103
  ],
104
  outputs=gr.Audio(type="filepath", label="Audio", autoplay=True),
105
  title="Multi‐Model Text-to-Speech",
106
- description=(
107
- "Select a TTS model from the dropdown and enter text to generate speech."
108
- ),
109
  examples=examples,
110
  cache_examples=True,
111
  )
112
 
113
  if __name__ == "__main__":
114
- demo.launch()
 
8
  from transformers import VitsModel, AutoTokenizer
9
 
10
  # ---------- Configuration --------------------------------------------------
 
11
  TTS_MODELS = {
12
  "Ewe": {
13
  "tokenizer": "FarmerlineML/Ewe-tts-2025_v3",
 
29
  "tokenizer": "FarmerlineML/luganda_TTS_v1",
30
  "checkpoint": "FarmerlineML/luganda_TTS_v1"
31
  },
 
32
  "Yoruba": {
33
  "tokenizer": "FarmerlineML/yoruba_tts-2025",
34
  "checkpoint": "FarmerlineML/yoruba_tts-2025"
35
  },
36
  }
37
 
38
+ # Per-language inference parameters
39
+ # noise_scale: prosody expressiveness (higher = more varied intonation)
40
+ # noise_scale_duration: rhythm/timing variation (higher = more natural pausing)
41
+ # speaking_rate: speed (lower = slower, clearer)
42
+ LANG_PARAMS = {
43
+ "Yoruba": {"noise_scale": 0.667, "noise_scale_duration": 0.8, "speaking_rate": 0.85}, # tonal
44
+ "Ewe": {"noise_scale": 0.667, "noise_scale_duration": 0.8, "speaking_rate": 0.85}, # tonal
45
+ "Hausa": {"noise_scale": 0.5, "noise_scale_duration": 0.6, "speaking_rate": 0.9},
46
+ "Swahili": {"noise_scale": 0.5, "noise_scale_duration": 0.6, "speaking_rate": 0.9},
47
+ "Luganda": {"noise_scale": 0.5, "noise_scale_duration": 0.6, "speaking_rate": 0.88},
48
+ "Krio": {"noise_scale": 0.4, "noise_scale_duration": 0.3, "speaking_rate": 0.95}, # creole, flatter prosody
49
+ }
50
+
51
  device = "cuda" if torch.cuda.is_available() else "cpu"
52
 
53
  # ---------- Load all models & tokenizers -----------------------------------
 
57
  print(f"Loading {name} model...")
58
  model = VitsModel.from_pretrained(paths["checkpoint"]).to(device)
59
  model.eval()
60
+ p = LANG_PARAMS[name]
61
+ model.noise_scale = p["noise_scale"]
62
+ model.noise_scale_duration = p["noise_scale_duration"]
63
+ model.speaking_rate = p["speaking_rate"]
64
+ models[name] = model
65
  tokenizers[name] = AutoTokenizer.from_pretrained(paths["tokenizer"])
66
 
67
  # ---------- Utility: WAV ➔ MP3 Conversion -----------------------------------
68
  def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str:
69
  """Convert int16 numpy waveform to an MP3 temp file, return its path."""
 
70
  if wave_np.dtype != np.int16:
71
  wave_np = (wave_np * 32767).astype(np.int16)
72
 
 
84
  """Generate speech for `text` using the selected model."""
85
  if not text:
86
  return None
87
+ model = models[model_name]
88
  tokenizer = tokenizers[model_name]
89
+ inputs = tokenizer(text, return_tensors="pt").to(device)
90
  with torch.no_grad():
91
  wave = model(**inputs).waveform[0].cpu().numpy()
92
  return _wav_to_mp3(wave, model.config.sampling_rate)
93
 
94
  # ---------- Gradio Interface ------------------------------------------------
95
  examples = [
96
+ ["Yoruba", "Jídé ń ta irinṣẹ́ kápẹ́ntà àti ti ìkọ́lé ní ọjà Ìsìnkàn"],
97
+ ["Ewe", "amewo le atsi tre woɖo fli kple woƒe tɔkpowo kple agbawo kple galɔn wo. ʋu si nɔ tsi dram la tɔ ɖe wo xa eye nyɔnu eve yi le drɔm me le kɔkɔm ɖe tɔkpo kple galɔn me bubu hā le agba ɖe ta."],
98
+ ["Ewe", "ɖekakpui ene wonɔ dɔgɔe me henɔ tsi kum le teƒe aɖe to. ɖeka ɖɔ kuku se avɔ ɖe ali eye tɔkpo et̄ɔ ye nɔ wo si."],
99
  ["Swahili", "zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"],
100
  ["Swahili", "Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."],
101
+ ["Krio", "Wetin na yu nem?"],
102
+ ["Krio", "aw yu de du"],
103
  ["Luganda", "Ndi musanyufu okukulaba leero"],
104
+ ["Hausa", "yaya za ka ƙi hafsan mafi ƙanƙanci na hafsoshin maigidana ko da yake kana dogara ga masar don kekunan yaƙi da mahayan dawakai"],
105
+ ["Hausa", "ina fata dukkanku za ku ji ni sosai. wannan ita ce ma'anar kawai."]
106
  ]
107
 
108
  demo = gr.Interface(
 
113
  ],
114
  outputs=gr.Audio(type="filepath", label="Audio", autoplay=True),
115
  title="Multi‐Model Text-to-Speech",
116
+ description="Select a TTS model from the dropdown and enter text to generate speech.",
 
 
117
  examples=examples,
118
  cache_examples=True,
119
  )
120
 
121
  if __name__ == "__main__":
122
+ demo.launch()