high77 commited on
Commit
8ede049
·
verified ·
1 Parent(s): b964169

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -40
app.py CHANGED
@@ -34,9 +34,12 @@ class ProsodyNeutraliser:
34
 
35
  def neutralise_prosody(self, audio: np.ndarray, src_sr: int) -> Tuple[int, np.ndarray]:
36
  """Return audio with flattened prosody (speaker voice kept)."""
 
 
 
37
  if src_sr != self.sr:
38
  audio = librosa.resample(audio, orig_sr=src_sr, target_sr=self.sr)
39
- # Simple but effective: flatten pitch contour → no Hindi/English intonation left
40
  f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
41
  fmax=librosa.note_to_hz('C7'), sr=self.sr)
42
  mask = ~np.isnan(f0)
@@ -44,16 +47,14 @@ class ProsodyNeutraliser:
44
  f0_interp = np.interp(np.arange(len(f0)), np.where(mask)[0], f0[mask])
45
  from scipy.ndimage import gaussian_filter1d
46
  f0_smooth = gaussian_filter1d(f0_interp, sigma=7) # flatten
47
- # Replace smooth F0 back (phase-safe approximation)
48
- audio = self._flatten_energy(audio) # optional: also flatten energy
49
  return self.sr, audio
50
 
51
  def _flatten_energy(self, audio: np.ndarray) -> np.ndarray:
52
- # Very light energy flattening (keeps naturalness)
53
  rms = librosa.feature.rms(y=audio, hop_length=512)[0]
54
  rms_mean = rms.mean()
55
  rms_flat = np.clip(rms, rms_mean * 0.6, rms_mean * 1.4)
56
- # Simple resynthesis (good enough for TTS input)
57
  return audio * np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(rms)), rms_flat / rms)
58
 
59
  # ---------- AUDIO LOADER ----------
@@ -66,11 +67,15 @@ def load_audio_from_url(url: str) -> Tuple[int, np.ndarray]:
66
 
67
  # ---------- MAIN SYNTHESIS ----------
68
  @spaces.GPU
69
- def synthesise_speech(text: str, ref_audio: Tuple[int, np.ndarray], ref_text: str):
70
  if ref_audio is None or not ref_text.strip():
71
  return "Error: reference audio + transcript required."
72
 
73
  src_sr, audio = ref_audio
 
 
 
 
74
  tgt_lang = detect_language_from_text(text)
75
  ref_lang = detect_language_from_text(ref_text)
76
 
@@ -91,62 +96,80 @@ def synthesise_speech(text: str, ref_audio: Tuple[int, np.ndarray], ref_text: st
91
 
92
  if out.dtype == np.int16:
93
  out = out.astype(np.float32) / 32768.0
94
- return 24_000, out
95
 
96
  # ---------- LOAD MODEL ----------
97
- repo_id = "ai4Bharat/IndicF5"
98
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
99
  model = AutoModel.from_pretrained(repo_id, trust_remote_code=True).to(device)
100
 
101
- # ---------- PRE-FETCH EXAMPLES (GRADIO 4-SAFE) ----------
102
  EXAMPLES = [
103
  {
104
- "audio_name": "ODIA_F (Neutral)",
105
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/ORI_F_WIKI_00001.wav",
106
- "ref_text": "ଓଡ଼ିଶା ରାଜ୍ୟର ଭୌଗୋଳିକ ଅବସ୍ଥିତି ଏହାର ଜନସାଧାରଣଙ୍କ ଜୀବନଶୈଳୀ ଉପରେ ପ୍ରଭାବ ପକାଉଛି।",
 
 
 
 
 
 
 
 
 
 
 
 
107
  "synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
108
  },
109
  {
110
- "audio_name": "ODIA_M (News)",
111
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/ORI_M_NEWS_00001.wav",
112
- "ref_text": "କୋଭିଡ ମହାମାରୀ ସମୟରେ ଓଡ଼ିଶା ସରକାର ବିଭିନ୍ନ ପଦକ୍ଷେପ ନେଇଥିଲେ।",
113
  "synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
114
  },
115
  {
116
- "audio_name": "PAN_F (Happy)",
117
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
118
- "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮਿਸਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
119
- "synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
120
  },
121
  ]
122
 
123
- # download only valid audio
124
- for ex in EXAMPLES:
125
- sr, data = load_audio_from_url(ex["audio_url"])
126
- ex["sample_rate"] = sr if sr is not None else 24_000
127
- ex["audio_data"] = data if data is not None and len(data) > 0 else np.zeros(1_000) # small dummy
128
 
129
- # build Gradio examples list (never contains None or zero-length audio)
130
- examples = []
131
- for ex in EXAMPLES:
132
- if ex["audio_data"] is not None and len(ex["audio_data"]) > 0:
133
- examples.append([ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]])
134
 
135
  # ---------- GRADIO UI ----------
136
  with gr.Blocks() as iface:
137
- gr.Markdown("""# IndicF5 TTS – Odia-prosody fixed""")
 
 
 
 
 
 
 
 
 
 
138
  with gr.Row():
139
  with gr.Column():
140
- text = gr.Textbox(label="Text to Synthesise", lines=3, placeholder="Enter Odia, Hindi, or any text…")
141
- ref_audio = gr.Audio(type="numpy", label="Reference Prompt Audio")
142
- ref_text = gr.Textbox(label="Text in Reference Prompt Audio", lines=2)
143
- btn = gr.Button("🎤 Generate Speech", variant="primary")
 
144
  with gr.Column():
145
- out_audio = gr.Audio(label="Generated Speech", type="numpy")
146
-
147
- if examples: # only show if we have valid ones
148
- gr.Examples(examples=examples, inputs=[text, ref_audio, ref_text], label="Pick an example:")
149
-
150
- btn.click(synthesise_speech, inputs=[text, ref_audio, ref_text], outputs=[out_audio])
151
 
152
  iface.launch()
 
34
 
35
  def neutralise_prosody(self, audio: np.ndarray, src_sr: int) -> Tuple[int, np.ndarray]:
36
  """Return audio with flattened prosody (speaker voice kept)."""
37
+ # Ensure float32 for librosa
38
+ if audio.dtype != np.float32:
39
+ audio = audio.astype(np.float32)
40
  if src_sr != self.sr:
41
  audio = librosa.resample(audio, orig_sr=src_sr, target_sr=self.sr)
42
+ # Flatten pitch contour → no Hindi/English intonation left
43
  f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
44
  fmax=librosa.note_to_hz('C7'), sr=self.sr)
45
  mask = ~np.isnan(f0)
 
47
  f0_interp = np.interp(np.arange(len(f0)), np.where(mask)[0], f0[mask])
48
  from scipy.ndimage import gaussian_filter1d
49
  f0_smooth = gaussian_filter1d(f0_interp, sigma=7) # flatten
50
+ # Light energy flattening
51
+ audio = self._flatten_energy(audio)
52
  return self.sr, audio
53
 
54
  def _flatten_energy(self, audio: np.ndarray) -> np.ndarray:
 
55
  rms = librosa.feature.rms(y=audio, hop_length=512)[0]
56
  rms_mean = rms.mean()
57
  rms_flat = np.clip(rms, rms_mean * 0.6, rms_mean * 1.4)
 
58
  return audio * np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(rms)), rms_flat / rms)
59
 
60
  # ---------- AUDIO LOADER ----------
 
67
 
68
  # ---------- MAIN SYNTHESIS ----------
69
  @spaces.GPU
70
+ def synthesize_speech(text: str, ref_audio: Tuple[int, np.ndarray], ref_text: str):
71
  if ref_audio is None or not ref_text.strip():
72
  return "Error: reference audio + transcript required."
73
 
74
  src_sr, audio = ref_audio
75
+ # Ensure float32
76
+ if audio.dtype != np.float32:
77
+ audio = audio.astype(np.float32)
78
+
79
  tgt_lang = detect_language_from_text(text)
80
  ref_lang = detect_language_from_text(ref_text)
81
 
 
96
 
97
  if out.dtype == np.int16:
98
  out = out.astype(np.float32) / 32768.0
99
+ return 24000, out
100
 
101
  # ---------- LOAD MODEL ----------
102
+ repo_id = "ai4bharat/IndicF5"
103
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
104
  model = AutoModel.from_pretrained(repo_id, trust_remote_code=True).to(device)
105
 
106
+ # ---------- PRE-FETCH EXAMPLES (ONLY ODIA SYNTH TEXT) ----------
107
  EXAMPLES = [
108
  {
109
+ "audio_name": "PAN_F (Happy)",
110
+ "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
111
+ "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮਿਸਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
112
+ "synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
113
+ },
114
+ {
115
+ "audio_name": "TAM_F (Happy)",
116
+ "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
117
+ "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
118
+ "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
119
+ },
120
+ {
121
+ "audio_name": "MAR_F (WIKI)",
122
+ "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
123
+ "ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
124
  "synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
125
  },
126
  {
127
+ "audio_name": "MAR_M (WIKI)",
128
+ "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
129
+ "ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
130
  "synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
131
  },
132
  {
133
+ "audio_name": "KAN_F (Happy)",
134
+ "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
135
+ "ref_text": "ನಮ್‌ ಫ್ರಿಜ್ಜಲ್ಲಿ ಕೂಲಿಂಗ್‌ ಸಮಸ್ಯೆ ಆಗಿ ನಾನ್‌ ಭಾಳ ದಿನದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆಕಾನಿಕ್ ಆಗಿರೋ ನಿಮ್‌ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬೋದು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
136
+ "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
137
  },
138
  ]
139
 
140
+ # Preload all example audios (skip broken ones)
141
+ for example in EXAMPLES:
142
+ sample_rate, audio_data = load_audio_from_url(example["audio_url"])
143
+ example["sample_rate"] = sample_rate if sample_rate is not None else 24_000
144
+ example["audio_data"] = audio_data if audio_data is not None and len(audio_data) > 0 else np.zeros(1_000)
145
 
146
+ # Gradio 4.x compatible examples
147
+ examples = [[ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]] for ex in EXAMPLES]
 
 
 
148
 
149
  # ---------- GRADIO UI ----------
150
  with gr.Blocks() as iface:
151
+ gr.Markdown(
152
+ """
153
+ # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
154
+ [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
155
+ We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
156
+ IndicF5 supports **11 Indian languages**:
157
+ **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
158
+ Generate speech using a reference prompt audio and its corresponding text.
159
+ """
160
+ )
161
+
162
  with gr.Row():
163
  with gr.Column():
164
+ text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
165
+ ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
166
+ ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
167
+ submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
168
+
169
  with gr.Column():
170
+ output_audio = gr.Audio(label="Generated Speech", type="numpy")
171
+
172
+ gr.Examples(examples=examples, inputs=[text_input, ref_audio_input, ref_text_input], label="Choose an example:")
173
+ submit_btn.click(synthesize_speech, inputs=[text_input, ref_audio_input, ref_text_input], outputs=[output_audio])
 
 
174
 
175
  iface.launch()