high77 commited on
Commit
ee96f4d
·
verified ·
1 Parent(s): 7939a73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -75
app.py CHANGED
@@ -8,11 +8,20 @@ import gradio as gr
8
  import soundfile as sf
9
  from transformers import AutoModel
10
  from typing import Tuple
 
 
11
 
12
- # ---------- LANGUAGE DETECTION (11 INDIAN LANGUAGES ONLY) ----------
13
  def detect_language_from_text(text: str) -> str:
14
- """Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te."""
15
- # 11 Indian scripts Latin (English) is **not** included
 
 
 
 
 
 
 
16
  scripts = {
17
  'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
18
  'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
@@ -30,37 +39,26 @@ def detect_language_from_text(text: str) -> str:
30
  for lang, chars in scripts.items():
31
  if txt & chars:
32
  return lang
33
- # Default to Hindi (most data) if nothing matches
34
  return 'hi'
35
 
36
- # ---------- PROSODY NEUTRALISER (GPU MEMORY LIGHT) ----------
37
- class ProsodyNeutraliser:
38
- def __init__(self):
39
- self.sr = 24_000
40
-
41
- def neutralise_prosody(self, audio: np.ndarray, src_sr: int) -> Tuple[int, np.ndarray]:
42
- """Flatten prosody (speaker voice kept)."""
43
- if audio.dtype != np.float32:
44
- audio = audio.astype(np.float32)
45
- if src_sr != self.sr:
46
- import torchaudio
47
- audio = torchaudio.functional.resample(torch.from_numpy(audio), src_sr, self.sr).numpy()
48
- # very light pitch/energy flattening
49
- f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
50
- fmax=librosa.note_to_hz('C7'), sr=self.sr)
51
- mask = ~np.isnan(f0)
52
- if mask.sum() > 2:
53
- f0_interp = np.interp(np.arange(len(f0)), np.where(mask)[0], f0[mask])
54
- from scipy.ndimage import gaussian_filter1d
55
- f0_smooth = gaussian_filter1d(f0_interp, sigma=7)
56
- audio = self._flatten_energy(audio)
57
- return self.sr, audio
58
-
59
- def _flatten_energy(self, audio: np.ndarray) -> np.ndarray:
60
- rms = librosa.feature.rms(y=audio, hop_length=512)[0]
61
- rms_mean = rms.mean()
62
- rms_flat = np.clip(rms, rms_mean * 0.6, rms_mean * 1.4)
63
- return audio * np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(rms)), rms_flat / rms)
64
 
65
  # Function to load reference audio from URL
66
  def load_audio_from_url(url):
@@ -72,27 +70,45 @@ def load_audio_from_url(url):
72
 
73
  @spaces.GPU
74
  def synthesize_speech(text, ref_audio, ref_text):
75
- if ref_audio is None or ref_text.strip() == "":
76
- return "Error: Please provide a reference audio and its corresponding text."
77
-
78
- # Ensure valid reference audio input
 
 
 
 
 
79
  if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
80
  sample_rate, audio_data = ref_audio
81
  else:
82
- return "Error: Invalid reference audio input."
83
 
84
- # Save reference audio directly without resampling
85
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
86
  sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
87
  temp_audio.flush()
88
 
89
- audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
 
 
 
 
 
90
 
91
- # Normalize output and save
92
  if audio.dtype == np.int16:
93
  audio = audio.astype(np.float32) / 32768.0
94
 
95
- return 24000, audio
 
 
 
 
 
 
 
 
96
 
97
 
98
  # Load TTS model
@@ -102,7 +118,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
102
  print("Device", device)
103
  model = model.to(device)
104
 
105
- # ---------- PRE-FETCH EXAMPLES (ONLY ODIA SYNTH TEXT) ----------
106
  EXAMPLES = [
107
  {
108
  "audio_name": "PAN_F (Happy)",
@@ -116,24 +132,6 @@ EXAMPLES = [
116
  "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
117
  "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
118
  },
119
- {
120
- "audio_name": "MAR_F (WIKI)",
121
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
122
- "ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
123
- "synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
124
- },
125
- {
126
- "audio_name": "MAR_M (WIKI)",
127
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
128
- "ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
129
- "synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
130
- },
131
- {
132
- "audio_name": "KAN_F (Happy)",
133
- "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
134
- "ref_text": "ನಮ್‌ ಫ್ರಿಜ್ಜಲ್ಲಿ ಕೂలಿಂಗ್‌ ಸమಸ്യೆ ಆಗಿ ನಾನ್‌ ಭಾಳ ದినದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆకానిక್ ಆಗಿರೋ ನిమ್‌ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬోదು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
135
- "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
136
- },
137
  ]
138
 
139
  # Preload all example audios
@@ -143,29 +141,29 @@ for example in EXAMPLES:
143
  example["audio_data"] = audio_data
144
 
145
 
146
- # Define Gradio interface with layout adjustments
147
  with gr.Blocks() as iface:
148
  gr.Markdown(
149
  """
150
- # **IndicF5: High-Quality Text-to-Speech for Indian Languages**
151
- [![Hugging Face](https://img.shields.io/badge/HuggingFace-Model-orange)](https://huggingface.co/ai4bharat/IndicF5)
152
- We release **IndicF5**, a **near-human polyglot** **Text-to-Speech (TTS)** model trained on **1417 hours** of high-quality speech from **[Rasa](https://huggingface.co/datasets/ai4bharat/Rasa), [IndicTTS](https://www.iitm.ac.in/donlab/indictts/database), [LIMMITS](https://sites.google.com/view/limmits24/), and [IndicVoices-R](https://huggingface.co/datasets/ai4bharat/indicvoices_r)**.
153
- IndicF5 supports **11 Indian languages**:
154
- **Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu.**
155
-
156
- Generate speech using a reference prompt audio and its corresponding text.
157
  """
158
  )
159
 
160
  with gr.Row():
161
  with gr.Column():
162
- text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter the text to convert to speech...", lines=3)
163
- ref_audio_input = gr.Audio(type="numpy", label="Reference Prompt Audio")
164
- ref_text_input = gr.Textbox(label="Text in Reference Prompt Audio", placeholder="Enter the transcript of the reference audio...", lines=2)
165
  submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
166
 
167
  with gr.Column():
168
- output_audio = gr.Audio(label="Generated Speech", type="numpy")
 
 
169
 
170
  # Add multiple examples
171
  examples = [
@@ -175,10 +173,14 @@ with gr.Blocks() as iface:
175
  gr.Examples(
176
  examples=examples,
177
  inputs=[text_input, ref_audio_input, ref_text_input],
178
- label="Choose an example:"
179
  )
180
 
181
- submit_btn.click(synthesize_speech, inputs=[text_input, ref_audio_input, ref_text_input], outputs=[output_audio])
182
-
 
 
 
 
183
 
184
  iface.launch(share=True)
 
8
  import soundfile as sf
9
  from transformers import AutoModel
10
  from typing import Tuple
11
+ import uuid
12
+ import os
13
 
14
+ # ---------- LANGUAGE DETECTION (UPDATED TO ALLOW ENGLISH) ----------
15
  def detect_language_from_text(text: str) -> str:
16
+ """Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'."""
17
+ # 1. Check for English (Latin Script) first
18
+ latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
19
+ text_chars = set(text)
20
+ # If text has significant Latin characters, treat as English
21
+ if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3:
22
+ return "en"
23
+
24
+ # 2. Check Indian scripts
25
  scripts = {
26
  'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
27
  'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
 
39
  for lang, chars in scripts.items():
40
  if txt & chars:
41
  return lang
42
+ # Default to Hindi if nothing matches
43
  return 'hi'
44
 
45
+ # ---------- TEXT PACER (HELPS PREVENT SKIPPING) ----------
46
+ def slow_down_text(text):
47
+ """
48
+ Adds pauses to force the model to take its time processing complex scripts.
49
+ """
50
+ if not text:
51
+ return ""
52
+ # Add a comma (pause) after every 3 words to force a breather
53
+ words = text.split()
54
+ paced_text = ""
55
+ for i, word in enumerate(words):
56
+ paced_text += word + " "
57
+ if (i + 1) % 3 == 0:
58
+ paced_text += ", "
59
+
60
+ # Add padding at start/end
61
+ return f". . . {paced_text} . . ."
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # Function to load reference audio from URL
64
  def load_audio_from_url(url):
 
70
 
71
  @spaces.GPU
72
  def synthesize_speech(text, ref_audio, ref_text):
73
+ # 1. Basic Validation
74
+ if ref_audio is None:
75
+ raise gr.Error("Please upload a Reference Audio file.")
76
+ if ref_text.strip() == "":
77
+ raise gr.Error("Please enter the text transcript for the Reference Audio.")
78
+ if text.strip() == "":
79
+ raise gr.Error("Please enter the text you want to generate.")
80
+
81
+ # 2. Reference Audio Processing
82
  if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
83
  sample_rate, audio_data = ref_audio
84
  else:
85
+ raise gr.Error("Invalid reference audio input.")
86
 
87
+ # Save reference audio to temp file
88
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
89
  sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
90
  temp_audio.flush()
91
 
92
+ # 3. Apply Text Pacing (The "Skipping" Fix)
93
+ safe_text = slow_down_text(text)
94
+
95
+ # 4. Generate Audio
96
+ # Note: We are using safe_text for generation
97
+ audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text)
98
 
99
+ # 5. Normalize Output
100
  if audio.dtype == np.int16:
101
  audio = audio.astype(np.float32) / 32768.0
102
 
103
+ # 6. Save Output to File (The "Download" Fix)
104
+ # We save the generated audio to a file so we can provide a download link
105
+ output_filename = f"generated_{uuid.uuid4().hex}.wav"
106
+ output_path = os.path.join(tempfile.gettempdir(), output_filename)
107
+
108
+ sf.write(output_path, audio, 24000)
109
+
110
+ # Return the file path twice: once for the player, once for the download button
111
+ return output_path, output_path
112
 
113
 
114
  # Load TTS model
 
118
  print("Device", device)
119
  model = model.to(device)
120
 
121
+ # ---------- PRE-FETCH EXAMPLES ----------
122
  EXAMPLES = [
123
  {
124
  "audio_name": "PAN_F (Happy)",
 
132
  "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
133
  "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
134
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  ]
136
 
137
  # Preload all example audios
 
141
  example["audio_data"] = audio_data
142
 
143
 
144
+ # Define Gradio interface
145
  with gr.Blocks() as iface:
146
  gr.Markdown(
147
  """
148
+ # **IndicF5 Dubbing Studio**
149
+ **Instructions for Best Results:**
150
+ 1. **Reference Audio:** Use a clear, 10-15 second clip. Slower speech works better.
151
+ 2. **Reference Text:** Must match the audio exactly.
152
+ 3. **Target Text:** Odia works best with punctuation. If it skips words, add commas.
 
 
153
  """
154
  )
155
 
156
  with gr.Row():
157
  with gr.Column():
158
+ text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3)
159
+ ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)")
160
+ ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2)
161
  submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
162
 
163
  with gr.Column():
164
+ output_audio = gr.Audio(label="Play Generated Speech", type="filepath")
165
+ # This is the dedicated download button
166
+ output_file = gr.File(label="Download Audio File", file_count="single")
167
 
168
  # Add multiple examples
169
  examples = [
 
173
  gr.Examples(
174
  examples=examples,
175
  inputs=[text_input, ref_audio_input, ref_text_input],
176
+ label="Quick Examples"
177
  )
178
 
179
+ # When clicked, return audio to Player AND File Downloader
180
+ submit_btn.click(
181
+ synthesize_speech,
182
+ inputs=[text_input, ref_audio_input, ref_text_input],
183
+ outputs=[output_audio, output_file]
184
+ )
185
 
186
  iface.launch(share=True)