Update app.py
Browse files
app.py
CHANGED
|
@@ -8,11 +8,20 @@ import gradio as gr
|
|
| 8 |
import soundfile as sf
|
| 9 |
from transformers import AutoModel
|
| 10 |
from typing import Tuple
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
# ---------- LANGUAGE DETECTION (
|
| 13 |
def detect_language_from_text(text: str) -> str:
|
| 14 |
-
"""Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te."""
|
| 15 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
scripts = {
|
| 17 |
'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
| 18 |
'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
|
@@ -30,37 +39,26 @@ def detect_language_from_text(text: str) -> str:
|
|
| 30 |
for lang, chars in scripts.items():
|
| 31 |
if txt & chars:
|
| 32 |
return lang
|
| 33 |
-
# Default to Hindi
|
| 34 |
return 'hi'
|
| 35 |
|
| 36 |
-
# ----------
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
f0_interp = np.interp(np.arange(len(f0)), np.where(mask)[0], f0[mask])
|
| 54 |
-
from scipy.ndimage import gaussian_filter1d
|
| 55 |
-
f0_smooth = gaussian_filter1d(f0_interp, sigma=7)
|
| 56 |
-
audio = self._flatten_energy(audio)
|
| 57 |
-
return self.sr, audio
|
| 58 |
-
|
| 59 |
-
def _flatten_energy(self, audio: np.ndarray) -> np.ndarray:
|
| 60 |
-
rms = librosa.feature.rms(y=audio, hop_length=512)[0]
|
| 61 |
-
rms_mean = rms.mean()
|
| 62 |
-
rms_flat = np.clip(rms, rms_mean * 0.6, rms_mean * 1.4)
|
| 63 |
-
return audio * np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(rms)), rms_flat / rms)
|
| 64 |
|
| 65 |
# Function to load reference audio from URL
|
| 66 |
def load_audio_from_url(url):
|
|
@@ -72,27 +70,45 @@ def load_audio_from_url(url):
|
|
| 72 |
|
| 73 |
@spaces.GPU
|
| 74 |
def synthesize_speech(text, ref_audio, ref_text):
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
|
| 80 |
sample_rate, audio_data = ref_audio
|
| 81 |
else:
|
| 82 |
-
|
| 83 |
|
| 84 |
-
# Save reference audio
|
| 85 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
|
| 86 |
sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
|
| 87 |
temp_audio.flush()
|
| 88 |
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
# Normalize
|
| 92 |
if audio.dtype == np.int16:
|
| 93 |
audio = audio.astype(np.float32) / 32768.0
|
| 94 |
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
|
| 98 |
# Load TTS model
|
|
@@ -102,7 +118,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 102 |
print("Device", device)
|
| 103 |
model = model.to(device)
|
| 104 |
|
| 105 |
-
# ---------- PRE-FETCH EXAMPLES
|
| 106 |
EXAMPLES = [
|
| 107 |
{
|
| 108 |
"audio_name": "PAN_F (Happy)",
|
|
@@ -116,24 +132,6 @@ EXAMPLES = [
|
|
| 116 |
"ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
|
| 117 |
"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
|
| 118 |
},
|
| 119 |
-
{
|
| 120 |
-
"audio_name": "MAR_F (WIKI)",
|
| 121 |
-
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
|
| 122 |
-
"ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
|
| 123 |
-
"synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
|
| 124 |
-
},
|
| 125 |
-
{
|
| 126 |
-
"audio_name": "MAR_M (WIKI)",
|
| 127 |
-
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
|
| 128 |
-
"ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
|
| 129 |
-
"synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
|
| 130 |
-
},
|
| 131 |
-
{
|
| 132 |
-
"audio_name": "KAN_F (Happy)",
|
| 133 |
-
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
|
| 134 |
-
"ref_text": "ನಮ್ ಫ್ರಿಜ್ಜಲ್ಲಿ ಕೂలಿಂಗ್ ಸమಸ്യೆ ಆಗಿ ನಾನ್ ಭಾಳ ದినದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆకానిక್ ಆಗಿರೋ ನిమ್ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬోదು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
|
| 135 |
-
"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
|
| 136 |
-
},
|
| 137 |
]
|
| 138 |
|
| 139 |
# Preload all example audios
|
|
@@ -143,29 +141,29 @@ for example in EXAMPLES:
|
|
| 143 |
example["audio_data"] = audio_data
|
| 144 |
|
| 145 |
|
| 146 |
-
# Define Gradio interface
|
| 147 |
with gr.Blocks() as iface:
|
| 148 |
gr.Markdown(
|
| 149 |
"""
|
| 150 |
-
# **IndicF5
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
**
|
| 155 |
-
|
| 156 |
-
Generate speech using a reference prompt audio and its corresponding text.
|
| 157 |
"""
|
| 158 |
)
|
| 159 |
|
| 160 |
with gr.Row():
|
| 161 |
with gr.Column():
|
| 162 |
-
text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter
|
| 163 |
-
ref_audio_input = gr.Audio(type="numpy", label="Reference
|
| 164 |
-
ref_text_input = gr.Textbox(label="
|
| 165 |
submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
|
| 166 |
|
| 167 |
with gr.Column():
|
| 168 |
-
output_audio = gr.Audio(label="Generated Speech", type="
|
|
|
|
|
|
|
| 169 |
|
| 170 |
# Add multiple examples
|
| 171 |
examples = [
|
|
@@ -175,10 +173,14 @@ with gr.Blocks() as iface:
|
|
| 175 |
gr.Examples(
|
| 176 |
examples=examples,
|
| 177 |
inputs=[text_input, ref_audio_input, ref_text_input],
|
| 178 |
-
label="
|
| 179 |
)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
iface.launch(share=True)
|
|
|
|
| 8 |
import soundfile as sf
|
| 9 |
from transformers import AutoModel
|
| 10 |
from typing import Tuple
|
| 11 |
+
import uuid
|
| 12 |
+
import os
|
| 13 |
|
| 14 |
+
# ---------- LANGUAGE DETECTION (UPDATED TO ALLOW ENGLISH) ----------
|
| 15 |
def detect_language_from_text(text: str) -> str:
|
| 16 |
+
"""Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'."""
|
| 17 |
+
# 1. Check for English (Latin Script) first
|
| 18 |
+
latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
| 19 |
+
text_chars = set(text)
|
| 20 |
+
# If text has significant Latin characters, treat as English
|
| 21 |
+
if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3:
|
| 22 |
+
return "en"
|
| 23 |
+
|
| 24 |
+
# 2. Check Indian scripts
|
| 25 |
scripts = {
|
| 26 |
'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
| 27 |
'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
|
|
|
| 39 |
for lang, chars in scripts.items():
|
| 40 |
if txt & chars:
|
| 41 |
return lang
|
| 42 |
+
# Default to Hindi if nothing matches
|
| 43 |
return 'hi'
|
| 44 |
|
| 45 |
+
# ---------- TEXT PACER (HELPS PREVENT SKIPPING) ----------
|
| 46 |
+
def slow_down_text(text):
|
| 47 |
+
"""
|
| 48 |
+
Adds pauses to force the model to take its time processing complex scripts.
|
| 49 |
+
"""
|
| 50 |
+
if not text:
|
| 51 |
+
return ""
|
| 52 |
+
# Add a comma (pause) after every 3 words to force a breather
|
| 53 |
+
words = text.split()
|
| 54 |
+
paced_text = ""
|
| 55 |
+
for i, word in enumerate(words):
|
| 56 |
+
paced_text += word + " "
|
| 57 |
+
if (i + 1) % 3 == 0:
|
| 58 |
+
paced_text += ", "
|
| 59 |
+
|
| 60 |
+
# Add padding at start/end
|
| 61 |
+
return f". . . {paced_text} . . ."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
# Function to load reference audio from URL
|
| 64 |
def load_audio_from_url(url):
|
|
|
|
| 70 |
|
| 71 |
@spaces.GPU
|
| 72 |
def synthesize_speech(text, ref_audio, ref_text):
|
| 73 |
+
# 1. Basic Validation
|
| 74 |
+
if ref_audio is None:
|
| 75 |
+
raise gr.Error("Please upload a Reference Audio file.")
|
| 76 |
+
if ref_text.strip() == "":
|
| 77 |
+
raise gr.Error("Please enter the text transcript for the Reference Audio.")
|
| 78 |
+
if text.strip() == "":
|
| 79 |
+
raise gr.Error("Please enter the text you want to generate.")
|
| 80 |
+
|
| 81 |
+
# 2. Reference Audio Processing
|
| 82 |
if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
|
| 83 |
sample_rate, audio_data = ref_audio
|
| 84 |
else:
|
| 85 |
+
raise gr.Error("Invalid reference audio input.")
|
| 86 |
|
| 87 |
+
# Save reference audio to temp file
|
| 88 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
|
| 89 |
sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
|
| 90 |
temp_audio.flush()
|
| 91 |
|
| 92 |
+
# 3. Apply Text Pacing (The "Skipping" Fix)
|
| 93 |
+
safe_text = slow_down_text(text)
|
| 94 |
+
|
| 95 |
+
# 4. Generate Audio
|
| 96 |
+
# Note: We are using safe_text for generation
|
| 97 |
+
audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text)
|
| 98 |
|
| 99 |
+
# 5. Normalize Output
|
| 100 |
if audio.dtype == np.int16:
|
| 101 |
audio = audio.astype(np.float32) / 32768.0
|
| 102 |
|
| 103 |
+
# 6. Save Output to File (The "Download" Fix)
|
| 104 |
+
# We save the generated audio to a file so we can provide a download link
|
| 105 |
+
output_filename = f"generated_{uuid.uuid4().hex}.wav"
|
| 106 |
+
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
| 107 |
+
|
| 108 |
+
sf.write(output_path, audio, 24000)
|
| 109 |
+
|
| 110 |
+
# Return the file path twice: once for the player, once for the download button
|
| 111 |
+
return output_path, output_path
|
| 112 |
|
| 113 |
|
| 114 |
# Load TTS model
|
|
|
|
| 118 |
print("Device", device)
|
| 119 |
model = model.to(device)
|
| 120 |
|
| 121 |
+
# ---------- PRE-FETCH EXAMPLES ----------
|
| 122 |
EXAMPLES = [
|
| 123 |
{
|
| 124 |
"audio_name": "PAN_F (Happy)",
|
|
|
|
| 132 |
"ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
|
| 133 |
"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
|
| 134 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
]
|
| 136 |
|
| 137 |
# Preload all example audios
|
|
|
|
| 141 |
example["audio_data"] = audio_data
|
| 142 |
|
| 143 |
|
| 144 |
+
# Define Gradio interface
|
| 145 |
with gr.Blocks() as iface:
|
| 146 |
gr.Markdown(
|
| 147 |
"""
|
| 148 |
+
# **IndicF5 Dubbing Studio**
|
| 149 |
+
**Instructions for Best Results:**
|
| 150 |
+
1. **Reference Audio:** Use a clear, 10-15 second clip. Slower speech works better.
|
| 151 |
+
2. **Reference Text:** Must match the audio exactly.
|
| 152 |
+
3. **Target Text:** Odia works best with punctuation. If it skips words, add commas.
|
|
|
|
|
|
|
| 153 |
"""
|
| 154 |
)
|
| 155 |
|
| 156 |
with gr.Row():
|
| 157 |
with gr.Column():
|
| 158 |
+
text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3)
|
| 159 |
+
ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)")
|
| 160 |
+
ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2)
|
| 161 |
submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
|
| 162 |
|
| 163 |
with gr.Column():
|
| 164 |
+
output_audio = gr.Audio(label="Play Generated Speech", type="filepath")
|
| 165 |
+
# This is the dedicated download button
|
| 166 |
+
output_file = gr.File(label="Download Audio File", file_count="single")
|
| 167 |
|
| 168 |
# Add multiple examples
|
| 169 |
examples = [
|
|
|
|
| 173 |
gr.Examples(
|
| 174 |
examples=examples,
|
| 175 |
inputs=[text_input, ref_audio_input, ref_text_input],
|
| 176 |
+
label="Quick Examples"
|
| 177 |
)
|
| 178 |
|
| 179 |
+
# When clicked, return audio to Player AND File Downloader
|
| 180 |
+
submit_btn.click(
|
| 181 |
+
synthesize_speech,
|
| 182 |
+
inputs=[text_input, ref_audio_input, ref_text_input],
|
| 183 |
+
outputs=[output_audio, output_file]
|
| 184 |
+
)
|
| 185 |
|
| 186 |
iface.launch(share=True)
|