File size: 8,699 Bytes
21af360 5b34230 7516f78 21af360 29a7548 ee96f4d 573805f ee96f4d 573805f ee96f4d d01c447 5ad586c d01c447 5ad586c d01c447 29a7548 d01c447 ee96f4d d01c447 29a7548 ee96f4d 29a7548 48feb86 21af360 5b34230 48feb86 ee96f4d 48feb86 ee96f4d 48feb86 ee96f4d 48feb86 ee96f4d 48feb86 ee96f4d 5ad586c ee96f4d 5ad586c 48feb86 7516f78 48feb86 21af360 ee96f4d 21af360 573805f 8ede049 5ad586c 8ede049 5ad586c 8ede049 21af360 81968d5 ee96f4d ed1cc99 8ede049 ee96f4d 8ede049 ed1cc99 ee96f4d 8ede049 ed1cc99 ee96f4d 8ede049 81968d5 ee96f4d 81968d5 d01c447 ee96f4d 81968d5 0e1a522 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import io
import spaces
import torch
import requests
import tempfile
import numpy as np
import gradio as gr
import soundfile as sf
from transformers import AutoModel
from typing import Tuple
import uuid
import os
# ---------- LANGUAGE DETECTION (UPDATED TO ALLOW ENGLISH) ----------
def detect_language_from_text(text: str) -> str:
"""Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'."""
# 1. Check for English (Latin Script) first
latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
text_chars = set(text)
# If text has significant Latin characters, treat as English
if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3:
return "en"
# 2. Check Indian scripts
scripts = {
'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'),
'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
'kn': set('ಅಆಇಈಉಊಋಏಐಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'),
'ml': set('അആഇഈഉഊഋഏഐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'),
'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'),
'pa': set('ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼ਸਹਕਸ਼ਜ਼'),
'ta': set('அஆஇஈஉஊஎஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'),
'te': set('అఆఇఈఉఊఋఎఐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'),
}
txt = set(text.replace(' ', ''))
for lang, chars in scripts.items():
if txt & chars:
return lang
# Default to Hindi if nothing matches
return 'hi'
# ---------- TEXT PACER (HELPS PREVENT SKIPPING) ----------
def slow_down_text(text):
"""
Adds pauses to force the model to take its time processing complex scripts.
"""
if not text:
return ""
# Add a comma (pause) after every 3 words to force a breather
words = text.split()
paced_text = ""
for i, word in enumerate(words):
paced_text += word + " "
if (i + 1) % 3 == 0:
paced_text += ", "
# Add padding at start/end
return f". . . {paced_text} . . ."
# Function to load reference audio from URL
def load_audio_from_url(url):
response = requests.get(url)
if response.status_code == 200:
audio_data, sample_rate = sf.read(io.BytesIO(response.content))
return sample_rate, audio_data
return None, None
@spaces.GPU
def synthesize_speech(text, ref_audio, ref_text):
# 1. Basic Validation
if ref_audio is None:
raise gr.Error("Please upload a Reference Audio file.")
if ref_text.strip() == "":
raise gr.Error("Please enter the text transcript for the Reference Audio.")
if text.strip() == "":
raise gr.Error("Please enter the text you want to generate.")
# 2. Reference Audio Processing
if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
sample_rate, audio_data = ref_audio
else:
raise gr.Error("Invalid reference audio input.")
# Save reference audio to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
temp_audio.flush()
# 3. Apply Text Pacing (The "Skipping" Fix)
safe_text = slow_down_text(text)
# 4. Generate Audio
# Note: We are using safe_text for generation
audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text)
# 5. Normalize Output
if audio.dtype == np.int16:
audio = audio.astype(np.float32) / 32768.0
# 6. Save Output to File (The "Download" Fix)
# We save the generated audio to a file so we can provide a download link
output_filename = f"generated_{uuid.uuid4().hex}.wav"
output_path = os.path.join(tempfile.gettempdir(), output_filename)
sf.write(output_path, audio, 24000)
# Return the file path twice: once for the player, once for the download button
return output_path, output_path
# Load TTS model
repo_id = "ai4bharat/IndicF5"
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device", device)
model = model.to(device)
# ---------- PRE-FETCH EXAMPLES ----------
EXAMPLES = [
{
"audio_name": "PAN_F (Happy)",
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
"ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮిసਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
"synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
},
{
"audio_name": "TAM_F (Happy)",
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
"ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
},
]
# Preload all example audios
for example in EXAMPLES:
sample_rate, audio_data = load_audio_from_url(example["audio_url"])
example["sample_rate"] = sample_rate
example["audio_data"] = audio_data
# Define Gradio interface
with gr.Blocks() as iface:
gr.Markdown(
"""
# **IndicF5 Dubbing Studio**
**Instructions for Best Results:**
1. **Reference Audio:** Use a clear, 10-15 second clip. Slower speech works better.
2. **Reference Text:** Must match the audio exactly.
3. **Target Text:** Odia works best with punctuation. If it skips words, add commas.
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3)
ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)")
ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2)
submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
with gr.Column():
output_audio = gr.Audio(label="Play Generated Speech", type="filepath")
# This is the dedicated download button
output_file = gr.File(label="Download Audio File", file_count="single")
# Add multiple examples
examples = [
[ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]] for ex in EXAMPLES
]
gr.Examples(
examples=examples,
inputs=[text_input, ref_audio_input, ref_text_input],
label="Quick Examples"
)
# When clicked, return audio to Player AND File Downloader
submit_btn.click(
synthesize_speech,
inputs=[text_input, ref_audio_input, ref_text_input],
outputs=[output_audio, output_file]
)
iface.launch(share=True) |