|
|
import io |
|
|
import spaces |
|
|
import torch |
|
|
import requests |
|
|
import tempfile |
|
|
import numpy as np |
|
|
import gradio as gr |
|
|
import soundfile as sf |
|
|
from transformers import AutoModel |
|
|
from typing import Tuple |
|
|
import uuid |
|
|
import os |
|
|
|
|
|
|
|
|
def detect_language_from_text(text: str) -> str: |
|
|
"""Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'.""" |
|
|
|
|
|
latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") |
|
|
text_chars = set(text) |
|
|
|
|
|
if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3: |
|
|
return "en" |
|
|
|
|
|
|
|
|
scripts = { |
|
|
'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'), |
|
|
'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'), |
|
|
'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'), |
|
|
'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'), |
|
|
'kn': set('ಅಆಇಈಉಊಋಏಐಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'), |
|
|
'ml': set('അആഇഈഉഊഋഏഐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'), |
|
|
'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'), |
|
|
'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'), |
|
|
'pa': set('ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼ਸਹਕਸ਼ਜ਼'), |
|
|
'ta': set('அஆஇஈஉஊஎஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'), |
|
|
'te': set('అఆఇఈఉఊఋఎఐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'), |
|
|
} |
|
|
txt = set(text.replace(' ', '')) |
|
|
for lang, chars in scripts.items(): |
|
|
if txt & chars: |
|
|
return lang |
|
|
|
|
|
return 'hi' |
|
|
|
|
|
|
|
|
def slow_down_text(text): |
|
|
""" |
|
|
Adds pauses to force the model to take its time processing complex scripts. |
|
|
""" |
|
|
if not text: |
|
|
return "" |
|
|
|
|
|
words = text.split() |
|
|
paced_text = "" |
|
|
for i, word in enumerate(words): |
|
|
paced_text += word + " " |
|
|
if (i + 1) % 3 == 0: |
|
|
paced_text += ", " |
|
|
|
|
|
|
|
|
return f". . . {paced_text} . . ." |
|
|
|
|
|
|
|
|
def load_audio_from_url(url): |
|
|
response = requests.get(url) |
|
|
if response.status_code == 200: |
|
|
audio_data, sample_rate = sf.read(io.BytesIO(response.content)) |
|
|
return sample_rate, audio_data |
|
|
return None, None |
|
|
|
|
|
@spaces.GPU |
|
|
def synthesize_speech(text, ref_audio, ref_text): |
|
|
|
|
|
if ref_audio is None: |
|
|
raise gr.Error("Please upload a Reference Audio file.") |
|
|
if ref_text.strip() == "": |
|
|
raise gr.Error("Please enter the text transcript for the Reference Audio.") |
|
|
if text.strip() == "": |
|
|
raise gr.Error("Please enter the text you want to generate.") |
|
|
|
|
|
|
|
|
if isinstance(ref_audio, tuple) and len(ref_audio) == 2: |
|
|
sample_rate, audio_data = ref_audio |
|
|
else: |
|
|
raise gr.Error("Invalid reference audio input.") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: |
|
|
sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV') |
|
|
temp_audio.flush() |
|
|
|
|
|
|
|
|
safe_text = slow_down_text(text) |
|
|
|
|
|
|
|
|
|
|
|
audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text) |
|
|
|
|
|
|
|
|
if audio.dtype == np.int16: |
|
|
audio = audio.astype(np.float32) / 32768.0 |
|
|
|
|
|
|
|
|
|
|
|
output_filename = f"generated_{uuid.uuid4().hex}.wav" |
|
|
output_path = os.path.join(tempfile.gettempdir(), output_filename) |
|
|
|
|
|
sf.write(output_path, audio, 24000) |
|
|
|
|
|
|
|
|
return output_path, output_path |
|
|
|
|
|
|
|
|
|
|
|
repo_id = "ai4bharat/IndicF5" |
|
|
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True) |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
print("Device", device) |
|
|
model = model.to(device) |
|
|
|
|
|
|
|
|
EXAMPLES = [ |
|
|
{ |
|
|
"audio_name": "PAN_F (Happy)", |
|
|
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav", |
|
|
"ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮిసਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।", |
|
|
"synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?" |
|
|
}, |
|
|
{ |
|
|
"audio_name": "TAM_F (Happy)", |
|
|
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav", |
|
|
"ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.", |
|
|
"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?" |
|
|
}, |
|
|
] |
|
|
|
|
|
|
|
|
for example in EXAMPLES: |
|
|
sample_rate, audio_data = load_audio_from_url(example["audio_url"]) |
|
|
example["sample_rate"] = sample_rate |
|
|
example["audio_data"] = audio_data |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as iface: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# **IndicF5 Dubbing Studio** |
|
|
**Instructions for Best Results:** |
|
|
1. **Reference Audio:** Use a clear, 10-15 second clip. Slower speech works better. |
|
|
2. **Reference Text:** Must match the audio exactly. |
|
|
3. **Target Text:** Odia works best with punctuation. If it skips words, add commas. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3) |
|
|
ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)") |
|
|
ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2) |
|
|
submit_btn = gr.Button("🎤 Generate Speech", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
output_audio = gr.Audio(label="Play Generated Speech", type="filepath") |
|
|
|
|
|
output_file = gr.File(label="Download Audio File", file_count="single") |
|
|
|
|
|
|
|
|
examples = [ |
|
|
[ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]] for ex in EXAMPLES |
|
|
] |
|
|
|
|
|
gr.Examples( |
|
|
examples=examples, |
|
|
inputs=[text_input, ref_audio_input, ref_text_input], |
|
|
label="Quick Examples" |
|
|
) |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
synthesize_speech, |
|
|
inputs=[text_input, ref_audio_input, ref_text_input], |
|
|
outputs=[output_audio, output_file] |
|
|
) |
|
|
|
|
|
iface.launch(share=True) |