File size: 8,699 Bytes
21af360
5b34230
7516f78
21af360
 
 
 
 
 
29a7548
ee96f4d
 
573805f
ee96f4d
573805f
ee96f4d
 
 
 
 
 
 
 
 
d01c447
 
 
 
 
5ad586c
 
 
d01c447
5ad586c
 
 
d01c447
29a7548
d01c447
 
 
ee96f4d
d01c447
29a7548
ee96f4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29a7548
48feb86
 
 
 
 
 
21af360
 
5b34230
48feb86
ee96f4d
 
 
 
 
 
 
 
 
48feb86
 
 
ee96f4d
48feb86
ee96f4d
48feb86
 
 
 
ee96f4d
 
 
 
 
 
48feb86
ee96f4d
5ad586c
 
 
ee96f4d
 
 
 
 
 
 
 
 
5ad586c
48feb86
 
 
 
7516f78
48feb86
 
21af360
ee96f4d
21af360
573805f
8ede049
 
5ad586c
 
8ede049
 
 
 
 
5ad586c
8ede049
21af360
 
81968d5
 
 
 
 
 
 
ee96f4d
ed1cc99
8ede049
 
ee96f4d
 
 
 
 
8ede049
 
 
ed1cc99
 
ee96f4d
 
 
8ede049
 
ed1cc99
ee96f4d
 
 
8ede049
81968d5
 
 
 
 
 
 
 
ee96f4d
81968d5
d01c447
ee96f4d
 
 
 
 
 
81968d5
0e1a522
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import io
import spaces
import torch
import requests
import tempfile
import numpy as np
import gradio as gr
import soundfile as sf
from transformers import AutoModel
from typing import Tuple
import uuid
import os

# ---------- LANGUAGE DETECTION (UPDATED TO ALLOW ENGLISH) ----------
def detect_language_from_text(text: str) -> str:
    """Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te, OR 'en'."""
    # 1. Check for English (Latin Script) first
    latin_chars = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
    text_chars = set(text)
    # If text has significant Latin characters, treat as English
    if len(text_chars) > 0 and (len(text_chars & latin_chars) / len(text_chars)) > 0.3:
        return "en"

    # 2. Check Indian scripts
    scripts = {
        'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
        'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
        'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'),
        'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
        'kn': set('ಅಆಇಈಉಊಋಏಐಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'),
        'ml': set('അആഇഈഉഊഋഏഐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'),
        'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
        'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'),
        'pa': set('ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼਷ਸਹਕਸ਼ਜ਼'),
        'ta': set('அஆஇஈஉஊ஋எஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'),
        'te': set('అఆఇఈఉఊఋఎఐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'),
    }
    txt = set(text.replace(' ', ''))
    for lang, chars in scripts.items():
        if txt & chars:
            return lang
    # Default to Hindi if nothing matches
    return 'hi'

# ---------- TEXT PACER (HELPS PREVENT SKIPPING) ----------
def slow_down_text(text):
    """
    Adds pauses to force the model to take its time processing complex scripts.
    """
    if not text:
        return ""
    # Add a comma (pause) after every 3 words to force a breather
    words = text.split()
    paced_text = ""
    for i, word in enumerate(words):
        paced_text += word + " "
        if (i + 1) % 3 == 0: 
            paced_text += ", " 
            
    # Add padding at start/end
    return f". . . {paced_text} . . ."

# Function to load reference audio from URL
def load_audio_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        audio_data, sample_rate = sf.read(io.BytesIO(response.content))
        return sample_rate, audio_data
    return None, None

@spaces.GPU
def synthesize_speech(text, ref_audio, ref_text):
    # 1. Basic Validation
    if ref_audio is None:
        raise gr.Error("Please upload a Reference Audio file.")
    if ref_text.strip() == "":
        raise gr.Error("Please enter the text transcript for the Reference Audio.")
    if text.strip() == "":
        raise gr.Error("Please enter the text you want to generate.")

    # 2. Reference Audio Processing
    if isinstance(ref_audio, tuple) and len(ref_audio) == 2:
        sample_rate, audio_data = ref_audio
    else:
        raise gr.Error("Invalid reference audio input.")
    
    # Save reference audio to temp file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
        sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
        temp_audio.flush()
    
    # 3. Apply Text Pacing (The "Skipping" Fix)
    safe_text = slow_down_text(text)
    
    # 4. Generate Audio
    # Note: We are using safe_text for generation
    audio = model(safe_text, ref_audio_path=temp_audio.name, ref_text=ref_text)
             
    # 5. Normalize Output
    if audio.dtype == np.int16:
        audio = audio.astype(np.float32) / 32768.0

    # 6. Save Output to File (The "Download" Fix)
    # We save the generated audio to a file so we can provide a download link
    output_filename = f"generated_{uuid.uuid4().hex}.wav"
    output_path = os.path.join(tempfile.gettempdir(), output_filename)
    
    sf.write(output_path, audio, 24000)

    # Return the file path twice: once for the player, once for the download button
    return output_path, output_path


# Load TTS model
repo_id = "ai4bharat/IndicF5"
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device", device)
model = model.to(device)

# ---------- PRE-FETCH EXAMPLES ----------
EXAMPLES = [
    {
        "audio_name": "PAN_F (Happy)",
        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
        "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮిసਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
        "synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
    },
    {
        "audio_name": "TAM_F (Happy)",
        "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
        "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
        "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
    },
]

# Preload all example audios
for example in EXAMPLES:
    sample_rate, audio_data = load_audio_from_url(example["audio_url"])
    example["sample_rate"] = sample_rate
    example["audio_data"] = audio_data


# Define Gradio interface
with gr.Blocks() as iface:
    gr.Markdown(
        """
        # **IndicF5 Dubbing Studio**
        **Instructions for Best Results:**
        1. **Reference Audio:** Use a clear, 10-15 second clip. Slower speech works better.
        2. **Reference Text:** Must match the audio exactly.
        3. **Target Text:** Odia works best with punctuation. If it skips words, add commas.
        """
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Text to Synthesize (Odia/English)", placeholder="Enter text here...", lines=3)
            ref_audio_input = gr.Audio(type="numpy", label="Reference Voice (10-15s ideal)")
            ref_text_input = gr.Textbox(label="Transcript of Reference Audio", placeholder="What did the voice say?", lines=2)
            submit_btn = gr.Button("🎤 Generate Speech", variant="primary")
        
        with gr.Column():
            output_audio = gr.Audio(label="Play Generated Speech", type="filepath")
            # This is the dedicated download button
            output_file = gr.File(label="Download Audio File", file_count="single")
    
    # Add multiple examples
    examples = [
        [ex["synth_text"], (ex["sample_rate"], ex["audio_data"]), ex["ref_text"]] for ex in EXAMPLES
    ]
    
    gr.Examples(
        examples=examples,
        inputs=[text_input, ref_audio_input, ref_text_input],
        label="Quick Examples"
    )

    # When clicked, return audio to Player AND File Downloader
    submit_btn.click(
        synthesize_speech, 
        inputs=[text_input, ref_audio_input, ref_text_input], 
        outputs=[output_audio, output_file]
    )

iface.launch(share=True)