ashishkblink commited on
Commit
dcc8cc6
·
verified ·
1 Parent(s): 285f6b8

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +259 -255
app.py CHANGED
@@ -1,145 +1,198 @@
1
  """
2
- Vakya TTS - Hugging Face Space Playground
3
- India's No. 1 TTS Model for Hindi and Other Indian Languages
4
  """
5
 
6
- import gradio as gr
7
- from TTS.api import TTS
8
  import os
 
9
  import tempfile
 
 
 
 
 
10
  from pathlib import Path
11
 
12
- # Initialize the TTS model
13
- MODEL_NAME = "ashishkblink/vakya"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- print("🚀 Loading Vakya TTS model...")
16
- print(f"📦 Model: {MODEL_NAME}")
17
- tts = None
18
 
19
- # TTS 0.22.0 expects model names in format: "model_type/language/dataset/model" (4 parts)
20
- # Custom HuggingFace models use format: "username/modelname" (2 parts)
21
- # This is a known limitation - we'll try multiple approaches
22
 
23
- try:
24
- # Method 1: Try direct loading
25
- print("📦 Attempting Method 1: Direct model loading...")
26
- tts = TTS(model_name=MODEL_NAME)
27
- print("✅ Model loaded successfully with Method 1!")
28
- except Exception as e1:
29
- error_msg1 = str(e1)
30
- print(f"⚠️ Method 1 failed: {error_msg1}")
31
-
32
- # Check if it's the unpacking error
33
- if "not enough values to unpack" in error_msg1 or "expected 4" in error_msg1:
34
- print("\n" + "="*70)
35
- print("⚠️ DETECTED: Model name format issue")
36
- print("="*70)
37
- print(f"The TTS library expects model names in format:")
38
- print(f" 'model_type/language/dataset/model' (4 parts)")
39
- print(f"But your model is: '{MODEL_NAME}' (2 parts)")
40
- print("\nThis suggests TTS 0.22.0 may not support custom HuggingFace")
41
- print("model repositories in this format.")
42
- print("="*70 + "\n")
43
 
44
- # Method 2: Try with explicit model type (won't work but shows we tried)
45
- try:
46
- print("📦 Attempting Method 2: With explicit model type...")
47
- tts = TTS(model_name=MODEL_NAME, model_type="tts_models/multilingual/multi-dataset/xtts_v2")
48
- print("✅ Model loaded successfully with Method 2!")
49
- except Exception as e2:
50
- print(f"⚠️ Method 2 failed: {e2}")
51
 
52
- # Method 3: Try base XTTS-v2 (to verify TTS works)
53
  try:
54
- print("\n📦 Attempting Method 3: Base XTTS-v2 model (for testing)...")
55
- tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
56
- print("✅ Base XTTS-v2 model loaded successfully!")
57
- print("⚠️ NOTE: Using base XTTS-v2 instead of custom Vakya model")
58
- except Exception as e3:
59
- print(f"❌ Method 3 also failed: {e3}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  import traceback
61
  traceback.print_exc()
62
- tts = None
63
-
64
- print("\n" + "="*70)
65
- print("❌ ERROR: Could not load any TTS model")
66
- print("="*70)
67
- print("\nPossible solutions:")
68
- print("1. Check if the model repository structure on HuggingFace")
69
- print(" matches what TTS library expects")
70
- print("2. The model may need to be in a different format")
71
- print("3. TTS 0.22.0 may not support custom HuggingFace models")
72
- print(" in 'username/modelname' format")
73
- print("\nThe app will continue but TTS functionality will be disabled.")
74
- print("="*70 + "\n")
75
-
76
- # Supported languages for Indian languages
77
- INDIAN_LANGUAGES = {
78
- "Hindi": "hi",
79
- "English": "en",
80
- "Marathi": "mr",
81
- "Telugu": "te",
82
- "Tamil": "ta",
83
- "Kannada": "kn",
84
- "Gujarati": "gu",
85
- "Punjabi": "pa",
86
- "Bengali": "bn",
87
- "Urdu": "ur",
88
- }
89
-
90
- # Example texts for each language
91
- EXAMPLE_TEXTS = {
92
- "hi": "नमस्ते, यह वाक्य TTS मॉडल है। यह भारत का नंबर एक टेक्स्ट-टू-स्पीच मॉडल है।",
93
- "en": "Hello, this is the Vakya TTS model. It is India's number one text-to-speech model.",
94
- "mr": "नमस्कार, हे वाक्य TTS मॉडेल आहे. हे भारतातील नंबर वन टेक्स्ट-टू-स्पीच मॉडेल आहे.",
95
- "te": "నమస్కారం, ఇది వాక్య TTS మోడల్. ఇది భారతదేశంలోని నంబర్ వన్ టెక్స్ట్-టు-స్పీచ్ మోడల్.",
96
- "ta": "வணக்கம், இது வாக்கிய TTS மாதிரி. இது இந்தியாவின் நம்பர் ஒன் டெக்ஸ்ட்-டு-ஸ்பீச் மாதிரி.",
97
- "kn": "ನಮಸ್ಕಾರ, ಇದು ವಾಕ್ಯ TTS ಮಾದರಿ. ಇದು ಭಾರತದ ನಂಬರ್ ವನ್ ಟೆಕ್ಸ್ಟ್-ಟು-ಸ್ಪೀಚ್ ಮಾದರಿ.",
98
- "gu": "નમસ્તે, આ વાક્ય TTS મોડલ છે. આ ભારતનું નંબર વન ટેક્સ્ટ-ટુ-સ્પીચ મોડલ છે.",
99
- "pa": "ਸਤ ਸ੍ਰੀ ਅਕਾਲ, ਇਹ ਵਾਕ TTS ਮਾਡਲ ਹੈ। ਇਹ ਭਾਰਤ ਦਾ ਨੰਬਰ ਵਨ ਟੈਕਸਟ-ਟੂ-ਸਪੀਚ ਮਾਡਲ ਹੈ।",
100
- "bn": "নমস্কার, এটি বাক্য TTS মডেল। এটি ভারতের নম্বর ওয়ান টেক্সট-টু-স্পিচ মডেল।",
101
- "ur": "السلام علیکم، یہ واکیہ TTS ماڈل ہے۔ یہ بھارت کا نمبر ایک ٹیکسٹ-ٹو-اسپیچ ماڈل ہے۔",
102
- }
103
 
104
- def synthesize_speech(text, language, speaker_audio):
105
- """
106
- Synthesize speech from text using Vakya TTS model
107
- """
108
- if tts is None:
109
- return None, "❌ Model not loaded. Please check the logs."
110
 
111
- if not text or not text.strip():
112
- return None, "⚠️ Please enter some text to synthesize."
113
 
114
- # Get language code
115
- lang_code = INDIAN_LANGUAGES.get(language, "hi")
116
 
117
- # Create temporary file for output
118
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
119
- output_path = tmp_file.name
120
 
121
  try:
122
- # XTTS requires a speaker_wav for voice cloning
123
- # If speaker audio is provided, use it
124
- if speaker_audio is not None:
125
- speaker_wav = speaker_audio
126
- else:
127
- # Try to use a default sample from the model
128
- # XTTS can work without explicit speaker_wav if using TTS.api
129
- # Let's use a simple approach - try with a minimal default
130
- speaker_wav = None
 
 
 
 
131
 
132
- # Synthesize speech using TTS API
133
- # The TTS.api handles the speaker_wav internally if not provided
134
- tts.tts_to_file(
135
- text=text,
136
- speaker_wav=speaker_wav if speaker_wav else None,
137
- language=lang_code,
138
- file_path=output_path
139
  )
140
 
141
- return output_path, "✅ Speech generated successfully! 🎉"
142
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  except Exception as e:
144
  error_msg = f"❌ Error generating speech: {str(e)}"
145
  print(error_msg)
@@ -147,165 +200,116 @@ def synthesize_speech(text, language, speaker_audio):
147
  traceback.print_exc()
148
  return None, error_msg
149
 
150
- # Custom CSS for better styling
151
- css = """
152
- .gradio-container {
153
- font-family: 'Inter', sans-serif;
154
- }
155
- .header {
156
- text-align: center;
157
- padding: 20px;
158
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
159
- color: white;
160
- border-radius: 10px;
161
- margin-bottom: 20px;
162
- }
163
- .header h1 {
164
- margin: 0;
165
- font-size: 2.5em;
166
- }
167
- .header p {
168
- margin: 10px 0 0 0;
169
- font-size: 1.2em;
170
- opacity: 0.9;
171
- }
172
- """
173
-
174
  # Create Gradio interface
175
- with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
176
- gr.HTML("""
177
- <div class="header">
178
- <h1>🎤 Vakya TTS</h1>
179
- <p>India's No. 1 TTS Model for Hindi and Other Indian Languages</p>
180
- </div>
181
- """)
182
-
183
  gr.Markdown("""
184
- ### Welcome to Vakya TTS Playground! 🚀
185
 
186
- **Test the power of India's premier Text-to-Speech model:**
 
187
 
188
- - 🎯 **High-quality Hindi TTS** - Optimized for Hindi pronunciation
189
- - 🌍 **Multi-Indian Language Support** - Supports 10+ Indian languages
190
- - 🎭 **Voice Cloning** - Clone voices from just 6 seconds of audio
191
- - **Real-time Synthesis** - Fast and efficient speech generation
 
 
 
192
 
193
- **How to use:**
194
- 1. Enter your text in the text box
195
- 2. Select the language (Hindi, English, Marathi, Telugu, Tamil, etc.)
196
- 3. (Optional) Upload a speaker reference audio file for voice cloning
197
- 4. Click "Generate Speech" and enjoy! 🎉
198
  """)
199
 
200
  with gr.Row():
201
- with gr.Column(scale=1):
202
- text_input = gr.Textbox(
203
- label="📝 Enter Text",
204
- placeholder="Type your text here... (e.g., नमस्ते, यह वाक्य TTS मॉडल है)",
205
- lines=5,
206
- value=EXAMPLE_TEXTS["hi"]
207
- )
208
-
209
- language_dropdown = gr.Dropdown(
210
- label="🌍 Select Language",
211
- choices=list(INDIAN_LANGUAGES.keys()),
212
- value="Hindi"
213
- )
214
-
215
- speaker_audio = gr.Audio(
216
- label="🎤 Speaker Reference Audio (Optional)",
217
- type="filepath"
218
  )
219
- gr.Markdown("*Upload a 6+ second audio file to clone the voice. Leave empty for default voice.*")
220
-
221
- generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
222
-
223
- status_text = gr.Textbox(
224
- label="Status",
225
- interactive=False,
226
- value="Ready to generate speech!"
227
  )
228
 
229
- with gr.Column(scale=1):
230
- output_audio = gr.Audio(
231
- label="🔊 Generated Speech",
232
- type="filepath"
 
 
233
  )
234
 
235
- gr.Markdown("""
236
- ### 💡 Tips:
237
- - For best results in Hindi, use Devanagari script (नमस्ते)
238
- - Speaker audio should be clear and at least 6 seconds long
239
- - You can download the generated audio by clicking the download button
240
- """)
 
 
 
 
 
 
 
 
241
 
242
- # Examples section
243
- gr.Markdown("### 📚 Example Texts (Click to use)")
244
-
245
- def make_example_loader(example_text, lang_name):
246
- """Create a function to load example text and language"""
247
- def load_example():
248
- return example_text, lang_name
249
- return load_example
250
 
251
  with gr.Row():
252
- for lang_name, lang_code in list(INDIAN_LANGUAGES.items())[:5]:
253
- example_text = EXAMPLE_TEXTS.get(lang_code, "")
254
- example_btn = gr.Button(
255
- f"{lang_name} Example",
256
- size="sm"
257
- )
258
- example_btn.click(
259
- fn=make_example_loader(example_text, lang_name),
260
- outputs=[text_input, language_dropdown],
261
- api_name=f"load_example_{lang_name.lower().replace(' ', '_')}"
262
- )
263
 
264
- with gr.Row():
265
- for lang_name, lang_code in list(INDIAN_LANGUAGES.items())[5:]:
266
- example_text = EXAMPLE_TEXTS.get(lang_code, "")
267
- example_btn = gr.Button(
268
- f"{lang_name} Example",
269
- size="sm"
270
- )
271
- example_btn.click(
272
- fn=make_example_loader(example_text, lang_name),
273
- outputs=[text_input, language_dropdown],
274
- api_name=f"load_example_{lang_name.lower().replace(' ', '_')}"
275
- )
276
 
277
- # Footer
278
  gr.Markdown("""
279
  ---
280
- ### 🔗 Links
281
- - **Model Repository**: [ashishkblink/vakya](https://huggingface.co/ashishkblink/vakya)
282
- - **Built with**: [Coqui TTS](https://github.com/coqui-ai/TTS)
 
 
 
283
 
284
- ### 📄 License
285
- Apache 2.0
286
-
287
- *Built with ❤️ for the Indian language community*
288
  """)
289
-
290
- # Connect the generate button
291
- generate_btn.click(
292
- fn=synthesize_speech,
293
- inputs=[text_input, language_dropdown, speaker_audio],
294
- outputs=[output_audio, status_text]
295
- )
296
-
297
- # Auto-load example when language changes
298
- language_dropdown.change(
299
- fn=lambda lang: EXAMPLE_TEXTS.get(INDIAN_LANGUAGES.get(lang, "hi"), ""),
300
- inputs=[language_dropdown],
301
- outputs=[text_input]
302
- )
303
 
304
- # Launch the app
305
  if __name__ == "__main__":
306
- demo.launch(
307
- server_name="0.0.0.0",
308
- server_port=7860,
309
- share=False
310
- )
311
 
 
1
  """
2
+ Vakya 2.0 - Text-to-Speech Playground
3
+ A Hugging Face Space for testing the Vakya TTS model
4
  """
5
 
 
 
6
  import os
7
+ import sys
8
  import tempfile
9
+ import gradio as gr
10
+ import numpy as np
11
+ import soundfile as sf
12
+ import torch
13
+ from huggingface_hub import hf_hub_download, snapshot_download
14
  from pathlib import Path
15
 
16
+ # Try to import f5_tts - handle different possible locations
17
+ try:
18
+ from f5_tts.api import F5TTS
19
+ from f5_tts.infer.utils_infer import preprocess_ref_audio_text
20
+ except ImportError:
21
+ # Try adding local paths
22
+ current_dir = os.path.dirname(__file__)
23
+ possible_paths = [
24
+ os.path.join(current_dir, "vakya_model"),
25
+ os.path.join(current_dir, "f5_tts"),
26
+ os.path.join(current_dir, "..", "vakya_model"),
27
+ ]
28
+ for path in possible_paths:
29
+ if os.path.exists(path):
30
+ sys.path.insert(0, path)
31
+ try:
32
+ from f5_tts.api import F5TTS
33
+ from f5_tts.infer.utils_infer import preprocess_ref_audio_text
34
+ break
35
+ except ImportError:
36
+ continue
37
+ else:
38
+ raise ImportError(
39
+ "Could not import f5_tts. Please ensure the model code is available. "
40
+ "You may need to include the f5_tts directory in your Space or install it as a package."
41
+ )
42
 
43
+ # Model configuration
44
+ MODEL_REPO_ID = "ashishkblink/vakya2.0"
45
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
46
 
47
+ # Global model instance
48
+ tts_model = None
49
+ vocoder = None
50
 
51
+ def load_model():
52
+ """Load the Vakya model from Hugging Face"""
53
+ global tts_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ if tts_model is None:
56
+ print("Loading Vakya model...")
57
+ print(f"Device: {DEVICE}")
 
 
 
 
58
 
 
59
  try:
60
+ # Download model files from Hugging Face
61
+ print("Downloading model files from Hugging Face...")
62
+ model_dir = snapshot_download(
63
+ repo_id=MODEL_REPO_ID,
64
+ cache_dir=None,
65
+ local_files_only=False
66
+ )
67
+
68
+ # Find checkpoint and vocab files
69
+ model_dir_path = Path(model_dir)
70
+ ckpt_files = list(model_dir_path.rglob("*.safetensors")) + list(model_dir_path.rglob("*.pt"))
71
+ vocab_files = list(model_dir_path.rglob("vocab.txt"))
72
+
73
+ ckpt_file = str(ckpt_files[0]) if ckpt_files else ""
74
+ vocab_file = str(vocab_files[0]) if vocab_files else ""
75
+
76
+ print(f"Checkpoint: {ckpt_file}")
77
+ print(f"Vocab: {vocab_file}")
78
+
79
+ # If files not found in repo, try using HF paths directly
80
+ if not ckpt_file:
81
+ print("Trying to download checkpoint from HF...")
82
+ try:
83
+ ckpt_file = hf_hub_download(
84
+ repo_id=MODEL_REPO_ID,
85
+ filename="model.safetensors",
86
+ cache_dir=None
87
+ )
88
+ except:
89
+ try:
90
+ ckpt_file = hf_hub_download(
91
+ repo_id=MODEL_REPO_ID,
92
+ filename="pytorch_model.bin",
93
+ cache_dir=None
94
+ )
95
+ except:
96
+ pass
97
+
98
+ if not vocab_file:
99
+ print("Trying to download vocab from HF...")
100
+ try:
101
+ vocab_file = hf_hub_download(
102
+ repo_id=MODEL_REPO_ID,
103
+ filename="vocab.txt",
104
+ cache_dir=None
105
+ )
106
+ except:
107
+ pass
108
+
109
+ # Initialize F5TTS model
110
+ # If ckpt_file is empty, F5TTS will use default
111
+ tts_model = F5TTS(
112
+ model_type="F5-TTS",
113
+ ckpt_file=ckpt_file if ckpt_file else "",
114
+ vocab_file=vocab_file if vocab_file else "",
115
+ device=DEVICE,
116
+ vocoder_name="vocos"
117
+ )
118
+
119
+ print("✅ Model loaded successfully!")
120
+ return "✅ Model loaded successfully!"
121
+
122
+ except Exception as e:
123
+ error_msg = f"❌ Error loading model: {str(e)}"
124
+ print(error_msg)
125
  import traceback
126
  traceback.print_exc()
127
+ return error_msg
128
+
129
+ return " Model already loaded!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ def generate_speech(ref_audio, ref_text, gen_text, speed, remove_silence):
132
+ """Generate speech from text using reference audio"""
133
+ global tts_model
 
 
 
134
 
135
+ if tts_model is None:
136
+ return None, "⚠️ Please load the model first by clicking 'Load Model' button."
137
 
138
+ if ref_audio is None:
139
+ return None, "⚠️ Please upload a reference audio file."
140
 
141
+ if not gen_text or not gen_text.strip():
142
+ return None, "⚠️ Please enter text to generate."
 
143
 
144
  try:
145
+ # Save uploaded audio to temporary file
146
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_ref:
147
+ # Handle different audio input formats
148
+ if isinstance(ref_audio, tuple):
149
+ # Gradio audio format: (sample_rate, audio_data)
150
+ sr, audio_data = ref_audio
151
+ sf.write(tmp_ref.name, audio_data, sr)
152
+ ref_audio_path = tmp_ref.name
153
+ elif isinstance(ref_audio, str):
154
+ # File path
155
+ ref_audio_path = ref_audio
156
+ else:
157
+ return None, "⚠️ Invalid audio format."
158
 
159
+ # Preprocess reference audio and text
160
+ ref_audio_processed, ref_text_processed = preprocess_ref_audio_text(
161
+ ref_audio_path,
162
+ ref_text if ref_text else "",
163
+ device=DEVICE
 
 
164
  )
165
 
166
+ # Generate speech
167
+ print(f"Generating speech for: {gen_text[:50]}...")
168
+
169
+ wav, sr, spect = tts_model.infer(
170
+ ref_file=ref_audio_processed,
171
+ ref_text=ref_text_processed,
172
+ gen_text=gen_text,
173
+ speed=speed,
174
+ remove_silence=remove_silence,
175
+ show_info=print,
176
+ progress=None
177
+ )
178
+
179
+ # Convert to numpy array if needed
180
+ if isinstance(wav, torch.Tensor):
181
+ wav = wav.cpu().numpy()
182
+
183
+ # Ensure it's 1D
184
+ if len(wav.shape) > 1:
185
+ wav = wav.squeeze()
186
+
187
+ # Normalize audio
188
+ if wav.dtype == np.int16:
189
+ wav = wav.astype(np.float32) / 32768.0
190
+ elif wav.max() > 1.0:
191
+ wav = wav / np.abs(wav).max()
192
+
193
+ # Return audio in Gradio format: (sample_rate, audio_data)
194
+ return (sr, wav), f"✅ Generated {len(wav)/sr:.2f} seconds of audio"
195
+
196
  except Exception as e:
197
  error_msg = f"❌ Error generating speech: {str(e)}"
198
  print(error_msg)
 
200
  traceback.print_exc()
201
  return None, error_msg
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  # Create Gradio interface
204
+ with gr.Blocks(title="Vakya 2.0 - Text-to-Speech", theme=gr.themes.Soft()) as app:
 
 
 
 
 
 
 
205
  gr.Markdown("""
206
+ # 🎙️ Vakya 2.0 - Text-to-Speech Playground
207
 
208
+ **Vakya** is a high-quality Text-to-Speech model supporting 11 Indian languages:
209
+ Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu
210
 
211
+ ### How to use:
212
+ 1. Click **"Load Model"** to load the Vakya model (first time may take a few minutes)
213
+ 2. Upload a **reference audio** file (WAV format recommended, <15 seconds for best results)
214
+ 3. Enter the **reference text** (what is spoken in the reference audio) - optional, will auto-transcribe if left blank
215
+ 4. Enter the **text to generate** (in any of the 11 supported languages)
216
+ 5. Adjust settings if needed
217
+ 6. Click **"Generate Speech"** to synthesize audio
218
 
219
+ ### Tips:
220
+ - Keep reference audio clips short (<15 seconds) for best results
221
+ - Reference text helps the model understand the voice characteristics better
222
+ - The model will automatically transcribe reference audio if text is not provided
 
223
  """)
224
 
225
  with gr.Row():
226
+ with gr.Column():
227
+ load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
228
+ model_status = gr.Textbox(label="Model Status", value="⏳ Model not loaded", interactive=False)
229
+
230
+ load_btn.click(
231
+ fn=load_model,
232
+ outputs=model_status
233
+ )
234
+
235
+ with gr.Row():
236
+ with gr.Column():
237
+ ref_audio_input = gr.Audio(
238
+ label="Reference Audio",
239
+ type="numpy",
240
+ sources=["upload", "microphone"],
241
+ format="wav"
 
242
  )
243
+ ref_text_input = gr.Textbox(
244
+ label="Reference Text (Optional)",
245
+ placeholder="Enter the text spoken in the reference audio. Leave blank for auto-transcription.",
246
+ lines=3,
247
+ info="This helps the model understand voice characteristics. Auto-transcription available if left blank."
 
 
 
248
  )
249
 
250
+ with gr.Column():
251
+ gen_text_input = gr.Textbox(
252
+ label="Text to Generate",
253
+ placeholder="Enter the text you want to synthesize in any supported Indian language...",
254
+ lines=5,
255
+ info="Supports: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Odia, Punjabi, Tamil, Telugu"
256
  )
257
 
258
+ with gr.Accordion("⚙️ Advanced Settings", open=False):
259
+ speed_slider = gr.Slider(
260
+ label="Speed",
261
+ minimum=0.5,
262
+ maximum=2.0,
263
+ value=1.0,
264
+ step=0.1,
265
+ info="Adjust the speed of generated speech"
266
+ )
267
+ remove_silence = gr.Checkbox(
268
+ label="Remove Silences",
269
+ value=False,
270
+ info="Remove silences from generated audio (experimental)"
271
+ )
272
 
273
+ generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
 
 
 
 
 
 
 
274
 
275
  with gr.Row():
276
+ audio_output = gr.Audio(
277
+ label="Generated Audio",
278
+ type="numpy",
279
+ autoplay=True
280
+ )
281
+ status_output = gr.Textbox(
282
+ label="Status",
283
+ interactive=False
284
+ )
 
 
285
 
286
+ generate_btn.click(
287
+ fn=generate_speech,
288
+ inputs=[
289
+ ref_audio_input,
290
+ ref_text_input,
291
+ gen_text_input,
292
+ speed_slider,
293
+ remove_silence
294
+ ],
295
+ outputs=[audio_output, status_output]
296
+ )
 
297
 
 
298
  gr.Markdown("""
299
  ---
300
+ ### 📚 Model Information
301
+ - **Model**: Vakya 2.0
302
+ - **Repository**: [ashishkblink/vakya2.0](https://huggingface.co/ashishkblink/vakya2.0)
303
+ - **Based on**: [IndicF5](https://github.com/AI4Bharat/IndicF5) by AI4Bharat (IIT Madras)
304
+ - **License**: MIT License
305
+ - **Sample Rate**: 24000 Hz
306
 
307
+ ### ⚠️ Terms of Use
308
+ - You must have explicit permission to clone voices
309
+ - Unauthorized voice cloning is strictly prohibited
310
+ - Any misuse of this model is the responsibility of the user
311
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
 
313
  if __name__ == "__main__":
314
+ app.queue().launch(share=False)
 
 
 
 
315