hari7261 commited on
Commit
c1d57b7
·
verified ·
1 Parent(s): a3d1b01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +523 -656
app.py CHANGED
@@ -1,656 +1,523 @@
1
- import gradio as gr
2
- import google.generativeai as genai
3
- from gtts import gTTS
4
- import pyttsx3
5
- from pathlib import Path
6
- import tempfile
7
- import os
8
- from uuid import uuid4
9
- import time
10
- import asyncio
11
- import edge_tts
12
- import numpy as np
13
- import soundfile as sf
14
- import re
15
-
16
- # Voice configurations for different speakers
17
- VOICE_CONFIGS = {
18
- "2_speakers": [
19
- {"name": "Alex", "voice": "en-US-AriaNeural", "gender": "female"},
20
- {"name": "Brian", "voice": "en-US-GuyNeural", "gender": "male"}
21
- ],
22
- "3_speakers": [
23
- {"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
24
- {"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
25
- {"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"}
26
- ],
27
- "4_speakers": [
28
- {"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
29
- {"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
30
- {"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"},
31
- {"name": "David", "voice": "en-US-GuyNeural", "gender": "male"}
32
- ]
33
- }
34
-
35
- # Initialize Gemini client
36
- client = None
37
-
38
- def init_gemini(api_key):
39
- """Initialize Gemini client with API key"""
40
- global client
41
- if api_key:
42
- try:
43
- genai.configure(api_key=api_key)
44
- client = genai.GenerativeModel('gemini-2.0-flash')
45
- return "✅ Gemini API connected successfully!"
46
- except Exception as e:
47
- return f" Gemini API error: {str(e)}"
48
- return "ℹ️ Add Gemini API key for better summaries"
49
-
50
- def generate_with_gtts(text, filename):
51
- """Generate speech using Google's gTTS"""
52
- try:
53
- tts = gTTS(text=text, lang='en', slow=False)
54
- tts.save(filename)
55
- return filename, None
56
- except Exception as e:
57
- return None, f"gTTS Error: {str(e)}"
58
-
59
- async def generate_with_edge_tts(text, voice, filename):
60
- """Generate speech using Microsoft Edge TTS with specific voice"""
61
- try:
62
- communicate = edge_tts.Communicate(text, voice)
63
- await communicate.save(filename)
64
- return filename, None
65
- except Exception as e:
66
- return None, f"Edge TTS Error: {str(e)}"
67
-
68
- def combine_audio_files(audio_files, output_filename):
69
- """Combine multiple audio files into one"""
70
- try:
71
- from scipy.signal import resample
72
- combined_audio = []
73
- sample_rate = None
74
-
75
- for audio_file in audio_files:
76
- if os.path.exists(audio_file):
77
- data, sr = sf.read(audio_file)
78
- if sample_rate is None:
79
- sample_rate = sr
80
- elif sr != sample_rate:
81
- # Resample if needed
82
- data = resample(data, int(len(data) * sample_rate / sr))
83
-
84
- combined_audio.append(data)
85
- # Add small pause between speakers
86
- pause = np.zeros(int(sample_rate * 0.5)) # 0.5 second pause
87
- combined_audio.append(pause)
88
-
89
- if combined_audio:
90
- final_audio = np.concatenate(combined_audio)
91
- sf.write(output_filename, final_audio, sample_rate)
92
- return output_filename, None
93
- else:
94
- return None, "No audio files to combine"
95
- except Exception as e:
96
- return None, f"Audio combination error: {str(e)}"
97
-
98
- async def generate_multi_speaker_audio(script_parts, speaker_count, output_filename):
99
- """Generate multi-speaker podcast audio"""
100
- try:
101
- voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
102
- audio_files = []
103
-
104
- for i, (speaker_text, speaker_idx) in enumerate(script_parts):
105
- voice = voice_config[speaker_idx]["voice"]
106
- temp_filename = f"temp_speaker_{i}_{uuid4().hex[:8]}.wav"
107
-
108
- result, error = await generate_with_edge_tts(speaker_text, voice, temp_filename)
109
- if result:
110
- audio_files.append(temp_filename)
111
- else:
112
- # Cleanup and return error
113
- for f in audio_files:
114
- try:
115
- os.unlink(f)
116
- except:
117
- pass
118
- return None, f"Error generating voice {i+1}: {error}"
119
-
120
- # Combine all audio files
121
- final_file, error = combine_audio_files(audio_files, output_filename)
122
-
123
- # Cleanup temp files
124
- for f in audio_files:
125
- try:
126
- os.unlink(f)
127
- except:
128
- pass
129
-
130
- return final_file, error
131
- except Exception as e:
132
- return None, f"Multi-speaker generation error: {str(e)}"
133
-
134
- def generate_with_pyttsx3(text, filename):
135
- """Generate speech using system's TTS engine"""
136
- try:
137
- engine = pyttsx3.init()
138
-
139
- # Set properties for better audio quality
140
- engine.setProperty('rate', 180)
141
- engine.setProperty('volume', 0.9)
142
-
143
- # Try to find a good voice
144
- voices = engine.getProperty('voices')
145
- for voice in voices:
146
- if 'female' in voice.name.lower() or 'zira' in voice.name.lower():
147
- engine.setProperty('voice', voice.id)
148
- break
149
-
150
- engine.save_to_file(text, filename)
151
- engine.runAndWait()
152
- return filename, None
153
- except Exception as e:
154
- return None, f"pyttsx3 Error: {str(e)}"
155
-
156
- def generate_podcast_script(text, speaker_count, use_gemini):
157
- """Generate a podcast script with multiple speakers"""
158
- if use_gemini and client:
159
- try:
160
- voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
161
- speaker_names = [config["name"] for config in voice_config]
162
-
163
- prompt = f"""Create an engaging podcast conversation between {speaker_count} hosts: {', '.join(speaker_names)}.
164
-
165
- Transform this text into a natural conversation where each speaker contributes meaningfully.
166
-
167
- Guidelines:
168
- - Make it sound like a real podcast discussion
169
- - Each speaker should have distinct perspectives and speaking styles
170
- - Include natural transitions and interactions
171
- - Keep it under 2500 characters total
172
- - Use speaker names clearly (e.g., "Sarah: Hello everyone...")
173
- - Make it conversational and engaging
174
-
175
- Original text: {text[:3000]}
176
-
177
- Format the output with clear speaker labels like:
178
- Speaker1: [text]
179
- Speaker2: [text]
180
- etc."""
181
-
182
- response = client.generate_content(prompt)
183
- return response.text
184
- except Exception as e:
185
- # Fallback to simple script
186
- return f"Error generating script: {str(e)}"
187
- else:
188
- # Simple fallback for single speaker
189
- return text[:2000] + ("..." if len(text) > 2000 else "")
190
-
191
- def parse_script_for_speakers(script, speaker_count):
192
- """Parse the script to extract speaker parts"""
193
- try:
194
- voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
195
- speaker_names = [config["name"] for config in voice_config]
196
-
197
- # Split script by speaker patterns
198
- parts = []
199
- lines = script.split('\n')
200
- current_speaker = 0
201
- current_text = ""
202
-
203
- for line in lines:
204
- line = line.strip()
205
- if not line:
206
- continue
207
-
208
- # Check if line starts with a speaker name
209
- speaker_found = False
210
- for i, name in enumerate(speaker_names):
211
- if line.lower().startswith(f"{name.lower()}:"):
212
- # Save previous speaker's text
213
- if current_text.strip():
214
- parts.append((current_text.strip(), current_speaker))
215
- # Start new speaker
216
- current_speaker = i
217
- current_text = line[len(name)+1:].strip()
218
- speaker_found = True
219
- break
220
-
221
- if not speaker_found:
222
- current_text += " " + line
223
-
224
- # Add final speaker text
225
- if current_text.strip():
226
- parts.append((current_text.strip(), current_speaker))
227
-
228
- # If no speakers were found, distribute text evenly
229
- if not parts and script.strip():
230
- sentences = script.split('. ')
231
- sentences_per_speaker = max(1, len(sentences) // speaker_count)
232
-
233
- for i in range(speaker_count):
234
- start_idx = i * sentences_per_speaker
235
- if i == speaker_count - 1: # Last speaker gets remaining sentences
236
- speaker_sentences = sentences[start_idx:]
237
- else:
238
- speaker_sentences = sentences[start_idx:start_idx + sentences_per_speaker]
239
-
240
- if speaker_sentences:
241
- speaker_text = '. '.join(speaker_sentences)
242
- if not speaker_text.endswith('.'):
243
- speaker_text += '.'
244
- parts.append((speaker_text, i))
245
-
246
- return parts
247
- except Exception as e:
248
- # Fallback: single speaker
249
- return [(script, 0)]
250
-
251
- def create_podcast(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
252
- """Main function to create podcast from text with multiple speakers"""
253
- progress(0.1, "Starting processing...")
254
-
255
- if not text.strip():
256
- return None, "❌ Please enter some text first!", ""
257
-
258
- # Step 1: Generate script using Gemini or use raw text
259
- progress(0.3, "Generating podcast script...")
260
-
261
- podcast_script = generate_podcast_script(text, speaker_count, use_gemini)
262
-
263
- progress(0.5, "Parsing script for speakers...")
264
-
265
- # Parse script for multiple speakers
266
- script_parts = parse_script_for_speakers(podcast_script, speaker_count)
267
-
268
- progress(0.6, "Generating audio with multiple voices...")
269
-
270
- # Step 2: Generate audio
271
- try:
272
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
273
- temp_filename = tmp_file.name
274
-
275
- if tts_engine == "Multi-Speaker (Edge TTS - Best Quality)" and speaker_count > 1:
276
- # Use Edge TTS for multi-speaker
277
- loop = asyncio.new_event_loop()
278
- asyncio.set_event_loop(loop)
279
- try:
280
- audio_file, error = loop.run_until_complete(
281
- generate_multi_speaker_audio(script_parts, speaker_count, temp_filename)
282
- )
283
- finally:
284
- loop.close()
285
- elif tts_engine == "gTTS (Online - Single Voice)":
286
- # Use gTTS for single voice
287
- full_text = " ".join([part[0] for part in script_parts])
288
- audio_file, error = generate_with_gtts(full_text, temp_filename)
289
- else:
290
- # Use pyttsx3 for offline single voice
291
- full_text = " ".join([part[0] for part in script_parts])
292
- audio_file, error = generate_with_pyttsx3(full_text, temp_filename)
293
-
294
- if error:
295
- return None, f"❌ {error}", ""
296
-
297
- progress(0.9, "Finalizing...")
298
-
299
- # Read the generated audio file
300
- with open(audio_file, 'rb') as f:
301
- audio_data = f.read()
302
-
303
- # Clean up
304
- try:
305
- os.unlink(audio_file)
306
- except:
307
- pass
308
-
309
- progress(1.0, "Complete!")
310
-
311
- return audio_data, "✅ Podcast generated successfully!", podcast_script
312
-
313
- except Exception as e:
314
- return None, f"❌ Audio generation failed: {str(e)}", ""
315
-
316
- # Custom CSS for better styling
317
- css = """
318
- .gradio-container {
319
- max-width: 900px !important;
320
- margin: 0 auto !important;
321
- }
322
- .container {
323
- padding: 20px;
324
- }
325
- .header {
326
- text-align: center;
327
- margin-bottom: 30px;
328
- }
329
- .header h1 {
330
- color: #2563eb;
331
- font-size: 2.5em;
332
- margin-bottom: 10px;
333
- }
334
- .header p {
335
- color: #6b7280;
336
- font-size: 1.1em;
337
- }
338
- .section {
339
- background: white;
340
- padding: 25px;
341
- border-radius: 12px;
342
- margin-bottom: 20px;
343
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
344
- }
345
- .section h2 {
346
- color: #374151;
347
- margin-bottom: 15px;
348
- font-size: 1.4em;
349
- }
350
- .input-text {
351
- min-height: 200px;
352
- resize: vertical;
353
- }
354
- .output-audio {
355
- text-align: center;
356
- }
357
- .output-script {
358
- background: #f8fafc;
359
- padding: 20px;
360
- border-radius: 8px;
361
- border-left: 4px solid #2563eb;
362
- max-height: 300px;
363
- overflow-y: auto;
364
- }
365
-
366
- .speaker-info {
367
- background: linear-gradient(135deg, #ffeaa7 0%, #fab1a0 100%);
368
- padding: 15px;
369
- border-radius: 8px;
370
- margin: 10px 0;
371
- border: 1px solid #fdcb6e;
372
- font-weight: bold;
373
- }
374
-
375
- .status-message {
376
- padding: 15px;
377
- border-radius: 8px;
378
- font-weight: bold;
379
- margin: 10px 0;
380
- }
381
-
382
- .status-success {
383
- background-color: #d4edda;
384
- color: #155724;
385
- border: 1px solid #c3e6cb;
386
- }
387
-
388
- .status-error {
389
- background-color: #f8d7da;
390
- color: #721c24;
391
- border: 1px solid #f5c6cb;
392
- }
393
-
394
- .status-info {
395
- background-color: #cce7ff;
396
- color: #004085;
397
- border: 1px solid #99d3ff;
398
- }
399
- .instructions {
400
- background: #f0f9ff;
401
- padding: 20px;
402
- border-radius: 8px;
403
- border-left: 4px solid #0ea5e9;
404
- }
405
- .instructions h3 {
406
- color: #0369a1;
407
- margin-bottom: 10px;
408
- }
409
- .btn-generate {
410
- background: linear-gradient(135deg, #2563eb, #1d4ed8) !important;
411
- color: white !important;
412
- font-weight: bold !important;
413
- padding: 12px 24px !important;
414
- border-radius: 8px !important;
415
- }
416
- .btn-generate:hover {
417
- background: linear-gradient(135deg, #1d4ed8, #1e40af) !important;
418
- }
419
- .status-message {
420
- padding: 15px;
421
- border-radius: 8px;
422
- margin: 10px 0;
423
- }
424
- .status-success {
425
- background: #dcfce7;
426
- color: #166534;
427
- border-left: 4px solid #22c55e;
428
- }
429
- .status-error {
430
- background: #fee2e2;
431
- color: #991b1b;
432
- border-left: 4px solid #ef4444;
433
- }
434
- .status-info {
435
- background: #dbeafe;
436
- color: #1e40af;
437
- border-left: 4px solid #3b82f6;
438
- }
439
- """
440
-
441
- # Create the Gradio interface
442
- with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
443
- with gr.Column(elem_classes="container"):
444
- # Header
445
- with gr.Column(elem_classes="header"):
446
- gr.Markdown("# 🎙️ Blog to Podcast Converter")
447
- gr.Markdown("Transform your text into engaging podcast audio using AI")
448
-
449
- # API Section
450
- with gr.Column(elem_classes="section"):
451
- gr.Markdown("## 🔑 API Configuration")
452
- api_key = gr.Textbox(
453
- label="Gemini API Key (Optional)",
454
- type="password",
455
- placeholder="Enter your Google Gemini API key for better summaries...",
456
- info="Get a free key from https://aistudio.google.com/"
457
- )
458
- api_status = gr.Textbox(
459
- label="API Status",
460
- interactive=False,
461
- value="ℹ️ Add Gemini API key for AI-powered summaries"
462
- )
463
- api_key.change(init_gemini, inputs=api_key, outputs=api_status)
464
-
465
- # Input Section
466
- with gr.Column(elem_classes="section"):
467
- gr.Markdown("## 📝 Input Text")
468
- input_text = gr.Textbox(
469
- label="Paste your blog post or article text",
470
- placeholder="Enter your text here... (2000+ characters works best)",
471
- lines=8,
472
- elem_classes="input-text"
473
- )
474
-
475
- # Configuration Section
476
- with gr.Column(elem_classes="section"):
477
- gr.Markdown("## ⚙️ Podcast Configuration")
478
-
479
- with gr.Row():
480
- speaker_count = gr.Radio(
481
- label="Number of Speakers",
482
- choices=[1, 2, 3, 4],
483
- value=2,
484
- info="Choose how many voices/speakers for your podcast"
485
- )
486
-
487
- use_gemini = gr.Checkbox(
488
- label="Use AI for better summaries",
489
- value=True,
490
- info="Requires valid Gemini API key above"
491
- )
492
-
493
- tts_engine = gr.Radio(
494
- label="Voice Engine",
495
- choices=[
496
- "Multi-Speaker (Edge TTS - Best Quality)",
497
- "gTTS (Online - Single Voice)",
498
- "pyttsx3 (Offline - Single Voice)"
499
- ],
500
- value="Multi-Speaker (Edge TTS - Best Quality)",
501
- info="Edge TTS provides realistic multi-speaker conversations"
502
- )
503
-
504
- # Generate Button
505
- generate_btn = gr.Button(
506
- "🎙️ Generate Podcast",
507
- elem_classes="btn-generate",
508
- size="lg"
509
- )
510
-
511
- # Output Section
512
- with gr.Column(elem_classes="section"):
513
- gr.Markdown("## 🎧 Generated Podcast")
514
-
515
- # Status message
516
- status_msg = gr.HTML(
517
- value="<div class='status-message status-info'>Ready to generate podcast...</div>"
518
- )
519
-
520
- # Audio output with download
521
- with gr.Row():
522
- audio_output = gr.Audio(
523
- label="Generated Podcast",
524
- type="filepath",
525
- visible=False
526
- )
527
- download_btn = gr.DownloadButton(
528
- "⬇️ Download Podcast",
529
- visible=False,
530
- variant="secondary"
531
- )
532
-
533
- # Speaker info display
534
- speaker_info = gr.HTML(
535
- value="",
536
- visible=False
537
- )
538
-
539
- # Script output
540
- script_output = gr.Textbox(
541
- label="Podcast Script",
542
- visible=False,
543
- lines=8,
544
- elem_classes="output-script"
545
- )
546
-
547
- # Instructions
548
- with gr.Column(elem_classes="instructions"):
549
- gr.Markdown("### ℹ️ How to Use")
550
- gr.Markdown("""
551
- 1. **Optional**: Enter your Gemini API key for AI-powered conversation generation
552
- 2. **Paste your text** in the input box (articles, blogs, etc.)
553
- 3. **Choose number of speakers** (1-4) for different conversation styles
554
- 4. **Select voice engine**:
555
- - Multi-Speaker Edge TTS (best quality, realistic voices)
556
- - gTTS (single voice, good quality)
557
- - pyttsx3 (offline, system voice)
558
- 5. **Click Generate Podcast** and wait for processing
559
- 6. **Listen and download** your podcast!
560
-
561
- **Speaker Configurations**:
562
- - **1 Speaker**: Solo narration
563
- - **2 Speakers**: Host conversation (Alex & Brian)
564
- - **3 Speakers**: Panel discussion (Sarah, Mike & Emma)
565
- - **4 Speakers**: Full roundtable (Sarah, Mike, Emma & David)
566
-
567
- **Tips**:
568
- - For best results, use 500-3000 characters of text
569
- - Multi-speaker works best with Gemini AI enabled
570
- - Edge TTS provides the most realistic conversations
571
- """)
572
-
573
- def get_speaker_info(speaker_count):
574
- """Get speaker information for display"""
575
- if speaker_count == 1:
576
- return "<div class='speaker-info'><b>Single Speaker Mode</b><br/>Solo narration with one voice</div>"
577
-
578
- voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
579
- speakers_html = "<div class='speaker-info'><b>Speakers in this podcast:</b><br/>"
580
-
581
- for i, config in enumerate(voice_config):
582
- speakers_html += f"🎤 <b>{config['name']}</b> ({config['gender']} voice)<br/>"
583
-
584
- speakers_html += "</div>"
585
- return speakers_html
586
-
587
- # Event handlers
588
- def update_status(message, success=True):
589
- """Update status message with appropriate styling"""
590
- status_class = "status-success" if success else "status-error"
591
- if "Ready" in message or "ℹ️" in message:
592
- status_class = "status-info"
593
- return f"<div class='status-message {status_class}'>{message}</div>"
594
-
595
- def generate_podcast_wrapper(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
596
- """Wrapper function for podcast generation"""
597
- audio_data, message, script = create_podcast(text, use_gemini, tts_engine, speaker_count, progress)
598
-
599
- status_html = update_status(message, success=audio_data is not None)
600
-
601
- outputs = [status_html]
602
- if audio_data:
603
- # Save audio to temporary file for playback and download
604
- filename = f"podcast_{speaker_count}speakers_{uuid4().hex[:8]}.wav"
605
- filepath = os.path.join(tempfile.gettempdir(), filename)
606
-
607
- with open(filepath, 'wb') as f:
608
- f.write(audio_data)
609
-
610
- # Get speaker info
611
- speaker_info_html = get_speaker_info(speaker_count)
612
-
613
- outputs.extend([filepath, filepath, speaker_info_html, script])
614
- else:
615
- outputs.extend([None, None, "", script])
616
-
617
- return outputs
618
-
619
- # Connect the button click event
620
- generate_btn.click(
621
- fn=generate_podcast_wrapper,
622
- inputs=[input_text, use_gemini, tts_engine, speaker_count],
623
- outputs=[status_msg, audio_output, download_btn, speaker_info, script_output]
624
- )
625
-
626
- # Update speaker info when speaker count changes
627
- speaker_count.change(
628
- fn=get_speaker_info,
629
- inputs=speaker_count,
630
- outputs=speaker_info
631
- )
632
-
633
- # Show/hide outputs based on results
634
- def toggle_visibility(audio_data):
635
- has_audio = audio_data is not None
636
- return (
637
- gr.Audio(visible=has_audio),
638
- gr.DownloadButton(visible=has_audio),
639
- gr.HTML(visible=has_audio),
640
- gr.Textbox(visible=has_audio)
641
- )
642
-
643
- audio_output.change(
644
- fn=toggle_visibility,
645
- inputs=audio_output,
646
- outputs=[audio_output, download_btn, speaker_info, script_output]
647
- )
648
-
649
- # Launch the application
650
- if __name__ == "__main__":
651
- demo.launch(
652
- server_name="0.0.0.0",
653
- server_port=7860,
654
- share=False,
655
- show_error=True
656
- )
 
1
+ import gradio as gr
2
+ import google.generativeai as genai
3
+ from gtts import gTTS
4
+ import pyttsx3
5
+ import tempfile
6
+ import os
7
+ from uuid import uuid4
8
+ import time
9
+ import asyncio
10
+ from pydub import AudioSegment
11
+ try:
12
+ import edge_tts
13
+ EDGE_TTS_AVAILABLE = True
14
+ except ImportError:
15
+ EDGE_TTS_AVAILABLE = False
16
+ print("Edge TTS not available, using fallback options")
17
+
18
+ # Voice configurations for different speakers
19
+ VOICE_CONFIGS = {
20
+ "2_speakers": [
21
+ {"name": "Alex", "voice": "en-US-AriaNeural", "gender": "female"},
22
+ {"name": "Brian", "voice": "en-US-GuyNeural", "gender": "male"}
23
+ ],
24
+ "3_speakers": [
25
+ {"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
26
+ {"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
27
+ {"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"}
28
+ ],
29
+ "4_speakers": [
30
+ {"name": "Sarah", "voice": "en-US-JennyNeural", "gender": "female"},
31
+ {"name": "Mike", "voice": "en-US-BrandonNeural", "gender": "male"},
32
+ {"name": "Emma", "voice": "en-US-AriaNeural", "gender": "female"},
33
+ {"name": "David", "voice": "en-US-GuyNeural", "gender": "male"}
34
+ ]
35
+ }
36
+
37
+ # Initialize Gemini client
38
+ client = None
39
+
40
+ def init_gemini(api_key):
41
+ """Initialize Gemini client with API key"""
42
+ global client
43
+ if api_key and api_key.strip():
44
+ try:
45
+ genai.configure(api_key=api_key)
46
+ client = genai.GenerativeModel('gemini-1.5-flash')
47
+ return " Gemini API connected successfully!"
48
+ except Exception as e:
49
+ return f"❌ Gemini API error: {str(e)}"
50
+ return "ℹ️ Add Gemini API key for AI-powered conversations"
51
+
52
+ def generate_with_gtts(text, filename):
53
+ """Generate speech using Google's gTTS"""
54
+ try:
55
+ tts = gTTS(text=text, lang='en', slow=False)
56
+ tts.save(filename)
57
+ return filename, None
58
+ except Exception as e:
59
+ return None, f"gTTS Error: {str(e)}"
60
+
61
+ def generate_with_pyttsx3(text, filename):
62
+ """Generate speech using system's TTS engine"""
63
+ try:
64
+ engine = pyttsx3.init()
65
+ engine.setProperty('rate', 180)
66
+ engine.setProperty('volume', 0.9)
67
+
68
+ voices = engine.getProperty('voices')
69
+ if voices:
70
+ for voice in voices:
71
+ if 'female' in voice.name.lower() or 'zira' in voice.name.lower():
72
+ engine.setProperty('voice', voice.id)
73
+ break
74
+
75
+ engine.save_to_file(text, filename)
76
+ engine.runAndWait()
77
+ return filename, None
78
+ except Exception as e:
79
+ return None, f"pyttsx3 Error: {str(e)}"
80
+
81
+ async def generate_with_edge_tts(text, voice, filename):
82
+ """Generate speech using Microsoft Edge TTS with specific voice"""
83
+ if not EDGE_TTS_AVAILABLE:
84
+ return None, "Edge TTS not available"
85
+
86
+ try:
87
+ communicate = edge_tts.Communicate(text, voice)
88
+ # Save as MP3 since that's Edge TTS default format
89
+ mp3_filename = filename.replace('.wav', '.mp3')
90
+ await communicate.save(mp3_filename)
91
+ return mp3_filename, None
92
+ except Exception as e:
93
+ return None, f"Edge TTS Error: {str(e)}"
94
+
95
+ def generate_podcast_script(text, speaker_count, use_gemini):
96
+ """Generate a podcast script with multiple speakers"""
97
+ if use_gemini and client:
98
+ try:
99
+ voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
100
+ speaker_names = [config["name"] for config in voice_config]
101
+
102
+ prompt = f"""Create an engaging podcast conversation between {speaker_count} hosts: {', '.join(speaker_names)}.
103
+
104
+ Transform this text into a natural conversation where each speaker contributes meaningfully.
105
+
106
+ Guidelines:
107
+ - Make it sound like a real podcast discussion
108
+ - Each speaker should have distinct perspectives
109
+ - Include natural transitions and interactions
110
+ - Keep it under 2000 characters total
111
+ - Use speaker names clearly (e.g., "Sarah: Hello everyone...")
112
+
113
+ Original text: {text[:2500]}
114
+
115
+ Format the output with clear speaker labels like:
116
+ {speaker_names[0]}: [text]
117
+ {speaker_names[1] if len(speaker_names) > 1 else speaker_names[0]}: [text]
118
+ etc."""
119
+
120
+ response = client.generate_content(prompt)
121
+ return response.text
122
+ except Exception as e:
123
+ return f"AI generation failed: {str(e)}. Using original text."
124
+
125
+ # Fallback: simple text with speaker distribution
126
+ return text[:1500] + ("..." if len(text) > 1500 else "")
127
+
128
+ def parse_script_for_speakers(script, speaker_count):
129
+ """Parse the script to extract speaker parts"""
130
+ try:
131
+ voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
132
+ speaker_names = [config["name"] for config in voice_config]
133
+
134
+ parts = []
135
+ lines = script.split('\n')
136
+ current_speaker = 0
137
+ current_text = ""
138
+
139
+ # First, try to find explicit speaker labels
140
+ for line in lines:
141
+ line = line.strip()
142
+ if not line:
143
+ continue
144
+
145
+ # Check if line starts with a speaker name
146
+ speaker_found = False
147
+ for i, name in enumerate(speaker_names):
148
+ if line.lower().startswith(f"{name.lower()}:"):
149
+ if current_text.strip():
150
+ parts.append((current_text.strip(), current_speaker))
151
+ current_speaker = i
152
+ current_text = line[len(name)+1:].strip()
153
+ speaker_found = True
154
+ break
155
+
156
+ if not speaker_found:
157
+ current_text += " " + line
158
+
159
+ if current_text.strip():
160
+ parts.append((current_text.strip(), current_speaker))
161
+
162
+ # If no explicit speakers were found, intelligently distribute text
163
+ if not parts or len(parts) < 2:
164
+ print(f"No explicit speakers found, distributing text among {speaker_count} speakers")
165
+ parts = []
166
+
167
+ # Split into sentences and distribute
168
+ sentences = []
169
+ for delimiter in ['. ', '! ', '? ']:
170
+ if delimiter in script:
171
+ sentences = script.split(delimiter)
172
+ # Add back the delimiter except for the last sentence
173
+ for i in range(len(sentences) - 1):
174
+ sentences[i] += delimiter.strip()
175
+ break
176
+
177
+ if not sentences:
178
+ sentences = [script]
179
+
180
+ # Remove empty sentences
181
+ sentences = [s.strip() for s in sentences if s.strip()]
182
+
183
+ if len(sentences) >= speaker_count:
184
+ # Distribute sentences among speakers
185
+ sentences_per_speaker = len(sentences) // speaker_count
186
+ remainder = len(sentences) % speaker_count
187
+
188
+ start_idx = 0
189
+ for i in range(speaker_count):
190
+ # Give extra sentences to first speakers if there's remainder
191
+ num_sentences = sentences_per_speaker + (1 if i < remainder else 0)
192
+
193
+ if start_idx < len(sentences):
194
+ end_idx = min(start_idx + num_sentences, len(sentences))
195
+ speaker_sentences = sentences[start_idx:end_idx]
196
+
197
+ if speaker_sentences:
198
+ speaker_text = ' '.join(speaker_sentences)
199
+ parts.append((speaker_text, i))
200
+ print(f"Speaker {speaker_names[i]}: {len(speaker_sentences)} sentences")
201
+
202
+ start_idx = end_idx
203
+ else:
204
+ # If we have fewer sentences than speakers, alternate between first two speakers
205
+ for i, sentence in enumerate(sentences):
206
+ speaker_idx = i % min(speaker_count, 2) # Alternate between first 2 speakers
207
+ parts.append((sentence, speaker_idx))
208
+
209
+ # Ensure we have content and speakers are properly assigned
210
+ if not parts:
211
+ parts = [(script, 0)]
212
+
213
+ # Print debug info
214
+ print(f"Generated {len(parts)} parts for {speaker_count} speakers:")
215
+ for i, (text, speaker_idx) in enumerate(parts):
216
+ speaker_name = speaker_names[speaker_idx]
217
+ print(f" Part {i+1}: {speaker_name} - {text[:60]}...")
218
+
219
+ return parts
220
+
221
+ except Exception as e:
222
+ print(f"Error parsing script: {e}")
223
+ return [(script, 0)]
224
+
225
+ async def generate_multi_speaker_audio(script_parts, speaker_count):
226
+ """Generate multi-speaker podcast audio"""
227
+ if not EDGE_TTS_AVAILABLE:
228
+ return None, "Edge TTS not available for multi-speaker"
229
+
230
+ try:
231
+ voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
232
+ audio_files = []
233
+
234
+ print(f"Generating audio for {len(script_parts)} parts with {speaker_count} speakers")
235
+
236
+ for i, (speaker_text, speaker_idx) in enumerate(script_parts):
237
+ voice = voice_config[speaker_idx]["voice"]
238
+ speaker_name = voice_config[speaker_idx]["name"]
239
+ temp_filename = f"temp_speaker_{i}_{speaker_name}_{uuid4().hex[:8]}.mp3"
240
+
241
+ print(f"Part {i+1}: {speaker_name} ({voice}) says: {speaker_text[:50]}...")
242
+
243
+ result, error = await generate_with_edge_tts(speaker_text, voice, temp_filename)
244
+ if result:
245
+ audio_files.append(temp_filename)
246
+ print(f"✅ Generated audio for {speaker_name}")
247
+ else:
248
+ print(f"❌ Error generating voice for {speaker_name}: {error}")
249
+ # Cleanup and return error
250
+ for f in audio_files:
251
+ try:
252
+ os.unlink(f)
253
+ except:
254
+ pass
255
+ return None, f"Error generating voice for {speaker_name}: {error}"
256
+
257
+ # Combine all audio files
258
+ if len(audio_files) > 1:
259
+ print(f"Combining {len(audio_files)} audio files...")
260
+ combined_audio = AudioSegment.empty()
261
+
262
+ for i, audio_file in enumerate(audio_files):
263
+ try:
264
+ # Load the audio segment - auto-detect format
265
+ segment = AudioSegment.from_file(audio_file)
266
+
267
+ # Add the segment to combined audio
268
+ combined_audio += segment
269
+
270
+ # Add a small pause between speakers (0.5 seconds)
271
+ if i < len(audio_files) - 1: # Don't add pause after last segment
272
+ pause = AudioSegment.silent(duration=500) # 500ms pause
273
+ combined_audio += pause
274
+
275
+ print(f" Added segment {i+1}")
276
+ except Exception as e:
277
+ print(f"❌ Error processing audio file {audio_file}: {e}")
278
+
279
+ # Save combined audio
280
+ output_filename = f"combined_podcast_{uuid4().hex[:8]}.wav"
281
+ combined_audio.export(output_filename, format="wav")
282
+
283
+ # Cleanup temporary files
284
+ for f in audio_files:
285
+ try:
286
+ os.unlink(f)
287
+ print(f"🗑️ Cleaned up {f}")
288
+ except:
289
+ pass
290
+
291
+ print(f" Combined audio saved as {output_filename}")
292
+ return output_filename, None
293
+
294
+ elif len(audio_files) == 1:
295
+ # Single audio file, just return it
296
+ return audio_files[0], None
297
+ else:
298
+ return None, "No audio files generated"
299
+
300
+ except Exception as e:
301
+ print(f"❌ Multi-speaker generation error: {str(e)}")
302
+ return None, f"Multi-speaker generation error: {str(e)}"
303
+
304
+ def create_podcast(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
305
+ """Main function to create podcast from text with multiple speakers"""
306
+ try:
307
+ progress(0.1, "Starting processing...")
308
+
309
+ if not text.strip():
310
+ return None, "❌ Please enter some text first!", ""
311
+
312
+ progress(0.3, "Generating podcast script...")
313
+ podcast_script = generate_podcast_script(text, speaker_count, use_gemini)
314
+
315
+ progress(0.5, "Parsing script for speakers...")
316
+ script_parts = parse_script_for_speakers(podcast_script, speaker_count)
317
+
318
+ progress(0.7, "Generating audio...")
319
+
320
+ # Generate audio based on engine choice
321
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
322
+ temp_filename = tmp_file.name
323
+
324
+ if tts_engine == "Multi-Speaker (Edge TTS)" and speaker_count > 1 and EDGE_TTS_AVAILABLE:
325
+ # Use Edge TTS for multi-speaker
326
+ loop = asyncio.new_event_loop()
327
+ asyncio.set_event_loop(loop)
328
+ try:
329
+ audio_file, error = loop.run_until_complete(
330
+ generate_multi_speaker_audio(script_parts, speaker_count)
331
+ )
332
+ finally:
333
+ loop.close()
334
+ elif tts_engine == "gTTS (Online)":
335
+ full_text = " ".join([part[0] for part in script_parts])
336
+ audio_file, error = generate_with_gtts(full_text, temp_filename)
337
+ else: # pyttsx3
338
+ full_text = " ".join([part[0] for part in script_parts])
339
+ audio_file, error = generate_with_pyttsx3(full_text, temp_filename)
340
+
341
+ if error:
342
+ return None, f"❌ {error}", podcast_script
343
+
344
+ progress(0.9, "Finalizing...")
345
+
346
+ # Read the generated audio file
347
+ with open(audio_file, 'rb') as f:
348
+ audio_data = f.read()
349
+
350
+ # Clean up
351
+ try:
352
+ os.unlink(audio_file)
353
+ except:
354
+ pass
355
+
356
+ progress(1.0, "Complete!")
357
+ return audio_data, "✅ Podcast generated successfully!", podcast_script
358
+
359
+ except Exception as e:
360
+ return None, f"❌ Audio generation failed: {str(e)}", ""
361
+
362
+ def get_speaker_info(speaker_count):
363
+ """Get speaker information for display"""
364
+ if speaker_count == 1:
365
+ return "**Single Speaker Mode**: Solo narration with one voice"
366
+
367
+ voice_config = VOICE_CONFIGS[f"{speaker_count}_speakers"]
368
+ info = f"**{speaker_count} Speaker Mode**:\n"
369
+
370
+ for i, config in enumerate(voice_config):
371
+ info += f"🎤 **{config['name']}** ({config['gender']} voice)\n"
372
+
373
+ return info
374
+
375
+ # Create the Gradio interface
376
+ def create_interface():
377
+ with gr.Blocks(title="🎙️ Multi-Speaker Podcast Generator", theme=gr.themes.Soft()) as demo:
378
+ gr.Markdown("# 🎙️ Multi-Speaker Podcast Generator")
379
+ gr.Markdown("Transform your text into engaging podcast conversations with multiple realistic voices!")
380
+
381
+ with gr.Row():
382
+ with gr.Column(scale=2):
383
+ # API Configuration
384
+ gr.Markdown("## 🔑 API Configuration")
385
+ api_key = gr.Textbox(
386
+ label="Gemini API Key (Optional)",
387
+ type="password",
388
+ placeholder="Enter your Google Gemini API key...",
389
+ info="Get a free key from https://aistudio.google.com/"
390
+ )
391
+ api_status = gr.Textbox(
392
+ label="API Status",
393
+ interactive=False,
394
+ value="ℹ️ Add Gemini API key for AI-powered conversations"
395
+ )
396
+
397
+ # Input Text
398
+ gr.Markdown("## 📝 Input Text")
399
+ input_text = gr.Textbox(
400
+ label="Your Content",
401
+ placeholder="Paste your article, blog post, or any text here...",
402
+ lines=6
403
+ )
404
+
405
+ # Configuration
406
+ gr.Markdown("## ⚙️ Configuration")
407
+ speaker_count = gr.Radio(
408
+ label="Number of Speakers",
409
+ choices=[1, 2, 3, 4],
410
+ value=2,
411
+ info="Choose how many voices for your podcast"
412
+ )
413
+
414
+ use_gemini = gr.Checkbox(
415
+ label="Use AI for conversation generation",
416
+ value=True,
417
+ info="Creates natural conversations (requires API key)"
418
+ )
419
+
420
+ tts_engine = gr.Radio(
421
+ label="Voice Engine",
422
+ choices=[
423
+ "Multi-Speaker (Edge TTS)",
424
+ "gTTS (Online)",
425
+ "pyttsx3 (Offline)"
426
+ ],
427
+ value="Multi-Speaker (Edge TTS)" if EDGE_TTS_AVAILABLE else "gTTS (Online)",
428
+ info="Edge TTS provides the most realistic conversations"
429
+ )
430
+
431
+ # Generate Button
432
+ generate_btn = gr.Button(
433
+ "🎙️ Generate Podcast",
434
+ variant="primary",
435
+ size="lg"
436
+ )
437
+
438
+ with gr.Column(scale=1):
439
+ # Speaker Info
440
+ speaker_info = gr.Markdown(
441
+ get_speaker_info(2),
442
+ label="Speaker Information"
443
+ )
444
+
445
+ # Status and Results
446
+ status_msg = gr.HTML(
447
+ value="<div style='padding: 10px; background: #e3f2fd; border-radius: 5px; color: #1976d2;'>Ready to generate your podcast!</div>"
448
+ )
449
+
450
+ with gr.Row():
451
+ audio_output = gr.Audio(
452
+ label="Generated Podcast",
453
+ visible=False
454
+ )
455
+ download_btn = gr.DownloadButton(
456
+ "⬇️ Download Podcast",
457
+ visible=False
458
+ )
459
+
460
+ script_output = gr.Textbox(
461
+ label="Generated Script",
462
+ lines=8,
463
+ visible=False
464
+ )
465
+
466
+ # Event handlers
467
+ def update_status(message, success=True):
468
+ color = "#1976d2" if success else "#d32f2f"
469
+ bg_color = "#e3f2fd" if success else "#ffebee"
470
+ return f"<div style='padding: 10px; background: {bg_color}; border-radius: 5px; color: {color};'>{message}</div>"
471
+
472
+ def generate_podcast_wrapper(text, use_gemini, tts_engine, speaker_count, progress=gr.Progress()):
473
+ audio_data, message, script = create_podcast(text, use_gemini, tts_engine, speaker_count, progress)
474
+
475
+ status_html = update_status(message, success=audio_data is not None)
476
+
477
+ if audio_data:
478
+ # Save audio to temporary file
479
+ filename = f"podcast_{speaker_count}speakers_{uuid4().hex[:8]}.wav"
480
+ filepath = os.path.join(tempfile.gettempdir(), filename)
481
+
482
+ with open(filepath, 'wb') as f:
483
+ f.write(audio_data)
484
+
485
+ return [
486
+ status_html,
487
+ gr.Audio(value=filepath, visible=True),
488
+ gr.DownloadButton(value=filepath, visible=True),
489
+ gr.Textbox(value=script, visible=True)
490
+ ]
491
+ else:
492
+ return [
493
+ status_html,
494
+ gr.Audio(visible=False),
495
+ gr.DownloadButton(visible=False),
496
+ gr.Textbox(visible=False)
497
+ ]
498
+
499
+ # Connect events
500
+ api_key.change(init_gemini, inputs=api_key, outputs=api_status)
501
+
502
+ speaker_count.change(
503
+ get_speaker_info,
504
+ inputs=speaker_count,
505
+ outputs=speaker_info
506
+ )
507
+
508
+ generate_btn.click(
509
+ generate_podcast_wrapper,
510
+ inputs=[input_text, use_gemini, tts_engine, speaker_count],
511
+ outputs=[status_msg, audio_output, download_btn, script_output]
512
+ )
513
+
514
+ return demo
515
+
516
+ if __name__ == "__main__":
517
+ demo = create_interface()
518
+ demo.launch(
519
+ server_name="0.0.0.0",
520
+ server_port=7860,
521
+ share=False,
522
+ show_error=True
523
+ )