import os import gradio as gr import tempfile import fitz # No longer used, you can uninstall this if not needed import edge_tts import asyncio import uuid from pydub import AudioSegment from google import genai # Configure Gemini API client client = genai.Client(api_key=os.getenv("aipi")) # Async TTS function async def synthesize_speech(text, voice, output_path): communicate = edge_tts.Communicate(text, voice) await communicate.save(output_path) # Prompt Gemini to generate a podcast script def generate_script(topic_text): prompt = ( f"Create a podcast-style script where a male speaker (Alex) and a female speaker (Maya) " f"discuss the topic below in a friendly, engaging way. The script should alternate between their lines.\n\n" f"Topic: {topic_text}" ) response = client.models.generate_content( model="gemini-2.0-flash", contents=[prompt] ) return response.text # Parse script and generate audio def create_podcast_audio(script_text): lines = script_text.strip().split("\n") audio_segments = [] for line in lines: if ":" in line: speaker, text = line.split(":", 1) voice = "en-US-GuyNeural" if "Alex" in speaker else "en-US-JennyNeural" temp_filename = f"/tmp/{uuid.uuid4()}.mp3" asyncio.run(synthesize_speech(text.strip(), voice, temp_filename)) segment = AudioSegment.from_file(temp_filename, format="mp3") audio_segments.append(segment) os.remove(temp_filename) final_audio = sum(audio_segments) final_audio_path = tempfile.mktemp(suffix=".mp3") final_audio.export(final_audio_path, format="mp3") return final_audio_path # Main handler def handle_input(text): if not text: return None script = generate_script(text) audio_path = create_podcast_audio(script) return audio_path # Gradio UI (Simplified) gr.Interface( fn=handle_input, inputs=gr.Textbox(label="Enter Topic Text"), outputs=gr.Audio(label="Generated Podcast Audio"), title="Learn Out Loud", description="Enter a topic to generate a podcast-style audio conversation." ).launch(server_name="0.0.0.0", server_port=7860)