""" app.py — Gradio UI entry point for aMuseMe """ import sys from pathlib import Path import gradio as gr SRC_DIR = Path(__file__).parent / "src" if str(SRC_DIR) not in sys.path: sys.path.insert(0, str(SRC_DIR)) from amuseme.transcriber import transcribe from amuseme.renderer import render_frames from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY from amuseme.video_assembler import assemble from amuseme.logger import get_logger logger = get_logger("app") # Try to import spaces for ZeroGPU; gracefully degrade locally try: import spaces HAS_SPACES = True except ImportError: HAS_SPACES = False if HAS_SPACES: from huggingface_hub import snapshot_download logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...") try: snapshot_download(repo_id="Systran/faster-whisper-large-v3") snapshot_download(repo_id="openbmb/MiniCPM5-1B") snapshot_download(repo_id="stabilityai/sd-turbo") logger.info("Model pre-download complete!") except Exception as e: logger.warning(f"Pre-download failed (will retry during runtime): {e}") def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str): return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt) if HAS_SPACES: _gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe) def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str: import time if audio_path is None: raise gr.Error("Please upload an audio file.") pipeline_t0 = time.time() logger.info( "===== PIPELINE START =====\n" f" audio={audio_path} theme={theme} font={font_family} visual_prompt={visual_prompt!r}\n" f" model_size={model_size} demucs={use_demucs} " f"cond_prev={cond_prev} vad={use_vad}" ) # Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed # input/output for these models is logged inside transcribe(). logger.info("[Step 1/4] Transcribing audio + generating frame metadata...") t0 = time.time() frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt) if not frames: raise gr.Error("Could not extract words from audio. Try a cleaner recording.") logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s — {len(frames)} frames.") # Step 2: Generate AI storyboard backgrounds — one image per pair of lyric # lines, so the backdrop changes less often than the on-screen text # (renderer expands each image to cover two consecutive lyric frames). bg_images = None if len(frames) > 0: logger.info("[Step 2/4] Generating AI storyboard backgrounds...") t0 = time.time() prompts = [] for i in range(0, len(frames), 2): pair = frames[i:i + 2] line_text = " ".join( " ".join(w.text for w in fr.words) for fr in pair ).strip() # Combine the dynamic lyric text with the user's visual prompt prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt prompts.append(prompt) logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n " + "\n ".join(prompts)) try: from amuseme.bg_generator import generate_storyboard bg_images = generate_storyboard(prompts) or None logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s — {len(bg_images or [])} image(s).") except Exception as e: logger.error(f"[Step 2/4] Error generating backgrounds: {e}") bg_images = None # Step 3: Get audio duration via ffprobe import subprocess, json probe = subprocess.run( ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path], capture_output=True, text=True ) duration = float(json.loads(probe.stdout)["format"]["duration"]) logger.info(f"[Step 3/4] Rendering frames — audio duration={duration:.1f}s, {len(frames)} lyric frames...") t0 = time.time() frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family) logger.info("[Step 4/4] Assembling video via FFmpeg...") out_path = assemble(frames_gen, audio_path) logger.info( f"[Step 4/4] Done in {time.time() - t0:.1f}s — output={out_path}\n" f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s =====" ) return out_path # ─── Gradio UI ───────────────────────────────────────────────────────────── CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap'); body, .gradio-container { font-family: 'Inter', sans-serif !important; background: #090910 !important; } .gradio-container { max-width: 900px !important; margin: 0 auto !important; } /* Header */ .app-header { text-align: center; padding: 2.5rem 1rem 1.5rem; background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%); border-radius: 16px; margin-bottom: 1.5rem; border: 1px solid rgba(255,255,255,0.06); } .app-header h1 { font-size: 3rem; font-weight: 700; background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.4rem; letter-spacing: -1px; } .app-header p { color: rgba(255,255,255,0.55); font-size: 1rem; margin: 0; } /* Panel */ .panel { background: #0f0f1a !important; border: 1px solid rgba(255,255,255,0.08) !important; border-radius: 12px !important; } /* Labels */ label span { color: rgba(255,255,255,0.75) !important; font-weight: 500 !important; font-size: 0.85rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; } /* Inputs */ textarea, input[type="text"] { background: #1a1a2e !important; border: 1px solid rgba(255,255,255,0.1) !important; border-radius: 8px !important; color: #e0e0ff !important; } /* Generate button */ .generate-btn { background: linear-gradient(135deg, #7c3aed, #2563eb) !important; border: none !important; border-radius: 10px !important; color: white !important; font-weight: 600 !important; font-size: 1rem !important; padding: 0.75rem 2rem !important; width: 100% !important; transition: opacity 0.2s ease !important; cursor: pointer !important; } .generate-btn:hover { opacity: 0.9 !important; } /* Step badges */ .steps-row { display: flex; gap: 0.75rem; justify-content: center; padding: 1rem 0 0.5rem; } .step-badge { background: rgba(255,255,255,0.05); border: 1px solid rgba(255,255,255,0.1); border-radius: 20px; padding: 0.3rem 0.9rem; color: rgba(255,255,255,0.5); font-size: 0.78rem; font-weight: 500; } """ HEADER_HTML = """

🎵 aMuseMe

Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.

① Upload Audio → Whisper AI Syncs → AI Storyboard Backgrounds → Kinetic Typography Video
""" with gr.Blocks(title="aMuseMe — AI Lyric Video Generator") as demo: gr.HTML(HEADER_HTML) with gr.Row(): with gr.Column(scale=1, elem_classes=["panel"]): gr.Markdown( "**1. Upload a song** — Whisper transcribes the vocals and times each " "word to drive the lyric video below." ) audio_input = gr.Audio( label="Audio File (song with clear vocals, MP3/WAV)", type="filepath", sources=["upload"], ) gr.Examples( examples=[ "assets/samples/ride_like_the_ind_test_song.mp3", "assets/samples/hollow-song-test.mp3" ], inputs=audio_input, label="Try a sample song" ) generate_btn = gr.Button( "✨ Generate Lyric Video", elem_classes=["generate-btn"], variant="primary", ) gr.Markdown( "Runs the full pipeline: transcribe lyrics → generate AI storyboard " "backgrounds → render kinetic typography → assemble the video " "(~30–90s depending on song length)." ) with gr.Column(scale=1, elem_classes=["panel"]): gr.Markdown("**2. Choose how the lyrics look**") theme_input = gr.Dropdown( label="Visual Theme", choices=list(THEMES.keys()), value="Neon", info="Sets the on-screen lyric text color: Dark = white, Light = warm gold, Neon = cyan glow. AI backgrounds are always slightly darkened, so pick whichever color reads best against your Visual Prompt.", ) font_input = gr.Dropdown( label="Lyric Font", choices=list(FONT_FAMILIES.keys()), value="Serif (Bold)", info="Typeface used for the on-screen lyrics. Bold sans-serif suits most songs; try Serif or Monospace for a different look.", ) visual_prompt_input = gr.Textbox( label="Visual Prompt", placeholder="e.g. mystical forest, glowing particles, cinematic, digital art, 8k", value="neon-lit futuristic city at night, vibrant glowing colors, cyberpunk aesthetic, energetic atmosphere, beautiful starry sky, digital art, highly detailed", info="Describes the look of the AI-generated backgrounds (and gives the lyric-timing model a sense of the visual mood).", lines=2, ) with gr.Accordion("Advanced Settings", open=False): gr.Markdown( "**Recommendations:**\n" "- **Best Default:** Condition on Previous Text **ON**, VAD **ON**, Demucs **OFF**. (Best for most pop/vocal tracks).\n" "- **Heavily Instrumental Songs:** If vocals are very quiet or buried under loud instruments, turn Condition on Previous Text **OFF**, and turn Demucs **ON**.\n" "- ⚠️ **WARNING:** Not recommended to use **Demucs ON + Condition ON** together! It may cause infinite hallucination loops during instrumental breaks." ) cond_prev_input = gr.Checkbox( label="Condition on Previous Text", value=True, info="Helps Whisper understand context by feeding it previous lines. Improves word accuracy but can cause loops if not anchored." ) use_vad_input = gr.Checkbox( label="Use VAD (Voice Activity Detection) Filter", value=True, info="Mutes audio completely when no singing is detected. Very helpful to prevent hallucinations during long instrumental solos." ) use_demucs_input = gr.Checkbox( label="Use Demucs Vocal Separation", value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops)." ) model_input = gr.Dropdown( label="Whisper Model", choices=["large-v3", "large-v3-turbo", "medium", "small", "base"], value="large-v3", info="Larger models are more accurate but take longer to process." ) def enforce_safe_params(cond_prev): if cond_prev: return gr.update(value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops). ") else: return gr.update(interactive=True, info="Isolates vocals as a preprocessing step. Only enable this if vocals are not clearly audible and are buried under instruments.") cond_prev_input.change( fn=enforce_safe_params, inputs=[cond_prev_input], outputs=[use_demucs_input] ) with gr.Column(scale=1, elem_classes=["panel"]): video_output = gr.Video( label="Your Lyric Video (preview and download here)", interactive=False, height=360, ) gr.Markdown( """ **Tips:** - Best with clear vocals (ballads, pop, spoken word) - Describe the visuals you want in the Visual Prompt — it shapes both the AI backgrounds and the on-screen mood - Try different Visual Themes and Fonts to match your song's vibe - Processing takes ~30–90s depending on song length """, elem_classes=["panel"], ) generate_btn.click( fn=generate_video, inputs=[audio_input, theme_input, font_input, visual_prompt_input, model_input, use_demucs_input, cond_prev_input, use_vad_input], outputs=[video_output], api_visibility="public", ) if __name__ == "__main__": demo.launch(css=CUSTOM_CSS)