""" app.py — Gradio UI entry point for aMuseMe """ import sys from pathlib import Path import gradio as gr SRC_DIR = Path(__file__).parent / "src" if str(SRC_DIR) not in sys.path: sys.path.insert(0, str(SRC_DIR)) from amuseme.transcriber import transcribe from amuseme.renderer import render_frames from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY from amuseme.video_assembler import assemble from amuseme.logger import get_logger logger = get_logger("app") # Try to import spaces for ZeroGPU; gracefully degrade locally try: import spaces HAS_SPACES = True except ImportError: HAS_SPACES = False if HAS_SPACES: from huggingface_hub import snapshot_download logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...") try: snapshot_download(repo_id="Systran/faster-whisper-large-v3") snapshot_download(repo_id="openbmb/MiniCPM5-1B") snapshot_download(repo_id="stabilityai/sd-turbo") logger.info("Model pre-download complete!") except Exception as e: logger.warning(f"Pre-download failed (will retry during runtime): {e}") def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str): return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt) if HAS_SPACES: _gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe) def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str: import time if audio_path is None: raise gr.Error("Please upload an audio file.") pipeline_t0 = time.time() logger.info( "===== PIPELINE START =====\n" f" audio={audio_path} theme={theme} font={font_family} visual_prompt={visual_prompt!r}\n" f" model_size={model_size} demucs={use_demucs} " f"cond_prev={cond_prev} vad={use_vad}" ) # Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed # input/output for these models is logged inside transcribe(). logger.info("[Step 1/4] Transcribing audio + generating frame metadata...") t0 = time.time() frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt) if not frames: raise gr.Error("Could not extract words from audio. Try a cleaner recording.") logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s — {len(frames)} frames.") # Step 2: Generate AI storyboard backgrounds — one image per pair of lyric # lines, so the backdrop changes less often than the on-screen text # (renderer expands each image to cover two consecutive lyric frames). bg_images = None if len(frames) > 0: logger.info("[Step 2/4] Generating AI storyboard backgrounds...") t0 = time.time() prompts = [] for i in range(0, len(frames), 2): pair = frames[i:i + 2] line_text = " ".join( " ".join(w.text for w in fr.words) for fr in pair ).strip() # Combine the dynamic lyric text with the user's visual prompt prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt prompts.append(prompt) logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n " + "\n ".join(prompts)) try: from amuseme.bg_generator import generate_storyboard bg_images = generate_storyboard(prompts) or None logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s — {len(bg_images or [])} image(s).") except Exception as e: logger.error(f"[Step 2/4] Error generating backgrounds: {e}") bg_images = None # Step 3: Get audio duration via ffprobe import subprocess, json probe = subprocess.run( ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path], capture_output=True, text=True ) duration = float(json.loads(probe.stdout)["format"]["duration"]) logger.info(f"[Step 3/4] Rendering frames — audio duration={duration:.1f}s, {len(frames)} lyric frames...") t0 = time.time() frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family) logger.info("[Step 4/4] Assembling video via FFmpeg...") out_path = assemble(frames_gen, audio_path) logger.info( f"[Step 4/4] Done in {time.time() - t0:.1f}s — output={out_path}\n" f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s =====" ) return out_path # ─── Gradio UI ───────────────────────────────────────────────────────────── CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap'); body, .gradio-container { font-family: 'Inter', sans-serif !important; background: #090910 !important; } .gradio-container { max-width: 900px !important; margin: 0 auto !important; } /* Header */ .app-header { text-align: center; padding: 2.5rem 1rem 1.5rem; background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%); border-radius: 16px; margin-bottom: 1.5rem; border: 1px solid rgba(255,255,255,0.06); } .app-header h1 { font-size: 3rem; font-weight: 700; background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.4rem; letter-spacing: -1px; } .app-header p { color: rgba(255,255,255,0.55); font-size: 1rem; margin: 0; } /* Panel */ .panel { background: #0f0f1a !important; border: 1px solid rgba(255,255,255,0.08) !important; border-radius: 12px !important; } /* Labels */ label span { color: rgba(255,255,255,0.75) !important; font-weight: 500 !important; font-size: 0.85rem !important; text-transform: uppercase !important; letter-spacing: 0.05em !important; } /* Inputs */ textarea, input[type="text"] { background: #1a1a2e !important; border: 1px solid rgba(255,255,255,0.1) !important; border-radius: 8px !important; color: #e0e0ff !important; } /* Generate button */ .generate-btn { background: linear-gradient(135deg, #7c3aed, #2563eb) !important; border: none !important; border-radius: 10px !important; color: white !important; font-weight: 600 !important; font-size: 1rem !important; padding: 0.75rem 2rem !important; width: 100% !important; transition: opacity 0.2s ease !important; cursor: pointer !important; } .generate-btn:hover { opacity: 0.9 !important; } /* Step badges */ .steps-row { display: flex; gap: 0.75rem; justify-content: center; padding: 1rem 0 0.5rem; } .step-badge { background: rgba(255,255,255,0.05); border: 1px solid rgba(255,255,255,0.1); border-radius: 20px; padding: 0.3rem 0.9rem; color: rgba(255,255,255,0.5); font-size: 0.78rem; font-weight: 500; } """ HEADER_HTML = """
Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.