Spaces:
Running on Zero
Running on Zero
| """ | |
| app.py β Gradio UI entry point for aMuseMe | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import gradio as gr | |
| SRC_DIR = Path(__file__).parent / "src" | |
| if str(SRC_DIR) not in sys.path: | |
| sys.path.insert(0, str(SRC_DIR)) | |
| from amuseme.transcriber import transcribe | |
| from amuseme.renderer import render_frames | |
| from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY | |
| from amuseme.video_assembler import assemble | |
| from amuseme.logger import get_logger | |
| logger = get_logger("app") | |
| # Try to import spaces for ZeroGPU; gracefully degrade locally | |
| try: | |
| import spaces | |
| HAS_SPACES = True | |
| except ImportError: | |
| HAS_SPACES = False | |
| if HAS_SPACES: | |
| from huggingface_hub import snapshot_download | |
| logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...") | |
| try: | |
| snapshot_download(repo_id="Systran/faster-whisper-large-v3") | |
| snapshot_download(repo_id="openbmb/MiniCPM5-1B") | |
| snapshot_download(repo_id="stabilityai/sd-turbo") | |
| logger.info("Model pre-download complete!") | |
| except Exception as e: | |
| logger.warning(f"Pre-download failed (will retry during runtime): {e}") | |
| def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str): | |
| return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt) | |
| if HAS_SPACES: | |
| _gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe) | |
| def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str: | |
| import time | |
| if audio_path is None: | |
| raise gr.Error("Please upload an audio file.") | |
| pipeline_t0 = time.time() | |
| logger.info( | |
| "===== PIPELINE START =====\n" | |
| f" audio={audio_path} theme={theme} font={font_family} visual_prompt={visual_prompt!r}\n" | |
| f" model_size={model_size} demucs={use_demucs} " | |
| f"cond_prev={cond_prev} vad={use_vad}" | |
| ) | |
| # Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed | |
| # input/output for these models is logged inside transcribe(). | |
| logger.info("[Step 1/4] Transcribing audio + generating frame metadata...") | |
| t0 = time.time() | |
| frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt) | |
| if not frames: | |
| raise gr.Error("Could not extract words from audio. Try a cleaner recording.") | |
| logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s β {len(frames)} frames.") | |
| # Step 2: Generate AI storyboard backgrounds β one image per pair of lyric | |
| # lines, so the backdrop changes less often than the on-screen text | |
| # (renderer expands each image to cover two consecutive lyric frames). | |
| bg_images = None | |
| if len(frames) > 0: | |
| logger.info("[Step 2/4] Generating AI storyboard backgrounds...") | |
| t0 = time.time() | |
| prompts = [] | |
| for i in range(0, len(frames), 2): | |
| pair = frames[i:i + 2] | |
| line_text = " ".join( | |
| " ".join(w.text for w in fr.words) for fr in pair | |
| ).strip() | |
| # Combine the dynamic lyric text with the user's visual prompt | |
| prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt | |
| prompts.append(prompt) | |
| logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n " + "\n ".join(prompts)) | |
| try: | |
| from amuseme.bg_generator import generate_storyboard | |
| bg_images = generate_storyboard(prompts) or None | |
| logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s β {len(bg_images or [])} image(s).") | |
| except Exception as e: | |
| logger.error(f"[Step 2/4] Error generating backgrounds: {e}") | |
| bg_images = None | |
| # Step 3: Get audio duration via ffprobe | |
| import subprocess, json | |
| probe = subprocess.run( | |
| ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path], | |
| capture_output=True, text=True | |
| ) | |
| duration = float(json.loads(probe.stdout)["format"]["duration"]) | |
| logger.info(f"[Step 3/4] Rendering frames β audio duration={duration:.1f}s, {len(frames)} lyric frames...") | |
| t0 = time.time() | |
| frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family) | |
| logger.info("[Step 4/4] Assembling video via FFmpeg...") | |
| out_path = assemble(frames_gen, audio_path) | |
| logger.info( | |
| f"[Step 4/4] Done in {time.time() - t0:.1f}s β output={out_path}\n" | |
| f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s =====" | |
| ) | |
| return out_path | |
| # βββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap'); | |
| body, .gradio-container { | |
| font-family: 'Inter', sans-serif !important; | |
| background: #090910 !important; | |
| } | |
| .gradio-container { | |
| max-width: 900px !important; | |
| margin: 0 auto !important; | |
| } | |
| /* Header */ | |
| .app-header { | |
| text-align: center; | |
| padding: 2.5rem 1rem 1.5rem; | |
| background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%); | |
| border-radius: 16px; | |
| margin-bottom: 1.5rem; | |
| border: 1px solid rgba(255,255,255,0.06); | |
| } | |
| .app-header h1 { | |
| font-size: 3rem; | |
| font-weight: 700; | |
| background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin: 0 0 0.4rem; | |
| letter-spacing: -1px; | |
| } | |
| .app-header p { | |
| color: rgba(255,255,255,0.55); | |
| font-size: 1rem; | |
| margin: 0; | |
| } | |
| /* Panel */ | |
| .panel { | |
| background: #0f0f1a !important; | |
| border: 1px solid rgba(255,255,255,0.08) !important; | |
| border-radius: 12px !important; | |
| } | |
| /* Labels */ | |
| label span { | |
| color: rgba(255,255,255,0.75) !important; | |
| font-weight: 500 !important; | |
| font-size: 0.85rem !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.05em !important; | |
| } | |
| /* Inputs */ | |
| textarea, input[type="text"] { | |
| background: #1a1a2e !important; | |
| border: 1px solid rgba(255,255,255,0.1) !important; | |
| border-radius: 8px !important; | |
| color: #e0e0ff !important; | |
| } | |
| /* Generate button */ | |
| .generate-btn { | |
| background: linear-gradient(135deg, #7c3aed, #2563eb) !important; | |
| border: none !important; | |
| border-radius: 10px !important; | |
| color: white !important; | |
| font-weight: 600 !important; | |
| font-size: 1rem !important; | |
| padding: 0.75rem 2rem !important; | |
| width: 100% !important; | |
| transition: opacity 0.2s ease !important; | |
| cursor: pointer !important; | |
| } | |
| .generate-btn:hover { | |
| opacity: 0.9 !important; | |
| } | |
| /* Step badges */ | |
| .steps-row { | |
| display: flex; | |
| gap: 0.75rem; | |
| justify-content: center; | |
| padding: 1rem 0 0.5rem; | |
| } | |
| .step-badge { | |
| background: rgba(255,255,255,0.05); | |
| border: 1px solid rgba(255,255,255,0.1); | |
| border-radius: 20px; | |
| padding: 0.3rem 0.9rem; | |
| color: rgba(255,255,255,0.5); | |
| font-size: 0.78rem; | |
| font-weight: 500; | |
| } | |
| """ | |
| HEADER_HTML = """ | |
| <div class="app-header"> | |
| <h1>π΅ aMuseMe</h1> | |
| <p>Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.</p> | |
| <div class="steps-row"> | |
| <span class="step-badge">β Upload Audio</span> | |
| <span class="step-badge">β Whisper AI Syncs</span> | |
| <span class="step-badge">β AI Storyboard Backgrounds</span> | |
| <span class="step-badge">β Kinetic Typography Video</span> | |
| </div> | |
| </div> | |
| """ | |
| with gr.Blocks(title="aMuseMe β AI Lyric Video Generator") as demo: | |
| gr.HTML(HEADER_HTML) | |
| with gr.Row(): | |
| with gr.Column(scale=1, elem_classes=["panel"]): | |
| gr.Markdown( | |
| "**1. Upload a song** β Whisper transcribes the vocals and times each " | |
| "word to drive the lyric video below." | |
| ) | |
| audio_input = gr.Audio( | |
| label="Audio File (song with clear vocals, MP3/WAV)", | |
| type="filepath", | |
| sources=["upload"], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| "assets/samples/ride_like_the_ind_test_song.mp3", | |
| "assets/samples/hollow-song-test.mp3" | |
| ], | |
| inputs=audio_input, | |
| label="Try a sample song" | |
| ) | |
| generate_btn = gr.Button( | |
| "β¨ Generate Lyric Video", | |
| elem_classes=["generate-btn"], | |
| variant="primary", | |
| ) | |
| gr.Markdown( | |
| "Runs the full pipeline: transcribe lyrics β generate AI storyboard " | |
| "backgrounds β render kinetic typography β assemble the video " | |
| "(~30β90s depending on song length)." | |
| ) | |
| with gr.Column(scale=1, elem_classes=["panel"]): | |
| gr.Markdown("**2. Choose how the lyrics look**") | |
| theme_input = gr.Dropdown( | |
| label="Visual Theme", | |
| choices=list(THEMES.keys()), | |
| value="Neon", | |
| info="Sets the on-screen lyric text color: Dark = white, Light = warm gold, Neon = cyan glow. AI backgrounds are always slightly darkened, so pick whichever color reads best against your Visual Prompt.", | |
| ) | |
| font_input = gr.Dropdown( | |
| label="Lyric Font", | |
| choices=list(FONT_FAMILIES.keys()), | |
| value="Serif (Bold)", | |
| info="Typeface used for the on-screen lyrics. Bold sans-serif suits most songs; try Serif or Monospace for a different look.", | |
| ) | |
| visual_prompt_input = gr.Textbox( | |
| label="Visual Prompt", | |
| placeholder="e.g. mystical forest, glowing particles, cinematic, digital art, 8k", | |
| value="neon-lit futuristic city at night, vibrant glowing colors, cyberpunk aesthetic, energetic atmosphere, beautiful starry sky, digital art, highly detailed", | |
| info="Describes the look of the AI-generated backgrounds (and gives the lyric-timing model a sense of the visual mood).", | |
| lines=2, | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| gr.Markdown( | |
| "**Recommendations:**\n" | |
| "- **Best Default:** Condition on Previous Text **ON**, VAD **ON**, Demucs **OFF**. (Best for most pop/vocal tracks).\n" | |
| "- **Heavily Instrumental Songs:** If vocals are very quiet or buried under loud instruments, turn Condition on Previous Text **OFF**, and turn Demucs **ON**.\n" | |
| "- β οΈ **WARNING:** Not recommended to use **Demucs ON + Condition ON** together! It may cause infinite hallucination loops during instrumental breaks." | |
| ) | |
| cond_prev_input = gr.Checkbox( | |
| label="Condition on Previous Text", | |
| value=True, | |
| info="Helps Whisper understand context by feeding it previous lines. Improves word accuracy but can cause loops if not anchored." | |
| ) | |
| use_vad_input = gr.Checkbox( | |
| label="Use VAD (Voice Activity Detection) Filter", | |
| value=True, | |
| info="Mutes audio completely when no singing is detected. Very helpful to prevent hallucinations during long instrumental solos." | |
| ) | |
| use_demucs_input = gr.Checkbox( | |
| label="Use Demucs Vocal Separation", | |
| value=False, | |
| interactive=False, | |
| info="Disabled because Condition on Previous Text is ON (prevents infinite loops)." | |
| ) | |
| model_input = gr.Dropdown( | |
| label="Whisper Model", | |
| choices=["large-v3", "large-v3-turbo", "medium", "small", "base"], | |
| value="large-v3", | |
| info="Larger models are more accurate but take longer to process." | |
| ) | |
| def enforce_safe_params(cond_prev): | |
| if cond_prev: | |
| return gr.update(value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops). ") | |
| else: | |
| return gr.update(interactive=True, info="Isolates vocals as a preprocessing step. Only enable this if vocals are not clearly audible and are buried under instruments.") | |
| cond_prev_input.change( | |
| fn=enforce_safe_params, | |
| inputs=[cond_prev_input], | |
| outputs=[use_demucs_input] | |
| ) | |
| with gr.Column(scale=1, elem_classes=["panel"]): | |
| video_output = gr.Video( | |
| label="Your Lyric Video (preview and download here)", | |
| interactive=False, | |
| height=360, | |
| ) | |
| gr.Markdown( | |
| """ | |
| **Tips:** | |
| - Best with clear vocals (ballads, pop, spoken word) | |
| - Describe the visuals you want in the Visual Prompt β it shapes both the AI backgrounds and the on-screen mood | |
| - Try different Visual Themes and Fonts to match your song's vibe | |
| - Processing takes ~30β90s depending on song length | |
| """, | |
| elem_classes=["panel"], | |
| ) | |
| generate_btn.click( | |
| fn=generate_video, | |
| inputs=[audio_input, theme_input, font_input, visual_prompt_input, model_input, use_demucs_input, cond_prev_input, use_vad_input], | |
| outputs=[video_output], | |
| api_visibility="public", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(css=CUSTOM_CSS) | |