Spaces:

build-small-hackathon
/

aMuseMe

Running on Zero

File size: 14,068 Bytes

"""
app.py — Gradio UI entry point for aMuseMe
"""
import sys
from pathlib import Path

import gradio as gr

SRC_DIR = Path(__file__).parent / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from amuseme.transcriber import transcribe
from amuseme.renderer import render_frames
from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY
from amuseme.video_assembler import assemble
from amuseme.logger import get_logger

logger = get_logger("app")

# Try to import spaces for ZeroGPU; gracefully degrade locally
try:
    import spaces
    HAS_SPACES = True
except ImportError:
    HAS_SPACES = False

if HAS_SPACES:
    from huggingface_hub import snapshot_download
    logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...")
    try:
        snapshot_download(repo_id="Systran/faster-whisper-large-v3")
        snapshot_download(repo_id="openbmb/MiniCPM5-1B")
        snapshot_download(repo_id="stabilityai/sd-turbo")
        logger.info("Model pre-download complete!")
    except Exception as e:
        logger.warning(f"Pre-download failed (will retry during runtime): {e}")


def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str):
    return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt)


if HAS_SPACES:
    _gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe)


def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str:
    import time
    if audio_path is None:
        raise gr.Error("Please upload an audio file.")

    pipeline_t0 = time.time()
    logger.info(
        "===== PIPELINE START =====\n"
        f"  audio={audio_path}  theme={theme}  font={font_family}  visual_prompt={visual_prompt!r}\n"
        f"  model_size={model_size}  demucs={use_demucs}  "
        f"cond_prev={cond_prev}  vad={use_vad}"
    )

    # Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed
    # input/output for these models is logged inside transcribe().
    logger.info("[Step 1/4] Transcribing audio + generating frame metadata...")
    t0 = time.time()
    frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt)
    if not frames:
        raise gr.Error("Could not extract words from audio. Try a cleaner recording.")
    logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s — {len(frames)} frames.")

    # Step 2: Generate AI storyboard backgrounds — one image per pair of lyric
    # lines, so the backdrop changes less often than the on-screen text
    # (renderer expands each image to cover two consecutive lyric frames).
    bg_images = None
    if len(frames) > 0:
        logger.info("[Step 2/4] Generating AI storyboard backgrounds...")
        t0 = time.time()
        prompts = []
        for i in range(0, len(frames), 2):
            pair = frames[i:i + 2]
            line_text = " ".join(
                " ".join(w.text for w in fr.words) for fr in pair
            ).strip()
            # Combine the dynamic lyric text with the user's visual prompt
            prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt
            prompts.append(prompt)

        logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n  " + "\n  ".join(prompts))
        try:
            from amuseme.bg_generator import generate_storyboard
            bg_images = generate_storyboard(prompts) or None
            logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s — {len(bg_images or [])} image(s).")
        except Exception as e:
            logger.error(f"[Step 2/4] Error generating backgrounds: {e}")
            bg_images = None

    # Step 3: Get audio duration via ffprobe
    import subprocess, json
    probe = subprocess.run(
        ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path],
        capture_output=True, text=True
    )
    duration = float(json.loads(probe.stdout)["format"]["duration"])
    logger.info(f"[Step 3/4] Rendering frames — audio duration={duration:.1f}s, {len(frames)} lyric frames...")
    t0 = time.time()
    frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family)

    logger.info("[Step 4/4] Assembling video via FFmpeg...")
    out_path = assemble(frames_gen, audio_path)
    logger.info(
        f"[Step 4/4] Done in {time.time() - t0:.1f}s — output={out_path}\n"
        f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s ====="
    )

    return out_path



# ─── Gradio UI ─────────────────────────────────────────────────────────────

CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap');

body, .gradio-container {
    font-family: 'Inter', sans-serif !important;
    background: #090910 !important;
}

.gradio-container {
    max-width: 900px !important;
    margin: 0 auto !important;
}

/* Header */
.app-header {
    text-align: center;
    padding: 2.5rem 1rem 1.5rem;
    background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%);
    border-radius: 16px;
    margin-bottom: 1.5rem;
    border: 1px solid rgba(255,255,255,0.06);
}
.app-header h1 {
    font-size: 3rem;
    font-weight: 700;
    background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    margin: 0 0 0.4rem;
    letter-spacing: -1px;
}
.app-header p {
    color: rgba(255,255,255,0.55);
    font-size: 1rem;
    margin: 0;
}

/* Panel */
.panel {
    background: #0f0f1a !important;
    border: 1px solid rgba(255,255,255,0.08) !important;
    border-radius: 12px !important;
}

/* Labels */
label span {
    color: rgba(255,255,255,0.75) !important;
    font-weight: 500 !important;
    font-size: 0.85rem !important;
    text-transform: uppercase !important;
    letter-spacing: 0.05em !important;
}

/* Inputs */
textarea, input[type="text"] {
    background: #1a1a2e !important;
    border: 1px solid rgba(255,255,255,0.1) !important;
    border-radius: 8px !important;
    color: #e0e0ff !important;
}

/* Generate button */
.generate-btn {
    background: linear-gradient(135deg, #7c3aed, #2563eb) !important;
    border: none !important;
    border-radius: 10px !important;
    color: white !important;
    font-weight: 600 !important;
    font-size: 1rem !important;
    padding: 0.75rem 2rem !important;
    width: 100% !important;
    transition: opacity 0.2s ease !important;
    cursor: pointer !important;
}
.generate-btn:hover {
    opacity: 0.9 !important;
}

/* Step badges */
.steps-row {
    display: flex;
    gap: 0.75rem;
    justify-content: center;
    padding: 1rem 0 0.5rem;
}
.step-badge {
    background: rgba(255,255,255,0.05);
    border: 1px solid rgba(255,255,255,0.1);
    border-radius: 20px;
    padding: 0.3rem 0.9rem;
    color: rgba(255,255,255,0.5);
    font-size: 0.78rem;
    font-weight: 500;
}
"""

HEADER_HTML = """
<div class="app-header">
    <h1>🎵 aMuseMe</h1>
    <p>Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.</p>
    <div class="steps-row">
        <span class="step-badge">① Upload Audio</span>
        <span class="step-badge">→ Whisper AI Syncs</span>
        <span class="step-badge">→ AI Storyboard Backgrounds</span>
        <span class="step-badge">→ Kinetic Typography Video</span>
    </div>
</div>
"""

with gr.Blocks(title="aMuseMe — AI Lyric Video Generator") as demo:
    gr.HTML(HEADER_HTML)

    with gr.Row():
        with gr.Column(scale=1, elem_classes=["panel"]):
            gr.Markdown(
                "**1. Upload a song** — Whisper transcribes the vocals and times each "
                "word to drive the lyric video below."
            )
            audio_input = gr.Audio(
                label="Audio File (song with clear vocals, MP3/WAV)",
                type="filepath",
                sources=["upload"],
            )
            gr.Examples(
                examples=[
                    "assets/samples/ride_like_the_ind_test_song.mp3",
                    "assets/samples/hollow-song-test.mp3"
                ],
                inputs=audio_input,
                label="Try a sample song"
            )

            generate_btn = gr.Button(
                "✨ Generate Lyric Video",
                elem_classes=["generate-btn"],
                variant="primary",
            )
            gr.Markdown(
                "Runs the full pipeline: transcribe lyrics → generate AI storyboard "
                "backgrounds → render kinetic typography → assemble the video "
                "(~30–90s depending on song length)."
            )

        with gr.Column(scale=1, elem_classes=["panel"]):
            gr.Markdown("**2. Choose how the lyrics look**")
            theme_input = gr.Dropdown(
                label="Visual Theme",
                choices=list(THEMES.keys()),
                value="Neon",
                info="Sets the on-screen lyric text color: Dark = white, Light = warm gold, Neon = cyan glow. AI backgrounds are always slightly darkened, so pick whichever color reads best against your Visual Prompt.",
            )
            font_input = gr.Dropdown(
                label="Lyric Font",
                choices=list(FONT_FAMILIES.keys()),
                value="Serif (Bold)",
                info="Typeface used for the on-screen lyrics. Bold sans-serif suits most songs; try Serif or Monospace for a different look.",
            )
            visual_prompt_input = gr.Textbox(
                label="Visual Prompt",
                placeholder="e.g. mystical forest, glowing particles, cinematic, digital art, 8k",
                value="neon-lit futuristic city at night, vibrant glowing colors, cyberpunk aesthetic, energetic atmosphere, beautiful starry sky, digital art, highly detailed",
                info="Describes the look of the AI-generated backgrounds (and gives the lyric-timing model a sense of the visual mood).",
                lines=2,
            )

            with gr.Accordion("Advanced Settings", open=False):
                gr.Markdown(
                    "**Recommendations:**\n"
                    "- **Best Default:** Condition on Previous Text **ON**, VAD **ON**, Demucs **OFF**. (Best for most pop/vocal tracks).\n"
                    "- **Heavily Instrumental Songs:** If vocals are very quiet or buried under loud instruments, turn Condition on Previous Text **OFF**, and turn Demucs **ON**.\n"
                    "- ⚠️ **WARNING:** Not recommended to use **Demucs ON + Condition ON** together! It may cause infinite hallucination loops during instrumental breaks."
                )
                cond_prev_input = gr.Checkbox(
                    label="Condition on Previous Text",
                    value=True,
                    info="Helps Whisper understand context by feeding it previous lines. Improves word accuracy but can cause loops if not anchored."
                )
                use_vad_input = gr.Checkbox(
                    label="Use VAD (Voice Activity Detection) Filter",
                    value=True,
                    info="Mutes audio completely when no singing is detected. Very helpful to prevent hallucinations during long instrumental solos."
                )
                use_demucs_input = gr.Checkbox(
                    label="Use Demucs Vocal Separation",
                    value=False,
                    interactive=False,
                    info="Disabled because Condition on Previous Text is ON (prevents infinite loops)."
                )
                model_input = gr.Dropdown(
                    label="Whisper Model",
                    choices=["large-v3", "large-v3-turbo", "medium", "small", "base"],
                    value="large-v3",
                    info="Larger models are more accurate but take longer to process."
                )

            def enforce_safe_params(cond_prev):
                if cond_prev:
                    return gr.update(value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops). ")
                else:
                    return gr.update(interactive=True, info="Isolates vocals as a preprocessing step. Only enable this if vocals are not clearly audible and are buried under instruments.")

            cond_prev_input.change(
                fn=enforce_safe_params,
                inputs=[cond_prev_input],
                outputs=[use_demucs_input]
            )

        with gr.Column(scale=1, elem_classes=["panel"]):
            video_output = gr.Video(
                label="Your Lyric Video (preview and download here)",
                interactive=False,
                height=360,
            )
            gr.Markdown(
                """
                **Tips:**
                - Best with clear vocals (ballads, pop, spoken word)
                - Describe the visuals you want in the Visual Prompt — it shapes both the AI backgrounds and the on-screen mood
                - Try different Visual Themes and Fonts to match your song's vibe
                - Processing takes ~30–90s depending on song length
                """,
                elem_classes=["panel"],
            )

    generate_btn.click(
        fn=generate_video,
        inputs=[audio_input, theme_input, font_input, visual_prompt_input, model_input, use_demucs_input, cond_prev_input, use_vad_input],
        outputs=[video_output],
        api_visibility="public",
    )


if __name__ == "__main__":
    demo.launch(css=CUSTOM_CSS)