File size: 14,068 Bytes
08ab8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff66b59
 
 
 
 
 
 
 
 
 
 
08ab8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
"""
app.py β€” Gradio UI entry point for aMuseMe
"""
import sys
from pathlib import Path

import gradio as gr

SRC_DIR = Path(__file__).parent / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

from amuseme.transcriber import transcribe
from amuseme.renderer import render_frames
from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY
from amuseme.video_assembler import assemble
from amuseme.logger import get_logger

logger = get_logger("app")

# Try to import spaces for ZeroGPU; gracefully degrade locally
try:
    import spaces
    HAS_SPACES = True
except ImportError:
    HAS_SPACES = False

if HAS_SPACES:
    from huggingface_hub import snapshot_download
    logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...")
    try:
        snapshot_download(repo_id="Systran/faster-whisper-large-v3")
        snapshot_download(repo_id="openbmb/MiniCPM5-1B")
        snapshot_download(repo_id="stabilityai/sd-turbo")
        logger.info("Model pre-download complete!")
    except Exception as e:
        logger.warning(f"Pre-download failed (will retry during runtime): {e}")


def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str):
    return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt)


if HAS_SPACES:
    _gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe)


def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str:
    import time
    if audio_path is None:
        raise gr.Error("Please upload an audio file.")

    pipeline_t0 = time.time()
    logger.info(
        "===== PIPELINE START =====\n"
        f"  audio={audio_path}  theme={theme}  font={font_family}  visual_prompt={visual_prompt!r}\n"
        f"  model_size={model_size}  demucs={use_demucs}  "
        f"cond_prev={cond_prev}  vad={use_vad}"
    )

    # Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed
    # input/output for these models is logged inside transcribe().
    logger.info("[Step 1/4] Transcribing audio + generating frame metadata...")
    t0 = time.time()
    frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt)
    if not frames:
        raise gr.Error("Could not extract words from audio. Try a cleaner recording.")
    logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s β€” {len(frames)} frames.")

    # Step 2: Generate AI storyboard backgrounds β€” one image per pair of lyric
    # lines, so the backdrop changes less often than the on-screen text
    # (renderer expands each image to cover two consecutive lyric frames).
    bg_images = None
    if len(frames) > 0:
        logger.info("[Step 2/4] Generating AI storyboard backgrounds...")
        t0 = time.time()
        prompts = []
        for i in range(0, len(frames), 2):
            pair = frames[i:i + 2]
            line_text = " ".join(
                " ".join(w.text for w in fr.words) for fr in pair
            ).strip()
            # Combine the dynamic lyric text with the user's visual prompt
            prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt
            prompts.append(prompt)

        logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n  " + "\n  ".join(prompts))
        try:
            from amuseme.bg_generator import generate_storyboard
            bg_images = generate_storyboard(prompts) or None
            logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s β€” {len(bg_images or [])} image(s).")
        except Exception as e:
            logger.error(f"[Step 2/4] Error generating backgrounds: {e}")
            bg_images = None

    # Step 3: Get audio duration via ffprobe
    import subprocess, json
    probe = subprocess.run(
        ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path],
        capture_output=True, text=True
    )
    duration = float(json.loads(probe.stdout)["format"]["duration"])
    logger.info(f"[Step 3/4] Rendering frames β€” audio duration={duration:.1f}s, {len(frames)} lyric frames...")
    t0 = time.time()
    frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family)

    logger.info("[Step 4/4] Assembling video via FFmpeg...")
    out_path = assemble(frames_gen, audio_path)
    logger.info(
        f"[Step 4/4] Done in {time.time() - t0:.1f}s β€” output={out_path}\n"
        f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s ====="
    )

    return out_path



# ─── Gradio UI ─────────────────────────────────────────────────────────────

CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap');

body, .gradio-container {
    font-family: 'Inter', sans-serif !important;
    background: #090910 !important;
}

.gradio-container {
    max-width: 900px !important;
    margin: 0 auto !important;
}

/* Header */
.app-header {
    text-align: center;
    padding: 2.5rem 1rem 1.5rem;
    background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%);
    border-radius: 16px;
    margin-bottom: 1.5rem;
    border: 1px solid rgba(255,255,255,0.06);
}
.app-header h1 {
    font-size: 3rem;
    font-weight: 700;
    background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    margin: 0 0 0.4rem;
    letter-spacing: -1px;
}
.app-header p {
    color: rgba(255,255,255,0.55);
    font-size: 1rem;
    margin: 0;
}

/* Panel */
.panel {
    background: #0f0f1a !important;
    border: 1px solid rgba(255,255,255,0.08) !important;
    border-radius: 12px !important;
}

/* Labels */
label span {
    color: rgba(255,255,255,0.75) !important;
    font-weight: 500 !important;
    font-size: 0.85rem !important;
    text-transform: uppercase !important;
    letter-spacing: 0.05em !important;
}

/* Inputs */
textarea, input[type="text"] {
    background: #1a1a2e !important;
    border: 1px solid rgba(255,255,255,0.1) !important;
    border-radius: 8px !important;
    color: #e0e0ff !important;
}

/* Generate button */
.generate-btn {
    background: linear-gradient(135deg, #7c3aed, #2563eb) !important;
    border: none !important;
    border-radius: 10px !important;
    color: white !important;
    font-weight: 600 !important;
    font-size: 1rem !important;
    padding: 0.75rem 2rem !important;
    width: 100% !important;
    transition: opacity 0.2s ease !important;
    cursor: pointer !important;
}
.generate-btn:hover {
    opacity: 0.9 !important;
}

/* Step badges */
.steps-row {
    display: flex;
    gap: 0.75rem;
    justify-content: center;
    padding: 1rem 0 0.5rem;
}
.step-badge {
    background: rgba(255,255,255,0.05);
    border: 1px solid rgba(255,255,255,0.1);
    border-radius: 20px;
    padding: 0.3rem 0.9rem;
    color: rgba(255,255,255,0.5);
    font-size: 0.78rem;
    font-weight: 500;
}
"""

HEADER_HTML = """
<div class="app-header">
    <h1>🎡 aMuseMe</h1>
    <p>Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.</p>
    <div class="steps-row">
        <span class="step-badge">β‘  Upload Audio</span>
        <span class="step-badge">β†’ Whisper AI Syncs</span>
        <span class="step-badge">β†’ AI Storyboard Backgrounds</span>
        <span class="step-badge">β†’ Kinetic Typography Video</span>
    </div>
</div>
"""

with gr.Blocks(title="aMuseMe β€” AI Lyric Video Generator") as demo:
    gr.HTML(HEADER_HTML)

    with gr.Row():
        with gr.Column(scale=1, elem_classes=["panel"]):
            gr.Markdown(
                "**1. Upload a song** β€” Whisper transcribes the vocals and times each "
                "word to drive the lyric video below."
            )
            audio_input = gr.Audio(
                label="Audio File (song with clear vocals, MP3/WAV)",
                type="filepath",
                sources=["upload"],
            )
            gr.Examples(
                examples=[
                    "assets/samples/ride_like_the_ind_test_song.mp3",
                    "assets/samples/hollow-song-test.mp3"
                ],
                inputs=audio_input,
                label="Try a sample song"
            )

            generate_btn = gr.Button(
                "✨ Generate Lyric Video",
                elem_classes=["generate-btn"],
                variant="primary",
            )
            gr.Markdown(
                "Runs the full pipeline: transcribe lyrics β†’ generate AI storyboard "
                "backgrounds β†’ render kinetic typography β†’ assemble the video "
                "(~30–90s depending on song length)."
            )

        with gr.Column(scale=1, elem_classes=["panel"]):
            gr.Markdown("**2. Choose how the lyrics look**")
            theme_input = gr.Dropdown(
                label="Visual Theme",
                choices=list(THEMES.keys()),
                value="Neon",
                info="Sets the on-screen lyric text color: Dark = white, Light = warm gold, Neon = cyan glow. AI backgrounds are always slightly darkened, so pick whichever color reads best against your Visual Prompt.",
            )
            font_input = gr.Dropdown(
                label="Lyric Font",
                choices=list(FONT_FAMILIES.keys()),
                value="Serif (Bold)",
                info="Typeface used for the on-screen lyrics. Bold sans-serif suits most songs; try Serif or Monospace for a different look.",
            )
            visual_prompt_input = gr.Textbox(
                label="Visual Prompt",
                placeholder="e.g. mystical forest, glowing particles, cinematic, digital art, 8k",
                value="neon-lit futuristic city at night, vibrant glowing colors, cyberpunk aesthetic, energetic atmosphere, beautiful starry sky, digital art, highly detailed",
                info="Describes the look of the AI-generated backgrounds (and gives the lyric-timing model a sense of the visual mood).",
                lines=2,
            )

            with gr.Accordion("Advanced Settings", open=False):
                gr.Markdown(
                    "**Recommendations:**\n"
                    "- **Best Default:** Condition on Previous Text **ON**, VAD **ON**, Demucs **OFF**. (Best for most pop/vocal tracks).\n"
                    "- **Heavily Instrumental Songs:** If vocals are very quiet or buried under loud instruments, turn Condition on Previous Text **OFF**, and turn Demucs **ON**.\n"
                    "- ⚠️ **WARNING:** Not recommended to use **Demucs ON + Condition ON** together! It may cause infinite hallucination loops during instrumental breaks."
                )
                cond_prev_input = gr.Checkbox(
                    label="Condition on Previous Text",
                    value=True,
                    info="Helps Whisper understand context by feeding it previous lines. Improves word accuracy but can cause loops if not anchored."
                )
                use_vad_input = gr.Checkbox(
                    label="Use VAD (Voice Activity Detection) Filter",
                    value=True,
                    info="Mutes audio completely when no singing is detected. Very helpful to prevent hallucinations during long instrumental solos."
                )
                use_demucs_input = gr.Checkbox(
                    label="Use Demucs Vocal Separation",
                    value=False,
                    interactive=False,
                    info="Disabled because Condition on Previous Text is ON (prevents infinite loops)."
                )
                model_input = gr.Dropdown(
                    label="Whisper Model",
                    choices=["large-v3", "large-v3-turbo", "medium", "small", "base"],
                    value="large-v3",
                    info="Larger models are more accurate but take longer to process."
                )

            def enforce_safe_params(cond_prev):
                if cond_prev:
                    return gr.update(value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops). ")
                else:
                    return gr.update(interactive=True, info="Isolates vocals as a preprocessing step. Only enable this if vocals are not clearly audible and are buried under instruments.")

            cond_prev_input.change(
                fn=enforce_safe_params,
                inputs=[cond_prev_input],
                outputs=[use_demucs_input]
            )

        with gr.Column(scale=1, elem_classes=["panel"]):
            video_output = gr.Video(
                label="Your Lyric Video (preview and download here)",
                interactive=False,
                height=360,
            )
            gr.Markdown(
                """
                **Tips:**
                - Best with clear vocals (ballads, pop, spoken word)
                - Describe the visuals you want in the Visual Prompt β€” it shapes both the AI backgrounds and the on-screen mood
                - Try different Visual Themes and Fonts to match your song's vibe
                - Processing takes ~30–90s depending on song length
                """,
                elem_classes=["panel"],
            )

    generate_btn.click(
        fn=generate_video,
        inputs=[audio_input, theme_input, font_input, visual_prompt_input, model_input, use_demucs_input, cond_prev_input, use_vad_input],
        outputs=[video_output],
        api_visibility="public",
    )


if __name__ == "__main__":
    demo.launch(css=CUSTOM_CSS)