Spaces:
Running on Zero
Running on Zero
File size: 14,068 Bytes
08ab8f1 ff66b59 08ab8f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 | """
app.py β Gradio UI entry point for aMuseMe
"""
import sys
from pathlib import Path
import gradio as gr
SRC_DIR = Path(__file__).parent / "src"
if str(SRC_DIR) not in sys.path:
sys.path.insert(0, str(SRC_DIR))
from amuseme.transcriber import transcribe
from amuseme.renderer import render_frames
from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY
from amuseme.video_assembler import assemble
from amuseme.logger import get_logger
logger = get_logger("app")
# Try to import spaces for ZeroGPU; gracefully degrade locally
try:
import spaces
HAS_SPACES = True
except ImportError:
HAS_SPACES = False
if HAS_SPACES:
from huggingface_hub import snapshot_download
logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...")
try:
snapshot_download(repo_id="Systran/faster-whisper-large-v3")
snapshot_download(repo_id="openbmb/MiniCPM5-1B")
snapshot_download(repo_id="stabilityai/sd-turbo")
logger.info("Model pre-download complete!")
except Exception as e:
logger.warning(f"Pre-download failed (will retry during runtime): {e}")
def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str):
return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt)
if HAS_SPACES:
_gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe)
def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str:
import time
if audio_path is None:
raise gr.Error("Please upload an audio file.")
pipeline_t0 = time.time()
logger.info(
"===== PIPELINE START =====\n"
f" audio={audio_path} theme={theme} font={font_family} visual_prompt={visual_prompt!r}\n"
f" model_size={model_size} demucs={use_demucs} "
f"cond_prev={cond_prev} vad={use_vad}"
)
# Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed
# input/output for these models is logged inside transcribe().
logger.info("[Step 1/4] Transcribing audio + generating frame metadata...")
t0 = time.time()
frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt)
if not frames:
raise gr.Error("Could not extract words from audio. Try a cleaner recording.")
logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s β {len(frames)} frames.")
# Step 2: Generate AI storyboard backgrounds β one image per pair of lyric
# lines, so the backdrop changes less often than the on-screen text
# (renderer expands each image to cover two consecutive lyric frames).
bg_images = None
if len(frames) > 0:
logger.info("[Step 2/4] Generating AI storyboard backgrounds...")
t0 = time.time()
prompts = []
for i in range(0, len(frames), 2):
pair = frames[i:i + 2]
line_text = " ".join(
" ".join(w.text for w in fr.words) for fr in pair
).strip()
# Combine the dynamic lyric text with the user's visual prompt
prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt
prompts.append(prompt)
logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n " + "\n ".join(prompts))
try:
from amuseme.bg_generator import generate_storyboard
bg_images = generate_storyboard(prompts) or None
logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s β {len(bg_images or [])} image(s).")
except Exception as e:
logger.error(f"[Step 2/4] Error generating backgrounds: {e}")
bg_images = None
# Step 3: Get audio duration via ffprobe
import subprocess, json
probe = subprocess.run(
["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path],
capture_output=True, text=True
)
duration = float(json.loads(probe.stdout)["format"]["duration"])
logger.info(f"[Step 3/4] Rendering frames β audio duration={duration:.1f}s, {len(frames)} lyric frames...")
t0 = time.time()
frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family)
logger.info("[Step 4/4] Assembling video via FFmpeg...")
out_path = assemble(frames_gen, audio_path)
logger.info(
f"[Step 4/4] Done in {time.time() - t0:.1f}s β output={out_path}\n"
f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s ====="
)
return out_path
# βββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap');
body, .gradio-container {
font-family: 'Inter', sans-serif !important;
background: #090910 !important;
}
.gradio-container {
max-width: 900px !important;
margin: 0 auto !important;
}
/* Header */
.app-header {
text-align: center;
padding: 2.5rem 1rem 1.5rem;
background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%);
border-radius: 16px;
margin-bottom: 1.5rem;
border: 1px solid rgba(255,255,255,0.06);
}
.app-header h1 {
font-size: 3rem;
font-weight: 700;
background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin: 0 0 0.4rem;
letter-spacing: -1px;
}
.app-header p {
color: rgba(255,255,255,0.55);
font-size: 1rem;
margin: 0;
}
/* Panel */
.panel {
background: #0f0f1a !important;
border: 1px solid rgba(255,255,255,0.08) !important;
border-radius: 12px !important;
}
/* Labels */
label span {
color: rgba(255,255,255,0.75) !important;
font-weight: 500 !important;
font-size: 0.85rem !important;
text-transform: uppercase !important;
letter-spacing: 0.05em !important;
}
/* Inputs */
textarea, input[type="text"] {
background: #1a1a2e !important;
border: 1px solid rgba(255,255,255,0.1) !important;
border-radius: 8px !important;
color: #e0e0ff !important;
}
/* Generate button */
.generate-btn {
background: linear-gradient(135deg, #7c3aed, #2563eb) !important;
border: none !important;
border-radius: 10px !important;
color: white !important;
font-weight: 600 !important;
font-size: 1rem !important;
padding: 0.75rem 2rem !important;
width: 100% !important;
transition: opacity 0.2s ease !important;
cursor: pointer !important;
}
.generate-btn:hover {
opacity: 0.9 !important;
}
/* Step badges */
.steps-row {
display: flex;
gap: 0.75rem;
justify-content: center;
padding: 1rem 0 0.5rem;
}
.step-badge {
background: rgba(255,255,255,0.05);
border: 1px solid rgba(255,255,255,0.1);
border-radius: 20px;
padding: 0.3rem 0.9rem;
color: rgba(255,255,255,0.5);
font-size: 0.78rem;
font-weight: 500;
}
"""
HEADER_HTML = """
<div class="app-header">
<h1>π΅ aMuseMe</h1>
<p>Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.</p>
<div class="steps-row">
<span class="step-badge">β Upload Audio</span>
<span class="step-badge">β Whisper AI Syncs</span>
<span class="step-badge">β AI Storyboard Backgrounds</span>
<span class="step-badge">β Kinetic Typography Video</span>
</div>
</div>
"""
with gr.Blocks(title="aMuseMe β AI Lyric Video Generator") as demo:
gr.HTML(HEADER_HTML)
with gr.Row():
with gr.Column(scale=1, elem_classes=["panel"]):
gr.Markdown(
"**1. Upload a song** β Whisper transcribes the vocals and times each "
"word to drive the lyric video below."
)
audio_input = gr.Audio(
label="Audio File (song with clear vocals, MP3/WAV)",
type="filepath",
sources=["upload"],
)
gr.Examples(
examples=[
"assets/samples/ride_like_the_ind_test_song.mp3",
"assets/samples/hollow-song-test.mp3"
],
inputs=audio_input,
label="Try a sample song"
)
generate_btn = gr.Button(
"β¨ Generate Lyric Video",
elem_classes=["generate-btn"],
variant="primary",
)
gr.Markdown(
"Runs the full pipeline: transcribe lyrics β generate AI storyboard "
"backgrounds β render kinetic typography β assemble the video "
"(~30β90s depending on song length)."
)
with gr.Column(scale=1, elem_classes=["panel"]):
gr.Markdown("**2. Choose how the lyrics look**")
theme_input = gr.Dropdown(
label="Visual Theme",
choices=list(THEMES.keys()),
value="Neon",
info="Sets the on-screen lyric text color: Dark = white, Light = warm gold, Neon = cyan glow. AI backgrounds are always slightly darkened, so pick whichever color reads best against your Visual Prompt.",
)
font_input = gr.Dropdown(
label="Lyric Font",
choices=list(FONT_FAMILIES.keys()),
value="Serif (Bold)",
info="Typeface used for the on-screen lyrics. Bold sans-serif suits most songs; try Serif or Monospace for a different look.",
)
visual_prompt_input = gr.Textbox(
label="Visual Prompt",
placeholder="e.g. mystical forest, glowing particles, cinematic, digital art, 8k",
value="neon-lit futuristic city at night, vibrant glowing colors, cyberpunk aesthetic, energetic atmosphere, beautiful starry sky, digital art, highly detailed",
info="Describes the look of the AI-generated backgrounds (and gives the lyric-timing model a sense of the visual mood).",
lines=2,
)
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown(
"**Recommendations:**\n"
"- **Best Default:** Condition on Previous Text **ON**, VAD **ON**, Demucs **OFF**. (Best for most pop/vocal tracks).\n"
"- **Heavily Instrumental Songs:** If vocals are very quiet or buried under loud instruments, turn Condition on Previous Text **OFF**, and turn Demucs **ON**.\n"
"- β οΈ **WARNING:** Not recommended to use **Demucs ON + Condition ON** together! It may cause infinite hallucination loops during instrumental breaks."
)
cond_prev_input = gr.Checkbox(
label="Condition on Previous Text",
value=True,
info="Helps Whisper understand context by feeding it previous lines. Improves word accuracy but can cause loops if not anchored."
)
use_vad_input = gr.Checkbox(
label="Use VAD (Voice Activity Detection) Filter",
value=True,
info="Mutes audio completely when no singing is detected. Very helpful to prevent hallucinations during long instrumental solos."
)
use_demucs_input = gr.Checkbox(
label="Use Demucs Vocal Separation",
value=False,
interactive=False,
info="Disabled because Condition on Previous Text is ON (prevents infinite loops)."
)
model_input = gr.Dropdown(
label="Whisper Model",
choices=["large-v3", "large-v3-turbo", "medium", "small", "base"],
value="large-v3",
info="Larger models are more accurate but take longer to process."
)
def enforce_safe_params(cond_prev):
if cond_prev:
return gr.update(value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops). ")
else:
return gr.update(interactive=True, info="Isolates vocals as a preprocessing step. Only enable this if vocals are not clearly audible and are buried under instruments.")
cond_prev_input.change(
fn=enforce_safe_params,
inputs=[cond_prev_input],
outputs=[use_demucs_input]
)
with gr.Column(scale=1, elem_classes=["panel"]):
video_output = gr.Video(
label="Your Lyric Video (preview and download here)",
interactive=False,
height=360,
)
gr.Markdown(
"""
**Tips:**
- Best with clear vocals (ballads, pop, spoken word)
- Describe the visuals you want in the Visual Prompt β it shapes both the AI backgrounds and the on-screen mood
- Try different Visual Themes and Fonts to match your song's vibe
- Processing takes ~30β90s depending on song length
""",
elem_classes=["panel"],
)
generate_btn.click(
fn=generate_video,
inputs=[audio_input, theme_input, font_input, visual_prompt_input, model_input, use_demucs_input, cond_prev_input, use_vad_input],
outputs=[video_output],
api_visibility="public",
)
if __name__ == "__main__":
demo.launch(css=CUSTOM_CSS)
|