aMuseMe / app.py
Blazestorm001's picture
chore: tidy Space repository structure
ff66b59 verified
Raw
History Blame Contribute Delete
14.1 kB
"""
app.py β€” Gradio UI entry point for aMuseMe
"""
import sys
from pathlib import Path
import gradio as gr
SRC_DIR = Path(__file__).parent / "src"
if str(SRC_DIR) not in sys.path:
sys.path.insert(0, str(SRC_DIR))
from amuseme.transcriber import transcribe
from amuseme.renderer import render_frames
from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY
from amuseme.video_assembler import assemble
from amuseme.logger import get_logger
logger = get_logger("app")
# Try to import spaces for ZeroGPU; gracefully degrade locally
try:
import spaces
HAS_SPACES = True
except ImportError:
HAS_SPACES = False
if HAS_SPACES:
from huggingface_hub import snapshot_download
logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...")
try:
snapshot_download(repo_id="Systran/faster-whisper-large-v3")
snapshot_download(repo_id="openbmb/MiniCPM5-1B")
snapshot_download(repo_id="stabilityai/sd-turbo")
logger.info("Model pre-download complete!")
except Exception as e:
logger.warning(f"Pre-download failed (will retry during runtime): {e}")
def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str):
return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt)
if HAS_SPACES:
_gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe)
def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str:
import time
if audio_path is None:
raise gr.Error("Please upload an audio file.")
pipeline_t0 = time.time()
logger.info(
"===== PIPELINE START =====\n"
f" audio={audio_path} theme={theme} font={font_family} visual_prompt={visual_prompt!r}\n"
f" model_size={model_size} demucs={use_demucs} "
f"cond_prev={cond_prev} vad={use_vad}"
)
# Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed
# input/output for these models is logged inside transcribe().
logger.info("[Step 1/4] Transcribing audio + generating frame metadata...")
t0 = time.time()
frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt)
if not frames:
raise gr.Error("Could not extract words from audio. Try a cleaner recording.")
logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s β€” {len(frames)} frames.")
# Step 2: Generate AI storyboard backgrounds β€” one image per pair of lyric
# lines, so the backdrop changes less often than the on-screen text
# (renderer expands each image to cover two consecutive lyric frames).
bg_images = None
if len(frames) > 0:
logger.info("[Step 2/4] Generating AI storyboard backgrounds...")
t0 = time.time()
prompts = []
for i in range(0, len(frames), 2):
pair = frames[i:i + 2]
line_text = " ".join(
" ".join(w.text for w in fr.words) for fr in pair
).strip()
# Combine the dynamic lyric text with the user's visual prompt
prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt
prompts.append(prompt)
logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n " + "\n ".join(prompts))
try:
from amuseme.bg_generator import generate_storyboard
bg_images = generate_storyboard(prompts) or None
logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s β€” {len(bg_images or [])} image(s).")
except Exception as e:
logger.error(f"[Step 2/4] Error generating backgrounds: {e}")
bg_images = None
# Step 3: Get audio duration via ffprobe
import subprocess, json
probe = subprocess.run(
["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path],
capture_output=True, text=True
)
duration = float(json.loads(probe.stdout)["format"]["duration"])
logger.info(f"[Step 3/4] Rendering frames β€” audio duration={duration:.1f}s, {len(frames)} lyric frames...")
t0 = time.time()
frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family)
logger.info("[Step 4/4] Assembling video via FFmpeg...")
out_path = assemble(frames_gen, audio_path)
logger.info(
f"[Step 4/4] Done in {time.time() - t0:.1f}s β€” output={out_path}\n"
f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s ====="
)
return out_path
# ─── Gradio UI ─────────────────────────────────────────────────────────────
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap');
body, .gradio-container {
font-family: 'Inter', sans-serif !important;
background: #090910 !important;
}
.gradio-container {
max-width: 900px !important;
margin: 0 auto !important;
}
/* Header */
.app-header {
text-align: center;
padding: 2.5rem 1rem 1.5rem;
background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%);
border-radius: 16px;
margin-bottom: 1.5rem;
border: 1px solid rgba(255,255,255,0.06);
}
.app-header h1 {
font-size: 3rem;
font-weight: 700;
background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin: 0 0 0.4rem;
letter-spacing: -1px;
}
.app-header p {
color: rgba(255,255,255,0.55);
font-size: 1rem;
margin: 0;
}
/* Panel */
.panel {
background: #0f0f1a !important;
border: 1px solid rgba(255,255,255,0.08) !important;
border-radius: 12px !important;
}
/* Labels */
label span {
color: rgba(255,255,255,0.75) !important;
font-weight: 500 !important;
font-size: 0.85rem !important;
text-transform: uppercase !important;
letter-spacing: 0.05em !important;
}
/* Inputs */
textarea, input[type="text"] {
background: #1a1a2e !important;
border: 1px solid rgba(255,255,255,0.1) !important;
border-radius: 8px !important;
color: #e0e0ff !important;
}
/* Generate button */
.generate-btn {
background: linear-gradient(135deg, #7c3aed, #2563eb) !important;
border: none !important;
border-radius: 10px !important;
color: white !important;
font-weight: 600 !important;
font-size: 1rem !important;
padding: 0.75rem 2rem !important;
width: 100% !important;
transition: opacity 0.2s ease !important;
cursor: pointer !important;
}
.generate-btn:hover {
opacity: 0.9 !important;
}
/* Step badges */
.steps-row {
display: flex;
gap: 0.75rem;
justify-content: center;
padding: 1rem 0 0.5rem;
}
.step-badge {
background: rgba(255,255,255,0.05);
border: 1px solid rgba(255,255,255,0.1);
border-radius: 20px;
padding: 0.3rem 0.9rem;
color: rgba(255,255,255,0.5);
font-size: 0.78rem;
font-weight: 500;
}
"""
HEADER_HTML = """
<div class="app-header">
<h1>🎡 aMuseMe</h1>
<p>Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.</p>
<div class="steps-row">
<span class="step-badge">β‘  Upload Audio</span>
<span class="step-badge">β†’ Whisper AI Syncs</span>
<span class="step-badge">β†’ AI Storyboard Backgrounds</span>
<span class="step-badge">β†’ Kinetic Typography Video</span>
</div>
</div>
"""
with gr.Blocks(title="aMuseMe β€” AI Lyric Video Generator") as demo:
gr.HTML(HEADER_HTML)
with gr.Row():
with gr.Column(scale=1, elem_classes=["panel"]):
gr.Markdown(
"**1. Upload a song** β€” Whisper transcribes the vocals and times each "
"word to drive the lyric video below."
)
audio_input = gr.Audio(
label="Audio File (song with clear vocals, MP3/WAV)",
type="filepath",
sources=["upload"],
)
gr.Examples(
examples=[
"assets/samples/ride_like_the_ind_test_song.mp3",
"assets/samples/hollow-song-test.mp3"
],
inputs=audio_input,
label="Try a sample song"
)
generate_btn = gr.Button(
"✨ Generate Lyric Video",
elem_classes=["generate-btn"],
variant="primary",
)
gr.Markdown(
"Runs the full pipeline: transcribe lyrics β†’ generate AI storyboard "
"backgrounds β†’ render kinetic typography β†’ assemble the video "
"(~30–90s depending on song length)."
)
with gr.Column(scale=1, elem_classes=["panel"]):
gr.Markdown("**2. Choose how the lyrics look**")
theme_input = gr.Dropdown(
label="Visual Theme",
choices=list(THEMES.keys()),
value="Neon",
info="Sets the on-screen lyric text color: Dark = white, Light = warm gold, Neon = cyan glow. AI backgrounds are always slightly darkened, so pick whichever color reads best against your Visual Prompt.",
)
font_input = gr.Dropdown(
label="Lyric Font",
choices=list(FONT_FAMILIES.keys()),
value="Serif (Bold)",
info="Typeface used for the on-screen lyrics. Bold sans-serif suits most songs; try Serif or Monospace for a different look.",
)
visual_prompt_input = gr.Textbox(
label="Visual Prompt",
placeholder="e.g. mystical forest, glowing particles, cinematic, digital art, 8k",
value="neon-lit futuristic city at night, vibrant glowing colors, cyberpunk aesthetic, energetic atmosphere, beautiful starry sky, digital art, highly detailed",
info="Describes the look of the AI-generated backgrounds (and gives the lyric-timing model a sense of the visual mood).",
lines=2,
)
with gr.Accordion("Advanced Settings", open=False):
gr.Markdown(
"**Recommendations:**\n"
"- **Best Default:** Condition on Previous Text **ON**, VAD **ON**, Demucs **OFF**. (Best for most pop/vocal tracks).\n"
"- **Heavily Instrumental Songs:** If vocals are very quiet or buried under loud instruments, turn Condition on Previous Text **OFF**, and turn Demucs **ON**.\n"
"- ⚠️ **WARNING:** Not recommended to use **Demucs ON + Condition ON** together! It may cause infinite hallucination loops during instrumental breaks."
)
cond_prev_input = gr.Checkbox(
label="Condition on Previous Text",
value=True,
info="Helps Whisper understand context by feeding it previous lines. Improves word accuracy but can cause loops if not anchored."
)
use_vad_input = gr.Checkbox(
label="Use VAD (Voice Activity Detection) Filter",
value=True,
info="Mutes audio completely when no singing is detected. Very helpful to prevent hallucinations during long instrumental solos."
)
use_demucs_input = gr.Checkbox(
label="Use Demucs Vocal Separation",
value=False,
interactive=False,
info="Disabled because Condition on Previous Text is ON (prevents infinite loops)."
)
model_input = gr.Dropdown(
label="Whisper Model",
choices=["large-v3", "large-v3-turbo", "medium", "small", "base"],
value="large-v3",
info="Larger models are more accurate but take longer to process."
)
def enforce_safe_params(cond_prev):
if cond_prev:
return gr.update(value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops). ")
else:
return gr.update(interactive=True, info="Isolates vocals as a preprocessing step. Only enable this if vocals are not clearly audible and are buried under instruments.")
cond_prev_input.change(
fn=enforce_safe_params,
inputs=[cond_prev_input],
outputs=[use_demucs_input]
)
with gr.Column(scale=1, elem_classes=["panel"]):
video_output = gr.Video(
label="Your Lyric Video (preview and download here)",
interactive=False,
height=360,
)
gr.Markdown(
"""
**Tips:**
- Best with clear vocals (ballads, pop, spoken word)
- Describe the visuals you want in the Visual Prompt β€” it shapes both the AI backgrounds and the on-screen mood
- Try different Visual Themes and Fonts to match your song's vibe
- Processing takes ~30–90s depending on song length
""",
elem_classes=["panel"],
)
generate_btn.click(
fn=generate_video,
inputs=[audio_input, theme_input, font_input, visual_prompt_input, model_input, use_demucs_input, cond_prev_input, use_vad_input],
outputs=[video_output],
api_visibility="public",
)
if __name__ == "__main__":
demo.launch(css=CUSTOM_CSS)