Spaces:

build-small-hackathon
/

aMuseMe

Running on Zero

App Files Files Community

aMuseMe / app.py

Blazestorm001

chore: tidy Space repository structure

ff66b59 verified 15 days ago

Raw

History Blame Contribute Delete

14.1 kB

	"""
	app.py — Gradio UI entry point for aMuseMe
	"""
	import sys
	from pathlib import Path

	import gradio as gr

	SRC_DIR = Path(__file__).parent / "src"
	if str(SRC_DIR) not in sys.path:
	sys.path.insert(0, str(SRC_DIR))

	from amuseme.transcriber import transcribe
	from amuseme.renderer import render_frames
	from amuseme.animations import THEME_COLORS as THEMES, FONT_FAMILIES, DEFAULT_FONT_FAMILY
	from amuseme.video_assembler import assemble
	from amuseme.logger import get_logger

	logger = get_logger("app")

	# Try to import spaces for ZeroGPU; gracefully degrade locally
	try:
	import spaces
	HAS_SPACES = True
	except ImportError:
	HAS_SPACES = False

	if HAS_SPACES:
	from huggingface_hub import snapshot_download
	logger.info("HF Space detected. Pre-downloading heavy models to avoid ZeroGPU timeout...")
	try:
	snapshot_download(repo_id="Systran/faster-whisper-large-v3")
	snapshot_download(repo_id="openbmb/MiniCPM5-1B")
	snapshot_download(repo_id="stabilityai/sd-turbo")
	logger.info("Model pre-download complete!")
	except Exception as e:
	logger.warning(f"Pre-download failed (will retry during runtime): {e}")


	def _gpu_transcribe(audio_path: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool, theme: str, visual_prompt: str):
	return transcribe(audio_path, model_size=model_size, use_demucs=use_demucs, condition_on_previous_text=cond_prev, use_vad=use_vad, theme=theme, visual_prompt=visual_prompt)


	if HAS_SPACES:
	_gpu_transcribe = spaces.GPU(duration=120)(_gpu_transcribe)


	def generate_video(audio_path: str, theme: str, font_family: str, visual_prompt: str, model_size: str, use_demucs: bool, cond_prev: bool, use_vad: bool) -> str:
	import time
	if audio_path is None:
	raise gr.Error("Please upload an audio file.")

	pipeline_t0 = time.time()
	logger.info(
	"===== PIPELINE START =====\n"
	f" audio={audio_path} theme={theme} font={font_family} visual_prompt={visual_prompt!r}\n"
	f" model_size={model_size} demucs={use_demucs} "
	f"cond_prev={cond_prev} vad={use_vad}"
	)

	# Step 1: Transcribe + frame-metadata (Whisper + MiniCPM5-1B). Detailed
	# input/output for these models is logged inside transcribe().
	logger.info("[Step 1/4] Transcribing audio + generating frame metadata...")
	t0 = time.time()
	frames = _gpu_transcribe(audio_path, model_size, use_demucs, cond_prev, use_vad, theme, visual_prompt)
	if not frames:
	raise gr.Error("Could not extract words from audio. Try a cleaner recording.")
	logger.info(f"[Step 1/4] Done in {time.time() - t0:.1f}s — {len(frames)} frames.")

	# Step 2: Generate AI storyboard backgrounds — one image per pair of lyric
	# lines, so the backdrop changes less often than the on-screen text
	# (renderer expands each image to cover two consecutive lyric frames).
	bg_images = None
	if len(frames) > 0:
	logger.info("[Step 2/4] Generating AI storyboard backgrounds...")
	t0 = time.time()
	prompts = []
	for i in range(0, len(frames), 2):
	pair = frames[i:i + 2]
	line_text = " ".join(
	" ".join(w.text for w in fr.words) for fr in pair
	).strip()
	# Combine the dynamic lyric text with the user's visual prompt
	prompt = f"{line_text}, {visual_prompt}" if line_text else visual_prompt
	prompts.append(prompt)

	logger.info(f"[Step 2/4] Background prompts ({len(prompts)}):\n " + "\n ".join(prompts))
	try:
	from amuseme.bg_generator import generate_storyboard
	bg_images = generate_storyboard(prompts) or None
	logger.info(f"[Step 2/4] Done in {time.time() - t0:.1f}s — {len(bg_images or [])} image(s).")
	except Exception as e:
	logger.error(f"[Step 2/4] Error generating backgrounds: {e}")
	bg_images = None

	# Step 3: Get audio duration via ffprobe
	import subprocess, json
	probe = subprocess.run(
	["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path],
	capture_output=True, text=True
	)
	duration = float(json.loads(probe.stdout)["format"]["duration"])
	logger.info(f"[Step 3/4] Rendering frames — audio duration={duration:.1f}s, {len(frames)} lyric frames...")
	t0 = time.time()
	frames_gen = render_frames(frames, duration, theme_name=theme, bg_images=bg_images, font_family=font_family)

	logger.info("[Step 4/4] Assembling video via FFmpeg...")
	out_path = assemble(frames_gen, audio_path)
	logger.info(
	f"[Step 4/4] Done in {time.time() - t0:.1f}s — output={out_path}\n"
	f"===== PIPELINE COMPLETE in {time.time() - pipeline_t0:.1f}s ====="
	)

	return out_path



	# ─── Gradio UI ─────────────────────────────────────────────────────────────

	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&display=swap');

	body, .gradio-container {
	font-family: 'Inter', sans-serif !important;
	background: #090910 !important;
	}

	.gradio-container {
	max-width: 900px !important;
	margin: 0 auto !important;
	}

	/* Header */
	.app-header {
	text-align: center;
	padding: 2.5rem 1rem 1.5rem;
	background: linear-gradient(135deg, #0f0f1a 0%, #1a0a2e 100%);
	border-radius: 16px;
	margin-bottom: 1.5rem;
	border: 1px solid rgba(255,255,255,0.06);
	}
	.app-header h1 {
	font-size: 3rem;
	font-weight: 700;
	background: linear-gradient(135deg, #a78bfa, #60a5fa, #34d399);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin: 0 0 0.4rem;
	letter-spacing: -1px;
	}
	.app-header p {
	color: rgba(255,255,255,0.55);
	font-size: 1rem;
	margin: 0;
	}

	/* Panel */
	.panel {
	background: #0f0f1a !important;
	border: 1px solid rgba(255,255,255,0.08) !important;
	border-radius: 12px !important;
	}

	/* Labels */
	label span {
	color: rgba(255,255,255,0.75) !important;
	font-weight: 500 !important;
	font-size: 0.85rem !important;
	text-transform: uppercase !important;
	letter-spacing: 0.05em !important;
	}

	/* Inputs */
	textarea, input[type="text"] {
	background: #1a1a2e !important;
	border: 1px solid rgba(255,255,255,0.1) !important;
	border-radius: 8px !important;
	color: #e0e0ff !important;
	}

	/* Generate button */
	.generate-btn {
	background: linear-gradient(135deg, #7c3aed, #2563eb) !important;
	border: none !important;
	border-radius: 10px !important;
	color: white !important;
	font-weight: 600 !important;
	font-size: 1rem !important;
	padding: 0.75rem 2rem !important;
	width: 100% !important;
	transition: opacity 0.2s ease !important;
	cursor: pointer !important;
	}
	.generate-btn:hover {
	opacity: 0.9 !important;
	}

	/* Step badges */
	.steps-row {
	display: flex;
	gap: 0.75rem;
	justify-content: center;
	padding: 1rem 0 0.5rem;
	}
	.step-badge {
	background: rgba(255,255,255,0.05);
	border: 1px solid rgba(255,255,255,0.1);
	border-radius: 20px;
	padding: 0.3rem 0.9rem;
	color: rgba(255,255,255,0.5);
	font-size: 0.78rem;
	font-weight: 500;
	}
	"""

	HEADER_HTML = """
	<div class="app-header">
	<h1>🎵 aMuseMe</h1>
	<p>Drop a song. Watch your lyrics come alive with AI-powered kinetic typography and AI-generated backgrounds.</p>
	<div class="steps-row">
	<span class="step-badge">① Upload Audio</span>
	<span class="step-badge">→ Whisper AI Syncs</span>
	<span class="step-badge">→ AI Storyboard Backgrounds</span>
	<span class="step-badge">→ Kinetic Typography Video</span>
	</div>
	</div>
	"""

	with gr.Blocks(title="aMuseMe — AI Lyric Video Generator") as demo:
	gr.HTML(HEADER_HTML)

	with gr.Row():
	with gr.Column(scale=1, elem_classes=["panel"]):
	gr.Markdown(
	"1. Upload a song — Whisper transcribes the vocals and times each "
	"word to drive the lyric video below."
	)
	audio_input = gr.Audio(
	label="Audio File (song with clear vocals, MP3/WAV)",
	type="filepath",
	sources=["upload"],
	)
	gr.Examples(
	examples=[
	"assets/samples/ride_like_the_ind_test_song.mp3",
	"assets/samples/hollow-song-test.mp3"
	],
	inputs=audio_input,
	label="Try a sample song"
	)

	generate_btn = gr.Button(
	"✨ Generate Lyric Video",
	elem_classes=["generate-btn"],
	variant="primary",
	)
	gr.Markdown(
	"Runs the full pipeline: transcribe lyrics → generate AI storyboard "
	"backgrounds → render kinetic typography → assemble the video "
	"(~30–90s depending on song length)."
	)

	with gr.Column(scale=1, elem_classes=["panel"]):
	gr.Markdown("2. Choose how the lyrics look")
	theme_input = gr.Dropdown(
	label="Visual Theme",
	choices=list(THEMES.keys()),
	value="Neon",
	info="Sets the on-screen lyric text color: Dark = white, Light = warm gold, Neon = cyan glow. AI backgrounds are always slightly darkened, so pick whichever color reads best against your Visual Prompt.",
	)
	font_input = gr.Dropdown(
	label="Lyric Font",
	choices=list(FONT_FAMILIES.keys()),
	value="Serif (Bold)",
	info="Typeface used for the on-screen lyrics. Bold sans-serif suits most songs; try Serif or Monospace for a different look.",
	)
	visual_prompt_input = gr.Textbox(
	label="Visual Prompt",
	placeholder="e.g. mystical forest, glowing particles, cinematic, digital art, 8k",
	value="neon-lit futuristic city at night, vibrant glowing colors, cyberpunk aesthetic, energetic atmosphere, beautiful starry sky, digital art, highly detailed",
	info="Describes the look of the AI-generated backgrounds (and gives the lyric-timing model a sense of the visual mood).",
	lines=2,
	)

	with gr.Accordion("Advanced Settings", open=False):
	gr.Markdown(
	"Recommendations:\n"
	"- Best Default: Condition on Previous Text ON, VAD ON, Demucs OFF. (Best for most pop/vocal tracks).\n"
	"- Heavily Instrumental Songs: If vocals are very quiet or buried under loud instruments, turn Condition on Previous Text OFF, and turn Demucs ON.\n"
	"- ⚠️ WARNING: Not recommended to use Demucs ON + Condition ON together! It may cause infinite hallucination loops during instrumental breaks."
	)
	cond_prev_input = gr.Checkbox(
	label="Condition on Previous Text",
	value=True,
	info="Helps Whisper understand context by feeding it previous lines. Improves word accuracy but can cause loops if not anchored."
	)
	use_vad_input = gr.Checkbox(
	label="Use VAD (Voice Activity Detection) Filter",
	value=True,
	info="Mutes audio completely when no singing is detected. Very helpful to prevent hallucinations during long instrumental solos."
	)
	use_demucs_input = gr.Checkbox(
	label="Use Demucs Vocal Separation",
	value=False,
	interactive=False,
	info="Disabled because Condition on Previous Text is ON (prevents infinite loops)."
	)
	model_input = gr.Dropdown(
	label="Whisper Model",
	choices=["large-v3", "large-v3-turbo", "medium", "small", "base"],
	value="large-v3",
	info="Larger models are more accurate but take longer to process."
	)

	def enforce_safe_params(cond_prev):
	if cond_prev:
	return gr.update(value=False, interactive=False, info="Disabled because Condition on Previous Text is ON (prevents infinite loops). ")
	else:
	return gr.update(interactive=True, info="Isolates vocals as a preprocessing step. Only enable this if vocals are not clearly audible and are buried under instruments.")

	cond_prev_input.change(
	fn=enforce_safe_params,
	inputs=[cond_prev_input],
	outputs=[use_demucs_input]
	)

	with gr.Column(scale=1, elem_classes=["panel"]):
	video_output = gr.Video(
	label="Your Lyric Video (preview and download here)",
	interactive=False,
	height=360,
	)
	gr.Markdown(
	"""
	Tips:
	- Best with clear vocals (ballads, pop, spoken word)
	- Describe the visuals you want in the Visual Prompt — it shapes both the AI backgrounds and the on-screen mood
	- Try different Visual Themes and Fonts to match your song's vibe
	- Processing takes ~30–90s depending on song length
	""",
	elem_classes=["panel"],
	)

	generate_btn.click(
	fn=generate_video,
	inputs=[audio_input, theme_input, font_input, visual_prompt_input, model_input, use_demucs_input, cond_prev_input, use_vad_input],
	outputs=[video_output],
	api_visibility="public",
	)


	if __name__ == "__main__":
	demo.launch(css=CUSTOM_CSS)