Spaces:

lulavc
/

AnimaStudio

Running on Zero

lulavc

fix: wav shape, float dtype check, extract_audio cleanup, NaN duration, HF token for InferenceClient

ae3213a 11 days ago

25 kB

	import spaces
	import gradio as gr
	import torch
	import torchaudio
	import os
	import gc
	import sys
	import shutil
	import tempfile
	import subprocess
	import threading
	import logging

	import dubbing
	from i18n import T, EXAMPLES, ALL_EXAMPLES_FLAT, TTS_LANGUAGES, MAX_TEXT_LEN, MAX_AUDIO_SEC
	from styles import THEME, CSS

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)s %(name)s: %(message)s",
	stream=sys.stderr,
	)
	log = logging.getLogger(__name__)

	# ── Config ────────────────────────────────────────────────────────────────────
	ECHOMIMIC_MODEL = os.environ.get("ECHOMIMIC_MODEL", "BadToBest/EchoMimicV3")
	CHATTERBOX_MODEL = os.environ.get("CHATTERBOX_MODEL", "ResembleAI/chatterbox")
	MAX_DUB_TEXT_LEN = 1500 # ~60s of typical speech at 150 wpm ≈ 900 chars; 1500 is safe headroom

	ASPECT_PRESETS = {
	"▮ 9:16 · 576×1024": (576, 1024),
	"◻ 1:1 · 512×512": (512, 512),
	"▬ 16:9 · 1024×576": (1024, 576),
	}

	DEFAULT_STEPS = 20
	DEFAULT_CFG = 3.5
	DEFAULT_FPS = 25

	# ── Runtime repo installs (avoid PyPI conflicts) ──────────────────────────────
	_ECHOMIMIC_REPO = "https://github.com/antgroup/echomimic_v3.git"
	_ECHOMIMIC_DIR = "/tmp/echomimic_v3"
	_CHATTERBOX_REPO = "https://github.com/resemble-ai/chatterbox.git"
	_CHATTERBOX_DIR = "/tmp/chatterbox"
	_clone_lock = threading.Lock()


	def _clone_repo(repo_url: str, dest: str, label: str):
	"""Thread-safe shallow clone. Uses .git presence to detect complete clones."""
	with _clone_lock:
	if not os.path.exists(os.path.join(dest, ".git")):
	if os.path.exists(dest):
	shutil.rmtree(dest)
	log.info("Cloning %s…", label)
	subprocess.run(
	["git", "clone", "--depth=1", repo_url, dest],
	check=True, timeout=180,
	)
	log.info("%s cloned", label)
	if dest not in sys.path:
	sys.path.insert(0, dest)


	def _ensure_echomimic_repo():
	_clone_repo(_ECHOMIMIC_REPO, _ECHOMIMIC_DIR, "EchoMimic V3")


	def _ensure_chatterbox_repo():
	_clone_repo(_CHATTERBOX_REPO, _CHATTERBOX_DIR, "Chatterbox TTS")


	# ── Model singletons ──────────────────────────────────────────────────────────
	_tts_model = None
	_echo_pipe = None
	_echo_mode = None


	def _load_tts():
	global _tts_model
	if _tts_model is None:
	_ensure_chatterbox_repo()
	from chatterbox.tts import ChatterboxTTS
	log.info("Loading Chatterbox TTS…")
	_tts_model = ChatterboxTTS.from_pretrained(device="cpu")
	log.info("Chatterbox TTS ready")
	return _tts_model


	def _load_echomimic():
	global _echo_pipe, _echo_mode
	if _echo_pipe is not None:
	return _echo_pipe, _echo_mode

	try:
	_ensure_echomimic_repo()
	from echomimic_v3.pipelines.pipeline_echomimic_v3 import EchoMimicV3Pipeline
	log.info("Loading EchoMimic V3 (local)…")
	_echo_pipe = EchoMimicV3Pipeline.from_pretrained(ECHOMIMIC_MODEL, torch_dtype=torch.float16)
	_echo_mode = "local"
	log.info("EchoMimic V3 ready (local)")
	return _echo_pipe, _echo_mode
	except Exception as e:
	log.warning("EchoMimic V3 local import failed: %s", e)

	try:
	from diffusers import DiffusionPipeline
	log.info("Loading EchoMimic V3 via diffusers…")
	_echo_pipe = DiffusionPipeline.from_pretrained(
	ECHOMIMIC_MODEL, torch_dtype=torch.float16, trust_remote_code=True,
	)
	_echo_mode = "local"
	log.info("EchoMimic V3 ready (diffusers)")
	return _echo_pipe, _echo_mode
	except Exception as e:
	log.warning("EchoMimic V3 diffusers load failed: %s", e)

	raise RuntimeError("EchoMimic V3 could not be loaded. Check requirements and model availability.")


	# ── Video utilities ───────────────────────────────────────────────────────────
	def _coerce_frames(frames):
	"""Normalise pipeline output to a list of (H, W, 3) uint8 numpy arrays."""
	import numpy as np
	result = []
	for frame in frames:
	if hasattr(frame, "save"):
	arr = np.array(frame.convert("RGB"))
	elif hasattr(frame, "cpu"):
	arr = frame.cpu().float().numpy()
	if arr.ndim == 3 and arr.shape[0] in (1, 3, 4):
	arr = arr.transpose(1, 2, 0)
	if arr.dtype.kind == 'f' and arr.max() <= 1.0:
	arr = (arr * 255).clip(0, 255)
	arr = arr.astype(np.uint8)
	else:
	arr = np.array(frame)
	if arr.ndim == 2:
	import cv2
	arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB)
	elif arr.ndim == 3 and arr.shape[2] == 4:
	arr = arr[:, :, :3]
	result.append(arr)
	return result


	def _mux_video(frames, audio_path: str, fps: int = DEFAULT_FPS) -> str:
	"""Combine frames (PIL/tensor/ndarray) + audio into an MP4 file."""
	import cv2

	coerced = _coerce_frames(frames)
	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
	out_path = f.name
	try:
	with tempfile.TemporaryDirectory() as tmpdir:
	for i, arr in enumerate(coerced):
	cv2.imwrite(os.path.join(tmpdir, f"{i:06d}.png"), cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
	cmd = [
	"ffmpeg", "-y", "-loglevel", "error",
	"-framerate", str(fps),
	"-i", os.path.join(tmpdir, "%06d.png"),
	"-i", audio_path,
	"-c:v", "libx264", "-preset", "fast", "-crf", "22",
	"-c:a", "aac", "-b:a", "128k",
	"-shortest", "-pix_fmt", "yuv420p",
	out_path,
	]
	subprocess.run(cmd, check=True, timeout=120)
	except Exception:
	if os.path.exists(out_path):
	try:
	os.unlink(out_path)
	except OSError:
	pass
	raise
	return out_path


	# ── TTS ───────────────────────────────────────────────────────────────────────
	def _run_tts(text: str, voice_ref: str \| None, emotion: float, language: str = "English") -> str:
	"""Generate speech WAV. Returns temp file path."""
	model = _load_tts()
	log.info("TTS: language=%s text_len=%d emotion=%.2f", language, len(text), emotion)
	model.to("cuda")
	out_path = None
	try:
	wav = model.generate(
	text=text.strip(),
	audio_prompt_path=voice_ref if voice_ref else None,
	exaggeration=float(emotion),
	)
	# torchaudio.save requires 2-D tensor [channels, samples]
	if wav.ndim == 1:
	wav = wav.unsqueeze(0)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	out_path = f.name
	torchaudio.save(out_path, wav, model.sr)
	return out_path
	except Exception:
	if out_path and os.path.exists(out_path):
	try:
	os.unlink(out_path)
	except OSError:
	pass
	raise
	finally:
	model.to("cpu")
	torch.cuda.empty_cache()


	# ── EchoMimic ─────────────────────────────────────────────────────────────────
	def _run_echomimic(portrait_img, audio_path: str, width: int, height: int,
	num_steps: int, guidance_scale: float) -> str:
	"""Generate talking-head video. Returns MP4 file path."""
	pipe, _ = _load_echomimic()
	pipe.to("cuda")
	try:
	output = pipe(
	ref_image=portrait_img,
	audio_path=audio_path,
	width=width,
	height=height,
	num_inference_steps=num_steps,
	guidance_scale=guidance_scale,
	fps=DEFAULT_FPS,
	)
	if hasattr(output, "frames"):
	return _mux_video(output.frames[0], audio_path)
	if hasattr(output, "videos"):
	vid = output.videos[0]
	if hasattr(vid, "unbind"):
	return _mux_video(list(vid.unbind(0)), audio_path)
	return _mux_video(vid, audio_path)
	if isinstance(output, str):
	return output
	raise ValueError(f"Unexpected pipeline output type: {type(output)}")
	finally:
	pipe.to("cpu")
	torch.cuda.empty_cache()
	gc.collect()


	# ── Phase 1: Generate video endpoint ─────────────────────────────────────────
	@spaces.GPU(duration=120)
	def generate(portrait_img, input_mode: str, text: str, tts_language: str,
	voice_ref, audio_file, aspect_ratio: str, emotion: float,
	num_steps: int, guidance_scale: float, lang: str,
	progress=gr.Progress(track_tqdm=True)):

	t = T.get(lang, T["🇺🇸 English"])
	if portrait_img is None:
	raise gr.Error(t["err_no_portrait"])

	width, height = ASPECT_PRESETS.get(aspect_ratio, (512, 512))
	_tts_tmp: str \| None = None

	try:
	if input_mode == "text":
	if not text or not text.strip():
	raise gr.Error(t["err_no_text"])
	if len(text) > MAX_TEXT_LEN:
	raise gr.Error(t["err_text_long"])
	if voice_ref and not os.path.exists(voice_ref):
	voice_ref = None
	_tts_tmp = _run_tts(text, voice_ref, emotion, language=tts_language)
	audio_path = _tts_tmp
	else:
	if audio_file is None:
	raise gr.Error(t["err_no_audio"])
	info = torchaudio.info(audio_file)
	if (info.num_frames / info.sample_rate) > MAX_AUDIO_SEC:
	raise gr.Error(t["err_audio_long"])
	audio_path = audio_file

	return _run_echomimic(portrait_img, audio_path, width, height, int(num_steps), float(guidance_scale))

	except torch.cuda.OutOfMemoryError:
	raise gr.Error(t["err_oom"])
	except gr.Error as e:
	log.warning("Generation gr.Error: %s", e)
	raise
	except Exception as e:
	log.error("Generation failed: %s", e, exc_info=True)
	raise gr.Error("Generation failed. Please try different settings or try again.")
	finally:
	if _tts_tmp and os.path.exists(_tts_tmp):
	try:
	os.unlink(_tts_tmp)
	except Exception:
	pass
	torch.cuda.empty_cache()
	gc.collect()


	# ── Phase 2: Dubbing endpoint ─────────────────────────────────────────────────
	@spaces.GPU(duration=120)
	def dub_video(video_input, target_lang: str, voice_ref, emotion: float, lang: str,
	progress=gr.Progress(track_tqdm=True)):

	t = T.get(lang, T["🇺🇸 English"])
	temp_files: list[str] = []

	try:
	if video_input is None:
	raise gr.Error(t["err_no_video"])

	duration = dubbing.get_video_duration(video_input)
	if duration > dubbing.MAX_DUB_AUDIO_SEC:
	raise gr.Error(t["err_video_long"])

	progress(0.10, desc="Extracting audio…")
	audio_path = dubbing.extract_audio(video_input)
	temp_files.append(audio_path)

	progress(0.25, desc="Transcribing…")
	transcript = dubbing.transcribe(audio_path)
	dubbing._unload_whisper()

	source_display = transcript.language_display
	if source_display != target_lang:
	progress(0.45, desc="Translating…")
	try:
	translated_text = dubbing.translate(transcript.text, source_display, target_lang)
	except Exception as exc:
	log.error("Translation failed: %s", exc, exc_info=True)
	raise gr.Error(t["err_translate"])
	else:
	translated_text = transcript.text

	if len(translated_text) > MAX_DUB_TEXT_LEN:
	raise gr.Error(t["err_dub_text_long"])

	progress(0.60, desc="Synthesizing speech…")
	if voice_ref and not os.path.exists(voice_ref):
	voice_ref = None
	dubbed_audio = _run_tts(translated_text, voice_ref, emotion, language=target_lang)
	temp_files.append(dubbed_audio)

	progress(0.85, desc="Combining video…")
	output_path = dubbing.mux_dubbed_video(video_input, dubbed_audio)

	status = f"✓ {source_display} → {target_lang} \| {duration:.1f}s"
	return output_path, transcript.text, translated_text, status

	except torch.cuda.OutOfMemoryError:
	raise gr.Error(t["err_oom"])
	except gr.Error as e:
	log.warning("Dubbing gr.Error: %s", e)
	raise
	except Exception as e:
	log.error("Dubbing failed: %s", e, exc_info=True)
	raise gr.Error("Dubbing failed. Please try a shorter video or different settings.")
	finally:
	for fp in temp_files:
	if fp and os.path.exists(fp):
	try:
	os.unlink(fp)
	except Exception:
	pass
	torch.cuda.empty_cache()
	gc.collect()


	# ── Language switcher ─────────────────────────────────────────────────────────
	def switch_language(lang: str):
	t = T.get(lang, T["🇺🇸 English"])
	mode_choices = [(t["mode_text"], "text"), (t["mode_audio"], "audio")]
	# 26 outputs — must match _lang_out list order below
	return (
	# Phase 1 (16)
	gr.update(label=t["portrait_label"]),
	gr.update(label=t["input_mode_label"], choices=mode_choices, value="text"),
	gr.update(label=t["text_label"], placeholder=t["text_ph"]),
	gr.update(label=t["tts_lang_label"]),
	gr.update(label=t["voice_ref_label"]),
	gr.update(label=t["emotion_label"], info=t["emotion_info"]),
	gr.update(label=t["audio_label"]),
	gr.update(label=t["aspect_label"]),
	gr.update(label=t["advanced"]),
	gr.update(label=t["steps_label"], info=t["steps_info"]),
	gr.update(label=t["guidance_label"], info=t["guidance_info"]),
	gr.update(value=t["generate"]),
	gr.update(value=t["examples_header"]),
	gr.update(visible=True), # text_group
	gr.update(visible=False), # audio_group
	gr.update(label=t["output_label"]),
	# Phase 2 (10)
	gr.update(label=t["dub_video_label"]),
	gr.update(label=t["dub_target_label"]),
	gr.update(label=t["dub_voice_label"]),
	gr.update(label=t["dub_emotion_label"]),
	gr.update(value=t["dub_btn"]),
	gr.update(label=t["dub_output_label"]),
	gr.update(label=t["dub_transcript"]),
	gr.update(label=t["dub_translation"]),
	gr.update(label=t["dub_status"]),
	gr.update(label=t["dub_details"]),
	)


	def _toggle_input_mode(mode: str, _lang: str):
	is_text = (mode == "text")
	return gr.update(visible=is_text), gr.update(visible=not is_text)


	# ── Interface ─────────────────────────────────────────────────────────────────
	with gr.Blocks(title="AnimaStudio 🎬") as demo:

	gr.HTML("""
	<div class="as-header">
	<h1>🎬 AnimaStudio</h1>
	<p class="tagline">AI Talking Head Video Creator & Video Dubbing Studio</p>
	<div class="badges">
	<span class="badge badge-purple">🎭 Lip Sync</span>
	<span class="badge badge-pink">🗣️ 23 TTS Languages</span>
	<span class="badge badge-cyan">🎙️ Voice Cloning</span>
	<span class="badge badge-teal">🎙️ Video Dubbing</span>
	<span class="badge">⚡ EchoMimic V3</span>
	<span class="badge badge-gold">🌐 EN · PT-BR · ES · AR</span>
	<span class="badge">🤖 MCP Server</span>
	</div>
	</div>
	""")

	lang_selector = gr.Radio(
	choices=list(T.keys()),
	value="🇺🇸 English",
	label=None,
	container=False,
	elem_id="lang-selector",
	)

	with gr.Tabs():

	# ══ Tab 1: Create Video ════════════════════════════════════════════════
	with gr.Tab("🎬 Create Video", id="tab-create"):
	with gr.Row(equal_height=False):
	with gr.Column(scale=1, min_width=360):
	portrait = gr.Image(
	label="Portrait Photo · front-facing face",
	type="pil",
	sources=["upload", "webcam"],
	)
	input_mode = gr.Radio(
	choices=[(T["🇺🇸 English"]["mode_text"], "text"),
	(T["🇺🇸 English"]["mode_audio"], "audio")],
	value="text",
	label="Audio Input",
	)
	with gr.Group(visible=True) as text_group:
	text_input = gr.Textbox(
	label="Text",
	placeholder="Type what you want the avatar to say...",
	lines=4, max_lines=10,
	)
	tts_language = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Speech Language")
	with gr.Row():
	voice_ref = gr.Audio(
	label="Voice Reference (optional — clone voice style)",
	type="filepath", sources=["upload", "microphone"],
	format="wav",
	)
	emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05,
	label="Emotion Intensity", info="0 = neutral · 1 = very expressive")
	with gr.Group(visible=False) as audio_group:
	audio_upload = gr.Audio(
	label="Audio File · WAV/MP3/FLAC · max 30 s",
	type="filepath", sources=["upload", "microphone"],
	format="wav",
	)
	aspect_ratio = gr.Dropdown(choices=list(ASPECT_PRESETS.keys()),
	value="◻ 1:1 · 512×512", label="Format")
	with gr.Accordion("⚙️ Advanced Settings", open=False) as adv_acc:
	num_steps = gr.Slider(5, 50, value=DEFAULT_STEPS, step=1,
	label="Inference Steps", info="More steps = higher quality, slower")
	guidance_scale = gr.Slider(1.0, 10.0, value=DEFAULT_CFG, step=0.5,
	label="Guidance Scale", info="Higher = follows audio more strictly")
	gen_btn = gr.Button("🎬 Generate Video", variant="primary", elem_id="gen-btn", size="lg")
	examples_header = gr.Markdown("### 💡 Try These Examples")
	gr.Examples(examples=ALL_EXAMPLES_FLAT, inputs=[text_input, tts_language, emotion], label=None)

	with gr.Column(scale=1, min_width=440):
	output_video = gr.Video(label="Generated Video", format="mp4", autoplay=True,
	height=640, elem_id="output-video", buttons=["download"])

	# ══ Tab 2: Dub Video ═══════════════════════════════════════════════════
	with gr.Tab("🎙️ Dub Video", id="tab-dub"):
	with gr.Row(equal_height=False):
	with gr.Column(scale=1, min_width=360):
	dub_video_input = gr.Video(label="Input Video · max 60 seconds",
	sources=["upload"])
	dub_target_lang = gr.Dropdown(choices=TTS_LANGUAGES, value="English", label="Target Language")
	dub_voice_ref = gr.Audio(label="Voice Reference (optional — clone voice style)",
	type="filepath", sources=["upload", "microphone"],
	format="wav")
	dub_emotion = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Emotion Intensity")
	dub_btn = gr.Button("🎙️ Dub Video", variant="primary", elem_id="dub-btn", size="lg")
	gr.HTML("""
	<div style="color:#94a3b8;font-size:0.82rem;margin-top:0.5rem;padding:0.75rem;
	background:rgba(6,182,212,0.05);border-radius:0.5rem;
	border:1px solid rgba(6,182,212,0.15);">
	<strong>How it works:</strong> Whisper transcribes → NLLB-200 translates →
	Chatterbox TTS synthesizes → audio replaces original track.
	</div>
	""")

	with gr.Column(scale=1, min_width=440):
	dub_output_video = gr.Video(label="Dubbed Video", format="mp4", autoplay=True,
	height=480, elem_id="dub-output-video", buttons=["download"])
	with gr.Accordion("Details", open=False) as dub_details_acc:
	dub_transcript_box = gr.Textbox(label="Detected Transcript", interactive=False, lines=4)
	dub_translation_box = gr.Textbox(label="Translation", interactive=False, lines=4)
	dub_status_box = gr.Textbox(label="Status", interactive=False, lines=2)

	gr.HTML("""
	<div class="as-footer">
	<strong>Models:</strong>
	<a href="https://huggingface.co/BadToBest/EchoMimicV3" target="_blank">EchoMimic V3</a>
	(Apache 2.0)  ·
	<a href="https://huggingface.co/ResembleAI/chatterbox" target="_blank">Chatterbox TTS</a>
	(MIT)  ·
	<a href="https://huggingface.co/openai/whisper-large-v3-turbo" target="_blank">Whisper Turbo</a>
	(MIT)  ·
	<a href="https://huggingface.co/facebook/nllb-200-distilled-600M" target="_blank">NLLB-200</a>
	(CC-BY-NC)  ·
	<strong>Space by:</strong>
	<a href="https://huggingface.co/lulavc" target="_blank">lulavc</a>
	·  ZeroGPU  ·  A10G
	</div>
	""")

	# ── Events ────────────────────────────────────────────────────────────────
	gen_btn.click(
	generate,
	inputs=[portrait, input_mode, text_input, tts_language,
	voice_ref, audio_upload, aspect_ratio, emotion,
	num_steps, guidance_scale, lang_selector],
	outputs=output_video,
	)

	input_mode.change(_toggle_input_mode, inputs=[input_mode, lang_selector],
	outputs=[text_group, audio_group])

	dub_btn.click(
	dub_video,
	inputs=[dub_video_input, dub_target_lang, dub_voice_ref, dub_emotion, lang_selector],
	outputs=[dub_output_video, dub_transcript_box, dub_translation_box, dub_status_box],
	)

	# Language switcher — 26 outputs, must match switch_language() return tuple order
	_lang_out = [
	# Phase 1 (16)
	portrait, input_mode, text_input, tts_language,
	voice_ref, emotion, audio_upload, aspect_ratio,
	adv_acc, num_steps, guidance_scale, gen_btn, examples_header,
	text_group, audio_group, output_video,
	# Phase 2 (10)
	dub_video_input, dub_target_lang, dub_voice_ref,
	dub_emotion, dub_btn, dub_output_video,
	dub_transcript_box, dub_translation_box,
	dub_status_box, dub_details_acc,
	]
	lang_selector.change(switch_language, inputs=lang_selector, outputs=_lang_out)


	if __name__ == "__main__":
	demo.queue(max_size=10, default_concurrency_limit=1)
	demo.launch(theme=THEME, css=CSS, mcp_server=True, ssr_mode=False)