Marcos

Add H.264 WebSocket streaming and React.js web interface

32bba92 7 months ago

27.9 kB

	"""
	MuseTalk Fast Engine - Optimized for real-time inference
	Implements: Pre-loaded models, Avatar caching, Parallel chunk processing

	MXFP4/Blackwell Optimizations (RTX 5090):
	- torch.compile with max-autotune mode
	- FP16 precision for all models
	- TF32 enabled for matmuls
	- Optional CUDA Graphs for fixed batch sizes
	"""

	import os
	import sys
	import torch
	import numpy as np
	import cv2
	import pickle
	import glob
	import copy
	import time
	import uuid
	import threading
	from pathlib import Path
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from queue import Queue
	from typing import List, Tuple, Optional, Dict
	import subprocess

	# Add parent directory to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	# Suppress warnings
	import warnings
	warnings.filterwarnings('ignore')

	# Check for Blackwell GPU optimizations
	def _check_blackwell_gpu() -> bool:
	"""Check if running on Blackwell GPU (compute capability 12.x)"""
	if not torch.cuda.is_available():
	return False
	cc = torch.cuda.get_device_capability(0)
	return cc[0] >= 12

	IS_BLACKWELL = _check_blackwell_gpu()


	class FastMuseTalkEngine:
	"""
	Optimized MuseTalk engine with:
	1. Pre-loaded models in memory
	2. Pre-processed avatar cache
	3. Parallel chunk processing
	"""

	_instance = None
	_lock = threading.Lock()

	def __new__(cls):
	if cls._instance is None:
	with cls._lock:
	if cls._instance is None:
	cls._instance = super().__new__(cls)
	cls._instance._initialized = False
	return cls._instance

	def __init__(self):
	if self._initialized:
	return

	print("=" * 50)
	print("Initializing FastMuseTalkEngine...")
	print("=" * 50)

	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Paths
	self.base_dir = Path(__file__).parent.parent
	self.server_dir = Path(__file__).parent

	# New video paths - separate videos for talking and idle
	self.avatar_video = self.server_dir / "avatar_videos" / "falando.mp4" # For lip-sync
	self.idle_video = self.server_dir / "avatar_videos" / "idle.mp4" # For idle animation

	# Cache directory inside server folder
	self.cache_dir = self.server_dir / "avatar_cache"
	self.results_dir = self.base_dir / "results" / "server"

	self.cache_dir.mkdir(parents=True, exist_ok=True)
	self.results_dir.mkdir(parents=True, exist_ok=True)

	# Models (loaded once)
	self.vae = None
	self.unet = None
	self.pe = None
	self.audio_processor = None
	self.timesteps = None

	# Avatar cache for lip-sync (falando.mp4)
	self.avatar_frame_paths = [] # paths to frame images
	self.avatar_frames = [] # actual frame numpy arrays
	self.avatar_latents = [] # pre-computed VAE latents
	self.avatar_coords = [] # bounding box tuples (x1,y1,x2,y2)
	self.avatar_fps = 30
	self.avatar_loaded = False

	# Idle animation cache (idle.mp4)
	self.idle_frames = [] # frames for idle animation
	self.idle_fps = 30
	self.idle_loaded = False

	# Thread pool for parallel processing
	self.executor = ThreadPoolExecutor(max_workers=4)

	self._initialized = True

	def load_models(self, use_blackwell_optimizations: bool = True):
	"""Load all models into GPU memory with optional Blackwell optimizations"""
	if self.vae is not None:
	print("Models already loaded")
	return

	print("\n[1/4] Loading VAE, UNet, PE models...")
	start = time.time()

	from musetalk.utils.utils import load_all_model
	# Blackwell optimizations are now applied automatically in load_all_model
	self.vae, self.unet, self.pe = load_all_model(
	use_blackwell_optimizations=use_blackwell_optimizations
	)

	# Track optimization state
	self._blackwell_optimized = use_blackwell_optimizations and IS_BLACKWELL

	print(f" Models loaded in {time.time()-start:.2f}s")

	print("\n[2/4] Loading Whisper audio processor...")
	start = time.time()

	from musetalk.whisper.audio2feature import Audio2Feature
	whisper_path = self.base_dir / "models" / "whisper" / "tiny.pt"
	self.audio_processor = Audio2Feature(model_path=str(whisper_path))

	print(f" Whisper loaded in {time.time()-start:.2f}s")

	self.timesteps = torch.tensor([0], device=self.device)

	print("\n✓ All models loaded and ready!")

	def preprocess_avatar(self, force=False):
	"""Pre-process avatar video (falando.mp4) and cache everything"""
	if self.avatar_loaded and not force:
	print("Avatar already preprocessed")
	return

	cache_file = self.cache_dir / "falando_cache.pkl"

	# Try to load from cache
	if cache_file.exists() and not force:
	print("\n[3/4] Loading avatar (falando) from cache...")
	start = time.time()

	with open(cache_file, 'rb') as f:
	cache = pickle.load(f)

	self.avatar_frame_paths = cache['frame_paths']
	self.avatar_frames = cache['frames'] # actual numpy arrays
	self.avatar_coords = cache['coords'] # list of (x1,y1,x2,y2) tuples
	self.avatar_fps = cache['fps']

	# Recompute latents (can't pickle CUDA tensors easily)
	self._compute_latents()

	self.avatar_loaded = True
	print(f" Avatar (falando) loaded from cache in {time.time()-start:.2f}s")
	return

	print("\n[3/4] Pre-processing avatar video - falando.mp4 (first time only)...")
	start = time.time()

	# Extract frames
	print(" Extracting frames from falando.mp4...")
	import imageio
	frames_dir = self.cache_dir / "falando_frames"
	frames_dir.mkdir(exist_ok=True)

	reader = imageio.get_reader(str(self.avatar_video))
	meta = reader.get_meta_data()
	self.avatar_fps = meta.get('fps', 30)

	frame_paths = []
	for i, frame in enumerate(reader):
	frame_path = frames_dir / f"{i:08d}.png"
	if not frame_path.exists():
	imageio.imwrite(str(frame_path), frame)
	frame_paths.append(str(frame_path))

	self.avatar_frame_paths = frame_paths
	print(f" Extracted {len(frame_paths)} frames")

	# Get landmarks and bounding boxes
	# Returns: coords_list (list of (x1,y1,x2,y2) tuples), frames (list of numpy arrays)
	print(" Computing face landmarks...")
	from musetalk.utils.preprocessing import get_landmark_and_bbox
	self.avatar_coords, self.avatar_frames = get_landmark_and_bbox(frame_paths, 0)

	# Save to cache
	cache = {
	'frame_paths': self.avatar_frame_paths,
	'frames': self.avatar_frames,
	'coords': self.avatar_coords,
	'fps': self.avatar_fps
	}
	with open(cache_file, 'wb') as f:
	pickle.dump(cache, f)

	# Compute latents
	self._compute_latents()

	self.avatar_loaded = True
	print(f" Avatar (falando) preprocessed in {time.time()-start:.2f}s")

	def load_idle_video(self, force=False):
	"""Load idle video (idle.mp4) frames for idle animation"""
	if self.idle_loaded and not force:
	print("Idle video already loaded")
	return

	cache_file = self.cache_dir / "idle_cache.pkl"

	# Try to load from cache
	if cache_file.exists() and not force:
	print("\n[5/5] Loading idle video from cache...")
	start = time.time()

	with open(cache_file, 'rb') as f:
	cache = pickle.load(f)

	self.idle_frames = cache['frames']
	self.idle_fps = cache['fps']
	self.idle_loaded = True
	print(f" Idle video loaded from cache: {len(self.idle_frames)} frames in {time.time()-start:.2f}s")
	return

	print("\n[5/5] Loading idle video - idle.mp4 (first time only)...")
	start = time.time()

	# Extract all frames from idle.mp4
	cap = cv2.VideoCapture(str(self.idle_video))
	self.idle_fps = cap.get(cv2.CAP_PROP_FPS)

	self.idle_frames = []
	while True:
	ret, frame = cap.read()
	if not ret:
	break
	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	self.idle_frames.append(frame_rgb)

	cap.release()

	# Save to cache
	cache = {
	'frames': self.idle_frames,
	'fps': self.idle_fps
	}
	with open(cache_file, 'wb') as f:
	pickle.dump(cache, f)

	self.idle_loaded = True
	print(f" Idle video loaded: {len(self.idle_frames)} frames @ {self.idle_fps} fps in {time.time()-start:.2f}s")

	def _compute_latents(self):
	"""Pre-compute VAE latents for all avatar frames"""
	print("\n[4/4] Pre-computing VAE latents...")
	start = time.time()

	# coord_placeholder is a tuple (0.0, 0.0, 0.0, 0.0) used when no face detected
	coord_placeholder = (0.0, 0.0, 0.0, 0.0)

	self.avatar_latents = []

	for i, (bbox, frame) in enumerate(zip(self.avatar_coords, self.avatar_frames)):
	# Skip invalid bboxes
	if bbox == coord_placeholder or bbox is None:
	if self.avatar_latents:
	self.avatar_latents.append(self.avatar_latents[-1])
	else:
	# Create a placeholder latent
	self.avatar_latents.append(None)
	continue

	x1, y1, x2, y2 = [int(c) for c in bbox]

	# Ensure valid crop coordinates
	h, w = frame.shape[:2]
	x1, y1 = max(0, x1), max(0, y1)
	x2, y2 = min(w, x2), min(h, y2)

	if x2 <= x1 or y2 <= y1:
	if self.avatar_latents:
	self.avatar_latents.append(self.avatar_latents[-1])
	else:
	self.avatar_latents.append(None)
	continue

	crop = frame[y1:y2, x1:x2]
	crop = cv2.resize(crop, (256, 256), interpolation=cv2.INTER_LANCZOS4)

	latent = self.vae.get_latents_for_unet(crop)
	self.avatar_latents.append(latent)

	# Replace any None latents with the first valid one
	first_valid = None
	for lat in self.avatar_latents:
	if lat is not None:
	first_valid = lat
	break

	if first_valid is not None:
	self.avatar_latents = [lat if lat is not None else first_valid for lat in self.avatar_latents]

	print(f" Computed {len(self.avatar_latents)} latents in {time.time()-start:.2f}s")

	def initialize(self):
	"""Full initialization - call once at startup"""
	total_start = time.time()
	print("\n" + "=" * 50)
	print("FAST ENGINE INITIALIZATION")
	print("=" * 50)

	self.load_models()
	self.preprocess_avatar() # Load falando.mp4 for lip-sync
	self.load_idle_video() # Load idle.mp4 for idle animation

	print("\n" + "=" * 50)
	print(f"INITIALIZATION COMPLETE in {time.time()-total_start:.2f}s")
	print("=" * 50 + "\n")

	def process_audio_chunk(self, audio_path: str, chunk_idx: int,
	start_frame: int, end_frame: int) -> List[np.ndarray]:
	"""Process a single audio chunk and generate frames"""

	# Get audio features for this chunk
	whisper_feature = self.audio_processor.audio2feat(audio_path)
	whisper_chunks = self.audio_processor.feature2chunks(
	feature_array=whisper_feature,
	fps=self.avatar_fps
	)

	num_frames = min(len(whisper_chunks), end_frame - start_frame)
	generated_frames = []

	# Process in batches
	batch_size = 8
	for i in range(0, num_frames, batch_size):
	batch_end = min(i + batch_size, num_frames)

	# Get whisper features for batch
	whisper_batch = whisper_chunks[i:batch_end]

	# Get corresponding avatar latents (cycle if needed)
	latent_batch = []
	for j in range(len(whisper_batch)):
	frame_idx = (start_frame + i + j) % len(self.avatar_latents)
	latent_batch.append(self.avatar_latents[frame_idx])

	# Convert to tensors
	audio_tensor = torch.stack([
	torch.FloatTensor(w).to(self.device)
	for w in whisper_batch
	])
	latent_tensor = torch.cat(latent_batch, dim=0).to(self.device)

	# Convert to FP16 if Blackwell optimizations enabled
	if self._blackwell_optimized:
	audio_tensor = audio_tensor.half()
	latent_tensor = latent_tensor.half()

	# Generate with UNet
	with torch.no_grad():
	pred = self.unet(
	latent_tensor,
	self.timesteps,
	encoder_hidden_states=audio_tensor
	).sample

	recon = self.vae.decode_latents(pred)

	for frame in recon:
	generated_frames.append(frame)

	return generated_frames

	def compose_video_chunk(self, generated_frames: List[np.ndarray],
	start_frame: int) -> List[np.ndarray]:
	"""Compose generated frames with original avatar"""
	from musetalk.utils.blending import get_image

	coord_placeholder = (0.0, 0.0, 0.0, 0.0)
	composed_frames = []

	for i, gen_frame in enumerate(generated_frames):
	frame_idx = (start_frame + i) % len(self.avatar_frames)
	bbox = self.avatar_coords[frame_idx]
	orig_frame = self.avatar_frames[frame_idx].copy()

	if bbox != coord_placeholder and bbox is not None:
	x1, y1, x2, y2 = [int(c) for c in bbox]
	gen_resized = cv2.resize(gen_frame, (x2-x1, y2-y1))
	# Simple paste instead of blending for speed
	orig_frame[y1:y2, x1:x2] = gen_resized
	composed_frames.append(orig_frame)
	else:
	composed_frames.append(orig_frame)

	return composed_frames

	def generate_video_fast(self, audio_path: str, output_path: str = None,
	resolution: int = 256, batch_size: int = 8,
	callback=None) -> str:
	"""
	Generate lip-sync video with all optimizations.
	resolution: output face size (128, 192, 256, 320) - affects quality/speed
	batch_size: frames per batch (4, 8, 16) - affects VRAM usage/speed
	callback(progress, message) is called with progress updates.
	"""
	if output_path is None:
	output_path = str(self.results_dir / f"fast_{uuid.uuid4().hex[:8]}.mp4")

	print(f"\n[VIDEO GENERATION] resolution={resolution}, batch_size={batch_size}")
	print(f"video in {self.avatar_fps} FPS, audio idx in 50FPS")

	total_start = time.time()

	if callback:
	callback(0, "Extraindo features de áudio...")

	# Get total audio length and features
	audio_start = time.time()
	whisper_feature = self.audio_processor.audio2feat(audio_path)
	whisper_chunks = self.audio_processor.feature2chunks(
	feature_array=whisper_feature,
	fps=self.avatar_fps
	)
	total_frames = len(whisper_chunks)

	if callback:
	callback(10, f"Processando {total_frames} frames...")

	print(f"Audio processing: {time.time()-audio_start:.2f}s, {total_frames} frames")

	# Process in parallel chunks
	chunk_size = 50 # frames per chunk
	num_chunks = (total_frames + chunk_size - 1) // chunk_size

	all_generated = [None] * total_frames

	gen_start = time.time()

	# Process chunks
	for chunk_idx in range(num_chunks):
	start_frame = chunk_idx * chunk_size
	end_frame = min(start_frame + chunk_size, total_frames)

	# Get whisper features for this chunk
	chunk_whisper = whisper_chunks[start_frame:end_frame]

	# Process in batches (using batch_size parameter)
	for i in range(0, len(chunk_whisper), batch_size):
	batch_end = min(i + batch_size, len(chunk_whisper))
	whisper_batch = chunk_whisper[i:batch_end]

	# Get latents
	latent_batch = []
	for j in range(len(whisper_batch)):
	frame_idx = (start_frame + i + j) % len(self.avatar_latents)
	latent_batch.append(self.avatar_latents[frame_idx])

	# Convert to tensors
	audio_tensor = torch.stack([
	torch.FloatTensor(w).to(self.device)
	for w in whisper_batch
	])
	latent_tensor = torch.cat(latent_batch, dim=0).to(self.device)

	# Convert to FP16 if Blackwell optimizations enabled
	if self._blackwell_optimized:
	audio_tensor = audio_tensor.half()
	latent_tensor = latent_tensor.half()

	# Generate
	with torch.no_grad():
	pred = self.unet.model(
	latent_tensor,
	self.timesteps,
	encoder_hidden_states=audio_tensor
	).sample
	recon = self.vae.decode_latents(pred)

	for j, frame in enumerate(recon):
	all_generated[start_frame + i + j] = frame

	progress = int(10 + (chunk_idx + 1) / num_chunks * 60)
	if callback:
	callback(progress, f"Gerado chunk {chunk_idx+1}/{num_chunks}")

	print(f"Generation: {time.time()-gen_start:.2f}s")

	if callback:
	callback(70, "Compondo frames...")

	# Compose all frames
	compose_start = time.time()
	coord_placeholder = (0.0, 0.0, 0.0, 0.0)

	# Scale factor based on resolution (256 is default/full quality)
	scale_factor = resolution / 256.0

	final_frames = []
	for i, gen_frame in enumerate(all_generated):
	if gen_frame is None:
	continue

	frame_idx = i % len(self.avatar_frames)
	bbox = self.avatar_coords[frame_idx]
	orig_frame = self.avatar_frames[frame_idx].copy()

	if bbox != coord_placeholder and bbox is not None:
	x1, y1, x2, y2 = [int(c) for c in bbox]
	# Ensure valid coords
	h, w = orig_frame.shape[:2]
	x1, y1 = max(0, x1), max(0, y1)
	x2, y2 = min(w, x2), min(h, y2)
	if x2 > x1 and y2 > y1:
	# Apply resolution scaling for quality/speed tradeoff
	target_w = max(32, int((x2-x1) * scale_factor))
	target_h = max(32, int((y2-y1) * scale_factor))
	gen_resized = cv2.resize(gen_frame, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
	gen_resized = cv2.resize(gen_resized, (x2-x1, y2-y1), interpolation=cv2.INTER_LINEAR)
	orig_frame[y1:y2, x1:x2] = gen_resized
	final_frames.append(orig_frame)
	else:
	final_frames.append(orig_frame)

	print(f"Composition: {time.time()-compose_start:.2f}s")

	if callback:
	callback(85, "Escrevendo vídeo...")

	# Write video
	write_start = time.time()
	temp_video = output_path.replace('.mp4', '_temp.mp4')

	h, w = final_frames[0].shape[:2]
	fourcc = cv2.VideoWriter_fourcc(*'mp4v')
	out = cv2.VideoWriter(temp_video, fourcc, self.avatar_fps, (w, h))

	for frame in final_frames:
	out.write(frame)
	out.release()

	print(f"Video write: {time.time()-write_start:.2f}s")

	if callback:
	callback(95, "Adicionando áudio...")

	# Add audio
	cmd = [
	"ffmpeg", "-y", "-v", "quiet",
	"-i", temp_video,
	"-i", audio_path,
	"-c:v", "libx264", "-preset", "fast",
	"-c:a", "aac",
	"-shortest",
	output_path
	]
	subprocess.run(cmd, capture_output=True)

	try:
	os.remove(temp_video)
	except:
	pass

	total_time = time.time() - total_start
	print(f"\nTotal generation time: {total_time:.2f}s for {total_frames} frames")
	print(f"Speed: {total_frames/total_time:.1f} fps")

	if callback:
	callback(100, "Concluído!")

	return output_path

	def generate_frames_streaming(self, audio_path: str, resolution: int = 256, batch_size: int = 8):
	"""
	Generator that yields frames one by one as they're generated.
	Perfect for WebSocket streaming.

	Yields:
	{"type": "info", "total_frames": N, "fps": 30}
	{"type": "frame", "frame": numpy_array, "index": N}
	"""
	print(f"\n[STREAMING] Starting frame generation: resolution={resolution}, batch_size={batch_size}")

	# Get audio features
	whisper_feature = self.audio_processor.audio2feat(audio_path)
	whisper_chunks = self.audio_processor.feature2chunks(
	feature_array=whisper_feature,
	fps=self.avatar_fps
	)
	total_frames = len(whisper_chunks)

	print(f"[STREAMING] Total frames to generate: {total_frames}")

	# Send info first
	yield {"type": "info", "total_frames": total_frames, "fps": self.avatar_fps}

	coord_placeholder = (0.0, 0.0, 0.0, 0.0)
	scale_factor = resolution / 256.0
	frame_index = 0

	# Process in batches but yield frames one by one
	for batch_start in range(0, total_frames, batch_size):
	batch_end = min(batch_start + batch_size, total_frames)
	whisper_batch = whisper_chunks[batch_start:batch_end]

	# Get latents for batch
	latent_batch = []
	for j in range(len(whisper_batch)):
	idx = (batch_start + j) % len(self.avatar_latents)
	latent_batch.append(self.avatar_latents[idx])

	# Convert to tensors
	audio_tensor = torch.stack([
	torch.FloatTensor(w).to(self.device)
	for w in whisper_batch
	])
	latent_tensor = torch.cat(latent_batch, dim=0).to(self.device)

	# Convert to FP16 if Blackwell optimizations enabled
	if self._blackwell_optimized:
	audio_tensor = audio_tensor.half()
	latent_tensor = latent_tensor.half()

	# Generate batch
	with torch.no_grad():
	pred = self.unet.model(
	latent_tensor,
	self.timesteps,
	encoder_hidden_states=audio_tensor
	).sample
	recon = self.vae.decode_latents(pred)

	# Yield each frame from the batch
	for i, gen_frame in enumerate(recon):
	global_idx = batch_start + i
	if global_idx >= total_frames:
	break

	# Compose with original frame
	avatar_idx = global_idx % len(self.avatar_frames)
	bbox = self.avatar_coords[avatar_idx]
	orig_frame = self.avatar_frames[avatar_idx].copy()

	if bbox != coord_placeholder and bbox is not None:
	x1, y1, x2, y2 = [int(c) for c in bbox]
	h, w = orig_frame.shape[:2]
	x1, y1 = max(0, x1), max(0, y1)
	x2, y2 = min(w, x2), min(h, y2)

	if x2 > x1 and y2 > y1:
	target_w = max(32, int((x2-x1) * scale_factor))
	target_h = max(32, int((y2-y1) * scale_factor))
	gen_resized = cv2.resize(gen_frame, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
	gen_resized = cv2.resize(gen_resized, (x2-x1, y2-y1), interpolation=cv2.INTER_LINEAR)
	orig_frame[y1:y2, x1:x2] = gen_resized

	yield {"type": "frame", "frame": orig_frame, "index": frame_index}
	frame_index += 1

	print(f"[STREAMING] Completed: {frame_index} frames")

	def get_idle_frames(self) -> Tuple[List[np.ndarray], float]:
	"""
	Get all frames from idle.mp4 for idle animation.
	Returns: (frames_list, fps)
	"""
	if not self.idle_loaded or not self.idle_frames:
	return [], 30.0

	return self.idle_frames, self.idle_fps

	def generate_video_streaming(self, audio_path: str,
	chunk_callback=None) -> List[str]:
	"""
	Generate video in streaming chunks.
	chunk_callback(chunk_idx, chunk_path) is called as each chunk is ready.
	Returns list of chunk video paths.
	"""
	# Split audio into chunks
	chunk_duration = 3 # seconds

	# Get audio duration
	import wave
	try:
	with wave.open(audio_path, 'rb') as wav:
	duration = wav.getnframes() / wav.getframerate()
	except:
	# Fallback for non-wav
	duration = 10 # assume 10 seconds

	num_chunks = max(1, int(np.ceil(duration / chunk_duration)))
	chunk_paths = []

	for i in range(num_chunks):
	start_time = i * chunk_duration

	# Extract audio chunk
	chunk_audio = str(self.results_dir / f"chunk_{i}_audio.wav")
	cmd = [
	"ffmpeg", "-y", "-v", "quiet",
	"-i", audio_path,
	"-ss", str(start_time),
	"-t", str(chunk_duration),
	"-ar", "16000", "-ac", "1",
	chunk_audio
	]
	subprocess.run(cmd, capture_output=True)

	# Generate video for this chunk
	chunk_video = str(self.results_dir / f"chunk_{i}_{uuid.uuid4().hex[:6]}.mp4")

	start_frame = int(start_time * self.avatar_fps)
	self.generate_video_fast(chunk_audio, chunk_video)

	chunk_paths.append(chunk_video)

	if chunk_callback:
	chunk_callback(i, chunk_video)

	# Cleanup audio chunk
	try:
	os.remove(chunk_audio)
	except:
	pass

	return chunk_paths


	# Global engine instance
	_engine: Optional[FastMuseTalkEngine] = None


	def get_engine() -> FastMuseTalkEngine:
	"""Get or create the singleton engine"""
	global _engine
	if _engine is None:
	_engine = FastMuseTalkEngine()
	return _engine


	def initialize_engine():
	"""Initialize the engine (call at server startup)"""
	engine = get_engine()
	engine.initialize()
	return engine


	if __name__ == "__main__":
	# Test the engine
	engine = initialize_engine()

	# Test with sample audio
	test_audio = "/workspace/MuseTalk1.5/data/audio/teacher_intro.wav"
	if os.path.exists(test_audio):
	print("\nTesting video generation...")
	output = engine.generate_video_fast(test_audio)
	print(f"Output: {output}")