Spaces:

LTTEAM
/

LatentSync

Paused

App Files Files Community

LatentSync / app.py

LTTEAM

Update app.py

cafa72c verified 7 months ago

raw

history blame contribute delete

9.1 kB

	import os
	import uuid
	import shutil
	import tempfile
	import gradio as gr
	import torch
	from moviepy.editor import VideoFileClip
	from pydub import AudioSegment
	from huggingface_hub import snapshot_download
	from omegaconf import OmegaConf
	from diffusers import AutoencoderKL, DDIMScheduler
	from latentsync.models.unet import UNet3DConditionModel
	from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
	from latentsync.whisper.audio2feature import Audio2Feature
	from accelerate.utils import set_seed
	AUTHOR = "Lý Trần"
	COMMUNITY = "LTTEAM"
	COMMUNITY_LINK = "https://www.facebook.com/groups/622526090937760"
	REPO_ID = "LTTEAM/Nhep_Mieng"
	os.makedirs("checkpoints", exist_ok=True)
	snapshot_download(
	repo_id=REPO_ID,
	local_dir="./checkpoints"
	)
	def process_video(input_video_path: str, temp_dir: str = "temp_video") -> str:
	os.makedirs(temp_dir, exist_ok=True)
	video = VideoFileClip(input_video_path)
	output_path = os.path.join(
	temp_dir, f"crop_{os.path.basename(input_video_path)}"
	)
	if video.duration > 10:
	video = video.subclip(0, 10)
	video.write_videofile(output_path, codec="libx264", audio_codec="aac")
	return output_path
	def process_audio(input_audio_path: str, temp_dir: str) -> str:
	os.makedirs(temp_dir, exist_ok=True)
	audio = AudioSegment.from_file(input_audio_path)
	max_ms = 8 * 1000
	if len(audio) > max_ms:
	audio = audio[:max_ms]
	output_path = os.path.join(temp_dir, "trim_audio.wav")
	audio.export(output_path, format="wav")
	return output_path
	def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)):
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"[INFO] Chạy trên device: {device}")
	space_id = os.environ.get("SPACE_ID", "")
	is_shared_ui = "fffiloni/LatentSync" in space_id

	# Nếu chạy trên shared UI, lưu tạm và cắt ngắn đầu vào
	temp_dir = None
	if is_shared_ui:
	temp_dir = tempfile.mkdtemp()
	video_path = process_video(video_path, temp_dir)
	audio_path = process_audio(audio_path, temp_dir)

	# Nạp cấu hình và checkpoint
	config = OmegaConf.load("configs/unet/second_stage.yaml")
	unet_ckpt = "checkpoints/latentsync_unet.pt"
	scheduler = DDIMScheduler.from_pretrained("configs")

	# Chọn Whisper model dựa vào cross_attention_dim
	dim = config.model.cross_attention_dim
	if dim == 768:
	whisper_ckpt = "checkpoints/whisper/small.pt"
	elif dim == 384:
	whisper_ckpt = "checkpoints/whisper/tiny.pt"
	else:
	raise NotImplementedError("cross_attention_dim phải là 768 hoặc 384")

	# Tạo audio encoder
	audio_encoder = Audio2Feature(
	model_path=whisper_ckpt,
	device=device,
	num_frames=config.data.num_frames
	)

	# Nạp VAE
	vae = AutoencoderKL.from_pretrained(
	"stabilityai/sd-vae-ft-mse",
	torch_dtype=torch.float16 if device=="cuda" else torch.float32
	)
	vae.config.scaling_factor = 0.18215
	vae.config.shift_factor = 0

	# Nạp UNet
	unet, _ = UNet3DConditionModel.from_pretrained(
	OmegaConf.to_container(config.model),
	unet_ckpt,
	device=device
	)
	# Chuyển dtype phù hợp
	unet = unet.to(dtype=torch.float16) if device=="cuda" else unet.to(dtype=torch.float32)

	# Khởi tạo pipeline và chuyển lên device
	pipeline = LipsyncPipeline(
	vae=vae,
	audio_encoder=audio_encoder,
	unet=unet,
	scheduler=scheduler,
	).to(device)

	# Thiết lập seed
	seed = -1
	if seed != -1:
	set_seed(seed)
	else:
	torch.seed()
	print(f"[INFO] Seed khởi tạo: {torch.initial_seed()}")

	# Thực thi pipeline
	output_id = uuid.uuid4().hex
	result_path = f"output_{output_id}.mp4"
	pipeline(
	video_path=video_path,
	audio_path=audio_path,
	video_out_path=result_path,
	video_mask_path=result_path.replace(".mp4", "_mask.mp4"),
	num_frames=config.data.num_frames,
	num_inference_steps=config.run.inference_steps,
	guidance_scale=1.0,
	weight_dtype=torch.float16 if device=="cuda" else torch.float32,
	width=config.data.resolution,
	height=config.data.resolution,
	)

	# Dọn dẹp thư mục tạm nếu có
	if is_shared_ui and temp_dir and os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)

	return result_path
	custom_css = """
	:root {
	--primary: #4CAF50;
	--secondary: #8BC34A;
	--accent: #FFC107;
	--dark: #1E1E1E;
	--light: #F5F5F5;
	}

	body {
	font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
	background-color: var(--light);
	}

	div#main-container {
	margin: 0 auto;
	max-width: 900px;
	background: white;
	padding: 2rem;
	border-radius: 12px;
	box-shadow: 0 4px 12px rgba(0,0,0,0.1);
	}

	h1 {
	color: var(--primary);
	border-bottom: 2px solid var(--secondary);
	padding-bottom: 0.5rem;
	}

	.gr-button {
	background: var(--primary) !important;
	color: white !important;
	border: none !important;
	padding: 0.75rem 1.5rem !important;
	border-radius: 8px !important;
	font-weight: 600 !important;
	transition: all 0.3s ease !important;
	}

	.gr-button:hover {
	background: var(--secondary) !important;
	transform: translateY(-2px) !important;
	box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important;
	}

	.gr-box {
	border-radius: 8px !important;
	border: 1px solid #e0e0e0 !important;
	}

	footer {
	text-align: center;
	margin-top: 2rem;
	color: #666;
	font-size: 0.9rem;
	}

	.example-container {
	background: #f9f9f9;
	padding: 1rem;
	border-radius: 8px;
	margin-top: 1rem;
	}
	"""

	with gr.Blocks(css=custom_css, title="LatentSync - Đồng bộ môi bằng AI") as demo:
	with gr.Column(elem_id="main-container"):
	# Header
	gr.Markdown(f"# 🎤 LatentSync - Đồng bộ môi bằng AI")
	gr.Markdown(f"Tác giả: {AUTHOR} \| Cộng đồng: [{COMMUNITY}]({COMMUNITY_LINK})")

	# Giới thiệu
	with gr.Accordion("ℹ️ Giới thiệu ứng dụng", open=False):
	gr.Markdown("""
	Ứng dụng sử dụng mô hình AI tiên tiến để đồng bộ chuyển động môi trong video với âm thanh đầu vào.

	Cách sử dụng:
	1. Tải lên video chứa khuôn mặt cần đồng bộ môi
	2. Tải lên file âm thanh hoặc ghi âm trực tiếp
	3. Nhấn nút "Chạy đồng bộ" và chờ kết quả

	Lưu ý:
	- Video nên có khuôn mặt rõ ràng, ánh sáng tốt
	- Âm thanh cần rõ ràng, không nhiễu
	- Thời gian xử lý phụ thuộc vào độ dài video và cấu hình máy
	""")

	# Input/Output
	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🎥 Đầu vào")
	video_in = gr.Video(label="Video đầu vào (MP4)", format="mp4", interactive=True)
	audio_in = gr.Audio(label="Âm thanh đầu vào", type="filepath", interactive=True)
	with gr.Row():
	btn = gr.Button("🚀 Chạy đồng bộ", variant="primary")
	clear_btn = gr.Button("🔄 Xóa hết")

	with gr.Column():
	gr.Markdown("### 📼 Kết quả")
	video_out = gr.Video(label="Video kết quả", interactive=False)
	with gr.Row():
	download_btn = gr.Button("💾 Tải xuống")

	# Ví dụ mẫu - ĐÃ SỬA LỖI Ở ĐÂY
	with gr.Accordion("📂 Ví dụ mẫu", open=True):
	gr.Examples(
	examples=[
	["assets/demo1_video.mp4", "assets/demo1_audio.wav"],
	["assets/demo2_video.mp4", "assets/demo2_audio.wav"],
	["assets/demo3_video.mp4", "assets/demo3_audio.wav"],
	],
	inputs=[video_in, audio_in],
	outputs=[video_out],
	fn=main, # Thêm hàm xử lý chính
	label="Nhấn vào ví dụ để thử ngay",
	# cache_examples=True # Đã bỏ cache_examples vì cần thêm cấu hình
	)

	# Footer
	gr.Markdown(f"""
	---
	Ứng dụng được phát triển bởi {AUTHOR} và cộng đồng {COMMUNITY}
	Phiên bản 1.0 \| [Tham gia nhóm]({COMMUNITY_LINK}) để cập nhật và hỗ trợ
	""")

	# Xử lý sự kiện
	btn.click(fn=main, inputs=[video_in, audio_in], outputs=[video_out])
	clear_btn.click(lambda: [None, None, None], outputs=[video_in, audio_in, video_out])
	download_btn.click(lambda x: x, inputs=[video_out], outputs=[video_out])

	demo.launch(
	share=True,
	show_error=True,
	server_name="0.0.0.0",
	server_port=int(os.environ.get("PORT", 7860)),
	)