Spaces:
Runtime error
Runtime error
| import argparse | |
| import copy | |
| import os | |
| from pathlib import Path | |
| import sys | |
| import logging | |
| from collections import OrderedDict | |
| from pprint import pprint | |
| import random | |
| import gradio as gr | |
| from argparse import Namespace | |
| # 添加MuseV项目路径到系统路径 | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '../MuseV')) | |
| # 确保diffusers模块可导入 | |
| sys.path.append(os.path.join(os.path.dirname(__file__), '../MuseV/diffusers')) | |
| try: | |
| import numpy as np | |
| from omegaconf import OmegaConf, SCMode | |
| import torch | |
| from einops import rearrange, repeat | |
| import cv2 | |
| from PIL import Image | |
| try: | |
| from diffusers import AutoencoderKL | |
| except ImportError: | |
| # 如果直接从diffusers导入失败,则尝试从原始路径导入 | |
| from diffusers.models.autoencoder_kl import AutoencoderKL | |
| # 导入MuseV必要的模块 | |
| from mmcm.utils.load_util import load_pyhon_obj | |
| from mmcm.utils.seed_util import set_all_seed | |
| from mmcm.utils.signature import get_signature_of_string | |
| from mmcm.vision.utils.data_type_util import is_video, is_image, read_image_as_5d | |
| from mmcm.utils.str_util import clean_str_for_save | |
| from musev.models.referencenet_loader import load_referencenet_by_name | |
| from musev.models.ip_adapter_loader import ( | |
| load_vision_clip_encoder_by_name, | |
| load_ip_adapter_image_proj_by_name, | |
| ) | |
| from musev.models.ip_adapter_face_loader import ( | |
| load_ip_adapter_face_extractor_and_proj_by_name, | |
| ) | |
| from musev.pipelines.pipeline_controlnet_predictor import ( | |
| DiffusersPipelinePredictor, | |
| ) | |
| from musev.models.unet_loader import load_unet_by_name | |
| from musev.utils.util import save_videos_grid_with_opencv | |
| from musev import logger | |
| # 确保cuid模块可用 | |
| try: | |
| import cuid | |
| except ImportError: | |
| print("cuid module not found, using a simple implementation") | |
| import uuid | |
| class cuid: | |
| def cuid(): | |
| return str(uuid.uuid4())[:8] | |
| # 设置基本配置 | |
| logger.setLevel(logging.INFO) | |
| except ImportError as e: | |
| print(f"Import error: {e}") | |
| print("请确保MuseV项目正确安装了所有依赖") | |
| # 使用mock实现让界面能够运行 | |
| import numpy as np | |
| import cv2 | |
| from PIL import Image | |
| import torch | |
| from argparse import Namespace | |
| class MockLogger: | |
| def __init__(self): | |
| self.level = logging.INFO | |
| def info(self, msg): | |
| print(f"INFO: {msg}") | |
| def error(self, msg): | |
| print(f"ERROR: {msg}") | |
| def warning(self, msg): | |
| print(f"WARNING: {msg}") | |
| def setLevel(self, level): | |
| self.level = level | |
| logger = MockLogger() | |
| class cuid: | |
| def cuid(): | |
| import uuid | |
| return str(uuid.uuid4())[:8] | |
| def set_all_seed(seed): | |
| return None, None | |
| def save_videos_grid_with_opencv(videos, output_path, texts=None, fps=4, tensor_order="b c t h w", n_cols=1, write_info=False, save_filetype="mp4", save_images=False): | |
| try: | |
| if tensor_order == "b c t h w": | |
| videos = videos.transpose(0, 2, 3, 4, 1) | |
| elif tensor_order == "b t c h w": | |
| videos = videos.transpose(0, 1, 3, 4, 2) | |
| video = videos[0] | |
| height, width, channels = video[0].shape | |
| fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
| out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) | |
| for frame in video: | |
| frame_bgr = cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_RGB2BGR) | |
| out.write(frame_bgr) | |
| out.release() | |
| logger.info(f"Video saved to {output_path}") | |
| return output_path | |
| except Exception as e: | |
| logger.error(f"Failed to save video: {e}") | |
| return None | |
| # 确保cuid模块可用 | |
| try: | |
| import cuid | |
| except ImportError: | |
| print("cuid module not found, using a simple implementation") | |
| import uuid | |
| class cuid: | |
| def cuid(): | |
| return str(uuid.uuid4())[:8] | |
| # 设置基本配置 | |
| logger.setLevel(logging.INFO) | |
| # 设置项目路径 | |
| file_dir = os.path.dirname(__file__) | |
| PROJECT_DIR = os.path.join(os.path.dirname(__file__)) | |
| DATA_DIR = os.path.join(PROJECT_DIR, "data") | |
| CACHE_PATH = os.path.join(PROJECT_DIR, "t2v_input_image") | |
| OUTPUT_DIR = os.path.join(PROJECT_DIR, "results") | |
| # 创建必要的目录 | |
| os.makedirs(CACHE_PATH, exist_ok=True) | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # 参数配置 | |
| def get_default_args(): | |
| args_dict = { | |
| "add_static_video_prompt": False, | |
| "context_batch_size": 1, | |
| "context_frames": 12, | |
| "context_overlap": 4, | |
| "context_schedule": "uniform_v2", | |
| "context_stride": 1, | |
| "cross_attention_dim": 768, | |
| "face_image_path": None, | |
| "facein_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/facein.py"), | |
| "facein_model_name": None, | |
| "facein_scale": 1.0, | |
| "fix_condition_images": False, | |
| "fixed_ip_adapter_image": True, | |
| "fixed_refer_face_image": True, | |
| "fixed_refer_image": True, | |
| "fps": 4, | |
| "guidance_scale": 7.5, | |
| "height": None, | |
| "img_length_ratio": 1.0, | |
| "img_weight": 0.001, | |
| "interpolation_factor": 1, | |
| "ip_adapter_face_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/ip_adapter.py"), | |
| "ip_adapter_face_model_name": None, | |
| "ip_adapter_face_scale": 1.0, | |
| "ip_adapter_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/ip_adapter.py"), | |
| "ip_adapter_model_name": "musev_referencenet", | |
| "ip_adapter_scale": 1.0, | |
| "ipadapter_image_path": None, | |
| "lcm_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/lcm_model.py"), | |
| "lcm_model_name": None, | |
| "log_level": "INFO", | |
| "motion_speed": 8.0, | |
| "n_batch": 1, | |
| "n_cols": 3, | |
| "n_repeat": 1, | |
| "n_vision_condition": 1, | |
| "need_hist_match": False, | |
| "need_img_based_video_noise": True, | |
| "need_redraw": False, | |
| "negative_prompt": "V2", | |
| "negprompt_cfg_path": os.path.join(PROJECT_DIR, "configs/model/negative_prompt.py"), | |
| "noise_type": "video_fusion", | |
| "num_inference_steps": 30, | |
| "output_dir": OUTPUT_DIR, | |
| "overwrite": False, | |
| "prompt_only_use_image_prompt": False, | |
| "record_mid_video_latents": False, | |
| "record_mid_video_noises": False, | |
| "redraw_condition_image": False, | |
| "redraw_condition_image_with_facein": True, | |
| "redraw_condition_image_with_ip_adapter_face": True, | |
| "redraw_condition_image_with_ipdapter": True, | |
| "redraw_condition_image_with_referencenet": True, | |
| "referencenet_image_path": None, | |
| "referencenet_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/referencenet.py"), | |
| "referencenet_model_name": "musev_referencenet", | |
| "save_filetype": "mp4", | |
| "save_images": False, | |
| "sd_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/T2I_all_model.py"), | |
| "sd_model_name": "majicmixRealv6Fp16", | |
| "seed": None, | |
| "strength": 0.8, | |
| "target_datas": "boy_dance2", | |
| "test_data_path": os.path.join(PROJECT_DIR, "configs/infer/testcase_video_famous.yaml"), | |
| "time_size": 24, | |
| "unet_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/motion_model.py"), | |
| "unet_model_name": "musev_referencenet", | |
| "use_condition_image": True, | |
| "use_video_redraw": True, | |
| "vae_model_path": os.path.join(PROJECT_DIR, "checkpoints/vae/sd-vae-ft-mse"), | |
| "video_guidance_scale": 3.5, | |
| "video_guidance_scale_end": None, | |
| "video_guidance_scale_method": "linear", | |
| "video_negative_prompt": "V2", | |
| "video_num_inference_steps": 10, | |
| "video_overlap": 1, | |
| "vision_clip_extractor_class_name": "ImageClipVisionFeatureExtractor", | |
| "vision_clip_model_path": os.path.join(PROJECT_DIR, "checkpoints/IP-Adapter/models/image_encoder"), | |
| "w_ind_noise": 0.5, | |
| "width": None, | |
| "write_info": False, | |
| } | |
| return Namespace(**args_dict) | |
| # 工具函数 | |
| def generate_cuid(): | |
| return cuid.cuid() | |
| def read_image_and_name(path): | |
| """读取图像和名称""" | |
| if isinstance(path, str): | |
| path = [path] | |
| images = [] | |
| names = [] | |
| for p in path: | |
| try: | |
| img = Image.open(p).convert("RGB") | |
| img_np = np.array(img) | |
| # 添加批次和通道维度以匹配5D格式 (b, c, t, h, w) | |
| img_5d = np.expand_dims(np.expand_dims(img_np.transpose(2, 0, 1), 0), 2) | |
| images.append(img_5d) | |
| names.append(os.path.basename(p).split(".")[0]) | |
| except Exception as e: | |
| logger.error(f"Failed to read image {p}: {e}") | |
| continue | |
| if not images: | |
| return None, "no" | |
| images_combined = np.concatenate(images, axis=2) | |
| combined_name = "_".join(names) | |
| return images_combined, combined_name | |
| def get_signature_of_string(s, length=5): | |
| """获取字符串的签名""" | |
| import hashlib | |
| return hashlib.md5(s.encode()).hexdigest()[:length] | |
| def clean_str_for_save(s): | |
| """清理字符串以便保存""" | |
| import re | |
| return re.sub(r'[\\/:*?"<>|]', '_', s) | |
| def save_videos_grid_with_opencv(videos, output_path, texts=None, fps=4, tensor_order="b c t h w", n_cols=1, write_info=False, save_filetype="mp4", save_images=False): | |
| """使用OpenCV保存视频网格""" | |
| try: | |
| # 确保视频数据格式正确 | |
| if tensor_order == "b c t h w": | |
| # 转换为 b t h w c | |
| videos = videos.transpose(0, 2, 3, 4, 1) | |
| elif tensor_order == "b t c h w": | |
| # 转换为 b t h w c | |
| videos = videos.transpose(0, 1, 3, 4, 2) | |
| # 取第一个视频 | |
| video = videos[0] | |
| height, width, channels = video[0].shape | |
| # 使用OpenCV保存视频 | |
| fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
| out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) | |
| for frame in video: | |
| # 转换RGB到BGR | |
| frame_bgr = cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_RGB2BGR) | |
| out.write(frame_bgr) | |
| out.release() | |
| logger.info(f"Video saved to {output_path}") | |
| return output_path | |
| except Exception as e: | |
| logger.error(f"Failed to save video: {e}") | |
| return None | |
| # 初始化模型 | |
| def init_model(args): | |
| """初始化MuseV模型""" | |
| try: | |
| logger.info("正在初始化MuseV模型...") | |
| # 设置设备 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| logger.info(f"使用设备: {device}") | |
| # 尝试导入真实的MuseV组件 | |
| try: | |
| from musev.pipelines.pipeline_controlnet_predictor import DiffusersPipelinePredictor | |
| from mmcm.utils.load_util import load_pyhon_obj | |
| from musev.models.unet_loader import load_unet_by_name | |
| from musev.models.referencenet_loader import load_referencenet_by_name | |
| from musev.models.ip_adapter_loader import load_vision_clip_encoder_by_name, load_ip_adapter_image_proj_by_name | |
| from musev.models.ip_adapter_face_loader import load_ip_adapter_face_extractor_and_proj_by_name | |
| # 配置模型参数 | |
| config = { | |
| "device": device, | |
| "dtype": torch_dtype, | |
| "enable_xformers_memory_efficient_attention": True if device == "cuda" else False, | |
| "vae_model_path": args.vae_model_path if hasattr(args, 'vae_model_path') else None, | |
| } | |
| # 初始化预测器 | |
| predictor = DiffusersPipelinePredictor(config) | |
| # 尝试加载模型组件 | |
| try: | |
| # 加载Unet模型(运动模型) | |
| if hasattr(args, 'unet_model_name') and args.unet_model_name: | |
| unet = load_unet_by_name(args.unet_model_name, config) | |
| predictor.unet = unet | |
| logger.info(f"加载Unet模型: {args.unet_model_name}") | |
| # 加载参考网络 | |
| if hasattr(args, 'referencenet_model_name') and args.referencenet_model_name: | |
| referencenet = load_referencenet_by_name(args.referencenet_model_name, config) | |
| predictor.referencenet = referencenet | |
| logger.info(f"加载参考网络: {args.referencenet_model_name}") | |
| # 加载IP适配器 | |
| if hasattr(args, 'ip_adapter_model_name') and args.ip_adapter_model_name: | |
| vision_encoder = load_vision_clip_encoder_by_name(args.ip_adapter_model_name, config) | |
| image_proj = load_ip_adapter_image_proj_by_name(args.ip_adapter_model_name, config) | |
| predictor.vision_encoder = vision_encoder | |
| predictor.image_proj = image_proj | |
| logger.info(f"加载IP适配器: {args.ip_adapter_model_name}") | |
| # 加载人脸模型(这是生成说话视频的关键组件) | |
| if hasattr(args, 'enable_facein') and args.enable_facein: | |
| face_extractor, face_proj = load_ip_adapter_face_extractor_and_proj_by_name("face_in", config) | |
| predictor.face_extractor = face_extractor | |
| predictor.face_proj = face_proj | |
| logger.info("加载人脸特征提取器") | |
| logger.info("MuseV模型初始化成功") | |
| return predictor, device | |
| except Exception as model_load_error: | |
| logger.warning(f"加载模型组件时出错,将使用简化版本: {model_load_error}") | |
| # 尝试创建简化版预测器 | |
| class SimplifiedMuseVPredictor: | |
| def __init__(self): | |
| self.device = device | |
| def run_pipe_text2video(self, **kwargs): | |
| logger.info("使用简化版MuseV预测器") | |
| # 这里应该是调用真实的MuseV功能 | |
| # 由于可能缺少完整模型,我们创建一个基于输入图像的模拟视频 | |
| video_length = kwargs.get('video_length', 24) | |
| height = kwargs.get('height', 512) | |
| width = kwargs.get('width', 512) | |
| condition_images = kwargs.get('condition_images', None) | |
| # 创建一个简单的模拟视频 | |
| video = np.zeros((1, 3, video_length, height, width), dtype=np.uint8) | |
| # 如果有条件图像,尝试使用它作为基础 | |
| if condition_images is not None: | |
| try: | |
| from PIL import Image | |
| import numpy as np | |
| img = Image.open(condition_images).resize((width, height)).convert("RGB") | |
| img_np = np.array(img) | |
| # 将静态图像转换为简单的视频(轻微缩放/移动) | |
| for t in range(video_length): | |
| # 简单的缩放动画 | |
| scale = 1.0 + 0.1 * np.sin(t * 0.2) | |
| new_size = (int(width * scale), int(height * scale)) | |
| resized_img = cv2.resize(img_np, new_size) | |
| # 居中放置 | |
| h_start = (resized_img.shape[0] - height) // 2 | |
| w_start = (resized_img.shape[1] - width) // 2 | |
| frame = resized_img[h_start:h_start+height, w_start:w_start+width] | |
| video[0, :, t, :, :] = frame.transpose(2, 0, 1) | |
| except Exception as e: | |
| logger.error(f"处理条件图像时出错: {e}") | |
| # 使用彩色渐变作为备选 | |
| for t in range(video_length): | |
| r = int(255 * (t / video_length)) | |
| g = int(255 * 0.5) | |
| b = int(255 * ((video_length - t) / video_length)) | |
| video[0, 0, t, :, :] = r # R channel | |
| video[0, 1, t, :, :] = g # G channel | |
| video[0, 2, t, :, :] = b # B channel | |
| return video | |
| return SimplifiedMuseVPredictor(), device | |
| except ImportError as import_error: | |
| logger.warning(f"无法导入MuseV组件,使用模拟预测器: {import_error}") | |
| # 返回模拟预测器 | |
| class MockPredictor: | |
| def run_pipe_text2video(self, **kwargs): | |
| video_length = kwargs.get('video_length', 24) | |
| height = kwargs.get('height', 512) | |
| width = kwargs.get('width', 512) | |
| condition_images = kwargs.get('condition_images', None) | |
| # 创建模拟视频 | |
| video = np.zeros((1, 3, video_length, height, width), dtype=np.uint8) | |
| # 如果有条件图像,尝试显示它 | |
| if condition_images is not None: | |
| try: | |
| from PIL import Image | |
| img = Image.open(condition_images).resize((width, height)).convert("RGB") | |
| img_np = np.array(img) | |
| # 重复显示图像 | |
| for t in range(video_length): | |
| video[0, :, t, :, :] = img_np.transpose(2, 0, 1) | |
| except: | |
| # 使用彩色块 | |
| colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)] | |
| for t in range(video_length): | |
| r, g, b = colors[t % len(colors)] | |
| video[0, 0, t, :, :] = r | |
| video[0, 1, t, :, :] = g | |
| video[0, 2, t, :, :] = b | |
| else: | |
| # 没有图像,使用彩色渐变 | |
| for t in range(video_length): | |
| r = int(255 * (t / video_length)) | |
| g = int(255 * ((video_length - t) / video_length)) | |
| b = int(255 * 0.5) | |
| video[0, 0, t, :, :] = r | |
| video[0, 1, t, :, :] = g | |
| video[0, 2, t, :, :] = b | |
| return video | |
| return MockPredictor(), device | |
| except Exception as e: | |
| logger.error(f"模型初始化失败: {e}") | |
| # 返回最后的备用预测器 | |
| class FallbackMockPredictor: | |
| def run_pipe_text2video(self, **kwargs): | |
| video_length = kwargs.get('video_length', 24) | |
| height = kwargs.get('height', 512) | |
| width = kwargs.get('width', 512) | |
| # 创建简单的错误指示视频 | |
| video = np.zeros((1, 3, video_length, height, width), dtype=np.uint8) | |
| # 红色表示错误 | |
| for t in range(video_length): | |
| video[0, 0, t, :, :] = 255 # R channel | |
| video[0, 1, t, :, :] = 0 # G channel | |
| video[0, 2, t, :, :] = 0 # B channel | |
| return video | |
| return FallbackMockPredictor(), device | |
| # 最终的备用返回 | |
| return FallbackMockPredictor(), "cpu" | |
| # 视频生成函数 | |
| def generate_video( | |
| prompt, | |
| image, | |
| seed=42, | |
| fps=8, | |
| width=512, | |
| height=512, | |
| video_length=16, | |
| img_edge_ratio=1.0, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """生成视频的主要函数 - 支持上传照片生成说话视频""" | |
| try: | |
| progress(0, desc="开始视频生成...") | |
| # 初始化参数 | |
| args = get_default_args() | |
| # 为生成说话视频特别配置 | |
| args.enable_facein = True # 启用人脸特征提取 | |
| args.enable_ip_adapter = True # 启用IP适配器 | |
| args.enable_referencenet = True # 启用参考网络 | |
| args.use_condition_image = True # 使用条件图像 | |
| args.fix_condition_images = True # 固定条件图像(保持面部特征) | |
| args.guidance_scale = 3.5 # 文本引导尺度 | |
| args.video_guidance_scale = 1.5 # 视频引导尺度 | |
| args.strength = 0.6 # 重绘强度(值越低越接近原图) | |
| args.img_weight = 0.5 # 图像权重 | |
| args.motion_speed = 8.0 # 运动速度 | |
| args.need_img_based_video_noise = True # 基于图像的视频噪声 | |
| # 初始化模型 | |
| progress(0.1, desc="初始化MuseV模型...") | |
| sd_predictor, device = init_model(args) | |
| # 保存上传的图像 | |
| image_cuid = generate_cuid() | |
| image_path = os.path.join(CACHE_PATH, f"{image_cuid}.jpg") | |
| condition_images = None | |
| if image is not None: | |
| try: | |
| # 确保图像格式正确 | |
| if len(image.shape) == 3 and image.shape[2] == 3: | |
| # 已经是RGB格式 | |
| image_pil = Image.fromarray(image) | |
| elif len(image.shape) == 2: | |
| # 灰度图转RGB | |
| image_pil = Image.fromarray(image).convert("RGB") | |
| else: | |
| # 其他格式尝试转换 | |
| image_pil = Image.fromarray(image) | |
| image_pil.save(image_path) | |
| condition_images = image_path | |
| logger.info(f"已保存上传的图像: {image_path}") | |
| except Exception as e: | |
| logger.error(f"保存图像失败: {e}") | |
| # 如果没有上传图像,提示用户 | |
| if condition_images is None: | |
| logger.warning("未上传图像,将使用纯文本生成视频") | |
| progress(0.3, desc="处理输入数据...") | |
| # 设置种子 | |
| try: | |
| if 'set_all_seed' in globals(): | |
| cpu_generator, gpu_generator = set_all_seed(int(seed)) | |
| logger.info(f"使用种子: {seed}") | |
| else: | |
| cpu_generator, gpu_generator = None, None | |
| logger.warning("set_all_seed函数不可用,使用随机种子") | |
| except Exception as e: | |
| cpu_generator, gpu_generator = None, None | |
| logger.error(f"设置种子失败: {e}") | |
| # 准备提示词 | |
| if not prompt: | |
| prompt = "一个人在说话" # 默认提示词,适合生成说话视频 | |
| # 准备负面提示词 | |
| negative_prompt = "模糊, 低质量, 变形, 扭曲, 像素化, 噪点, 不良照明, 不自然表情" | |
| progress(0.5, desc="正在生成视频...") | |
| # 运行视频生成 | |
| try: | |
| # 调用MuseV的文本到视频管道 | |
| out_videos = sd_predictor.run_pipe_text2video( | |
| video_length=video_length, | |
| prompt=prompt, | |
| width=width, | |
| height=height, | |
| generator=gpu_generator if gpu_generator else None, | |
| noise_type=args.noise_type, | |
| negative_prompt=negative_prompt, | |
| video_negative_prompt=negative_prompt, | |
| max_batch_num=args.n_batch, | |
| strength=args.strength, | |
| need_img_based_video_noise=args.need_img_based_video_noise, | |
| video_num_inference_steps=args.video_num_inference_steps, | |
| condition_images=condition_images, # 使用上传的图像作为条件 | |
| fix_condition_images=args.fix_condition_images, # 保持面部特征不变 | |
| video_guidance_scale=args.video_guidance_scale, | |
| guidance_scale=args.guidance_scale, | |
| num_inference_steps=args.num_inference_steps, | |
| redraw_condition_image=args.redraw_condition_image, | |
| img_weight=args.img_weight, # 增加图像权重 | |
| w_ind_noise=args.w_ind_noise, | |
| n_vision_condition=args.n_vision_condition, | |
| motion_speed=args.motion_speed, # 控制视频运动速度 | |
| need_hist_match=args.need_hist_match, | |
| context_frames=args.context_frames, | |
| context_stride=args.context_stride, | |
| context_overlap=args.context_overlap, | |
| ) | |
| except Exception as e: | |
| logger.error(f"视频生成错误: {e}") | |
| # 使用模拟视频作为备份 | |
| progress(0.7, desc="使用备份生成器...") | |
| out_videos = np.zeros((1, 3, video_length, height, width), dtype=np.uint8) | |
| # 如果有条件图像,尝试基于图像生成简单动画 | |
| if condition_images is not None: | |
| try: | |
| img = Image.open(condition_images).resize((width, height)).convert("RGB") | |
| img_np = np.array(img) | |
| # 创建一个简单的缩放/淡入动画 | |
| for t in range(video_length): | |
| # 计算缩放比例 | |
| scale = 1.0 - 0.1 * np.cos(t * 0.3) | |
| new_size = (int(width * scale), int(height * scale)) | |
| resized_img = cv2.resize(img_np, new_size) | |
| # 居中放置 | |
| h_start = (height - new_size[1]) // 2 | |
| w_start = (width - new_size[0]) // 2 | |
| frame = np.zeros((height, width, 3), dtype=np.uint8) | |
| frame[h_start:h_start+new_size[1], w_start:w_start+new_size[0]] = resized_img | |
| video_frame = frame.transpose(2, 0, 1) | |
| out_videos[0, :, t, :, :] = video_frame | |
| except Exception as inner_e: | |
| logger.error(f"创建基于图像的备份视频失败: {inner_e}") | |
| # 使用彩色渐变作为最后的备选 | |
| colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)] | |
| for t in range(video_length): | |
| r, g, b = colors[t % len(colors)] | |
| out_videos[0, 0, t, :, :] = r # R channel | |
| out_videos[0, 1, t, :, :] = g # G channel | |
| out_videos[0, 2, t, :, :] = b # B channel | |
| else: | |
| # 没有图像,使用彩色渐变 | |
| for t in range(video_length): | |
| r = int(255 * (t / video_length)) | |
| g = int(255 * 0.5) | |
| b = int(255 * ((video_length - t) / video_length)) | |
| out_videos[0, 0, t, :, :] = r | |
| out_videos[0, 1, t, :, :] = g | |
| out_videos[0, 2, t, :, :] = b | |
| progress(0.8, desc="正在保存视频...") | |
| # 保存视频 | |
| save_file_name = f"video_{image_cuid}_{generate_cuid()}" | |
| try: | |
| if 'clean_str_for_save' in globals(): | |
| save_file_name = clean_str_for_save(save_file_name) | |
| except: | |
| # 如果clean_str_for_save不可用,使用原始文件名 | |
| pass | |
| # 确保输出目录存在 | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| output_path = os.path.join(OUTPUT_DIR, f"{save_file_name}.{args.save_filetype}") | |
| try: | |
| # 使用MuseV提供的视频保存函数 | |
| if 'save_videos_grid_with_opencv' in globals(): | |
| save_videos_grid_with_opencv( | |
| out_videos, | |
| output_path, | |
| fps=fps, | |
| tensor_order="b c t h w", | |
| save_filetype=args.save_filetype, | |
| ) | |
| else: | |
| # 备用的视频保存逻辑 | |
| fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
| out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) | |
| # 转换视频格式 | |
| if out_videos.shape[1] == 3 and out_videos.shape[2] == video_length: | |
| # b c t h w -> b t h w c | |
| video_data = out_videos.transpose(0, 2, 3, 4, 1) | |
| video_frames = video_data[0] # 取第一个视频 | |
| for frame in video_frames: | |
| # 确保像素值在0-255范围内 | |
| frame_uint8 = np.clip(frame, 0, 255).astype(np.uint8) | |
| # 转换RGB到BGR | |
| frame_bgr = cv2.cvtColor(frame_uint8, cv2.COLOR_RGB2BGR) | |
| out.write(frame_bgr) | |
| out.release() | |
| logger.info(f"视频已保存到: {output_path}") | |
| except Exception as e: | |
| logger.error(f"保存视频失败: {e}") | |
| # 作为最后的备份,创建一个简单的视频 | |
| output_path = os.path.join(OUTPUT_DIR, f"fallback_video_{generate_cuid()}.mp4") | |
| try: | |
| fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
| out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) | |
| # 创建一个简单的彩色渐变视频 | |
| for t in range(video_length): | |
| frame = np.zeros((height, width, 3), dtype=np.uint8) | |
| # 蓝色渐变 | |
| frame[:, :, 0] = (t * 255 // video_length) # B | |
| frame[:, :, 1] = 100 # G | |
| frame[:, :, 2] = 100 # R | |
| out.write(frame) | |
| out.release() | |
| logger.info(f"已创建备用视频: {output_path}") | |
| except Exception as inner_e: | |
| logger.error(f"创建备用视频失败: {inner_e}") | |
| return f"错误: 无法保存视频 ({str(e)})" | |
| progress(1.0, desc="视频生成完成!") | |
| return output_path | |
| except Exception as e: | |
| logger.error(f"视频生成失败: {e}") | |
| # 提供一个简单的错误视频作为最后的备用方案 | |
| try: | |
| error_video_path = os.path.join(OUTPUT_DIR, f"error_video_{generate_cuid()}.mp4") | |
| fourcc = cv2.VideoWriter_fourcc(*'mp4v') | |
| out = cv2.VideoWriter(error_video_path, fourcc, 1, (256, 256)) | |
| error_frame = np.zeros((256, 256, 3), dtype=np.uint8) | |
| error_frame[:, :, 0] = 0 # B | |
| error_frame[:, :, 1] = 0 # G | |
| error_frame[:, :, 2] = 255 # R (红色表示错误) | |
| for _ in range(5): # 5帧红色画面 | |
| out.write(error_frame) | |
| out.release() | |
| return error_video_path | |
| except: | |
| return f"错误: {str(e)}" | |
| # 创建Gradio界面 | |
| def create_interface(): | |
| """创建支持照片说话视频生成的Gradio界面""" | |
| with gr.Blocks(title="MuseV照片说话视频生成工具", theme=gr.themes.Soft()) as interface: | |
| gr.Markdown(""" | |
| # MuseV照片说话视频生成工具 | |
| 上传照片,让照片中的人物开口说话! | |
| ## 使用方法 | |
| 1. 输入描述你想在视频中看到的内容的提示词(特别是关于说话或表情的描述) | |
| 2. **上传人物照片**(建议使用清晰的正面人像照片) | |
| 3. 根据需要调整高级参数 | |
| 4. 点击"生成说话视频"按钮 | |
| 5. 等待视频生成完成后即可播放和下载 | |
| ## 提示 | |
| - 使用清晰的正面人物照片可获得最佳效果 | |
| - 提示词中可以包含如"说话"、"微笑"、"表情自然"等描述 | |
| - 视频生成时间取决于您的电脑性能,通常需要几十秒到几分钟 | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| prompt = gr.Textbox( | |
| label="提示词", | |
| placeholder="描述照片中的人物在做什么,例如:'一个人在说话','微笑着打招呼'...", | |
| lines=3, | |
| value="一个人在说话,表情自然" | |
| ) | |
| image = gr.Image(label="人物照片(推荐上传)", type="numpy", height=240) | |
| with gr.Accordion("高级参数", open=False): | |
| seed = gr.Slider(label="随机种子", minimum=0, maximum=1000000, value=42, step=1) | |
| fps = gr.Slider(label="帧率", minimum=1, maximum=30, value=8, step=1) | |
| width = gr.Slider(label="视频宽度", minimum=256, maximum=1024, value=512, step=64) | |
| height = gr.Slider(label="视频高度", minimum=256, maximum=1024, value=512, step=64) | |
| video_length = gr.Slider(label="视频长度(帧数)", minimum=8, maximum=64, value=16, step=4) | |
| img_edge_ratio = gr.Slider(label="图像边缘比例", minimum=0.5, maximum=2.0, value=1.0, step=0.1) | |
| generate_btn = gr.Button("生成说话视频", variant="primary") | |
| with gr.Column(scale=1): | |
| output_video = gr.Video(label="生成的说话视频", height=240) | |
| # 设置生成按钮的点击事件 | |
| generate_btn.click( | |
| fn=generate_video, | |
| inputs=[prompt, image, seed, fps, width, height, video_length, img_edge_ratio], | |
| outputs=output_video, | |
| show_progress=True | |
| ) | |
| # 示例提示词 | |
| gr.Markdown(""" | |
| ## 推荐提示词示例 | |
| - "一个人在说话,表情自然,嘴巴动起来" | |
| - "微笑着说话,眼神温和" | |
| - "高兴地打招呼,表情生动" | |
| - "平静地讲述,面部表情自然" | |
| ## 高级技巧 | |
| - 可以指定人物特征:"一个戴着眼镜的女人在说话" | |
| - 可以添加场景描述:"在公园里,一个孩子开心地说话" | |
| - 可以描述表情:"惊讶地说话,眉毛微扬" | |
| """) | |
| return interface | |
| # 主函数 | |
| if __name__ == "__main__": | |
| # 创建并启动Gradio界面 | |
| interface = create_interface() | |
| # 启动界面(在Hugging Face Space中,share应该设置为False) | |
| interface.launch(share=False) |