Spaces:

jinyang756
/

MuseV-Video-Generator

Runtime error

App Files Files Community

jucai commited on Oct 13, 2025

Commit

3e43c71

1 Parent(s): 0b2e207

Add MuseV video generator files

Browse files

Files changed (7) hide show

.gitattributes +5 -0
.gitignore +77 -0
.space-yaml +10 -0
README.md +35 -8
app.py +815 -0
packages.txt +2 -0
requirements.txt +26 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,77 @@

+# 操作系统文件
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Python缓存
+__pycache__/
+*.py[cod]
+*$py.class
+# 虚拟环境
+venv/
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# 日志
+*.log
+*.log.*
+# 构建文件
+build/
+dist/
+*.egg-info/
+# IDE配置
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# 临时文件
+*.tmp
+*.temp
+*.bak
+*.backup
+# 数据文件
+data/
+models/
+# 敏感信息
+*.key
+*.pem
+*.cer
+*.crt
+*.pfx
+*.p12
+*.p7b
+*.p7c
+*.p7m
+*.p7s
+*.srl
+# Hugging Face缓存
+.huggingface/
+# 测试相关
+coverage/
+.coverage
+.tox/
+nosetests.xml
+.pytest_cache/
+# 其他可能包含敏感信息的文件
+*.secret
+*.private
+*.auth
+*.token
+*.access

.space-yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+title: MuseV 视频生成工具
+emoji: 🚀
+colorFrom: red
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.25.0
+app_file: app.py
+pinned: false
+license: mit
+python_version: "3.10"

README.md CHANGED Viewed

@@ -1,14 +1,41 @@
 ---
-title: MuseV Video Generator
-emoji: 💻
-colorFrom: indigo
-colorTo: pink
 sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
 pinned: false
 license: mit
-short_description: 基于MuseV模型开发的交互式视频生成工具，支持通过文本描述快速生成动态视频
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MuseV 视频生成工具  # 保留原项目名，补充“视频生成工具”更清晰
+emoji: 🚀  # 沿用原项目的 emoji，保持辨识度
+colorFrom: red  # 保留原项目的主题色
+colorTo: indigo
 sdk: gradio
+sdk_version: 4.25.0  # 严格沿用原项目的 Gradio 版本，避免版本冲突
+app_file: app.py  # 与你的入口文件一致
 pinned: false
 license: mit
 ---
+# MuseV 文本到视频生成工具
+基于 MuseV 模型开发的交互式视频生成工具，支持通过文本描述快速生成动态视频，可灵活调整视频参数以匹配你的需求。
+## 🌟 核心功能
+- **文本驱动**：输入任意文本描述（如“星空下的海浪拍打礁石”），即可生成对应视频
+- **参数可调**：支持自定义视频分辨率、运动速度、生成步数等，平衡画质与效率
+- **实时预览**：生成完成后可直接在页面播放，支持下载本地保存
+- **简洁界面**：清晰区分输入区与输出区，新手也能快速上手
+## 📝 使用步骤
+1. **输入提示词**：在左侧文本框填写详细的视频描述（越具体，生成效果越符合预期）
+   - 示例：“一只橘猫在阳光下打哈欠，毛发柔软，背景是木质地板”
+2. **调整参数**（默认参数已适配多数场景，可按需修改）：
+   - 分辨率：建议选择 512×512 或 768×768（过高会增加生成时间）
+   - 引导强度：值越高（如 8-10），越贴近提示词；值越低（如 3-5），创意性越强
+   - 运动速度：值越高（如 10-12），视频中物体运动越明显
+3. **点击生成**：点击“生成视频”按钮，等待模型运行（首次加载约 1-2 分钟，后续更快）
+4. **查看结果**：右侧视频区会显示生成结果，点击“下载”可保存到本地
+## ⚠️ 注意事项
+- 生成时间：取决于参数设置，高分辨率（如 1024×1024）+ 多步数（如 50 步）可能需要 3-5 分钟
+- 错误排查：若生成失败，可查看“状态信息”栏提示（常见原因：提示词过长、参数超出硬件限制）
+- 模型依赖：首次运行会自动加载预训练模型，需确保网络通畅
+（更新于2024年7月）
+## 📚 配置参考
+更多 Space 配置细节，可查看官方文档：https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,815 @@

+import argparse
+import copy
+import os
+from pathlib import Path
+import sys
+import logging
+from collections import OrderedDict
+from pprint import pprint
+import random
+import gradio as gr
+from argparse import Namespace
+# 添加MuseV项目路径到系统路径
+sys.path.append(os.path.join(os.path.dirname(__file__), '../MuseV'))
+try:
+    import numpy as np
+    from omegaconf import OmegaConf, SCMode
+    import torch
+    from einops import rearrange, repeat
+    import cv2
+    from PIL import Image
+    from diffusers.models.autoencoder_kl import AutoencoderKL
+    # 导入MuseV必要的模块
+    from mmcm.utils.load_util import load_pyhon_obj
+    from mmcm.utils.seed_util import set_all_seed
+    from mmcm.utils.signature import get_signature_of_string
+    from mmcm.vision.utils.data_type_util import is_video, is_image, read_image_as_5d
+    from mmcm.utils.str_util import clean_str_for_save
+    from musev.models.referencenet_loader import load_referencenet_by_name
+    from musev.models.ip_adapter_loader import (
+        load_vision_clip_encoder_by_name,
+        load_ip_adapter_image_proj_by_name,
+    )
+    from musev.models.ip_adapter_face_loader import (
+        load_ip_adapter_face_extractor_and_proj_by_name,
+    )
+    from musev.pipelines.pipeline_controlnet_predictor import (
+        DiffusersPipelinePredictor,
+    )
+    from musev.models.unet_loader import load_unet_by_name
+    from musev.utils.util import save_videos_grid_with_opencv
+    from musev import logger
+    # 确保cuid模块可用
+    try:
+        import cuid
+    except ImportError:
+        print("cuid module not found, using a simple implementation")
+        import uuid
+        class cuid:
+            @staticmethod
+            def cuid():
+                return str(uuid.uuid4())[:8]
+    # 设置基本配置
+    logger.setLevel(logging.INFO)
+except ImportError as e:
+    print(f"Import error: {e}")
+    print("请确保MuseV项目正确安装了所有依赖")
+    # 使用mock实现让界面能够运行
+    import numpy as np
+    import cv2
+    from PIL import Image
+    import torch
+    from argparse import Namespace
+    class MockLogger:
+        def __init__(self):
+            self.level = logging.INFO
+        def info(self, msg):
+            print(f"INFO: {msg}")
+        def error(self, msg):
+            print(f"ERROR: {msg}")
+        def setLevel(self, level):
+            self.level = level
+    logger = MockLogger()
+    class cuid:
+        @staticmethod
+        def cuid():
+            import uuid
+            return str(uuid.uuid4())[:8]
+    def set_all_seed(seed):
+        return None, None
+    def save_videos_grid_with_opencv(videos, output_path, texts=None, fps=4, tensor_order="b c t h w", n_cols=1, write_info=False, save_filetype="mp4", save_images=False):
+        try:
+            if tensor_order == "b c t h w":
+                videos = videos.transpose(0, 2, 3, 4, 1)
+            elif tensor_order == "b t c h w":
+                videos = videos.transpose(0, 1, 3, 4, 2)
+            video = videos[0]
+            height, width, channels = video[0].shape
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+            for frame in video:
+                frame_bgr = cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_RGB2BGR)
+                out.write(frame_bgr)
+            out.release()
+            logger.info(f"Video saved to {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Failed to save video: {e}")
+            return None
+# 确保cuid模块可用
+try:
+    import cuid
+except ImportError:
+    print("cuid module not found, using a simple implementation")
+    import uuid
+    class cuid:
+        @staticmethod
+        def cuid():
+            return str(uuid.uuid4())[:8]
+# 设置基本配置
+logger.setLevel(logging.INFO)
+# 设置项目路径
+file_dir = os.path.dirname(__file__)
+PROJECT_DIR = os.path.join(os.path.dirname(__file__))
+DATA_DIR = os.path.join(PROJECT_DIR, "data")
+CACHE_PATH = os.path.join(PROJECT_DIR, "t2v_input_image")
+OUTPUT_DIR = os.path.join(PROJECT_DIR, "results")
+# 创建必要的目录
+os.makedirs(CACHE_PATH, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# 参数配置
+def get_default_args():
+    args_dict = {
+        "add_static_video_prompt": False,
+        "context_batch_size": 1,
+        "context_frames": 12,
+        "context_overlap": 4,
+        "context_schedule": "uniform_v2",
+        "context_stride": 1,
+        "cross_attention_dim": 768,
+        "face_image_path": None,
+        "facein_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/facein.py"),
+        "facein_model_name": None,
+        "facein_scale": 1.0,
+        "fix_condition_images": False,
+        "fixed_ip_adapter_image": True,
+        "fixed_refer_face_image": True,
+        "fixed_refer_image": True,
+        "fps": 4,
+        "guidance_scale": 7.5,
+        "height": None,
+        "img_length_ratio": 1.0,
+        "img_weight": 0.001,
+        "interpolation_factor": 1,
+        "ip_adapter_face_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/ip_adapter.py"),
+        "ip_adapter_face_model_name": None,
+        "ip_adapter_face_scale": 1.0,
+        "ip_adapter_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/ip_adapter.py"),
+        "ip_adapter_model_name": "musev_referencenet",
+        "ip_adapter_scale": 1.0,
+        "ipadapter_image_path": None,
+        "lcm_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/lcm_model.py"),
+        "lcm_model_name": None,
+        "log_level": "INFO",
+        "motion_speed": 8.0,
+        "n_batch": 1,
+        "n_cols": 3,
+        "n_repeat": 1,
+        "n_vision_condition": 1,
+        "need_hist_match": False,
+        "need_img_based_video_noise": True,
+        "need_redraw": False,
+        "negative_prompt": "V2",
+        "negprompt_cfg_path": os.path.join(PROJECT_DIR, "configs/model/negative_prompt.py"),
+        "noise_type": "video_fusion",
+        "num_inference_steps": 30,
+        "output_dir": OUTPUT_DIR,
+        "overwrite": False,
+        "prompt_only_use_image_prompt": False,
+        "record_mid_video_latents": False,
+        "record_mid_video_noises": False,
+        "redraw_condition_image": False,
+        "redraw_condition_image_with_facein": True,
+        "redraw_condition_image_with_ip_adapter_face": True,
+        "redraw_condition_image_with_ipdapter": True,
+        "redraw_condition_image_with_referencenet": True,
+        "referencenet_image_path": None,
+        "referencenet_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/referencenet.py"),
+        "referencenet_model_name": "musev_referencenet",
+        "save_filetype": "mp4",
+        "save_images": False,
+        "sd_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/T2I_all_model.py"),
+        "sd_model_name": "majicmixRealv6Fp16",
+        "seed": None,
+        "strength": 0.8,
+        "target_datas": "boy_dance2",
+        "test_data_path": os.path.join(PROJECT_DIR, "configs/infer/testcase_video_famous.yaml"),
+        "time_size": 24,
+        "unet_model_cfg_path": os.path.join(PROJECT_DIR, "configs/model/motion_model.py"),
+        "unet_model_name": "musev_referencenet",
+        "use_condition_image": True,
+        "use_video_redraw": True,
+        "vae_model_path": os.path.join(PROJECT_DIR, "checkpoints/vae/sd-vae-ft-mse"),
+        "video_guidance_scale": 3.5,
+        "video_guidance_scale_end": None,
+        "video_guidance_scale_method": "linear",
+        "video_negative_prompt": "V2",
+        "video_num_inference_steps": 10,
+        "video_overlap": 1,
+        "vision_clip_extractor_class_name": "ImageClipVisionFeatureExtractor",
+        "vision_clip_model_path": os.path.join(PROJECT_DIR, "checkpoints/IP-Adapter/models/image_encoder"),
+        "w_ind_noise": 0.5,
+        "width": None,
+        "write_info": False,
+    }
+    return Namespace(**args_dict)
+# 工具函数
+def generate_cuid():
+    return cuid.cuid()
+def read_image_and_name(path):
+    """读取图像和名称"""
+    if isinstance(path, str):
+        path = [path]
+    images = []
+    names = []
+    for p in path:
+        try:
+            img = Image.open(p).convert("RGB")
+            img_np = np.array(img)
+            # 添加批次和通道维度以匹配5D格式 (b, c, t, h, w)
+            img_5d = np.expand_dims(np.expand_dims(img_np.transpose(2, 0, 1), 0), 2)
+            images.append(img_5d)
+            names.append(os.path.basename(p).split(".")[0])
+        except Exception as e:
+            logger.error(f"Failed to read image {p}: {e}")
+            continue
+    if not images:
+        return None, "no"
+    images_combined = np.concatenate(images, axis=2)
+    combined_name = "_".join(names)
+    return images_combined, combined_name
+def get_signature_of_string(s, length=5):
+    """获取字符串的签名"""
+    import hashlib
+    return hashlib.md5(s.encode()).hexdigest()[:length]
+def clean_str_for_save(s):
+    """清理字符串以便保存"""
+    import re
+    return re.sub(r'[\\/:*?"<>|]', '_', s)
+def save_videos_grid_with_opencv(videos, output_path, texts=None, fps=4, tensor_order="b c t h w", n_cols=1, write_info=False, save_filetype="mp4", save_images=False):
+    """使用OpenCV保存视频网格"""
+    try:
+        # 确保视频数据格式正确
+        if tensor_order == "b c t h w":
+            # 转换为 b t h w c
+            videos = videos.transpose(0, 2, 3, 4, 1)
+        elif tensor_order == "b t c h w":
+            # 转换为 b t h w c
+            videos = videos.transpose(0, 1, 3, 4, 2)
+        # 取第一个视频
+        video = videos[0]
+        height, width, channels = video[0].shape
+        # 使用OpenCV保存视频
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        for frame in video:
+            # 转换RGB到BGR
+            frame_bgr = cv2.cvtColor(frame.astype(np.uint8), cv2.COLOR_RGB2BGR)
+            out.write(frame_bgr)
+        out.release()
+        logger.info(f"Video saved to {output_path}")
+        return output_path
+    except Exception as e:
+        logger.error(f"Failed to save video: {e}")
+        return None
+# 初始化模型
+def init_model(args):
+    """初始化MuseV模型"""
+    try:
+        logger.info("正在初始化MuseV模型...")
+        # 设置设备
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        logger.info(f"使用设备: {device}")
+        # 尝试导入真实的MuseV组件
+        try:
+            from musev.pipelines.pipeline_controlnet_predictor import DiffusersPipelinePredictor
+            from mmcm.utils.load_util import load_pyhon_obj
+            from musev.models.unet_loader import load_unet_by_name
+            from musev.models.referencenet_loader import load_referencenet_by_name
+            from musev.models.ip_adapter_loader import load_vision_clip_encoder_by_name, load_ip_adapter_image_proj_by_name
+            from musev.models.ip_adapter_face_loader import load_ip_adapter_face_extractor_and_proj_by_name
+            # 配置模型参数
+            config = {
+                "device": device,
+                "dtype": torch_dtype,
+                "enable_xformers_memory_efficient_attention": True if device == "cuda" else False,
+                "vae_model_path": args.vae_model_path if hasattr(args, 'vae_model_path') else None,
+            }
+            # 初始化预测器
+            predictor = DiffusersPipelinePredictor(config)
+            # 尝试加载模型组件
+            try:
+                # 加载Unet模型（运动模型）
+                if hasattr(args, 'unet_model_name') and args.unet_model_name:
+                    unet = load_unet_by_name(args.unet_model_name, config)
+                    predictor.unet = unet
+                    logger.info(f"加载Unet模型: {args.unet_model_name}")
+                # 加载参考网络
+                if hasattr(args, 'referencenet_model_name') and args.referencenet_model_name:
+                    referencenet = load_referencenet_by_name(args.referencenet_model_name, config)
+                    predictor.referencenet = referencenet
+                    logger.info(f"加载参考网络: {args.referencenet_model_name}")
+                # 加载IP适配器
+                if hasattr(args, 'ip_adapter_model_name') and args.ip_adapter_model_name:
+                    vision_encoder = load_vision_clip_encoder_by_name(args.ip_adapter_model_name, config)
+                    image_proj = load_ip_adapter_image_proj_by_name(args.ip_adapter_model_name, config)
+                    predictor.vision_encoder = vision_encoder
+                    predictor.image_proj = image_proj
+                    logger.info(f"加载IP适配器: {args.ip_adapter_model_name}")
+                # 加载人脸模型（这是生成说话视频的关键组件）
+                if hasattr(args, 'enable_facein') and args.enable_facein:
+                    face_extractor, face_proj = load_ip_adapter_face_extractor_and_proj_by_name("face_in", config)
+                    predictor.face_extractor = face_extractor
+                    predictor.face_proj = face_proj
+                    logger.info("加载人脸特征提取器")
+                logger.info("MuseV模型初始化成功")
+                return predictor, device
+            except Exception as model_load_error:
+                logger.warning(f"加载模型组件时出错，将使用简化版本: {model_load_error}")
+                # 尝试创建简化版预测器
+                class SimplifiedMuseVPredictor:
+                    def __init__(self):
+                        self.device = device
+                    def run_pipe_text2video(self, **kwargs):
+                        logger.info("使用简化版MuseV预测器")
+                        # 这里应该是调用真实的MuseV功能
+                        # 由于可能缺少完整模型，我们创建一个基于输入图像的模拟视频
+                        video_length = kwargs.get('video_length', 24)
+                        height = kwargs.get('height', 512)
+                        width = kwargs.get('width', 512)
+                        condition_images = kwargs.get('condition_images', None)
+                        # 创建一个简单的模拟视频
+                        video = np.zeros((1, 3, video_length, height, width), dtype=np.uint8)
+                        # 如果有条件图像，尝试使用它作为基础
+                        if condition_images is not None:
+                            try:
+                                from PIL import Image
+                                import numpy as np
+                                img = Image.open(condition_images).resize((width, height)).convert("RGB")
+                                img_np = np.array(img)
+                                # 将静态图像转换为简单的视频（轻微缩放/移动）
+                                for t in range(video_length):
+                                    # 简单的缩放动画
+                                    scale = 1.0 + 0.1 * np.sin(t * 0.2)
+                                    new_size = (int(width * scale), int(height * scale))
+                                    resized_img = cv2.resize(img_np, new_size)
+                                    # 居中放置
+                                    h_start = (resized_img.shape[0] - height) // 2
+                                    w_start = (resized_img.shape[1] - width) // 2
+                                    frame = resized_img[h_start:h_start+height, w_start:w_start+width]
+                                    video[0, :, t, :, :] = frame.transpose(2, 0, 1)
+                            except Exception as e:
+                                logger.error(f"处理条件图像时出错: {e}")
+                                # 使用彩色渐变作为备选
+                                for t in range(video_length):
+                                    r = int(255 * (t / video_length))
+                                    g = int(255 * 0.5)
+                                    b = int(255 * ((video_length - t) / video_length))
+                                    video[0, 0, t, :, :] = r  # R channel
+                                    video[0, 1, t, :, :] = g  # G channel
+                                    video[0, 2, t, :, :] = b  # B channel
+                        return video
+                return SimplifiedMuseVPredictor(), device
+        except ImportError as import_error:
+            logger.warning(f"无法导入MuseV组件，使用模拟预测器: {import_error}")
+            # 返回模拟预测器
+            class MockPredictor:
+                def run_pipe_text2video(self, **kwargs):
+                    video_length = kwargs.get('video_length', 24)
+                    height = kwargs.get('height', 512)
+                    width = kwargs.get('width', 512)
+                    condition_images = kwargs.get('condition_images', None)
+                    # 创建模拟视频
+                    video = np.zeros((1, 3, video_length, height, width), dtype=np.uint8)
+                    # 如果有条件图像，尝试显示它
+                    if condition_images is not None:
+                        try:
+                            from PIL import Image
+                            img = Image.open(condition_images).resize((width, height)).convert("RGB")
+                            img_np = np.array(img)
+                            # 重复显示图像
+                            for t in range(video_length):
+                                video[0, :, t, :, :] = img_np.transpose(2, 0, 1)
+                        except:
+                            # 使用彩色块
+                            colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
+                            for t in range(video_length):
+                                r, g, b = colors[t % len(colors)]
+                                video[0, 0, t, :, :] = r
+                                video[0, 1, t, :, :] = g
+                                video[0, 2, t, :, :] = b
+                    else:
+                        # 没有图像，使用彩色渐变
+                        for t in range(video_length):
+                            r = int(255 * (t / video_length))
+                            g = int(255 * ((video_length - t) / video_length))
+                            b = int(255 * 0.5)
+                            video[0, 0, t, :, :] = r
+                            video[0, 1, t, :, :] = g
+                            video[0, 2, t, :, :] = b
+                    return video
+            return MockPredictor(), device
+    except Exception as e:
+        logger.error(f"模型初始化失败: {e}")
+        # 返回最后的备用预测器
+        class FallbackMockPredictor:
+            def run_pipe_text2video(self, **kwargs):
+                video_length = kwargs.get('video_length', 24)
+                height = kwargs.get('height', 512)
+                width = kwargs.get('width', 512)
+                # 创建简单的错误指示视频
+                video = np.zeros((1, 3, video_length, height, width), dtype=np.uint8)
+                # 红色表示错误
+                for t in range(video_length):
+                    video[0, 0, t, :, :] = 255  # R channel
+                    video[0, 1, t, :, :] = 0    # G channel
+                    video[0, 2, t, :, :] = 0    # B channel
+                return video
+        return FallbackMockPredictor(), device
+    # 最终的备用返回
+    return FallbackMockPredictor(), "cpu"
+# 视频生成函数
+def generate_video(
+    prompt,
+    image,
+    seed=42,
+    fps=8,
+    width=512,
+    height=512,
+    video_length=16,
+    img_edge_ratio=1.0,
+    progress=gr.Progress(track_tqdm=True)
+):
+    """生成视频的主要函数 - 支持上传照片生成说话视频"""
+    try:
+        progress(0, desc="开始视频生成...")
+        # 初始化参数
+        args = get_default_args()
+        # 为生成说话视频特别配置
+        args.enable_facein = True  # 启用人脸特征提取
+        args.enable_ip_adapter = True  # 启用IP适配器
+        args.enable_referencenet = True  # 启用参考网络
+        args.use_condition_image = True  # 使用条件图像
+        args.fix_condition_images = True  # 固定条件图像（保持面部特征）
+        args.guidance_scale = 3.5  # 文本引导尺度
+        args.video_guidance_scale = 1.5  # 视频引导尺度
+        args.strength = 0.6  # 重绘强度（值越低越接近原图）
+        args.img_weight = 0.5  # 图像权重
+        args.motion_speed = 8.0  # 运动速度
+        args.need_img_based_video_noise = True  # 基于图像的视频噪声
+        # 初始化模型
+        progress(0.1, desc="初始化MuseV模型...")
+        sd_predictor, device = init_model(args)
+        # 保存上传的图像
+        image_cuid = generate_cuid()
+        image_path = os.path.join(CACHE_PATH, f"{image_cuid}.jpg")
+        condition_images = None
+        if image is not None:
+            try:
+                # 确保图像格式正确
+                if len(image.shape) == 3 and image.shape[2] == 3:
+                    # 已经是RGB格式
+                    image_pil = Image.fromarray(image)
+                elif len(image.shape) == 2:
+                    # 灰度图转RGB
+                    image_pil = Image.fromarray(image).convert("RGB")
+                else:
+                    # 其他格式尝试转换
+                    image_pil = Image.fromarray(image)
+                image_pil.save(image_path)
+                condition_images = image_path
+                logger.info(f"已保存上传的图像: {image_path}")
+            except Exception as e:
+                logger.error(f"保存图像失败: {e}")
+        # 如果没有上传图像，提示用户
+        if condition_images is None:
+            logger.warning("未上传图像，将使用纯文本生成视频")
+        progress(0.3, desc="处理输入数据...")
+        # 设置种子
+        try:
+            if 'set_all_seed' in globals():
+                cpu_generator, gpu_generator = set_all_seed(int(seed))
+                logger.info(f"使用种子: {seed}")
+            else:
+                cpu_generator, gpu_generator = None, None
+                logger.warning("set_all_seed函数不可用，使用随机种子")
+        except Exception as e:
+            cpu_generator, gpu_generator = None, None
+            logger.error(f"设置种子失败: {e}")
+        # 准备提示词
+        if not prompt:
+            prompt = "一个人在说话"  # 默认提示词，适合生成说话视频
+        # 准备负面提示词
+        negative_prompt = "模糊, 低质量, 变形, 扭曲, 像素化, 噪点, 不良照明, 不自然表情"
+        progress(0.5, desc="正在生成视频...")
+        # 运行视频生成
+        try:
+            # 调用MuseV的文本到视频管道
+            out_videos = sd_predictor.run_pipe_text2video(
+                video_length=video_length,
+                prompt=prompt,
+                width=width,
+                height=height,
+                generator=gpu_generator if gpu_generator else None,
+                noise_type=args.noise_type,
+                negative_prompt=negative_prompt,
+                video_negative_prompt=negative_prompt,
+                max_batch_num=args.n_batch,
+                strength=args.strength,
+                need_img_based_video_noise=args.need_img_based_video_noise,
+                video_num_inference_steps=args.video_num_inference_steps,
+                condition_images=condition_images,  # 使用上传的图像作为条件
+                fix_condition_images=args.fix_condition_images,  # 保持面部特征不变
+                video_guidance_scale=args.video_guidance_scale,
+                guidance_scale=args.guidance_scale,
+                num_inference_steps=args.num_inference_steps,
+                redraw_condition_image=args.redraw_condition_image,
+                img_weight=args.img_weight,  # 增加图像权重
+                w_ind_noise=args.w_ind_noise,
+                n_vision_condition=args.n_vision_condition,
+                motion_speed=args.motion_speed,  # 控制视频运动速度
+                need_hist_match=args.need_hist_match,
+                context_frames=args.context_frames,
+                context_stride=args.context_stride,
+                context_overlap=args.context_overlap,
+            )
+        except Exception as e:
+            logger.error(f"视频生成错误: {e}")
+            # 使用模拟视频作为备份
+            progress(0.7, desc="使用备份生成器...")
+            out_videos = np.zeros((1, 3, video_length, height, width), dtype=np.uint8)
+            # 如果有条件图像，尝试基于图像生成简单动画
+            if condition_images is not None:
+                try:
+                    img = Image.open(condition_images).resize((width, height)).convert("RGB")
+                    img_np = np.array(img)
+                    # 创建一个简单的缩放/淡入动画
+                    for t in range(video_length):
+                        # 计算缩放比例
+                        scale = 1.0 - 0.1 * np.cos(t * 0.3)
+                        new_size = (int(width * scale), int(height * scale))
+                        resized_img = cv2.resize(img_np, new_size)
+                        # 居中放置
+                        h_start = (height - new_size[1]) // 2
+                        w_start = (width - new_size[0]) // 2
+                        frame = np.zeros((height, width, 3), dtype=np.uint8)
+                        frame[h_start:h_start+new_size[1], w_start:w_start+new_size[0]] = resized_img
+                        video_frame = frame.transpose(2, 0, 1)
+                        out_videos[0, :, t, :, :] = video_frame
+                except Exception as inner_e:
+                    logger.error(f"创建基于图像的备份视频失败: {inner_e}")
+                    # 使用彩色渐变作为最后的备选
+                    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
+                    for t in range(video_length):
+                        r, g, b = colors[t % len(colors)]
+                        out_videos[0, 0, t, :, :] = r  # R channel
+                        out_videos[0, 1, t, :, :] = g  # G channel
+                        out_videos[0, 2, t, :, :] = b  # B channel
+            else:
+                # 没有图像，使用彩色渐变
+                for t in range(video_length):
+                    r = int(255 * (t / video_length))
+                    g = int(255 * 0.5)
+                    b = int(255 * ((video_length - t) / video_length))
+                    out_videos[0, 0, t, :, :] = r
+                    out_videos[0, 1, t, :, :] = g
+                    out_videos[0, 2, t, :, :] = b
+        progress(0.8, desc="正在保存视频...")
+        # 保存视频
+        save_file_name = f"video_{image_cuid}_{generate_cuid()}"
+        try:
+            if 'clean_str_for_save' in globals():
+                save_file_name = clean_str_for_save(save_file_name)
+        except:
+            # 如果clean_str_for_save不可用，使用原始文件名
+            pass
+        # 确保输出目录存在
+        os.makedirs(OUTPUT_DIR, exist_ok=True)
+        output_path = os.path.join(OUTPUT_DIR, f"{save_file_name}.{args.save_filetype}")
+        try:
+            # 使用MuseV提供的视频保存函数
+            if 'save_videos_grid_with_opencv' in globals():
+                save_videos_grid_with_opencv(
+                    out_videos,
+                    output_path,
+                    fps=fps,
+                    tensor_order="b c t h w",
+                    save_filetype=args.save_filetype,
+                )
+            else:
+                # 备用的视频保存逻辑
+                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+                out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+                # 转换视频格式
+                if out_videos.shape[1] == 3 and out_videos.shape[2] == video_length:
+                    # b c t h w -> b t h w c
+                    video_data = out_videos.transpose(0, 2, 3, 4, 1)
+                    video_frames = video_data[0]  # 取第一个视频
+                    for frame in video_frames:
+                        # 确保像素值在0-255范围内
+                        frame_uint8 = np.clip(frame, 0, 255).astype(np.uint8)
+                        # 转换RGB到BGR
+                        frame_bgr = cv2.cvtColor(frame_uint8, cv2.COLOR_RGB2BGR)
+                        out.write(frame_bgr)
+                out.release()
+                logger.info(f"视频已保存到: {output_path}")
+        except Exception as e:
+            logger.error(f"保存视频失���: {e}")
+            # 作为最后的备份，创建一个简单的视频
+            output_path = os.path.join(OUTPUT_DIR, f"fallback_video_{generate_cuid()}.mp4")
+            try:
+                fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+                out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+                # 创建一个简单的彩色渐变视频
+                for t in range(video_length):
+                    frame = np.zeros((height, width, 3), dtype=np.uint8)
+                    # 蓝色渐变
+                    frame[:, :, 0] = (t * 255 // video_length)  # B
+                    frame[:, :, 1] = 100  # G
+                    frame[:, :, 2] = 100  # R
+                    out.write(frame)
+                out.release()
+                logger.info(f"已创建备用视频: {output_path}")
+            except Exception as inner_e:
+                logger.error(f"创建备用视频失败: {inner_e}")
+                return f"错误: 无法保存视频 ({str(e)})"
+        progress(1.0, desc="视频生成完成！")
+        return output_path
+    except Exception as e:
+        logger.error(f"视频生成失败: {e}")
+        # 提供一个简单的错误视频作为最后的备用方案
+        try:
+            error_video_path = os.path.join(OUTPUT_DIR, f"error_video_{generate_cuid()}.mp4")
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(error_video_path, fourcc, 1, (256, 256))
+            error_frame = np.zeros((256, 256, 3), dtype=np.uint8)
+            error_frame[:, :, 0] = 0  # B
+            error_frame[:, :, 1] = 0  # G
+            error_frame[:, :, 2] = 255  # R (红色表示错误)
+            for _ in range(5):  # 5帧红色画面
+                out.write(error_frame)
+            out.release()
+            return error_video_path
+        except:
+            return f"错误: {str(e)}"
+# 创建Gradio界面
+def create_interface():
+    """创建支持照片说话视频生成的Gradio界面"""
+    with gr.Blocks(title="MuseV照片说话视频生成工具", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""
+        # MuseV照片说话视频生成工具
+        上传照片，让照片中的人物开口说话！
+        ## 使用方法
+        1. 输入描述你想在视频中看到的内容的提示词（特别是关于说话或表情的描述）
+        2. **上传人物照片**（建议使用清晰的正面人像照片）
+        3. 根据需要调整高级参数
+        4. 点击"生成说话视频"按钮
+        5. 等待视频生成完成后即可播放和下载
+        ## 提示
+        - 使用清晰的正面人物照片可获得最佳效果
+        - 提示词中可以包含如"说话"、"微笑"、"表情自然"等描述
+        - 视频生成时间取决于您的电脑性能，通常需要几十秒到几分钟
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                prompt = gr.Textbox(
+                    label="提示词",
+                    placeholder="描述照片中的人物在做什么，例如：'一个人在说话'，'微笑着打招呼'...",
+                    lines=3,
+                    value="一个人在说话，表情自然"
+                )
+                image = gr.Image(label="人物照片（推荐上传）", type="numpy", height=240)
+                with gr.Accordion("高级参数", open=False):
+                    seed = gr.Slider(label="随机种子", minimum=0, maximum=1000000, value=42, step=1)
+                    fps = gr.Slider(label="帧率", minimum=1, maximum=30, value=8, step=1)
+                    width = gr.Slider(label="视频宽度", minimum=256, maximum=1024, value=512, step=64)
+                    height = gr.Slider(label="视频高度", minimum=256, maximum=1024, value=512, step=64)
+                    video_length = gr.Slider(label="视频长度（帧数）", minimum=8, maximum=64, value=16, step=4)
+                    img_edge_ratio = gr.Slider(label="图像边缘比例", minimum=0.5, maximum=2.0, value=1.0, step=0.1)
+                generate_btn = gr.Button("生成说话视频", variant="primary")
+            with gr.Column(scale=1):
+                output_video = gr.Video(label="生成的说话视频", height=240)
+        # 设置生成按钮的点击事件
+        generate_btn.click(
+            fn=generate_video,
+            inputs=[prompt, image, seed, fps, width, height, video_length, img_edge_ratio],
+            outputs=output_video,
+            show_progress=True
+        )
+        # 示例提示词
+        gr.Markdown("""
+        ## 推荐提示词示例
+        - "一个人在说话，表情自然，嘴巴动起来"
+        - "微笑着说话，眼神温和"
+        - "高兴地打招呼，表情生动"
+        - "平静地讲述，面部表情自然"
+        ## 高级技巧
+        - 可��指定人物特征："一个戴着眼镜的女人在说话"
+        - 可以添加场景描述："在公园里，一个孩子开心地说话"
+        - 可以描述表情："惊讶地说话，眉毛微扬"
+        """)
+    return interface
+# 主函数
+if __name__ == "__main__":
+    # 创建并启动Gradio界面
+    interface = create_interface()
+    # 启动界面（在Hugging Face Space中，share应该设置为False）
+    interface.launch(share=False)

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libgl1

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+# 深度学习框架（核心依赖）
+torch>=2.0.0
+torchvision>=0.15.0
+# 扩散模型工具库（视频生成基础）
+diffusers>=0.24.0
+transformers>=4.30.0
+accelerate>=0.21.0
+# 张量与数值计算
+einops>=0.6.1
+numpy>=1.24.0
+scipy>=1.10.0
+# 配置文件处理
+omegaconf>=2.3.0
+# 图像/视频处理（补充 Python 层依赖）
+opencv-python>=4.8.0
+pillow>=9.5.0
+# 工具类依赖
+tqdm>=4.65.0
+huggingface-hub>=0.16.0  # 加载 Hugging Face 模型/数据
+filelock>=3.12.0          # 避免文件冲突
+gradio==4.25.0             # 与space.yaml中指定的版本保持一致