Spaces:

YinmingHuang
/

StableAvatar

Sleeping

App Files Files Community

YinmingHuang commited on Aug 20, 2025

Commit

cf2f35c

1 Parent(s): bc4a00b

Add application file

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
.gradio/certificate.pem +31 -0
LICENSE.txt +21 -0
accelerate_config/accelerate_config_machine_14B_multiple.yaml +19 -0
accelerate_config/accelerate_config_machine_1B_multiple.yaml +15 -0
app.py +718 -0
audio_extractor.py +14 -0
deepspeed_config/wan2.1/wan_civitai.yaml +39 -0
deepspeed_config/zero2_offload_cpu.json +35 -0
deepspeed_config/zero_stage2_config.json +35 -0
deepspeed_config/zero_stage3_config.json +46 -0
example_case/case-1/audio.wav +3 -0
example_case/case-1/prompt.txt +1 -0
example_case/case-1/reference.png +3 -0
example_case/case-2/audio.wav +3 -0
example_case/case-2/prompt.txt +1 -0
example_case/case-2/reference.png +3 -0
example_case/case-3/audio.wav +3 -0
example_case/case-3/prompt.txt +1 -0
example_case/case-3/reference.jpg +3 -0
example_case/case-45/audio.wav +3 -0
example_case/case-45/prompt.txt +1 -0
example_case/case-45/reference.png +3 -0
example_case/case-6/audio.wav +3 -0
example_case/case-6/prompt.txt +1 -0
example_case/case-6/reference.png +3 -0
extract_audio_segment.py +146 -0
lip_mask_extractor.py +70 -0
requirements.txt +170 -0
vocal_seperator.py +31 -0
wan/__init__.py +3 -0
wan/__pycache__/__init__.cpython-311.pyc +0 -0
wan/configs/__init__.py +42 -0
wan/configs/shared_config.py +19 -0
wan/configs/wan_i2v_14B.py +35 -0
wan/configs/wan_t2v_14B.py +29 -0
wan/configs/wan_t2v_1_3B.py +29 -0
wan/dataset/talking_video_dataset_fantasy.py +328 -0
wan/dist/__init__.py +40 -0
wan/dist/__pycache__/__init__.cpython-311.pyc +0 -0
wan/dist/__pycache__/wan_xfuser.cpython-311.pyc +0 -0
wan/dist/wan_xfuser.py +115 -0
wan/distributed/__init__.py +0 -0
wan/distributed/__pycache__/__init__.cpython-311.pyc +0 -0
wan/distributed/__pycache__/fsdp.cpython-311.pyc +0 -0
wan/distributed/fsdp.py +41 -0
wan/distributed/xdit_context_parallel.py +192 -0
wan/image2video.py +334 -0
wan/models/__init__.py +0 -0
wan/models/__pycache__/__init__.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Shuyuan Tu.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

accelerate_config/accelerate_config_machine_14B_multiple.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_config_file: path/StableAvatar/deepspeed_config/zero_stage2_config.json
+  deepspeed_multinode_launcher: standard
+  zero3_init_flag: False
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+main_training_function: main
+dynamo_backend: 'no'
+num_machines: 8
+num_processes: 64
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

accelerate_config/accelerate_config_machine_1B_multiple.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+main_training_function: main
+dynamo_backend: 'no'
+num_machines: 8
+num_processes: 64
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

app.py ADDED Viewed

	@@ -0,0 +1,718 @@

+import torch
+import psutil
+import argparse
+import gradio as gr
+import os
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import load_image
+from transformers import AutoTokenizer, Wav2Vec2Model, Wav2Vec2Processor
+from omegaconf import OmegaConf
+from wan.models.cache_utils import get_teacache_coefficients
+from wan.models.wan_fantasy_transformer3d_1B import WanTransformer3DFantasyModel
+from wan.models.wan_text_encoder import WanT5EncoderModel
+from wan.models.wan_vae import AutoencoderKLWan
+from wan.models.wan_image_encoder import CLIPModel
+from wan.pipeline.wan_inference_long_pipeline import WanI2VTalkingInferenceLongPipeline
+from wan.utils.fp8_optimization import replace_parameters_by_name, convert_weight_dtype_wrapper, convert_model_weight_to_float8
+from wan.utils.utils import get_image_to_video_latent, save_videos_grid
+import numpy as np
+import librosa
+import datetime
+import random
+import math
+import subprocess
+from moviepy.editor import VideoFileClip
+from huggingface_hub import snapshot_download
+import shutil
+try:
+    from audio_separator.separator import Separator
+except:
+    print("Unable to use vocal separation feature. Please install audio-separator[gpu].")
+if torch.cuda.is_available():
+    device = "cuda"
+    if torch.cuda.get_device_capability()[0] >= 8:
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float16
+else:
+    device = "cpu"
+    dtype = torch.float32
+def filter_kwargs(cls, kwargs):
+    import inspect
+    sig = inspect.signature(cls.__init__)
+    valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
+    filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
+    return filtered_kwargs
+def load_transformer_model(model_version):
+    """
+    根据选择的模型版本加载对应的transformer模型
+    Args:
+        model_version (str): 模型版本，"square" 或 "rec_vec"
+    Returns:
+        WanTransformer3DFantasyModel: 加载的transformer模型
+    """
+    global transformer3d
+    if model_version == "square":
+        transformer_path = os.path.join(repo_root, "StableAvatar-1.3B", "transformer3d-square.pt")
+    elif model_version == "rec_vec":
+        transformer_path = os.path.join(repo_root, "StableAvatar-1.3B", "transformer3d-rec-vec.pt")
+    else:
+        # 默认使用square版本
+        transformer_path = os.path.join(repo_root, "StableAvatar-1.3B", "transformer3d-square.pt")
+    print(f"正在加载模型: {transformer_path}")
+    if os.path.exists(transformer_path):
+        state_dict = torch.load(transformer_path, map_location="cpu")
+        state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
+        m, u = transformer3d.load_state_dict(state_dict, strict=False)
+        print(f"模型加载成功: {transformer_path}")
+        print(f"Missing keys: {len(m)}; Unexpected keys: {len(u)}")
+        return transformer3d
+    else:
+        print(f"错误：模型文件不存在: {transformer_path}")
+        return None
+REPO_ID = "FrancisRing/StableAvatar"
+repo_root = snapshot_download(
+    repo_id=REPO_ID,
+    allow_patterns=[
+        "StableAvatar-1.3B/*",
+        "Wan2.1-Fun-V1.1-1.3B-InP/*",
+        "wav2vec2-base-960h/*",
+        "assets/**",
+        "Kim_Vocal_2.onnx",
+    ],
+)
+pretrained_model_name_or_path = os.path.join(repo_root, "Wan2.1-Fun-V1.1-1.3B-InP")
+pretrained_wav2vec_path       = os.path.join(repo_root, "wav2vec2-base-960h")
+# 人声分离 onnx
+audio_separator_model_file = os.path.join(repo_root, "Kim_Vocal_2.onnx")
+# model_path = "/datadrive/stableavatar/checkpoints"
+# pretrained_model_name_or_path = f"{model_path}/Wan2.1-Fun-V1.1-1.3B-InP"
+# pretrained_wav2vec_path = f"{model_path}/wav2vec2-base-960h"
+# transformer_path = f"{model_path}/StableAvatar-1.3B/transformer3d-square.pt"
+config = OmegaConf.load("deepspeed_config/wan2.1/wan_civitai.yaml")
+sampler_name = "Flow"
+clip_sample_n_frames = 81
+tokenizer = AutoTokenizer.from_pretrained(os.path.join(pretrained_model_name_or_path, config['text_encoder_kwargs'].get('tokenizer_subpath', 'tokenizer')), )
+text_encoder = WanT5EncoderModel.from_pretrained(
+    os.path.join(pretrained_model_name_or_path, config['text_encoder_kwargs'].get('text_encoder_subpath', 'text_encoder')),
+    additional_kwargs=OmegaConf.to_container(config['text_encoder_kwargs']),
+    low_cpu_mem_usage=True,
+    torch_dtype=dtype,
+)
+text_encoder = text_encoder.eval()
+vae = AutoencoderKLWan.from_pretrained(
+    os.path.join(pretrained_model_name_or_path, config['vae_kwargs'].get('vae_subpath', 'vae')),
+    additional_kwargs=OmegaConf.to_container(config['vae_kwargs']),
+)
+wav2vec_processor = Wav2Vec2Processor.from_pretrained(pretrained_wav2vec_path)
+wav2vec = Wav2Vec2Model.from_pretrained(pretrained_wav2vec_path).to("cpu")
+clip_image_encoder = CLIPModel.from_pretrained(os.path.join(pretrained_model_name_or_path, config['image_encoder_kwargs'].get('image_encoder_subpath', 'image_encoder')), )
+clip_image_encoder = clip_image_encoder.eval()
+transformer3d = WanTransformer3DFantasyModel.from_pretrained(
+    os.path.join(pretrained_model_name_or_path, config['transformer_additional_kwargs'].get('transformer_subpath', 'transformer')),
+    transformer_additional_kwargs=OmegaConf.to_container(config['transformer_additional_kwargs']),
+    low_cpu_mem_usage=False,
+    torch_dtype=dtype,
+)
+# 默认加载square版本模型
+load_transformer_model("square")
+Choosen_Scheduler = scheduler_dict = {
+    "Flow": FlowMatchEulerDiscreteScheduler,
+}[sampler_name]
+scheduler = Choosen_Scheduler(
+    **filter_kwargs(Choosen_Scheduler, OmegaConf.to_container(config['scheduler_kwargs']))
+)
+pipeline = WanI2VTalkingInferenceLongPipeline(
+    tokenizer=tokenizer,
+    text_encoder=text_encoder,
+    vae=vae,
+    transformer=transformer3d,
+    clip_image_encoder=clip_image_encoder,
+    scheduler=scheduler,
+    wav2vec_processor=wav2vec_processor,
+    wav2vec=wav2vec,
+)
+def generate(
+    GPU_memory_mode,
+    teacache_threshold,
+    num_skip_start_steps,
+    image_path,
+    audio_path,
+    prompt,
+    negative_prompt,
+    width,
+    height,
+    guidance_scale,
+    num_inference_steps,
+    text_guide_scale,
+    audio_guide_scale,
+    motion_frame,
+    fps,
+    overlap_window_length,
+    seed_param,
+    overlapping_weight_scheme,
+    progress=gr.Progress(track_tqdm=True),
+):
+    global pipeline, transformer3d
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    if seed_param<0:
+        seed = random.randint(0, np.iinfo(np.int32).max)
+    else:
+        seed = seed_param
+    if GPU_memory_mode == "sequential_cpu_offload":
+        replace_parameters_by_name(transformer3d, ["modulation", ], device=device)
+        transformer3d.freqs = transformer3d.freqs.to(device=device)
+        pipeline.enable_sequential_cpu_offload(device=device)
+    elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+        convert_model_weight_to_float8(transformer3d, exclude_module_name=["modulation", ])
+        convert_weight_dtype_wrapper(transformer3d, dtype)
+        pipeline.enable_model_cpu_offload(device=device)
+    elif GPU_memory_mode == "model_cpu_offload":
+        pipeline.enable_model_cpu_offload(device=device)
+    else:
+        pipeline.to(device=device)
+    if teacache_threshold > 0:
+        coefficients = get_teacache_coefficients(pretrained_model_name_or_path)
+        pipeline.transformer.enable_teacache(
+            coefficients,
+            num_inference_steps,
+            teacache_threshold,
+            num_skip_start_steps=num_skip_start_steps,
+        )
+    with torch.no_grad():
+        video_length = int((clip_sample_n_frames - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if clip_sample_n_frames != 1 else 1
+        input_video, input_video_mask, clip_image = get_image_to_video_latent(image_path, None, video_length=video_length, sample_size=[height, width])
+        sr = 16000
+        vocal_input, sample_rate = librosa.load(audio_path, sr=sr)
+        sample = pipeline(
+            prompt,
+            num_frames=video_length,
+            negative_prompt=negative_prompt,
+            width=width,
+            height=height,
+            guidance_scale=guidance_scale,
+            generator=torch.Generator().manual_seed(seed),
+            num_inference_steps=num_inference_steps,
+            video=input_video,
+            mask_video=input_video_mask,
+            clip_image=clip_image,
+            text_guide_scale=text_guide_scale,
+            audio_guide_scale=audio_guide_scale,
+            vocal_input_values=vocal_input,
+            motion_frame=motion_frame,
+            fps=fps,
+            sr=sr,
+            cond_file_path=image_path,
+            overlap_window_length=overlap_window_length,
+            seed=seed,
+            overlapping_weight_scheme=overlapping_weight_scheme,
+        ).videos
+        os.makedirs("outputs", exist_ok=True)
+        video_path = os.path.join("outputs", f"{timestamp}.mp4")
+        save_videos_grid(sample, video_path, fps=fps)
+        output_video_with_audio = os.path.join("outputs", f"{timestamp}_audio.mp4")
+        subprocess.run([
+            "ffmpeg", "-y", "-loglevel", "quiet", "-i", video_path, "-i", audio_path,
+            "-c:v", "copy", "-c:a", "aac", "-strict", "experimental",
+            output_video_with_audio
+        ], check=True)
+    return output_video_with_audio, seed, f"Generated outputs/{timestamp}.mp4 / 已生成outputs/{timestamp}.mp4"
+def exchange_width_height(width, height):
+    return height, width, "✅ Width and Height Swapped / 宽高交换完毕"
+def adjust_width_height(image):
+    image = load_image(image)
+    width, height = image.size
+    original_area = width * height
+    default_area = 512*512
+    ratio = math.sqrt(original_area / default_area)
+    width = width / ratio // 16 * 16
+    height = height / ratio // 16 * 16
+    return int(width), int(height), "✅ Adjusted Size Based on Image / 根据图片调整宽高"
+def audio_extractor(video_path):
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    os.makedirs("outputs", exist_ok=True)  # 确保目录存在
+    out_wav = os.path.abspath(os.path.join("outputs", f"{timestamp}.wav"))
+    video = VideoFileClip(video_path)
+    audio = video.audio
+    audio.write_audiofile(out_wav, codec="pcm_s16le")
+    return out_wav, f"Generated {out_wav} / 已生成 {out_wav}", out_wav  # ← 第3个返回给 gr.File
+def vocal_separation(audio_path):
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    os.makedirs("outputs", exist_ok=True)
+    # audio_separator_model_file = "checkpoints/Kim_Vocal_2.onnx"
+    audio_separator = Separator(
+        output_dir=os.path.abspath(os.path.join("outputs", timestamp)),
+        output_single_stem="vocals",
+        model_file_dir=os.path.dirname(audio_separator_model_file),
+    )
+    audio_separator.load_model(os.path.basename(audio_separator_model_file))
+    assert audio_separator.model_instance is not None, "Fail to load audio separate model."
+    outputs = audio_separator.separate(audio_path)
+    vocal_audio_file = os.path.join(audio_separator.output_dir, outputs[0])
+    destination_file = os.path.abspath(os.path.join("outputs", f"{timestamp}.wav"))
+    shutil.copy(vocal_audio_file, destination_file)
+    os.remove(vocal_audio_file)
+    return destination_file, f"Generated {destination_file} / 已生成 {destination_file}", destination_file
+def update_language(language):
+    if language == "English":
+        return {
+            GPU_memory_mode: gr.Dropdown(label="GPU Memory Mode", info="Normal uses 25G VRAM, model_cpu_offload uses 13G VRAM"),
+            teacache_threshold: gr.Slider(label="TeaCache Threshold", info="Recommended 0.1, 0 disables TeaCache acceleration"),
+            num_skip_start_steps: gr.Slider(label="Skip Start Steps", info="Recommended 5"),
+            model_version: gr.Dropdown(label="Model Version",  choices=["square", "rec_vec"], value="square"),
+            image_path: gr.Image(label="Upload Image"),
+            audio_path: gr.Audio(label="Upload Audio"),
+            prompt: gr.Textbox(label="Prompt"),
+            negative_prompt: gr.Textbox(label="Negative Prompt"),
+            generate_button: gr.Button("🎬 Start Generation"),
+            width: gr.Slider(label="Width"),
+            height: gr.Slider(label="Height"),
+            exchange_button: gr.Button("🔄 Swap Width/Height"),
+            adjust_button: gr.Button("Adjust Size Based on Image"),
+            guidance_scale: gr.Slider(label="Guidance Scale"),
+            num_inference_steps: gr.Slider(label="Sampling Steps (Recommended 50)"),
+            text_guide_scale: gr.Slider(label="Text Guidance Scale"),
+            audio_guide_scale: gr.Slider(label="Audio Guidance Scale"),
+            motion_frame: gr.Slider(label="Motion Frame"),
+            fps: gr.Slider(label="FPS"),
+            overlap_window_length: gr.Slider(label="Overlap Window Length"),
+            seed_param: gr.Number(label="Seed (positive integer, -1 for random)"),
+            overlapping_weight_scheme: gr.Dropdown(label="Overlapping Weight Scheme", choices=["uniform", "log"], value="uniform"),
+            info: gr.Textbox(label="Status"),
+            video_output: gr.Video(label="Generated Result"),
+            seed_output: gr.Textbox(label="Seed"),
+            video_path: gr.Video(label="Upload Video"),
+            extractor_button: gr.Button("🎬 Start Extraction"),
+            info2: gr.Textbox(label="Status"),
+            audio_output: gr.Audio(label="Generated Result"),
+            audio_path3: gr.Audio(label="Upload Audio"),
+            separation_button: gr.Button("🎬 Start Separation"),
+            info3: gr.Textbox(label="Status"),
+            audio_output3: gr.Audio(label="Generated Result"),
+            example_title: gr.Markdown(value="### Select the following example cases for testing:"),
+            example1_label: gr.Markdown(value="**Example 1**"),
+            example2_label: gr.Markdown(value="**Example 2**"),
+            example3_label: gr.Markdown(value="**Example 3**"),
+            example4_label: gr.Markdown(value="**Example 4**"),
+            example5_label: gr.Markdown(value="**Example 5**"),
+            example1_btn: gr.Button("🚀 Use Example 1", variant="secondary"),
+            example2_btn: gr.Button("🚀 Use Example 2", variant="secondary"),
+            example3_btn: gr.Button("🚀 Use Example 3", variant="secondary"),
+            example4_btn: gr.Button("🚀 Use Example 4", variant="secondary"),
+            example5_btn: gr.Button("🚀 Use Example 5", variant="secondary"),
+            parameter_settings_title: gr.Accordion(label="Parameter Settings", open=True),
+            example_cases_title: gr.Accordion(label="Example Cases", open=True),
+            stableavatar_title: gr.TabItem(label="StableAvatar"),
+            audio_extraction_title: gr.TabItem(label="Audio Extraction"),
+            vocal_separation_title: gr.TabItem(label="Vocal Separation")
+        }
+    else:
+        return {
+            GPU_memory_mode: gr.Dropdown(label="显存模式", info="Normal占用25G显存，model_cpu_offload占用13G显存"),
+            teacache_threshold: gr.Slider(label="teacache threshold", info="推荐参数0.1，0为禁用teacache加速"),
+            num_skip_start_steps: gr.Slider(label="跳过开始步数", info="推荐参数5"),
+            model_version: gr.Dropdown(label="模型版本", choices=["square", "rec_vec"], value="square"),
+            image_path: gr.Image(label="上传图片"),
+            audio_path: gr.Audio(label="上传音频"),
+            prompt: gr.Textbox(label="提示词"),
+            negative_prompt: gr.Textbox(label="负面提示词"),
+            generate_button: gr.Button("🎬 开始生成"),
+            width: gr.Slider(label="宽度"),
+            height: gr.Slider(label="高度"),
+            exchange_button: gr.Button("🔄 交换宽高"),
+            adjust_button: gr.Button("根据图片调整宽高"),
+            guidance_scale: gr.Slider(label="guidance scale"),
+            num_inference_steps: gr.Slider(label="采样步数（推荐50步）", minimum=1, maximum=100, step=1, value=50),
+            text_guide_scale: gr.Slider(label="text guidance scale"),
+            audio_guide_scale: gr.Slider(label="audio guidance scale"),
+            motion_frame: gr.Slider(label="motion frame"),
+            fps: gr.Slider(label="帧率"),
+            overlap_window_length: gr.Slider(label="overlap window length"),
+            seed_param: gr.Number(label="种子，请输入正整数，-1为随机"),
+            overlapping_weight_scheme: gr.Dropdown(label="Overlapping Weight Scheme", choices=["uniform", "log"], value="uniform"),
+            info: gr.Textbox(label="提示信息"),
+            video_output: gr.Video(label="生成结果"),
+            seed_output: gr.Textbox(label="种子"),
+            video_path: gr.Video(label="上传视频"),
+            extractor_button: gr.Button("🎬 开始提取"),
+            info2: gr.Textbox(label="提示信息"),
+            audio_output: gr.Audio(label="生成结果"),
+            audio_path3: gr.Audio(label="上传音频"),
+            separation_button: gr.Button("🎬 开始分离"),
+            info3: gr.Textbox(label="提示信息"),
+            audio_output3: gr.Audio(label="生成结果"),
+            example_title: gr.Markdown(value="### 选择以下示例案例进行测试："),
+            example1_label: gr.Markdown(value="**示例 1**"),
+            example2_label: gr.Markdown(value="**示例 2**"),
+            example3_label: gr.Markdown(value="**示例 3**"),
+            example4_label: gr.Markdown(value="**示例 4**"),
+            example5_label: gr.Markdown(value="**示例 5**"),
+            example1_btn: gr.Button("🚀 使用示例 1", variant="secondary"),
+            example2_btn: gr.Button("🚀 使用示例 2", variant="secondary"),
+            example3_btn: gr.Button("🚀 使用示例 3", variant="secondary"),
+            example4_btn: gr.Button("🚀 使用示例 4", variant="secondary"),
+            example5_btn: gr.Button("🚀 使用示例 5", variant="secondary"),
+            parameter_settings_title: gr.Accordion(label="参数设置", open=True),
+            example_cases_title: gr.Accordion(label="示例案例", open=True),
+            stableavatar_title: gr.TabItem(label="StableAvatar"),
+            audio_extraction_title: gr.TabItem(label="音频提取"),
+            vocal_separation_title: gr.TabItem(label="人声分离")
+        }
+BANNER_HTML = """
+<div class="hero">
+  <div class="brand">
+    <!-- 如有项目 logo，可放到仓库并换成你的地址；没有就删这一行 -->
+    <!-- <img src="https://raw.githubusercontent.com/Francis-Rings/StableAvatar/main/assets/logo.png" alt="StableAvatar Logo"> -->
+    <span class="brand-text">STABLEAVATAR</span>
+  </div>
+  <div class="titles">
+    <h1>StableAvatar</h1>
+    <div class="badges">
+      <a class="badge" href="https://arxiv.org/abs/2508.08248" target="_blank" rel="noopener">
+        <img src="https://img.shields.io/badge/arXiv-2508.08248-b31b1b">
+      </a>
+      <a class="badge" href="https://francis-rings.github.io/StableAvatar/" target="_blank" rel="noopener">
+        <img src="https://img.shields.io/badge/Webpage-Visit-2266ee">
+      </a>
+      <a class="badge" href="https://github.com/Francis-Rings/StableAvatar" target="_blank" rel="noopener">
+        <img src="https://img.shields.io/badge/GitHub-Repo-181717?logo=github&logoColor=white">
+      </a>
+      <a class="badge" href="https://www.youtube.com/watch?v=6lhvmbzvv3Y" target="_blank" rel="noopener">
+        <img src="https://img.shields.io/badge/YouTube-Demo-ff0000?logo=youtube&logoColor=white">
+      </a>
+    </div>
+  </div>
+</div>
+<hr class="divider">
+"""
+BANNER_CSS = """
+.hero{display:flex;align-items:center;gap:18px;padding:18px;border-radius:14px;
+      background:#111;color:#fff;margin-bottom:12px}
+.brand-text{font-weight:800;letter-spacing:2px}
+.brand img{height:46px}
+.titles h1{font-size:28px;margin:0 0 6px 0}
+.badges{display:flex;gap:10px;flex-wrap:wrap}
+.badge img{height:22px}
+.divider{border:0;border-top:1px solid rgba(255,255,255,0.18);margin:6px 0 18px}
+"""
+# with gr.Blocks(theme=gr.themes.Base()) as demo:
+#     gr.Markdown("""
+#             <div>
+#                 <h2 style="font-size: 30px;text-align: center;">StableAvatar</h2>
+#             </div>
+#             """)
+with gr.Blocks(theme=gr.themes.Base(), css=BANNER_CSS) as demo:
+    gr.HTML(BANNER_HTML)
+    language_radio = gr.Radio(
+        choices=["English", "中文"],
+        value="English",
+        label="Language / 语言"
+    )
+    with gr.Accordion("Model Settings / 模型设置", open=False):
+        with gr.Row():
+            GPU_memory_mode = gr.Dropdown(
+                label = "显存模式",
+                info = "Normal占用25G显存，model_cpu_offload占用13G显存",
+                choices = ["Normal", "model_cpu_offload", "model_cpu_offloadand_qfloat8", "sequential_cpu_offload"],
+                value = "model_cpu_offload"
+            )
+            teacache_threshold = gr.Slider(label="teacache threshold", info = "推荐参数0.1，0为禁用teacache加速", minimum=0, maximum=1, step=0.01, value=0)
+            num_skip_start_steps = gr.Slider(label="跳过开始步数", info = "推荐参数5", minimum=0, maximum=100, step=1, value=5)
+        with gr.Row():
+            model_version = gr.Dropdown(
+                label = "模型版本",
+                choices = ["square","rec_vec"],
+                value = "square"
+            )
+    stableavatar_title = gr.TabItem(label="StableAvatar")
+    with stableavatar_title:
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    image_path = gr.Image(label="上传图片", type="filepath", height=280)
+                    audio_path = gr.Audio(label="上传音频", type="filepath")
+                prompt = gr.Textbox(label="提示词", value="")
+                negative_prompt = gr.Textbox(label="负面提示词", value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
+                generate_button = gr.Button("🎬 开始生成", variant='primary')
+                parameter_settings_title = gr.Accordion(label="参数设置", open=True)
+                with parameter_settings_title:
+                    with gr.Row():
+                        width = gr.Slider(label="宽度", minimum=256, maximum=2048, step=16, value=512)
+                        height = gr.Slider(label="高度", minimum=256, maximum=2048, step=16, value=512)
+                    with gr.Row():
+                        exchange_button = gr.Button("🔄 交换宽高")
+                        adjust_button = gr.Button("根据图片调整宽高")
+                    with gr.Row():
+                        guidance_scale = gr.Slider(label="guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=6.0)
+                        num_inference_steps = gr.Slider(label="采样步数（推荐50步）", minimum=1, maximum=100, step=1, value=50)
+                    with gr.Row():
+                        text_guide_scale = gr.Slider(label="text guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=3.0)
+                        audio_guide_scale = gr.Slider(label="audio guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=5.0)
+                    with gr.Row():
+                        motion_frame = gr.Slider(label="motion frame", minimum=1, maximum=50, step=1, value=25)
+                        fps = gr.Slider(label="帧率", minimum=1, maximum=60, step=1, value=25)
+                    with gr.Row():
+                        overlap_window_length = gr.Slider(label="overlap window length", minimum=1, maximum=20, step=1, value=10)
+                        seed_param = gr.Number(label="种子，请输入正整数，-1为随机", value=42)
+                    with gr.Row():
+                        overlapping_weight_scheme = gr.Dropdown(label="Overlapping Weight Scheme", choices=["uniform", "log"], value="uniform")
+            with gr.Column():
+                info = gr.Textbox(label="提示信息", interactive=False)
+                video_output = gr.Video(label="生成结果", interactive=False)
+                seed_output = gr.Textbox(label="种子")
+        # 示例案例部分移到StableAvatar标签页内部
+        example_cases_title = gr.Accordion(label="示例案例", open=True)
+        with example_cases_title:
+            example_title = gr.Markdown(value="### 选择以下示例案例进行测试：")
+            with gr.Row():
+                with gr.Column():
+                    example1_label = gr.Markdown(value="**示例 1**")
+                    example1_image = gr.Image(value="example_case/case-1/reference.png", label="", interactive=False, height=120, show_label=False)
+                    example1_audio = gr.Audio(value="example_case/case-1/audio.wav", label="", interactive=False, show_label=False)
+                    example1_btn = gr.Button("🚀 使用示例 1", variant="secondary", size="sm")
+                with gr.Column():
+                    example2_label = gr.Markdown(value="**示例 2**")
+                    example2_image = gr.Image(value="example_case/case-2/reference.png", label="", interactive=False, height=120, show_label=False)
+                    example2_audio = gr.Audio(value="example_case/case-2/audio.wav", label="", interactive=False, show_label=False)
+                    example2_btn = gr.Button("🚀 使用示例 2", variant="secondary", size="sm")
+                with gr.Column():
+                    example3_label = gr.Markdown(value="**示例 3**")
+                    example3_image = gr.Image(value="example_case/case-6/reference.png", label="", interactive=False, height=120, show_label=False)
+                    example3_audio = gr.Audio(value="example_case/case-6/audio.wav", label="", interactive=False, show_label=False)
+                    example3_btn = gr.Button("🚀 使用示例 3", variant="secondary", size="sm")
+                with gr.Column():
+                    example4_label = gr.Markdown(value="**示例 4**")
+                    example4_image = gr.Image(value="example_case/case-45/reference.png", label="", interactive=False, height=120, show_label=False)
+                    example4_audio = gr.Audio(value="example_case/case-45/audio.wav", label="", interactive=False, show_label=False)
+                    example4_btn = gr.Button("🚀 使用示例 4", variant="secondary", size="sm")
+                with gr.Column():
+                    example5_label = gr.Markdown(value="**示例 5**")
+                    example5_image = gr.Image(value="example_case/case-3/reference.jpg", label="", interactive=False, height=120, show_label=False)
+                    example5_audio = gr.Audio(value="example_case/case-3/audio.wav", label="", interactive=False, show_label=False)
+                    example5_btn = gr.Button("🚀 使用示例 5", variant="secondary", size="sm")
+    audio_extraction_title = gr.TabItem(label="音频提取")
+    with audio_extraction_title:
+        with gr.Row():
+            with gr.Column():
+                video_path = gr.Video(label="上传视频", height=500)
+                extractor_button = gr.Button("🎬 开始提取", variant='primary')
+            with gr.Column():
+                info2 = gr.Textbox(label="提示信息", interactive=False)
+                audio_output = gr.Audio(label="生成结果", interactive=False)
+                audio_file = gr.File(label="download audio file")
+    vocal_separation_title = gr.TabItem(label="人声分离")
+    with vocal_separation_title:
+        with gr.Row():
+            with gr.Column():
+                audio_path3 = gr.Audio(label="上传音频", type="filepath")
+                separation_button = gr.Button("🎬 开始分离", variant='primary')
+            with gr.Column():
+                info3 = gr.Textbox(label="提示信息", interactive=False)
+                audio_output3 = gr.Audio(label="生成结果", interactive=False)
+                audio_file3 = gr.File(label="download audio file")
+    # 示例案例部分移到末尾
+    # example_cases_title = gr.Accordion(label="示例案例", open=True)
+    # with example_cases_title:
+    #     example_title = gr.Markdown(value="### 选择以下示例案例进行测试：")
+    #     with gr.Row():
+    #         with gr.Column():
+    #             example1_label = gr.Markdown(value="**示例 1**")
+    #             example1_image = gr.Image(value="example_case/case-1/reference.png", label="", interactive=False, height=120, show_label=False)
+    #             example1_audio = gr.Audio(value="example_case/case-1/audio.wav", label="", interactive=False, show_label=False)
+    #             example1_btn = gr.Button("🚀 使用示例 1", variant="secondary", size="sm")
+    #         with gr.Column():
+    #             example2_label = gr.Markdown(value="**示例 2**")
+    #             example2_image = gr.Image(value="example_case/case-2/reference.png", label="", interactive=False, height=120, show_label=False)
+    #             example2_audio = gr.Audio(value="example_case/case-2/audio.wav", label="", interactive=False, show_label=False)
+    #             example2_btn = gr.Button("🚀 使用示例 2", variant="secondary", size="sm")
+    #         with gr.Column():
+    #             example3_label = gr.Markdown(value="**示例 3**")
+    #             example3_image = gr.Image(value="example_case/case-6/reference.png", label="", interactive=False, height=120, show_label=False)
+    #             example3_audio = gr.Audio(value="example_case/case-6/audio.wav", label="", interactive=False, show_label=False)
+    #             example3_btn = gr.Button("🚀 使用示例 3", variant="secondary", size="sm")
+    #         with gr.Column():
+    #             example4_label = gr.Markdown(value="**示例 4**")
+    #             example4_image = gr.Image(value="example_case/case-45/reference.png", label="", interactive=False, height=120, show_label=False)
+    #             example4_audio = gr.Audio(value="example_case/case-45/audio.wav", label="", interactive=False, show_label=False)
+    #             example4_btn = gr.Button("🚀 使用示例 4", variant="secondary", size="sm")
+    #         with gr.Column():
+    #             example5_label = gr.Markdown(value="**示例 5**")
+    #             example5_image = gr.Image(value="example_case/case-3/reference.jpg", label="", interactive=False, height=120, show_label=False)
+    #             example5_audio = gr.Audio(value="example_case/case-3/audio.wav", label="", interactive=False, show_label=False)
+    #             example5_btn = gr.Button("🚀 使用示例 5", variant="secondary", size="sm")
+    all_components = [GPU_memory_mode, teacache_threshold, num_skip_start_steps, model_version, image_path, audio_path, prompt, negative_prompt, generate_button, width, height, exchange_button, adjust_button, guidance_scale, num_inference_steps, text_guide_scale, audio_guide_scale, motion_frame, fps, overlap_window_length, seed_param, overlapping_weight_scheme, info, video_output, seed_output, video_path, extractor_button, info2, audio_output, audio_path3, separation_button, info3, audio_output3, example_title, example1_label, example2_label, example3_label, example4_label, example1_btn, example2_btn, example3_btn, example4_btn, example5_label, example5_btn, parameter_settings_title, example_cases_title, stableavatar_title, audio_extraction_title, vocal_separation_title]
+    language_radio.change(
+        fn=update_language,
+        inputs=[language_radio],
+        outputs=all_components
+    )
+    # 添加模型版本选择的事件处理
+    def on_model_version_change(model_version):
+        """当模型版本改变时，重新加载对应的模型"""
+        result = load_transformer_model(model_version)
+        if result is not None:
+            return f"✅ 模型已切换到 {model_version} 版本"
+        else:
+            return f"❌ 模型切换失败，请检查文件是否存在"
+    model_version.change(
+        fn=on_model_version_change,
+        inputs=[model_version],
+        outputs=[info]
+    )
+    demo.load(fn=update_language, inputs=[language_radio], outputs=all_components)
+    # 添加示例案例按钮的事件处理
+    def load_example1():
+        try:
+            with open("example_case/case-1/prompt.txt", "r", encoding="utf-8") as f:
+                prompt_text = f.read().strip()
+        except:
+            prompt_text = ""
+        return "example_case/case-1/reference.png", "example_case/case-1/audio.wav", prompt_text
+    def load_example2():
+        try:
+            with open("example_case/case-2/prompt.txt", "r", encoding="utf-8") as f:
+                prompt_text = f.read().strip()
+        except:
+            prompt_text = ""
+        return "example_case/case-2/reference.png", "example_case/case-2/audio.wav", prompt_text
+    def load_example3():
+        try:
+            with open("example_case/case-6/prompt.txt", "r", encoding="utf-8") as f:
+                prompt_text = f.read().strip()
+        except:
+            prompt_text = ""
+        return "example_case/case-6/reference.png", "example_case/case-6/audio.wav", prompt_text
+    def load_example4():
+        try:
+            with open("example_case/case-45/prompt.txt", "r", encoding="utf-8") as f:
+                prompt_text = f.read().strip()
+        except:
+            prompt_text = ""
+        return "example_case/case-45/reference.png", "example_case/case-45/audio.wav", prompt_text
+    def load_example5():
+        try:
+            with open("example_case/case-3/prompt.txt", "r", encoding="utf-8") as f:
+                prompt_text = f.read().strip()
+        except:
+            prompt_text = ""
+        return "example_case/case-3/reference.jpg", "example_case/case-3/audio.wav", prompt_text
+    example1_btn.click(fn=load_example1, outputs=[image_path, audio_path, prompt])
+    example2_btn.click(fn=load_example2, outputs=[image_path, audio_path, prompt])
+    example3_btn.click(fn=load_example3, outputs=[image_path, audio_path, prompt])
+    example4_btn.click(fn=load_example4, outputs=[image_path, audio_path, prompt])
+    example5_btn.click(fn=load_example5, outputs=[image_path, audio_path, prompt])
+    gr.on(
+        triggers=[generate_button.click, prompt.submit, negative_prompt.submit],
+        fn = generate,
+        inputs = [
+            GPU_memory_mode,
+            teacache_threshold,
+            num_skip_start_steps,
+            image_path,
+            audio_path,
+            prompt,
+            negative_prompt,
+            width,
+            height,
+            guidance_scale,
+            num_inference_steps,
+            text_guide_scale,
+            audio_guide_scale,
+            motion_frame,
+            fps,
+            overlap_window_length,
+            seed_param,
+            overlapping_weight_scheme,
+        ],
+        outputs = [video_output, seed_output, info]
+    )
+    exchange_button.click(
+        fn=exchange_width_height,
+        inputs=[width, height],
+        outputs=[width, height, info]
+    )
+    adjust_button.click(
+        fn=adjust_width_height,
+        inputs=[image_path],
+        outputs=[width, height, info]
+    )
+    extractor_button.click(
+        fn=audio_extractor,
+        inputs=[video_path],
+        outputs=[audio_output, info2, audio_file]
+    )
+    separation_button.click(
+        fn=vocal_separation,
+        inputs=[audio_path3],
+        outputs=[audio_output3, info3, audio_file3]
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", 7860)),
+        share=False,
+        inbrowser=False,
+    )

audio_extractor.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+from moviepy.editor import VideoFileClip
+import argparse
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video_path", type=str)
+    parser.add_argument("--saved_audio_path", type=str)
+    args = parser.parse_args()
+    video_path = args.video_path
+    saved_audio_path = args.saved_audio_path
+    video = VideoFileClip(video_path)
+    audio = video.audio
+    audio.write_audiofile(saved_audio_path, codec='pcm_s16le')

deepspeed_config/wan2.1/wan_civitai.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+format: civitai
+pipeline: Wan
+transformer_additional_kwargs:
+  transformer_subpath: ./
+  dict_mapping:
+    in_dim: in_channels
+    dim: hidden_size
+vae_kwargs:
+  vae_subpath: Wan2.1_VAE.pth
+  temporal_compression_ratio: 4
+  spatial_compression_ratio: 8
+text_encoder_kwargs:
+  text_encoder_subpath: models_t5_umt5-xxl-enc-bf16.pth
+  tokenizer_subpath: google/umt5-xxl
+  text_length: 512
+  vocab: 256384
+  dim: 4096
+  dim_attn: 4096
+  dim_ffn: 10240
+  num_heads: 64
+  num_layers: 24
+  num_buckets: 32
+  shared_pos: False
+  dropout: 0.0
+scheduler_kwargs:
+  scheduler_subpath: null
+  num_train_timesteps: 1000
+  shift: 5.0
+  use_dynamic_shifting: false
+  base_shift: 0.5
+  max_shift: 1.15
+  base_image_seq_len: 256
+  max_image_seq_len: 4096
+image_encoder_kwargs:
+  image_encoder_subpath: models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth

deepspeed_config/zero2_offload_cpu.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 5e-5,
+            "betas": [0.9, 0.95],
+            "weight_decay": 0.01
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": 1e-6,
+            "warmup_max_lr": 5e-5,
+            "total_num_steps": 10000
+        }
+    }
+}

deepspeed_config/zero_stage2_config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "bf16": {
+      "enabled": true
+  },
+  "train_micro_batch_size_per_gpu": 1,
+  "train_batch_size": 64,
+  "gradient_clipping": 1.0,
+  "gradient_accumulation_steps": 1,
+  "dump_state": true,
+  "zero_optimization": {
+      "stage": 2,
+      "allgather_partitions": true,
+      "allgather_bucket_size": 2e8,
+      "overlap_comm": true,
+      "reduce_scatter": true,
+      "reduce_bucket_size": 1e8,
+      "contiguous_gradients": true
+  },
+  "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": 1e-4,
+        "betas": [0.9, 0.999],
+        "weight_decay": 3e-2
+      }
+  },
+  "scheduler": {
+      "type": "WarmupLR",
+      "params": {
+        "warmup_min_lr": 1e-7,
+        "warmup_max_lr": 1e-4,
+        "warmup_num_steps": 100
+      }
+  }
+}

deepspeed_config/zero_stage3_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 2e-5,
+            "betas": [0.9, 0.999],
+            "weight_decay": 3e-2
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 1e-7,
+            "warmup_max_lr": 2e-5,
+            "warmup_num_steps": 6400
+        }
+    },
+    "train_micro_batch_size_per_gpu": 1,
+    "gradient_accumulation_steps": 1,
+    "train_batch_size": 64,
+    "gradient_clipping": 1.0,
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false,
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e8,
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto",
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    }
+}

example_case/case-1/audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d12a8745971f1472c1ac5b3e3e5349163be7555b187ef3ad3cc4718393174458
+size 17645370

example_case/case-1/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Front-facing head-and-shoulders close-up of a middle-aged woman with short light brown hair, pearl earrings, and a blue blazer under soft studio lighting – She delivers a clear, confident speech with precise lip movements, steady gaze toward the camera, subtle eyebrow emphasis, slight nods, and occasional blinks while maintaining composed posture – Blurred civic architecture in the background resembling a government building, shallow depth of field, static camera.

example_case/case-1/reference.png ADDED Viewed

Git LFS Details

SHA256: 238117e216e32488130fa3b1ef4147339e7b2f157dcf7e62af46f3be34d19a89
Pointer size: 131 Bytes
Size of remote file: 657 kB

example_case/case-2/audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb004cdfc7ba33e44c4128c555f43bbfdd88049a8937b1c5585db56efd59da15
+size 2568018

example_case/case-2/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Front-facing head-and-shoulders close-up of a middle-aged man with a shaved head, thin-rim glasses, and a striped shirt under soft warm lighting – He speaks clearly and thoughtfully with precise lip-sync, subtle eyebrow movement, slight nods, and occasional blinks while maintaining a steady posture – Indoor studio with blurred shutters and two warm pendant lights, shallow depth of field, and a static camera.

example_case/case-2/reference.png ADDED Viewed

Git LFS Details

SHA256: 0282b2e0a1401db84ae97d06adadea1d58b9c61b2e632bcf2a377d55eacdaecf
Pointer size: 131 Bytes
Size of remote file: 705 kB

example_case/case-3/audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2162d8ca2e9ff692c132683c9f197cc73c84a6bb6cd8a3ed5aeefbc4711ad87
+size 168014

example_case/case-3/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Front-facing head-and-shoulders close-up of an adult woman with wavy dark brown hair and silver hoop earrings under soft warm lighting – She sings “there once was a ship that put to sea, the name of the ship was the Billy” with precise lip-sync, steady tempo, subtle head sway, gentle eyebrow lifts, and occasional blinks while maintaining a composed posture – Indoor studio with a softly blurred background and warm bokeh, shallow depth of field, static camera.

example_case/case-3/reference.jpg ADDED Viewed

Git LFS Details

SHA256: 5875d5c60ed74322fb878604c8e8548aefe4a45bdbbdb998069e398824080b74
Pointer size: 130 Bytes
Size of remote file: 26.9 kB

example_case/case-45/audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd5db4a3d4a970a51729aff7001ac34ffb13c486c62c7e44759023403121db66
+size 3076494

example_case/case-45/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Front-facing medium close-up of a young woman with long silver hair, elf-like ears, a cozy oversized light blue scarf, and a white outfit under soft daylight – She sings a sweet, lighthearted melody with precise lip-sync, a gentle smile, relaxed breathing, subtle head sway, and natural blinks while maintaining a warm and calm demeanor – Cozy indoor room with soft light, bed and curtain in the background, shallow depth of field, static camera.

example_case/case-45/reference.png ADDED Viewed

Git LFS Details

SHA256: 859b53178227630da809ec6be49894c6b26da340aae1532c90167a7736125e59
Pointer size: 130 Bytes
Size of remote file: 49.4 kB

example_case/case-6/audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a1460a58d3a7662cb17494e99edc590bbab2254c80fbbd8d5c0ce327645c39
+size 5821278

example_case/case-6/prompt.txt ADDED Viewed

	@@ -0,0 +1 @@

+ Front-facing medium close-up of a young woman with shoulder-length dark hair, wearing a white top and small hoop earrings, a studio microphone visible in the lower left under soft daylight – She sings smoothly with precise lip-sync, relaxed breathing, gentle head sway, subtle eyebrow emphasis, and natural blinks while maintaining a calm posture – Minimal indoor setting with a light gray wall and decorative molding, diagonal light and soft shadows, shallow depth of field, static camera.

example_case/case-6/reference.png ADDED Viewed

Git LFS Details

SHA256: 43717c247b9c19950216b409632c0daae8b6568ed1c58cf5abf364b3469a4569
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

extract_audio_segment.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env python3
+"""
+音频文件转换和片段提取工具
+将MP3文件转换为WAV格式，并提取指定时间段的音频片段
+"""
+import os
+import subprocess
+from pathlib import Path
+def convert_mp3_to_wav_and_extract(input_file, start_time, end_time, output_dir=None):
+    """
+    将MP3文件转换为WAV格式，并提取指定时间段的音频片段
+    Args:
+        input_file (str): 输入的MP3文件路径
+        start_time (float): 开始时间（秒）
+        end_time (float): 结束时间（秒）
+        output_dir (str): 输出目录，如果为None则使用输入文件所在目录
+    Returns:
+        bool: 操作是否成功
+    """
+    try:
+        # 检查输入文件是否存在
+        if not os.path.exists(input_file):
+            print(f"❌ 错误：输入文件不存在: {input_file}")
+            return False
+        # 设置输出目录
+        if output_dir is None:
+            output_dir = os.path.dirname(input_file)
+        # 确保输出目录存在
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        # 生成输出文件名
+        input_name = Path(input_file).stem
+        output_wav = os.path.join(output_dir, f"{input_name}.wav")
+        output_segment = os.path.join(output_dir, f"{input_name}_segment_{start_time}s_to_{end_time}s.wav")
+        print(f"🎵 开始处理音频文件: {input_file}")
+        print(f"📁 输出目录: {output_dir}")
+        # 步骤1：将MP3转换为WAV格式
+        print(f"\n🔄 步骤1: 将MP3转换为WAV格式")
+        convert_cmd = [
+            'ffmpeg',
+            '-y',  # 覆盖输出文件
+            '-i', input_file,  # 输入文件
+            '-ar', '16000',  # 采样率16kHz
+            '-ac', '1',  # 单声道
+            '-c:a', 'pcm_s16le',  # 16位PCM编码
+            output_wav  # 输出文件
+        ]
+        print(f"执行命令: {' '.join(convert_cmd)}")
+        result = subprocess.run(convert_cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"❌ MP3转WAV失败: {result.stderr}")
+            return False
+        print(f"✅ MP3转WAV成功: {output_wav}")
+        # 步骤2：提取音频片段
+        print(f"\n🔄 步骤2: 提取音频片段 ({start_time}s - {end_time}s)")
+        duration = end_time - start_time
+        extract_cmd = [
+            'ffmpeg',
+            '-y',  # 覆盖输出文件
+            '-i', output_wav,  # 输入WAV文件
+            '-ss', str(start_time),  # 开始时间
+            '-t', str(duration),  # 持续时间
+            '-c', 'copy',  # 直接复制，不重新编码
+            output_segment  # 输出片段文件
+        ]
+        print(f"执行命令: {' '.join(extract_cmd)}")
+        result = subprocess.run(extract_cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"❌ 音频片段提取失败: {result.stderr}")
+            return False
+        print(f"✅ 音频片段提取成功: {output_segment}")
+        # 显示文件信息
+        print(f"\n📊 文件信息:")
+        print(f"原始MP3文件: {input_file}")
+        print(f"转换后的WAV文件: {output_wav}")
+        print(f"提取的音频片段: {output_segment}")
+        print(f"片段时长: {duration:.1f}秒")
+        # 检查输出文件大小
+        if os.path.exists(output_wav):
+            wav_size = os.path.getsize(output_wav) / 1024  # KB
+            print(f"WAV文件大小: {wav_size:.1f} KB")
+        if os.path.exists(output_segment):
+            segment_size = os.path.getsize(output_segment) / 1024  # KB
+            print(f"片段文件大小: {segment_size:.1f} KB")
+        return True
+    except Exception as e:
+        print(f"❌ 处理过程中出现错误: {str(e)}")
+        return False
+def main():
+    """主函数"""
+    print("🎵 音频文件转换和片段提取工具")
+    print("=" * 50)
+    # 检查ffmpeg是否安装
+    try:
+        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
+        print("✅ 检测到ffmpeg")
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("❌ 错误：未找到ffmpeg，请先安装ffmpeg")
+        print("Ubuntu/Debian: sudo apt install ffmpeg")
+        print("CentOS/RHEL: sudo yum install ffmpeg")
+        print("macOS: brew install ffmpeg")
+        return
+    # 设置文件路径和时间参数
+    input_file = "/home/t2vg-a100-G4-42/v-shuyuantu/StableAvatar/example_case/case-3/ssvid.net--Wellerman-Female-Cover-LYRICS-Sea-Shanty.mp3"
+    start_time = 1.9  # 开始时间（秒）
+    end_time = 7.1    # 结束时间（秒）
+    print(f"📁 输入文件: {input_file}")
+    print(f"⏰ 提取时间段: {start_time}s - {end_time}s")
+    print(f"⏱️ 片段时长: {end_time - start_time:.1f}秒")
+    # 执行转换和提取
+    success = convert_mp3_to_wav_and_extract(input_file, start_time, end_time)
+    if success:
+        print(f"\n🎉 所有操作完成！")
+        print(f"输出文件保存在: {os.path.dirname(input_file)}")
+    else:
+        print(f"\n❌ 操作失败，请检查错误信息")
+if __name__ == "__main__":
+    main()

lip_mask_extractor.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import argparse
+import os
+import cv2
+import mediapipe as mp
+import numpy as np
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--folder_root", type=str)
+    parser.add_argument("--start", type=int, help="Specify the value of start")
+    parser.add_argument("--end", type=int, help="Specify the value of end")
+    args = parser.parse_args()
+    folder_root = args.folder_root
+    start = args.start
+    end = args.end
+    mp_face_mesh = mp.solutions.face_mesh
+    face_mesh = mp_face_mesh.FaceMesh(static_image_mode=True, max_num_faces=10)
+    upper_lip_idx = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291]
+    lower_lip_idx = [61, 146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
+    for idx in range(start, end):
+        subfolder = str(idx).zfill(5)
+        subfolder_path = os.path.join(folder_root, subfolder)
+        images_folder = os.path.join(subfolder_path, "images")
+        if os.path.exists(images_folder):
+            face_masks_folder = os.path.join(subfolder_path, "lip_masks")
+            os.makedirs(face_masks_folder, exist_ok=True)
+            for root, dirs, files in os.walk(images_folder):
+                for file in files:
+                    if file.endswith('.png'):
+                        file_name = os.path.splitext(file)[0]
+                        image_name = file_name + '.png'
+                        image_legal_path = os.path.join(images_folder, image_name)
+                        if os.path.exists(os.path.join(face_masks_folder, file_name + '.png')):
+                            existed_path = os.path.join(face_masks_folder, file_name + '.png')
+                            print(f"{existed_path} already exists!")
+                            continue
+                        face_save_path = os.path.join(face_masks_folder, file_name + '.png')
+                        image = cv2.imread(image_legal_path)
+                        h, w, _ = image.shape
+                        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                        results = face_mesh.process(rgb_image)
+                        mask = np.zeros((h, w), dtype=np.uint8)
+                        if results.multi_face_landmarks:
+                            for face_landmarks in results.multi_face_landmarks:
+                                upper_points = np.array([
+                                    [int(face_landmarks.landmark[i].x * w), int(face_landmarks.landmark[i].y * h)]
+                                    for i in upper_lip_idx
+                                ], dtype=np.int32)
+                                lower_points = np.array([
+                                    [int(face_landmarks.landmark[i].x * w), int(face_landmarks.landmark[i].y * h)]
+                                    for i in lower_lip_idx
+                                ], dtype=np.int32)
+                                cv2.fillPoly(mask, [upper_points], 255)
+                                cv2.fillPoly(mask, [lower_points], 255)
+                        else:
+                            print(f"No face detected in {image_legal_path}. Saving empty mask.")
+                        cv2.imwrite(face_save_path, mask)
+                        print(f"Lip mask saved to {face_save_path}")
+        else:
+            print(f"{images_folder} does not exist")
+            continue

requirements.txt ADDED Viewed

	@@ -0,0 +1,170 @@

+absl-py==2.3.1
+accelerate==1.10.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+albucore==0.0.24
+albumentations==2.0.8
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.10.0
+attrs==25.3.0
+audio-separator==0.36.1
+audioread==3.0.1
+av==15.0.0
+beartype==0.18.5
+beautifulsoup4==4.13.4
+Brotli==1.1.0
+certifi==2025.8.3
+cffi==1.17.1
+charset-normalizer==3.4.3
+click==8.2.1
+coloredlogs==15.0.1
+cryptography==45.0.6
+Cython==3.1.3
+dashscope==1.24.1
+datasets==4.0.0
+decorator==4.4.2
+decord==0.6.0
+diffq==0.2.4
+diffusers==0.30.1
+dill==0.3.8
+easydict==1.13
+einops==0.8.1
+fastapi==0.116.1
+ffmpy==0.6.1
+filelock==3.13.1
+flatbuffers==25.2.10
+frozenlist==1.7.0
+fsspec==2024.6.1
+ftfy==6.3.1
+gradio==5.42.0
+gradio_client==1.11.1
+groovy==0.1.2
+grpcio==1.74.0
+h11==0.16.0
+hf-xet==1.1.7
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.34.4
+humanfriendly==10.0
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+importlib_metadata==8.7.0
+Jinja2==3.1.4
+joblib==1.5.1
+julius==0.2.7
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+Markdown==3.8.2
+markdown-it-py==4.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+ml_collections==1.1.0
+ml_dtypes==0.5.3
+moviepy==1.0.3
+mpmath==1.3.0
+msgpack==1.1.1
+multidict==6.6.4
+multiprocess==0.70.16
+networkx==3.3
+ninja==1.13.0
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+omegaconf==2.3.0
+onnx-weekly==1.19.0.dev20250726
+onnx2torch-py313==1.6.0
+onnxruntime-gpu==1.22.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+orjson==3.11.2
+packaging==25.0
+pandas==2.3.1
+pillow==11.0.0
+platformdirs==4.3.8
+pooch==1.8.2
+proglog==0.1.12
+propcache==0.3.2
+protobuf==6.31.1
+psutil==7.0.0
+pyarrow==21.0.0
+pycparser==2.22
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2025.7.34
+requests==2.32.4
+resampy==0.4.3
+rich==14.1.0
+rotary-embedding-torch==0.6.5
+ruff==0.12.8
+safehttpx==0.1.6
+safetensors==0.6.2
+samplerate==0.1.0
+scikit-image==0.25.2
+scikit-learn==1.7.1
+scipy==1.16.1
+semantic-version==2.10.0
+sentencepiece==0.2.1
+shellingham==1.5.4
+simsimd==6.5.0
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+soupsieve==2.7
+soxr==0.5.0.post1
+starlette==0.47.2
+stringzilla==3.12.6
+sympy==1.13.1
+tensorboard==2.20.0
+tensorboard-data-server==0.7.2
+threadpoolctl==3.6.0
+tifffile==2025.6.11
+timm==1.0.19
+tokenizers==0.21.4
+tomesd==0.1.3
+tomlkit==0.13.3
+torch==2.6.0+cu124
+torchaudio==2.6.0+cu124
+torchdiffeq==0.2.5
+torchsde==0.2.6
+torchvision==0.21.0+cu124
+tqdm==4.67.1
+trampoline==0.1.2
+transformers==4.51.3
+triton==3.2.0
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.12.2
+tzdata==2025.2
+urllib3==2.5.0
+uvicorn==0.35.0
+wcwidth==0.2.13
+websocket-client==1.8.0
+websockets==15.0.1
+Werkzeug==3.1.3
+xxhash==3.5.0
+yarl==1.20.1
+zipp==3.23.0

vocal_seperator.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import argparse
+import os
+import shutil
+import subprocess
+from audio_separator.separator import Separator
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--audio_file_path", type=str)
+    parser.add_argument("--saved_vocal_path", type=str)
+    parser.add_argument("--audio_separator_model_file", type=str)
+    args = parser.parse_args()
+    audio_file_path = args.audio_file_path
+    audio_separator_model_file = args.audio_separator_model_file
+    saved_vocal_path = args.saved_vocal_path
+    cache_dir = os.path.join(os.path.dirname(audio_file_path), "vocals")
+    os.makedirs(cache_dir, exist_ok=True)
+    audio_separator = Separator(
+        output_dir=cache_dir,
+        output_single_stem="vocals",
+        model_file_dir=os.path.dirname(audio_separator_model_file),
+    )
+    audio_separator.load_model(os.path.basename(audio_separator_model_file))
+    assert audio_separator.model_instance is not None, "Fail to load audio separate model."
+    outputs = audio_separator.separate(audio_file_path)
+    subfolder_path = os.path.dirname(audio_file_path)
+    vocal_audio_file = os.path.join(audio_separator.output_dir, outputs[0])
+    destination_file = os.path.join(subfolder_path, "vocal.wav")
+    shutil.copy(vocal_audio_file, destination_file)
+    os.remove(vocal_audio_file)

wan/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# from . import configs, distributed, modules
+# from .image2video import WanI2V
+# from .text2video import WanT2V

wan/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (175 Bytes). View file

wan/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import copy
+import os
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+from .wan_i2v_14B import i2v_14B
+from .wan_t2v_1_3B import t2v_1_3B
+from .wan_t2v_14B import t2v_14B
+# the config of t2i_14B is the same as t2v_14B
+t2i_14B = copy.deepcopy(t2v_14B)
+t2i_14B.__name__ = 'Config: Wan T2I 14B'
+WAN_CONFIGS = {
+    't2v-14B': t2v_14B,
+    't2v-1.3B': t2v_1_3B,
+    'i2v-14B': i2v_14B,
+    't2i-14B': t2i_14B,
+}
+SIZE_CONFIGS = {
+    '720*1280': (720, 1280),
+    '1280*720': (1280, 720),
+    '480*832': (480, 832),
+    '832*480': (832, 480),
+    '1024*1024': (1024, 1024),
+}
+MAX_AREA_CONFIGS = {
+    '720*1280': 720 * 1280,
+    '1280*720': 1280 * 720,
+    '480*832': 480 * 832,
+    '832*480': 832 * 480,
+}
+SUPPORTED_SIZES = {
+    't2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2v-1.3B': ('480*832', '832*480'),
+    'i2v-14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    't2i-14B': tuple(SIZE_CONFIGS.keys()),
+}

wan/configs/shared_config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+#------------------------ Wan shared config ------------------------#
+wan_shared_cfg = EasyDict()
+# t5
+wan_shared_cfg.t5_model = 'umt5_xxl'
+wan_shared_cfg.t5_dtype = torch.bfloat16
+wan_shared_cfg.text_len = 512
+# transformer
+wan_shared_cfg.param_dtype = torch.bfloat16
+# inference
+wan_shared_cfg.num_train_timesteps = 1000
+wan_shared_cfg.sample_fps = 16
+wan_shared_cfg.sample_neg_prompt = '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'

wan/configs/wan_i2v_14B.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan I2V 14B ------------------------#
+i2v_14B = EasyDict(__name__='Config: Wan I2V 14B')
+i2v_14B.update(wan_shared_cfg)
+i2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+i2v_14B.t5_tokenizer = 'google/umt5-xxl'
+# clip
+i2v_14B.clip_model = 'clip_xlm_roberta_vit_h_14'
+i2v_14B.clip_dtype = torch.float16
+i2v_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+i2v_14B.clip_tokenizer = 'xlm-roberta-large'
+# vae
+i2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+i2v_14B.vae_stride = (4, 8, 8)
+# transformer
+i2v_14B.patch_size = (1, 2, 2)
+i2v_14B.dim = 5120
+i2v_14B.ffn_dim = 13824
+i2v_14B.freq_dim = 256
+i2v_14B.num_heads = 40
+i2v_14B.num_layers = 40
+i2v_14B.window_size = (-1, -1)
+i2v_14B.qk_norm = True
+i2v_14B.cross_attn_norm = True
+i2v_14B.eps = 1e-6

wan/configs/wan_t2v_14B.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan T2V 14B ------------------------#
+t2v_14B = EasyDict(__name__='Config: Wan T2V 14B')
+t2v_14B.update(wan_shared_cfg)
+# t5
+t2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_14B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+t2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_14B.vae_stride = (4, 8, 8)
+# transformer
+t2v_14B.patch_size = (1, 2, 2)
+t2v_14B.dim = 5120
+t2v_14B.ffn_dim = 13824
+t2v_14B.freq_dim = 256
+t2v_14B.num_heads = 40
+t2v_14B.num_layers = 40
+t2v_14B.window_size = (-1, -1)
+t2v_14B.qk_norm = True
+t2v_14B.cross_attn_norm = True
+t2v_14B.eps = 1e-6

wan/configs/wan_t2v_1_3B.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import wan_shared_cfg
+#------------------------ Wan T2V 1.3B ------------------------#
+t2v_1_3B = EasyDict(__name__='Config: Wan T2V 1.3B')
+t2v_1_3B.update(wan_shared_cfg)
+# t5
+t2v_1_3B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_1_3B.t5_tokenizer = 'google/umt5-xxl'
+# vae
+t2v_1_3B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_1_3B.vae_stride = (4, 8, 8)
+# transformer
+t2v_1_3B.patch_size = (1, 2, 2)
+t2v_1_3B.dim = 1536
+t2v_1_3B.ffn_dim = 8960
+t2v_1_3B.freq_dim = 256
+t2v_1_3B.num_heads = 12
+t2v_1_3B.num_layers = 30
+t2v_1_3B.window_size = (-1, -1)
+t2v_1_3B.qk_norm = True
+t2v_1_3B.cross_attn_norm = True
+t2v_1_3B.eps = 1e-6

wan/dataset/talking_video_dataset_fantasy.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import math
+import os
+import random
+import warnings
+import librosa
+import numpy as np
+import torch
+from PIL import Image
+import cv2
+from einops import rearrange
+import torchvision.transforms.functional as TF
+from torch.utils.data.dataset import Dataset
+import torch.nn.functional as F
+def get_random_mask(shape, image_start_only=False):
+    f, c, h, w = shape
+    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
+    if not image_start_only:
+        if f != 1:
+            mask_index = np.random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], p=[0.05, 0.2, 0.2, 0.2, 0.05, 0.05, 0.05, 0.1, 0.05, 0.05])
+        else:
+            mask_index = np.random.choice([0, 1], p = [0.2, 0.8])
+        if mask_index == 0:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+            block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+            start_x = max(center_x - block_size_x // 2, 0)
+            end_x = min(center_x + block_size_x // 2, w)
+            start_y = max(center_y - block_size_y // 2, 0)
+            end_y = min(center_y + block_size_y // 2, h)
+            mask[:, :, start_y:end_y, start_x:end_x] = 1
+        elif mask_index == 1:
+            mask[:, :, :, :] = 1
+        elif mask_index == 2:
+            mask_frame_index = np.random.randint(1, 5)
+            mask[mask_frame_index:, :, :, :] = 1
+        elif mask_index == 3:
+            mask_frame_index = np.random.randint(1, 5)
+            mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
+        elif mask_index == 4:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+            block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+            start_x = max(center_x - block_size_x // 2, 0)
+            end_x = min(center_x + block_size_x // 2, w)
+            start_y = max(center_y - block_size_y // 2, 0)
+            end_y = min(center_y + block_size_y // 2, h)
+            mask_frame_before = np.random.randint(0, f // 2)
+            mask_frame_after = np.random.randint(f // 2, f)
+            mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
+        elif mask_index == 5:
+            mask = torch.randint(0, 2, (f, 1, h, w), dtype=torch.uint8)
+        elif mask_index == 6:
+            num_frames_to_mask = random.randint(1, max(f // 2, 1))
+            frames_to_mask = random.sample(range(f), num_frames_to_mask)
+            for i in frames_to_mask:
+                block_height = random.randint(1, h // 4)
+                block_width = random.randint(1, w // 4)
+                top_left_y = random.randint(0, h - block_height)
+                top_left_x = random.randint(0, w - block_width)
+                mask[i, 0, top_left_y:top_left_y + block_height, top_left_x:top_left_x + block_width] = 1
+        elif mask_index == 7:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            a = torch.randint(min(w, h) // 8, min(w, h) // 4, (1,)).item()  # 长半轴
+            b = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()  # 短半轴
+            for i in range(h):
+                for j in range(w):
+                    if ((i - center_y) ** 2) / (b ** 2) + ((j - center_x) ** 2) / (a ** 2) < 1:
+                        mask[:, :, i, j] = 1
+        elif mask_index == 8:
+            center_x = torch.randint(0, w, (1,)).item()
+            center_y = torch.randint(0, h, (1,)).item()
+            radius = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()
+            for i in range(h):
+                for j in range(w):
+                    if (i - center_y) ** 2 + (j - center_x) ** 2 < radius ** 2:
+                        mask[:, :, i, j] = 1
+        elif mask_index == 9:
+            for idx in range(f):
+                if np.random.rand() > 0.5:
+                    mask[idx, :, :, :] = 1
+        else:
+            raise ValueError(f"The mask_index {mask_index} is not define")
+    else:
+        if f != 1:
+            mask[1:, :, :, :] = 1
+        else:
+            mask[:, :, :, :] = 1
+    return mask
+class LargeScaleTalkingFantasyVideos(Dataset):
+    def __init__(self, txt_path, width, height, n_sample_frames, sample_frame_rate, only_last_features=False, vocal_encoder=None, audio_encoder=None, vocal_sample_rate=16000, audio_sample_rate=24000, enable_inpaint=True, audio_margin=2, vae_stride=None, patch_size=None, wav2vec_processor=None, wav2vec=None):
+        self.txt_path = txt_path
+        self.width = width
+        self.height = height
+        self.n_sample_frames = n_sample_frames
+        self.sample_frame_rate = sample_frame_rate
+        self.only_last_features = only_last_features
+        self.vocal_encoder = vocal_encoder
+        self.audio_encoder = audio_encoder
+        self.vocal_sample_rate = vocal_sample_rate
+        self.audio_sample_rate = audio_sample_rate
+        self.enable_inpaint = enable_inpaint
+        self.wav2vec_processor = wav2vec_processor
+        self.audio_margin = audio_margin
+        self.vae_stride = vae_stride
+        self.patch_size = patch_size
+        self.max_area = height * width
+        self.aspect_ratio = height / width
+        self.video_files = self._read_txt_file_images()
+        self.lat_h = round(
+            np.sqrt(self.max_area * self.aspect_ratio) // self.vae_stride[1] //
+            self.patch_size[1] * self.patch_size[1])
+        self.lat_w = round(
+            np.sqrt(self.max_area / self.aspect_ratio) // self.vae_stride[2] //
+            self.patch_size[2] * self.patch_size[2])
+    def _read_txt_file_images(self):
+        with open(self.txt_path, 'r') as file:
+            lines = file.readlines()
+            video_files = []
+            for line in lines:
+                video_file = line.strip()
+                video_files.append(video_file)
+        return video_files
+    def __len__(self):
+        return len(self.video_files)
+    def frame_count(self, frames_path):
+        files = os.listdir(frames_path)
+        png_files = [file for file in files if file.endswith('.png') or file.endswith('.jpg')]
+        png_files_count = len(png_files)
+        return png_files_count
+    def find_frames_list(self, frames_path):
+        files = os.listdir(frames_path)
+        image_files = [file for file in files if file.endswith('.png') or file.endswith('.jpg')]
+        if image_files[0].startswith('frame_'):
+            image_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
+        else:
+            image_files.sort(key=lambda x: int(x.split('.')[0]))
+        return image_files
+    def __getitem__(self, idx):
+        warnings.filterwarnings('ignore', category=DeprecationWarning)
+        warnings.filterwarnings('ignore', category=FutureWarning)
+        video_path = os.path.join(self.video_files[idx], "sub_clip.mp4")
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        try:
+            is_0_fps = 2 / fps
+        except Exception as e:
+            print(f"The fps of {video_path} is 0 !!!")
+            vocal_audio_path = os.path.join(self.video_files[idx], "audio.wav")
+            vocal_duration = librosa.get_duration(filename=vocal_audio_path)
+            frames_path = os.path.join(self.video_files[idx], "images")
+            total_frame_number = self.frame_count(frames_path)
+            fps = total_frame_number / vocal_duration
+            print(f"The calculated fps of {video_path} is {fps} !!!")
+            # idx = random.randint(0, len(self.video_files) - 1)
+            # video_path = os.path.join(self.video_files[idx], "sub_clip.mp4")
+            # cap = cv2.VideoCapture(video_path)
+            # fps = cap.get(cv2.CAP_PROP_FPS)
+        frames_path = os.path.join(self.video_files[idx], "images")
+        face_masks_path = os.path.join(self.video_files[idx], "face_masks")
+        lip_masks_path = os.path.join(self.video_files[idx], "lip_masks")
+        raw_audio_path = os.path.join(self.video_files[idx], "audio.wav")
+        # vocal_audio_path = os.path.join(self.video_files[idx], "vocal.wav")
+        vocal_audio_path = os.path.join(self.video_files[idx], "audio.wav")
+        video_length = self.frame_count(frames_path)
+        frames_list = self.find_frames_list(frames_path)
+        clip_length = min(video_length, (self.n_sample_frames - 1) * self.sample_frame_rate + 1)
+        start_idx = random.randint(0, video_length - clip_length)
+        batch_index = np.linspace(
+            start_idx, start_idx + clip_length - 1, self.n_sample_frames, dtype=int
+        ).tolist()
+        all_indices = list(range(0, video_length))
+        reference_frame_idx = random.choice(all_indices)
+        tgt_pil_image_list = []
+        tgt_face_masks_list = []
+        tgt_lip_masks_list = []
+        # reference_frame_path = os.path.join(frames_path, frames_list[reference_frame_idx])
+        reference_frame_path = os.path.join(frames_path, frames_list[start_idx])
+        reference_pil_image = Image.open(reference_frame_path).convert('RGB')
+        reference_pil_image = reference_pil_image.resize((self.width, self.height))
+        reference_pil_image = torch.from_numpy(np.array(reference_pil_image)).float()
+        reference_pil_image = reference_pil_image / 127.5 - 1
+        for index in batch_index:
+            tgt_img_path = os.path.join(frames_path, frames_list[index])
+            # file_name = os.path.splitext(os.path.basename(tgt_img_path))[0]
+            file_name = os.path.basename(tgt_img_path)
+            face_mask_path = os.path.join(face_masks_path, file_name)
+            lip_mask_path = os.path.join(lip_masks_path, file_name)
+            try:
+                tgt_img_pil = Image.open(tgt_img_path).convert('RGB')
+            except Exception as e:
+                print(f"Fail loading the image: {tgt_img_path}")
+            try:
+                tgt_lip_mask = Image.open(lip_mask_path)
+                # tgt_lip_mask = Image.open(lip_mask_path).convert('RGB')
+                tgt_lip_mask = tgt_lip_mask.resize((self.width, self.height))
+                tgt_lip_mask = torch.from_numpy(np.array(tgt_lip_mask)).float()
+                # tgt_lip_mask = tgt_lip_mask / 127.5 - 1
+                tgt_lip_mask = tgt_lip_mask / 255
+            except Exception as e:
+                print(f"Fail loading the lip masks: {lip_mask_path}")
+                tgt_lip_mask = torch.ones(self.height, self.width)
+                # tgt_lip_mask = torch.ones(self.height, self.width, 3)
+            tgt_lip_masks_list.append(tgt_lip_mask)
+            try:
+                tgt_face_mask = Image.open(face_mask_path)
+                # tgt_face_mask = Image.open(face_mask_path).convert('RGB')
+                tgt_face_mask = tgt_face_mask.resize((self.width, self.height))
+                tgt_face_mask = torch.from_numpy(np.array(tgt_face_mask)).float()
+                tgt_face_mask = tgt_face_mask / 255
+                # tgt_face_mask = tgt_face_mask / 127.5 - 1
+            except Exception as e:
+                print(f"Fail loading the face masks: {face_mask_path}")
+                tgt_face_mask = torch.ones(self.height, self.width)
+                # tgt_face_mask = torch.ones(self.height, self.width, 3)
+            tgt_face_masks_list.append(tgt_face_mask)
+            tgt_img_pil = tgt_img_pil.resize((self.width, self.height))
+            tgt_img_tensor = torch.from_numpy(np.array(tgt_img_pil)).float()
+            tgt_img_normalized = tgt_img_tensor / 127.5 - 1
+            tgt_pil_image_list.append(tgt_img_normalized)
+        sr = 16000
+        vocal_input, sample_rate = librosa.load(vocal_audio_path, sr=sr)
+        vocal_duration = librosa.get_duration(filename=vocal_audio_path)
+        start_time = batch_index[0] / fps
+        end_time = (clip_length / fps) + start_time
+        start_sample = int(start_time * sr)
+        end_sample = int(end_time * sr)
+        try:
+            vocal_segment = vocal_input[start_sample:end_sample]
+        except:
+            print(f"The current vocal segment is too short: {vocal_audio_path}, [{batch_index[0]}, {batch_index[-1]}], fps={fps}, clip_length={clip_length}, vocal_duration={vocal_duration}], [{start_time}, {end_time}]")
+            vocal_segment = vocal_input[start_sample:]
+        vocal_input_values = self.wav2vec_processor(
+            vocal_segment, sampling_rate=sample_rate, return_tensors="pt"
+        ).input_values
+        tgt_pil_image_list = torch.stack(tgt_pil_image_list, dim=0)
+        tgt_pil_image_list = rearrange(tgt_pil_image_list, "f h w c -> f c h w")
+        reference_pil_image = rearrange(reference_pil_image, "h w c -> c h w")
+        tgt_face_masks_list = torch.stack(tgt_face_masks_list, dim=0)
+        tgt_face_masks_list = torch.unsqueeze(tgt_face_masks_list, dim=-1)
+        tgt_face_masks_list = rearrange(tgt_face_masks_list, "f h w c -> c f h w")
+        tgt_lip_masks_list = torch.stack(tgt_lip_masks_list, dim=0)
+        tgt_lip_masks_list = torch.unsqueeze(tgt_lip_masks_list, dim=-1)
+        tgt_lip_masks_list = rearrange(tgt_lip_masks_list, "f h w c -> c f h w")
+        clip_pixel_values = reference_pil_image.permute(1, 2, 0).contiguous()
+        clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
+        cos_similarities = []
+        stride = 8
+        for i in range(0, tgt_pil_image_list.size()[0] - stride, stride):
+            frame1 = tgt_pil_image_list[i]
+            frame2 = tgt_pil_image_list[i + stride]
+            frame1_flat = frame1.contiguous().view(-1)
+            frame2_flat = frame2.contiguous().view(-1)
+            cos_sim = F.cosine_similarity(frame1_flat, frame2_flat, dim=0)
+            cos_sim = (cos_sim + 1) / 2
+            cos_similarities.append(cos_sim.item())
+        overall_cos_sim = F.cosine_similarity(tgt_pil_image_list[0].contiguous().view(-1), tgt_pil_image_list[-1].contiguous().view(-1), dim=0)
+        overall_cos_sim = (overall_cos_sim + 1) / 2
+        cos_similarities.append(overall_cos_sim.item())
+        motion_id = (1.0 - sum(cos_similarities) / len(cos_similarities)) * 100
+        if "singing" in self.video_files[idx]:
+            text_prompt = "The protagonist is singing"
+        elif "speech" in self.video_files[idx]:
+            text_prompt = "The protagonist is talking"
+        elif "dancing" in self.video_files[idx]:
+            text_prompt = "The protagonist is simultaneously dancing and singing"
+        else:
+            text_prompt = ""
+            print(1 / 0)
+        sample = dict(
+            pixel_values=tgt_pil_image_list,
+            reference_image=reference_pil_image,
+            clip_pixel_values=clip_pixel_values,
+            tgt_face_masks=tgt_face_masks_list,
+            vocal_input_values=vocal_input_values,
+            text_prompt=text_prompt,
+            motion_id=motion_id,
+            tgt_lip_masks=tgt_lip_masks_list,
+            audio_path=raw_audio_path,
+        )
+        if self.enable_inpaint:
+            pixel_value_masks = get_random_mask(tgt_pil_image_list.size(), image_start_only=True)
+            masked_pixel_values = tgt_pil_image_list * (1-pixel_value_masks)
+            sample["masked_pixel_values"] = masked_pixel_values
+            sample["pixel_value_masks"] = pixel_value_masks
+        return sample

wan/dist/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.distributed as dist
+try:
+    import xfuser
+    from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                         get_sequence_parallel_world_size,
+                                         get_sp_group, get_world_group,
+                                         init_distributed_environment,
+                                         initialize_model_parallel)
+    from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+except Exception as ex:
+    get_sequence_parallel_world_size = None
+    get_sequence_parallel_rank = None
+    xFuserLongContextAttention = None
+    get_sp_group = None
+    get_world_group = None
+    init_distributed_environment = None
+    initialize_model_parallel = None
+def set_multi_gpus_devices(ulysses_degree, ring_degree):
+    if ulysses_degree > 1 or ring_degree > 1:
+        if get_sp_group is None:
+            raise RuntimeError("xfuser is not installed.")
+        dist.init_process_group("nccl")
+        print('parallel inference enabled: ulysses_degree=%d ring_degree=%d rank=%d world_size=%d' % (
+            ulysses_degree, ring_degree, dist.get_rank(),
+            dist.get_world_size()))
+        assert dist.get_world_size() == ring_degree * ulysses_degree, \
+                    "number of GPUs(%d) should be equal to ring_degree * ulysses_degree." % dist.get_world_size()
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+        initialize_model_parallel(sequence_parallel_degree=dist.get_world_size(),
+                ring_degree=ring_degree,
+                ulysses_degree=ulysses_degree)
+        # device = torch.device("cuda:%d" % dist.get_rank())
+        device = torch.device(f"cuda:{get_world_group().local_rank}")
+        print('rank=%d device=%s' % (get_world_group().rank, str(device)))
+    else:
+        device = "cuda"
+    return device

wan/dist/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.52 kB). View file

wan/dist/__pycache__/wan_xfuser.cpython-311.pyc ADDED Viewed

Binary file (6 kB). View file

wan/dist/wan_xfuser.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+import torch.amp as amp
+try:
+    import xfuser
+    from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                         get_sequence_parallel_world_size,
+                                         get_sp_group,
+                                         init_distributed_environment,
+                                         initialize_model_parallel)
+    from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+except Exception as ex:
+    get_sequence_parallel_world_size = None
+    get_sequence_parallel_rank = None
+    xFuserLongContextAttention = None
+    get_sp_group = None
+    init_distributed_environment = None
+    initialize_model_parallel = None
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@amp.autocast('cuda', enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float32).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+        dim=-1).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output)
+def usp_attn_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x

wan/distributed/__init__.py ADDED Viewed

File without changes

wan/distributed/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (183 Bytes). View file

wan/distributed/__pycache__/fsdp.cpython-311.pyc ADDED Viewed

Binary file (2.05 kB). View file

wan/distributed/fsdp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+from torch.distributed.utils import _free_storage
+def shard_model(
+    model,
+    device_id,
+    param_dtype=torch.bfloat16,
+    reduce_dtype=torch.float32,
+    buffer_dtype=torch.float32,
+    process_group=None,
+    sharding_strategy=ShardingStrategy.FULL_SHARD,
+    sync_module_states=True,
+):
+    model = FSDP(
+        module=model,
+        process_group=process_group,
+        sharding_strategy=sharding_strategy,
+        auto_wrap_policy=partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
+        mixed_precision=MixedPrecision(
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            buffer_dtype=buffer_dtype),
+        device_id=device_id,
+        sync_module_states=sync_module_states)
+    return model
+def free_model(model):
+    for m in model.modules():
+        if isinstance(m, FSDP):
+            _free_storage(m._handle.flat_param.data)
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

wan/distributed/xdit_context_parallel.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.cuda.amp as amp
+from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                     get_sequence_parallel_world_size,
+                                     get_sp_group)
+from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+from ..modules.model import sinusoidal_embedding_1d
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+@amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+                            dim=-1).reshape(seq_len, 1, -1)
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output).float()
+def usp_dit_forward(
+    self,
+    x,
+    t,
+    context,
+    seq_len,
+    clip_fea=None,
+    y=None,
+):
+    """
+    x:              A list of videos each with shape [C, T, H, W].
+    t:              [B].
+    context:        A list of text embeddings each with shape [L, C].
+    """
+    if self.model_type == 'i2v':
+        assert clip_fea is not None and y is not None
+    # params
+    device = self.patch_embedding.weight.device
+    if self.freqs.device != device:
+        self.freqs = self.freqs.to(device)
+    if y is not None:
+        x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
+    # embeddings
+    x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
+    grid_sizes = torch.stack(
+        [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
+    x = [u.flatten(2).transpose(1, 2) for u in x]
+    seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
+    assert seq_lens.max() <= seq_len
+    x = torch.cat([
+        torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
+        for u in x
+    ])
+    # time embeddings
+    with amp.autocast(dtype=torch.float32):
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).float())
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+        assert e.dtype == torch.float32 and e0.dtype == torch.float32
+    # context
+    context_lens = None
+    context = self.text_embedding(
+        torch.stack([
+            torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
+            for u in context
+        ]))
+    if clip_fea is not None:
+        context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+        context = torch.concat([context_clip, context], dim=1)
+    # arguments
+    kwargs = dict(
+        e=e0,
+        seq_lens=seq_lens,
+        grid_sizes=grid_sizes,
+        freqs=self.freqs,
+        context=context,
+        context_lens=context_lens)
+    # Context Parallel
+    x = torch.chunk(
+        x, get_sequence_parallel_world_size(),
+        dim=1)[get_sequence_parallel_rank()]
+    for block in self.blocks:
+        x = block(x, **kwargs)
+    # head
+    x = self.head(x, e)
+    # Context Parallel
+    x = get_sp_group().all_gather(x, dim=1)
+    # unpatchify
+    x = self.unpatchify(x, grid_sizes)
+    return [u.float() for u in x]
+def usp_attn_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+    q, k, v = qkv_fn(x)
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x

wan/image2video.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+import logging
+import math
+import os
+import random
+import sys
+import types
+from contextlib import contextmanager
+from functools import partial
+import numpy as np
+import torch
+import torch.cuda.amp as amp
+import torch.distributed as dist
+import torchvision.transforms.functional as TF
+from tqdm import tqdm
+from .distributed.fsdp import shard_model
+from .modules.clip import CLIPModel
+from .modules.model import WanModel
+from .modules.t5 import T5EncoderModel
+from .modules.vae import WanVAE
+from .utils.fm_solvers import (FlowDPMSolverMultistepScheduler,
+                               get_sampling_sigmas, retrieve_timesteps)
+from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+class WanI2V:
+    def __init__(
+        self,
+        config,
+        checkpoint_dir,
+        device_id=0,
+        rank=0,
+        t5_fsdp=False,
+        dit_fsdp=False,
+        use_usp=False,
+        t5_cpu=False,
+        init_on_cpu=True,
+    ):
+        r"""
+        Initializes the image-to-video generation model components.
+        Args:
+            config (EasyDict):
+                Object containing model parameters initialized from config.py
+            checkpoint_dir (`str`):
+                Path to directory containing model checkpoints
+            device_id (`int`,  *optional*, defaults to 0):
+                Id of target GPU device
+            rank (`int`,  *optional*, defaults to 0):
+                Process rank for distributed training
+            t5_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for T5 model
+            dit_fsdp (`bool`, *optional*, defaults to False):
+                Enable FSDP sharding for DiT model
+            use_usp (`bool`, *optional*, defaults to False):
+                Enable distribution strategy of USP.
+            t5_cpu (`bool`, *optional*, defaults to False):
+                Whether to place T5 model on CPU. Only works without t5_fsdp.
+            init_on_cpu (`bool`, *optional*, defaults to True):
+                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
+        """
+        self.device = torch.device(f"cuda:{device_id}")
+        self.config = config
+        self.rank = rank
+        self.use_usp = use_usp
+        self.t5_cpu = t5_cpu
+        self.num_train_timesteps = config.num_train_timesteps
+        self.param_dtype = config.param_dtype
+        shard_fn = partial(shard_model, device_id=device_id)
+        self.text_encoder = T5EncoderModel(
+            text_len=config.text_len,
+            dtype=config.t5_dtype,
+            device=torch.device('cpu'),
+            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
+            shard_fn=shard_fn if t5_fsdp else None,
+        )
+        self.vae_stride = config.vae_stride
+        self.patch_size = config.patch_size
+        self.vae = WanVAE(
+            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
+            device=self.device)
+        self.clip = CLIPModel(
+            dtype=config.clip_dtype,
+            device=self.device,
+            checkpoint_path=os.path.join(checkpoint_dir,
+                                         config.clip_checkpoint),
+            tokenizer_path=os.path.join(checkpoint_dir, config.clip_tokenizer))
+        logging.info(f"Creating WanModel from {checkpoint_dir}")
+        self.model = WanModel.from_pretrained(checkpoint_dir)
+        self.model.eval().requires_grad_(False)
+        if t5_fsdp or dit_fsdp or use_usp:
+            init_on_cpu = False
+        if use_usp:
+            from xfuser.core.distributed import \
+                get_sequence_parallel_world_size
+            from .distributed.xdit_context_parallel import (usp_attn_forward,
+                                                            usp_dit_forward)
+            for block in self.model.blocks:
+                block.self_attn.forward = types.MethodType(usp_attn_forward, block.self_attn)
+            self.model.forward = types.MethodType(usp_dit_forward, self.model)
+            self.sp_size = get_sequence_parallel_world_size()
+        else:
+            self.sp_size = 1
+        if dist.is_initialized():
+            dist.barrier()
+        if dit_fsdp:
+            self.model = shard_fn(self.model)
+        else:
+            if not init_on_cpu:
+                self.model.to(self.device)
+        self.sample_neg_prompt = config.sample_neg_prompt
+    def generate(self,
+                 input_prompt,
+                 img,
+                 max_area=720 * 1280,
+                 frame_num=81,
+                 shift=5.0,
+                 sample_solver='unipc',
+                 sampling_steps=40,
+                 guide_scale=5.0,
+                 n_prompt="",
+                 seed=-1,
+                 offload_model=True):
+        r"""
+        Generates video frames from input image and text prompt using diffusion process.
+        Args:
+            input_prompt (`str`):
+                Text prompt for content generation.
+            img (PIL.Image.Image):
+                Input image tensor. Shape: [3, H, W]
+            max_area (`int`, *optional*, defaults to 720*1280):
+                Maximum pixel area for latent space calculation. Controls video resolution scaling
+            frame_num (`int`, *optional*, defaults to 81):
+                How many frames to sample from a video. The number should be 4n+1
+            shift (`float`, *optional*, defaults to 5.0):
+                Noise schedule shift parameter. Affects temporal dynamics
+                [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
+            sample_solver (`str`, *optional*, defaults to 'unipc'):
+                Solver used to sample the video.
+            sampling_steps (`int`, *optional*, defaults to 40):
+                Number of diffusion sampling steps. Higher values improve quality but slow generation
+            guide_scale (`float`, *optional*, defaults 5.0):
+                Classifier-free guidance scale. Controls prompt adherence vs. creativity
+            n_prompt (`str`, *optional*, defaults to ""):
+                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
+            seed (`int`, *optional*, defaults to -1):
+                Random seed for noise generation. If -1, use random seed
+            offload_model (`bool`, *optional*, defaults to True):
+                If True, offloads models to CPU during generation to save VRAM
+        Returns:
+            torch.Tensor:
+                Generated video frames tensor. Dimensions: (C, N H, W) where:
+                - C: Color channels (3 for RGB)
+                - N: Number of frames (81)
+                - H: Frame height (from max_area)
+                - W: Frame width from max_area)
+        """
+        img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(self.device)
+        F = frame_num
+        h, w = img.shape[1:]
+        aspect_ratio = h / w
+        lat_h = round(
+            np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
+            self.patch_size[1] * self.patch_size[1])
+        lat_w = round(
+            np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
+            self.patch_size[2] * self.patch_size[2])
+        h = lat_h * self.vae_stride[1]
+        w = lat_w * self.vae_stride[2]
+        max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (self.patch_size[1] * self.patch_size[2])
+        max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
+        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
+        seed_g = torch.Generator(device=self.device)
+        seed_g.manual_seed(seed)
+        noise = torch.randn(
+            16,
+            21,
+            lat_h,
+            lat_w,
+            dtype=torch.float32,
+            generator=seed_g,
+            device=self.device)
+        msk = torch.ones(1, 81, lat_h, lat_w, device=self.device)
+        msk[:, 1:] = 0
+        msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]],dim=1)
+        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
+        msk = msk.transpose(1, 2)[0]
+        if n_prompt == "":
+            n_prompt = self.sample_neg_prompt
+        # preprocess
+        if not self.t5_cpu:
+            self.text_encoder.model.to(self.device)
+            context = self.text_encoder([input_prompt], self.device)
+            context_null = self.text_encoder([n_prompt], self.device)
+            if offload_model:
+                self.text_encoder.model.cpu()
+        else:
+            context = self.text_encoder([input_prompt], torch.device('cpu'))
+            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
+            context = [t.to(self.device) for t in context]
+            context_null = [t.to(self.device) for t in context_null]
+        self.clip.model.to(self.device)
+        clip_context = self.clip.visual([img[:, None, :, :]])
+        if offload_model:
+            self.clip.model.cpu()
+        y = self.vae.encode([torch.concat([torch.nn.functional.interpolate(img[None].cpu(), size=(h, w), mode='bicubic').transpose(0, 1), torch.zeros(3, 80, h, w)],dim=1).to(self.device)])[0]
+        y = torch.concat([msk, y])
+        @contextmanager
+        def noop_no_sync():
+            yield
+        no_sync = getattr(self.model, 'no_sync', noop_no_sync)
+        # evaluation mode
+        with amp.autocast(dtype=self.param_dtype), torch.no_grad(), no_sync():
+            if sample_solver == 'unipc':
+                sample_scheduler = FlowUniPCMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sample_scheduler.set_timesteps(
+                    sampling_steps, device=self.device, shift=shift)
+                timesteps = sample_scheduler.timesteps
+            elif sample_solver == 'dpm++':
+                sample_scheduler = FlowDPMSolverMultistepScheduler(
+                    num_train_timesteps=self.num_train_timesteps,
+                    shift=1,
+                    use_dynamic_shifting=False)
+                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
+                timesteps, _ = retrieve_timesteps(
+                    sample_scheduler,
+                    device=self.device,
+                    sigmas=sampling_sigmas)
+            else:
+                raise NotImplementedError("Unsupported solver.")
+            # sample videos
+            latent = noise
+            arg_c = {
+                'context': [context[0]],
+                'clip_fea': clip_context,
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            arg_null = {
+                'context': context_null,
+                'clip_fea': clip_context,
+                'seq_len': max_seq_len,
+                'y': [y],
+            }
+            if offload_model:
+                torch.cuda.empty_cache()
+            self.model.to(self.device)
+            for _, t in enumerate(tqdm(timesteps)):
+                latent_model_input = [latent.to(self.device)]
+                timestep = [t]
+                timestep = torch.stack(timestep).to(self.device)
+                noise_pred_cond = self.model(
+                    latent_model_input, t=timestep, **arg_c)[0].to(
+                        torch.device('cpu') if offload_model else self.device)
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred_uncond = self.model(
+                    latent_model_input, t=timestep, **arg_null)[0].to(
+                        torch.device('cpu') if offload_model else self.device)
+                if offload_model:
+                    torch.cuda.empty_cache()
+                noise_pred = noise_pred_uncond + guide_scale * (
+                    noise_pred_cond - noise_pred_uncond)
+                latent = latent.to(
+                    torch.device('cpu') if offload_model else self.device)
+                temp_x0 = sample_scheduler.step(
+                    noise_pred.unsqueeze(0),
+                    t,
+                    latent.unsqueeze(0),
+                    return_dict=False,
+                    generator=seed_g)[0]
+                latent = temp_x0.squeeze(0)
+                x0 = [latent.to(self.device)]
+                del latent_model_input, timestep
+            if offload_model:
+                self.model.cpu()
+                torch.cuda.empty_cache()
+            if self.rank == 0:
+                videos = self.vae.decode(x0)
+        del noise, latent
+        del sample_scheduler
+        if offload_model:
+            gc.collect()
+            torch.cuda.synchronize()
+        if dist.is_initialized():
+            dist.barrier()
+        return videos[0] if self.rank == 0 else None

wan/models/__init__.py ADDED Viewed

File without changes

wan/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (182 Bytes). View file