File size: 7,396 Bytes
26557da
 
 
 
 
 
 
 
 
 
 
 
927c90e
 
26557da
927c90e
 
 
 
 
 
 
 
 
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4d0858
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4d0858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4d0858
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4d0858
26557da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a07e0e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import gradio as gr
import torch
import time
from PIL import Image
import tempfile
import os

from data.video import save_video
from wan_loader import load_wan_pipe
from models.set_condition_branch import set_stand_in
from preprocessor import FaceProcessor

from huggingface_hub import snapshot_download

print("Loading model, please wait...")

snapshot_download("Wan-AI/Wan2.1-T2V-14B", local_dir="checkpoints/base_model/")

snapshot_download(
    "DIAMONIK7777/antelopev2", 
    local_dir="checkpoints/antelopev2/models/antelopev2"
)
snapshot_download("BowenXue/Stand-In", local_dir="checkpoints/Stand-In/")

try:
    ANTELOPEV2_PATH = "checkpoints/antelopev2"
    BASE_MODEL_PATH = "checkpoints/base_model/"
    LORA_MODEL_PATH = "checkpoints/Stand-In/Stand-In_wan2.1_T2V_14B_ver1.0.ckpt"

    if not os.path.exists(ANTELOPEV2_PATH):
        raise FileNotFoundError(
            f"AntelopeV2 checkpoint not found at: {ANTELOPEV2_PATH}"
        )
    if not os.path.exists(BASE_MODEL_PATH):
        raise FileNotFoundError(f"Base model not found at: {BASE_MODEL_PATH}")
    if not os.path.exists(LORA_MODEL_PATH):
        raise FileNotFoundError(f"LoRA model not found at: {LORA_MODEL_PATH}")

    face_processor = FaceProcessor(antelopv2_path=ANTELOPEV2_PATH)
    pipe = load_wan_pipe(base_path=BASE_MODEL_PATH, torch_dtype=torch.bfloat16)
    set_stand_in(pipe, model_path=LORA_MODEL_PATH)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Model loading failed: {e}")
    with gr.Blocks() as demo:
        gr.Markdown("# Error: Model Loading Failed")
        gr.Markdown(f"""
        Please check the following:
        1.  Make sure the checkpoint files are placed in the correct directory.
        2.  Ensure all dependencies are properly installed.
        3.  Check the console output for detailed error information.
        
        **Error details**: {e}
        """)
    demo.launch()
    exit()


def generate_video(
    pil_image: Image.Image,
    prompt: str,
    seed: int,
    negative_prompt: str,
    num_steps: int,
    fps: int,
    quality: int,
    progress=gr.Progress(track_tqdm=True)
):
    if pil_image is None:
        raise gr.Error("Please upload a face image first!")

    print("Processing face...")
    ip_image = face_processor.process(pil_image)
    print("Face processing completed.")

    print("Generating video...")
    start_time = time.time()
    video = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        seed=int(seed),
        ip_image=ip_image,
        num_inference_steps=int(num_steps),
        tiled=False,
    )
    end_time = time.time()
    print(f"Video generated in {end_time - start_time:.2f} seconds.")

    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
        video_path = temp_file.name
        save_video(video, video_path, fps=int(fps), quality=quality)
        print(f"Video saved to: {video_path}")
        return video_path


with gr.Blocks(theme=gr.themes.Soft(), css="footer {display: none !important}") as demo:
    gr.Markdown(
        """
        # Stand-In IP2V
        """
    )
    gr.Markdown("A Lightweight and Plug-and-Play Identity Control for Video Generation")
        gr.HTML("""
        <div style="display:flex;column-gap:4px;">
            <a href="https://github.com/WeChatCV/Stand-In">
                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
            </a> 
            <a href="https://stand-in-video.github.io/">
                <img src='https://img.shields.io/badge/Project-Page-green'>
            </a>
            <a href="https://arxiv.org/abs/2508.07901">
                <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
            </a>
            <a href="https://huggingface.co/spaces/fffiloni/Stand-In?duplicate=true">
                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
            </a>
            <a href="https://huggingface.co/fffiloni">
                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
            </a>
        </div>
        """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Upload a Face Image")
            input_image = gr.Image(
                label="Upload Image",
                type="pil",
                image_mode="RGB",
                height=300,
            )

            gr.Markdown("### 2. Enter Core Parameters")
            input_prompt = gr.Textbox(
                label="Prompt",
                lines=4,
                value="A man sits comfortably at his desk, facing the camera, as if conversing with a friend or family member in front of a screen. His eyes are focused yet gentle, and a natural smile plays on his lips. The background is his meticulously decorated personal space, with photos and a world map on the wall, conveying a sense of intimacy and modern communication.",
                placeholder="Please enter a detailed description of the scene, character actions, expressions, etc...",
            )

            input_seed = gr.Slider(
                label="Seed",
                minimum=0,
                maximum=100000,
                step=1,
                value=0,
                info="The same seed and parameters will generate the same result.",
            )

            with gr.Accordion("Advanced Options", open=False):
                input_negative_prompt = gr.Textbox(
                    label="Negative Prompt",
                    lines=3,
                    value="Vibrant colors, overexposure, static, blurred details, subtitles, style, artwork, painting, still image, overall grayness, worst quality, low quality, JPEG compression residue, ugly, mutilated, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, malformed limbs, fused fingers, still image, cluttered background, three legs, crowded background, walking backwards",
                )
                input_steps = gr.Slider(
                    label="Inference Steps",
                    minimum=10,
                    maximum=50,
                    step=1,
                    value=20,
                    info="More steps may improve details but will take longer to generate.",
                )
                output_fps = gr.Slider(
                    label="Video FPS", minimum=10, maximum=30, step=1, value=25
                )
                output_quality = gr.Slider(
                    label="Video Quality", minimum=1, maximum=10, step=1, value=9
                )

            generate_btn = gr.Button("Generate Video", variant="primary")

        with gr.Column(scale=1):
            gr.Markdown("### 3. View Generated Result")
            output_video = gr.Video(
                label="Generated Video",
                height=480,
            )
    generate_btn.click(
        fn=generate_video,
        inputs=[
            input_image,
            input_prompt,
            input_seed,
            input_negative_prompt,
            input_steps,
            output_fps,
            output_quality,
        ],
        outputs=output_video,
        api_name="generate_video",
    )

if __name__ == "__main__":
    demo.launch(ssr_mode=False, show_error=True, show_api=False)