tomiconic commited on
Commit
5765f32
Β·
verified Β·
1 Parent(s): fea8014

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +391 -0
app.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import spaces
4
+ import os
5
+ import tempfile
6
+ import numpy as np
7
+ from PIL import Image
8
+ from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
9
+ from diffusers.utils import export_to_video
10
+ from transformers import CLIPVisionModel
11
+ from huggingface_hub import InferenceClient
12
+
13
+ # ── Config ────────────────────────────────────────────────────────────────────
14
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
15
+ MODEL_REPO = "Wan-AI/Wan2.1-I2V-14B-480P"
16
+
17
+ # ── Prompt expansion LLM ──────────────────────────────────────────────────────
18
+ llm_client = InferenceClient(
19
+ model="mistralai/Mistral-7B-Instruct-v0.3",
20
+ token=HF_TOKEN,
21
+ )
22
+
23
+ VIDEO_SYSTEM = """You are an expert at writing motion prompts for AI video generation using Wan I2V.
24
+
25
+ Your job: take a short description of desired motion/animation and expand it into a detailed video motion prompt.
26
+
27
+ Rules:
28
+ - Focus on MOTION β€” what moves, how it moves, camera movement
29
+ - Be specific: "hair gently blowing in breeze", "camera slowly pulls back", "eyes blink naturally"
30
+ - Keep subjects consistent with what's already in the image
31
+ - Describe lighting changes if relevant (e.g. "light flickers softly")
32
+ - Do NOT describe the static image content β€” only the motion
33
+ - Return ONLY the prompt, no explanation, no preamble
34
+ - Keep under 80 words"""
35
+
36
+ def expand_video_prompt(raw_prompt):
37
+ if not raw_prompt.strip():
38
+ return "subtle natural movement, gentle camera drift, cinematic atmosphere"
39
+ try:
40
+ response = llm_client.chat_completion(
41
+ messages=[
42
+ {"role": "system", "content": VIDEO_SYSTEM},
43
+ {"role": "user", "content": f"Expand this motion description:\n{raw_prompt.strip()}"},
44
+ ],
45
+ max_tokens=150,
46
+ temperature=0.6,
47
+ )
48
+ return response.choices[0].message.content.strip().strip('"').strip("'")
49
+ except Exception as e:
50
+ print(f"LLM failed: {e}")
51
+ return raw_prompt.strip()
52
+
53
+ # ── Load pipeline ─────────────────────────────────────────────────────────────
54
+ print("Loading Wan2.1 I2V pipeline...")
55
+
56
+ vae = AutoencoderKLWan.from_pretrained(
57
+ MODEL_REPO,
58
+ subfolder="vae",
59
+ torch_dtype=torch.float32,
60
+ )
61
+
62
+ pipe = WanImageToVideoPipeline.from_pretrained(
63
+ MODEL_REPO,
64
+ vae=vae,
65
+ torch_dtype=torch.bfloat16,
66
+ )
67
+
68
+ # CPU offload keeps VRAM usage manageable on ZeroGPU
69
+ pipe.enable_model_cpu_offload()
70
+ print("Pipeline ready.")
71
+
72
+ # ── Negative prompt for video ─────────────────────────────────────────────────
73
+ VIDEO_NEG = (
74
+ "static, no movement, blurry, low quality, worst quality, "
75
+ "inconsistent motion, flickering, jitter, artifacts, "
76
+ "watermark, text, deformed"
77
+ )
78
+
79
+ # ── Generation ────────────────────────────────────────────────────────────────
80
+ @spaces.GPU(duration=300)
81
+ def generate_video(input_image, motion_prompt, num_frames, guidance, seed, randomize):
82
+
83
+ if input_image is None:
84
+ raise gr.Error("Please upload an image first.")
85
+
86
+ if randomize:
87
+ seed = random.randint(0, 2**32 - 1)
88
+ seed = int(seed)
89
+
90
+ # Expand motion prompt via LLM
91
+ expanded_motion = expand_video_prompt(motion_prompt)
92
+ print(f"Motion prompt: {expanded_motion}")
93
+
94
+ # Resize image β€” Wan I2V works best at 832x480 area
95
+ img = Image.fromarray(input_image).convert("RGB")
96
+ orig_w, orig_h = img.size
97
+ aspect = orig_w / orig_h
98
+ if aspect >= 1:
99
+ new_w, new_h = 832, 480
100
+ else:
101
+ new_w, new_h = 480, 832
102
+ img = img.resize((new_w, new_h), Image.LANCZOS)
103
+
104
+ generator = torch.Generator(device="cpu").manual_seed(seed)
105
+
106
+ output = pipe(
107
+ image=img,
108
+ prompt=expanded_motion,
109
+ negative_prompt=VIDEO_NEG,
110
+ height=new_h,
111
+ width=new_w,
112
+ num_frames=int(num_frames),
113
+ guidance_scale=float(guidance),
114
+ num_inference_steps=30,
115
+ generator=generator,
116
+ )
117
+
118
+ frames = output.frames[0]
119
+
120
+ # Export to mp4
121
+ tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
122
+ export_to_video(frames, tmp.name, fps=16)
123
+
124
+ return tmp.name, seed, f"**Motion prompt sent to model:**\n\n{expanded_motion}"
125
+
126
+ # ── CSS ───────────────────────────────────────────────────────────────────────
127
+ import random
128
+
129
+ css = """
130
+ * { box-sizing: border-box; margin: 0; padding: 0; }
131
+
132
+ body, .gradio-container {
133
+ background: #07070e !important;
134
+ font-family: 'Inter', system-ui, sans-serif !important;
135
+ max-width: 500px !important;
136
+ margin: 0 auto !important;
137
+ padding: 8px !important;
138
+ }
139
+
140
+ .topbar {
141
+ display: flex;
142
+ align-items: center;
143
+ justify-content: space-between;
144
+ padding: 10px 2px 14px;
145
+ }
146
+ .topbar-title {
147
+ color: #e8e0ff;
148
+ font-size: 0.95em;
149
+ font-weight: 800;
150
+ }
151
+ .gpu-pill {
152
+ background: #1aff7a18;
153
+ border: 1px solid #1aff7a44;
154
+ color: #1aff7a;
155
+ font-size: 0.6em;
156
+ font-weight: 800;
157
+ padding: 4px 12px;
158
+ border-radius: 20px;
159
+ letter-spacing: 1.5px;
160
+ text-transform: uppercase;
161
+ }
162
+
163
+ .upload-area {
164
+ background: #0d0d1a;
165
+ border: 2px dashed #1e1e35;
166
+ border-radius: 18px;
167
+ overflow: hidden;
168
+ margin-bottom: 8px;
169
+ min-height: 260px;
170
+ display: flex;
171
+ align-items: center;
172
+ justify-content: center;
173
+ }
174
+ .upload-area img { width: 100% !important; border-radius: 16px; }
175
+
176
+ .video-out {
177
+ background: #0d0d1a;
178
+ border: 1px solid #16162a;
179
+ border-radius: 18px;
180
+ overflow: hidden;
181
+ margin-bottom: 8px;
182
+ min-height: 260px;
183
+ }
184
+
185
+ .card {
186
+ background: #0d0d1a;
187
+ border: 1px solid #16162a;
188
+ border-radius: 14px;
189
+ padding: 14px;
190
+ margin-bottom: 8px;
191
+ }
192
+ .card-label {
193
+ color: #3d3060;
194
+ font-size: 0.62em;
195
+ font-weight: 800;
196
+ text-transform: uppercase;
197
+ letter-spacing: 2px;
198
+ margin-bottom: 8px;
199
+ }
200
+
201
+ textarea {
202
+ background: transparent !important;
203
+ border: none !important;
204
+ color: #c8b8f0 !important;
205
+ font-size: 15px !important;
206
+ line-height: 1.6 !important;
207
+ padding: 0 !important;
208
+ resize: none !important;
209
+ box-shadow: none !important;
210
+ width: 100% !important;
211
+ outline: none !important;
212
+ }
213
+ textarea::placeholder { color: #252038 !important; }
214
+ textarea:focus { outline: none !important; box-shadow: none !important; }
215
+
216
+ .gradio-accordion {
217
+ background: #0d0d1a !important;
218
+ border: 1px solid #16162a !important;
219
+ border-radius: 14px !important;
220
+ margin-bottom: 8px !important;
221
+ overflow: hidden !important;
222
+ }
223
+ .gradio-accordion .label-wrap button {
224
+ color: #4a3a6a !important;
225
+ font-size: 0.72em !important;
226
+ font-weight: 700 !important;
227
+ text-transform: uppercase !important;
228
+ letter-spacing: 1.5px !important;
229
+ padding: 12px 16px !important;
230
+ }
231
+
232
+ .gradio-slider {
233
+ background: transparent !important;
234
+ border: none !important;
235
+ padding: 4px 0 10px !important;
236
+ }
237
+ input[type=range] { accent-color: #6633bb !important; width: 100% !important; }
238
+
239
+ input[type=number] {
240
+ background: #0a0a14 !important;
241
+ border: 1px solid #18182a !important;
242
+ border-radius: 10px !important;
243
+ color: #9977cc !important;
244
+ font-size: 13px !important;
245
+ padding: 8px 10px !important;
246
+ }
247
+
248
+ input[type=checkbox] { accent-color: #6633bb !important; }
249
+ .gradio-checkbox label span {
250
+ color: #4a3a6a !important;
251
+ font-size: 0.75em !important;
252
+ font-weight: 600 !important;
253
+ }
254
+
255
+ label > span:first-child {
256
+ color: #3a2d55 !important;
257
+ font-size: 0.7em !important;
258
+ font-weight: 700 !important;
259
+ text-transform: uppercase !important;
260
+ letter-spacing: 1px !important;
261
+ }
262
+
263
+ .seed-out input[type=number] {
264
+ background: transparent !important;
265
+ border: none !important;
266
+ color: #2e2848 !important;
267
+ font-size: 0.7em !important;
268
+ text-align: center !important;
269
+ }
270
+
271
+ .hint-box {
272
+ background: #0a0a14;
273
+ border: 1px solid #111122;
274
+ border-radius: 10px;
275
+ padding: 10px 14px;
276
+ color: #443366;
277
+ font-size: 0.72em;
278
+ line-height: 1.7;
279
+ margin-bottom: 8px;
280
+ }
281
+
282
+ .gen-btn button {
283
+ background: linear-gradient(135deg, #1a4aaa 0%, #0e2d77 100%) !important;
284
+ border: 1px solid #3366cc !important;
285
+ border-radius: 14px !important;
286
+ color: #fff !important;
287
+ font-size: 0.88em !important;
288
+ font-weight: 900 !important;
289
+ padding: 17px !important;
290
+ width: 100% !important;
291
+ letter-spacing: 2px !important;
292
+ text-transform: uppercase !important;
293
+ box-shadow: 0 4px 24px #1a4aaa55 !important;
294
+ transition: all 0.15s ease !important;
295
+ margin-top: 6px !important;
296
+ }
297
+ .gen-btn button:hover {
298
+ box-shadow: 0 6px 32px #1a4aaa99 !important;
299
+ transform: translateY(-1px) !important;
300
+ }
301
+ .gen-btn button:active { transform: scale(0.98) !important; }
302
+
303
+ footer, .built-with { display: none !important; }
304
+ """
305
+
306
+ # ── UI ────────────────────────────────────────────────────────────────────────
307
+ with gr.Blocks(css=css, title="VideoGen") as demo:
308
+
309
+ gr.HTML("""
310
+ <div class="topbar">
311
+ <span class="topbar-title">🎬 Wan I2V β€” Image to Video</span>
312
+ <span class="gpu-pill">⚑ ZeroGPU</span>
313
+ </div>
314
+ """)
315
+
316
+ gr.HTML("""
317
+ <div class="hint-box">
318
+ Upload any image β†’ describe the motion you want β†’ generate a ~5 second 480P video.<br>
319
+ <strong>Tips:</strong> describe motion, not the image itself. "hair blowing in wind", "camera slowly zooms out", "candle flame flickers".
320
+ </div>
321
+ """)
322
+
323
+ # Input image
324
+ input_image = gr.Image(
325
+ label="Input Image",
326
+ type="numpy",
327
+ height=300,
328
+ elem_classes="upload-area",
329
+ )
330
+
331
+ # Motion prompt
332
+ gr.HTML('<div class="card"><div class="card-label">✦ Motion β€” describe what should move</div>')
333
+ motion_prompt = gr.Textbox(
334
+ show_label=False,
335
+ placeholder="hair gently blowing, eyes blinking slowly, soft light shimmer...",
336
+ lines=2,
337
+ )
338
+ gr.HTML('</div>')
339
+
340
+ # Generate button
341
+ generate_btn = gr.Button(
342
+ "Generate Video ✦", variant="primary",
343
+ size="lg", elem_classes="gen-btn",
344
+ )
345
+
346
+ # Output video
347
+ output_video = gr.Video(
348
+ label="Generated Video",
349
+ elem_classes="video-out",
350
+ height=300,
351
+ )
352
+
353
+ used_seed = gr.Number(
354
+ label="seed", interactive=False,
355
+ elem_classes="seed-out",
356
+ )
357
+
358
+ expanded_out = gr.Markdown(elem_classes="hint-box")
359
+
360
+ # Advanced settings
361
+ with gr.Accordion("βš™οΈ Settings", open=False):
362
+ gr.HTML('<div style="height:6px"></div>')
363
+
364
+ num_frames = gr.Slider(
365
+ minimum=17,
366
+ maximum=81,
367
+ value=49,
368
+ step=16,
369
+ label="Frames (17=~1s, 49=~3s, 81=~5s at 16fps)",
370
+ )
371
+ guidance = gr.Slider(
372
+ minimum=1.0,
373
+ maximum=10.0,
374
+ value=5.0,
375
+ step=0.5,
376
+ label="Guidance Scale",
377
+ )
378
+ with gr.Row():
379
+ seed = gr.Number(
380
+ label="Seed", value=42, precision=0,
381
+ minimum=0, maximum=2**32-1, scale=3,
382
+ )
383
+ randomize = gr.Checkbox(label="Random seed", value=True, scale=1)
384
+
385
+ generate_btn.click(
386
+ fn=generate_video,
387
+ inputs=[input_image, motion_prompt, num_frames, guidance, seed, randomize],
388
+ outputs=[output_video, used_seed, expanded_out],
389
+ )
390
+
391
+ demo.launch()