bbqhan commited on
Commit
f14231d
·
verified ·
1 Parent(s): 1f79de4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -471
app.py CHANGED
@@ -2,189 +2,139 @@ import spaces
2
  import subprocess
3
  import os
4
  import torch
5
- import mediapy
6
- from einops import rearrange
7
- from omegaconf import OmegaConf
8
- import datetime
9
- from tqdm import tqdm
10
  import gc
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
12
  from data.image.transforms.divisible_crop import DivisibleCrop
13
  from data.image.transforms.na_resize import NaResize
 
 
 
14
  if os.path.exists("./projects/video_diffusion_sr/color_fix.py"):
15
  from projects.video_diffusion_sr.color_fix import wavelet_reconstruction
16
- use_colorfix=True
17
  else:
18
  use_colorfix = False
19
- print('Note!!!!!! Color fix is not avaliable!')
20
- from torchvision.transforms import Compose, Lambda, Normalize
21
- import argparse
22
- from PIL import Image
23
-
24
- from common.distributed import (
25
- get_device,
26
- init_torch,
27
- )
28
-
29
- from common.distributed.advanced import (
30
- get_data_parallel_rank,
31
- get_data_parallel_world_size,
32
- get_sequence_parallel_rank,
33
- get_sequence_parallel_world_size,
34
- init_sequence_parallel,
35
- )
36
 
 
37
  from projects.video_diffusion_sr.infer import VideoDiffusionInfer
38
  from common.config import load_config
39
  from common.distributed.ops import sync_data
40
  from common.seed import set_seed
41
- from common.partition import partition_by_groups, partition_by_size
42
-
43
- import gradio as gr
44
- from pathlib import Path
45
- from urllib.parse import urlparse
46
- from torch.hub import download_url_to_file, get_dir
47
- import shlex
48
- import uuid
49
- import mimetypes
50
- import torchvision.transforms as T
51
 
 
52
  os.environ["MASTER_ADDR"] = "127.0.0.1"
53
  os.environ["MASTER_PORT"] = "12355"
54
  os.environ["RANK"] = str(0)
55
  os.environ["WORLD_SIZE"] = str(1)
56
 
 
57
  subprocess.run(
58
  "pip install flash-attn --no-build-isolation",
59
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
60
  shell=True,
61
  )
62
 
 
63
  def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
64
- """Load file from http url, will download models if necessary.
65
-
66
- Reference: https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py
67
-
68
- Args:
69
- url (str): URL to be downloaded.
70
- model_dir (str): The path to save the downloaded model. Should be a full path. If None, use pytorch hub_dir.
71
- Default: None.
72
- progress (bool): Whether to show the download progress. Default: True.
73
- file_name (str): The downloaded file name. If None, use the file name in the url. Default: None.
74
-
75
- Returns:
76
- str: The path to the downloaded file.
77
- """
78
- if model_dir is None: # use the pytorch hub_dir
79
  hub_dir = get_dir()
80
  model_dir = os.path.join(hub_dir, 'checkpoints')
81
 
82
  os.makedirs(model_dir, exist_ok=True)
83
-
84
  parts = urlparse(url)
85
- filename = os.path.basename(parts.path)
86
- if file_name is not None:
87
- filename = file_name
88
  cached_file = os.path.abspath(os.path.join(model_dir, filename))
 
89
  if not os.path.exists(cached_file):
90
- print(f'Downloading: "{url}" to {cached_file}
91
- ')
92
  download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
93
  return cached_file
94
 
95
-
96
- # os.system("pip freeze")
97
  ckpt_dir = Path('./ckpts')
98
- if not ckpt_dir.exists():
99
- ckpt_dir.mkdir()
100
 
101
  pretrain_model_url = {
102
- 'vae': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/ema_vae.pth',
103
- 'dit': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/seedvr2_ema_3b.pth',
104
  'pos_emb': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/pos_emb.pt',
105
  'neg_emb': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/neg_emb.pt',
106
  'apex': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/apex-0.1-cp310-cp310-linux_x86_64.whl'
107
  }
108
- # download weights
 
109
  if not os.path.exists('./ckpts/seedvr2_ema_3b.pth'):
110
- load_file_from_url(url=pretrain_model_url['dit'], model_dir='./ckpts/', progress=True, file_name=None)
111
  if not os.path.exists('./ckpts/ema_vae.pth'):
112
- load_file_from_url(url=pretrain_model_url['vae'], model_dir='./ckpts/', progress=True, file_name=None)
113
  if not os.path.exists('./pos_emb.pt'):
114
- load_file_from_url(url=pretrain_model_url['pos_emb'], model_dir='./', progress=True, file_name=None)
115
  if not os.path.exists('./neg_emb.pt'):
116
- load_file_from_url(url=pretrain_model_url['neg_emb'], model_dir='./', progress=True, file_name=None)
117
  if not os.path.exists('./apex-0.1-cp310-cp310-linux_x86_64.whl'):
118
- load_file_from_url(url=pretrain_model_url['apex'], model_dir='./', progress=True, file_name=None)
119
-
120
- subprocess.run(shlex.split("pip install apex-0.1-cp310-cp310-linux_x86_64.whl"))
121
- print(f"✅ setup completed Apex")
122
-
123
- # download images
124
- torch.hub.download_url_to_file(
125
- 'https://huggingface.co/datasets/Iceclear/SeedVR_VideoDemos/resolve/main/seedvr_videos_crf23/aigc1k/23_1_lq.mp4',
126
- '01.mp4')
127
- torch.hub.download_url_to_file(
128
- 'https://huggingface.co/datasets/Iceclear/SeedVR_VideoDemos/resolve/main/seedvr_videos_crf23/aigc1k/28_1_lq.mp4',
129
- '02.mp4')
130
- torch.hub.download_url_to_file(
131
- 'https://huggingface.co/datasets/Iceclear/SeedVR_VideoDemos/resolve/main/seedvr_videos_crf23/aigc1k/2_1_lq.mp4',
132
- '03.mp4')
133
-
134
- def configure_sequence_parallel(sp_size):
135
- if sp_size > 1:
136
- init_sequence_parallel(sp_size)
137
 
138
  @spaces.GPU(duration=100)
139
- def configure_runner(sp_size):
 
140
  config_path = os.path.join('./configs_3b', 'main.yaml')
141
  config = load_config(config_path)
142
  runner = VideoDiffusionInfer(config)
143
  OmegaConf.set_readonly(runner.config, False)
144
 
145
- init_torch(cudnn_benchmark=False, timeout=datetime.timedelta(seconds=3600))
146
- configure_sequence_parallel(sp_size)
 
147
  runner.configure_dit_model(device="cuda", checkpoint='./ckpts/seedvr2_ema_3b.pth')
148
  runner.configure_vae_model()
149
- # Set memory limit.
150
  if hasattr(runner.vae, "set_memory_limit"):
151
  runner.vae.set_memory_limit(**runner.config.vae.memory_limit)
152
  return runner
153
 
154
  @spaces.GPU(duration=100)
155
  def generation_step(runner, text_embeds_dict, cond_latents):
 
156
  def _move_to_cuda(x):
157
  return [i.to(torch.device("cuda")) for i in x]
158
 
 
159
  noises = [torch.randn_like(latent) for latent in cond_latents]
160
  aug_noises = [torch.randn_like(latent) for latent in cond_latents]
161
- print(f"Generating with noise shape: {noises[0].size()}.")
 
162
  noises, aug_noises, cond_latents = sync_data((noises, aug_noises, cond_latents), 0)
163
- noises, aug_noises, cond_latents = list(
164
- map(lambda x: _move_to_cuda(x), (noises, aug_noises, cond_latents))
165
- )
166
  cond_noise_scale = 0.1
167
 
168
  def _add_noise(x, aug_noise):
169
- t = (
170
- torch.tensor([1000.0], device=torch.device("cuda"))
171
- * cond_noise_scale
172
- )
173
  shape = torch.tensor(x.shape[1:], device=torch.device("cuda"))[None]
174
  t = runner.timestep_transform(t, shape)
175
- print(
176
- f"Timestep shifting from"
177
- f" {1000.0 * cond_noise_scale} to {t}."
178
- )
179
  x = runner.schedule.forward(x, aug_noise, t)
180
  return x
181
 
182
  conditions = [
183
- runner.get_condition(
184
- noise,
185
- task="sr",
186
- latent_blur=_add_noise(latent_blur, aug_noise),
187
- )
188
  for noise, aug_noise, latent_blur in zip(noises, aug_noises, cond_latents)
189
  ]
190
 
@@ -196,383 +146,181 @@ def generation_step(runner, text_embeds_dict, cond_latents):
196
  **text_embeds_dict,
197
  )
198
 
 
199
  samples = [
200
- (
201
- rearrange(video[:, None], "c t h w -> t c h w")
202
- if video.ndim == 3
203
- else rearrange(video, "c t h w -> t c h w")
204
- )
205
  for video in video_tensors
206
  ]
207
- del video_tensors
208
-
209
  return samples
210
 
 
 
 
 
 
 
211
  @spaces.GPU(duration=100)
212
- def generation_loop(image_path='./test_images', seed=666, res_h=1280, res_w=720, sp_size=1):
213
- runner = configure_runner(1)
214
-
215
- def _extract_text_embeds():
216
- # Text encoder forward.
217
- positive_prompts_embeds = []
218
- for texts_pos in tqdm(original_images_local):
219
- text_pos_embeds = torch.load('pos_emb.pt')
220
- text_neg_embeds = torch.load('neg_emb.pt')
221
-
222
- positive_prompts_embeds.append(
223
- {"texts_pos": [text_pos_embeds], "texts_neg": [text_neg_embeds]}
224
- )
225
- gc.collect()
226
- torch.cuda.empty_cache()
227
- return positive_prompts_embeds
228
-
229
- def process_image(image_path):
230
- media_type, _ = mimetypes.guess_type(image_path)
231
- is_image = media_type and media_type.startswith("image")
232
- if not is_image:
233
- return None
234
-
235
- img = Image.open(image_path).convert("RGB")
236
- img_tensor = T.ToTensor()(img).unsqueeze(0) # (1, C, H, W)
237
- video = img_tensor.permute(0, 1, 2, 3) # (T=1, C, H, W)
238
- print(f"Read Image size: {video.size()}")
239
- output_dir = 'output/' + str(uuid.uuid4()) + '.png'
240
-
241
- return video, output_dir
242
-
243
- # get test prompts
244
- original_images = [image_path.split('/')[-1]]
245
-
246
- # divide the prompts into different groups
247
- original_images_group = original_images
248
- original_images_local = original_images_group
249
- original_images_local = partition_by_size(original_images_local, 1)
250
-
251
- # pre-extract the text embeddings
252
- positive_prompts_embeds = _extract_text_embeds()
253
-
254
- video_transform = Compose(
255
- [
256
- NaResize(
257
- resolution=(
258
- res_h * res_w
259
- )
260
- ** 0.5,
261
- mode="area",
262
- # Upsample image, model only trained for high res.
263
- downsample_only=False,
264
- ),
265
- Lambda(lambda x: torch.clamp(x, 0.0, 1.0)),
266
- DivisibleCrop((16, 16)),
267
- Normalize(0.5, 0.5),
268
- rearrange("t c h w -> c t h w"),
269
- ]
270
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- # generation loop
273
- for images, text_embeds in tqdm(zip(original_images_local, positive_prompts_embeds)):
274
- # read condition latents
275
- cond_latents = []
276
- for image in images:
277
- video, output_dir = process_image(image)
278
- if video is None:
279
- continue
280
-
281
- cond_latents.append(video_transform(video.to(torch.device("cuda"))))
282
-
283
- ori_lengths = [video.size(1) for video in cond_latents]
284
- input_videos = cond_latents
285
-
286
- # runner.dit.to("cpu")
287
- print(f"Encoding images: {list(map(lambda x: x.size(), cond_latents))}")
288
- # runner.vae.to(torch.device("cuda"))
289
- cond_latents = runner.vae_encode(cond_latents)
290
- # runner.vae.to("cpu")
291
- # runner.dit.to(torch.device("cuda"))
292
-
293
- for i, emb in enumerate(text_embeds["texts_pos"]):
294
- text_embeds["texts_pos"][i] = emb.to(torch.device("cuda"))
295
- for i, emb in enumerate(text_embeds["texts_neg"]):
296
- text_embeds["texts_neg"][i] = emb.to(torch.device("cuda"))
297
-
298
- samples = generation_step(runner, text_embeds, cond_latents=cond_latents)
299
- # runner.dit.to("cpu")
300
- del cond_latents
301
-
302
- # dump samples to the output directory
303
- for path, input, sample, ori_length in zip(
304
- images, input_videos, samples, ori_lengths
305
- ):
306
- if ori_length < sample.shape[0]:
307
- sample = sample[:ori_length]
308
- # color fix
309
- input = (
310
- rearrange(input[:, None], "c t h w -> t c h w")
311
- if input.ndim == 3
312
- else rearrange(input, "c t h w -> t c h w")
313
- )
314
- if use_colorfix:
315
- sample = wavelet_reconstruction(
316
- sample.to("cpu"), input[: sample.size(0)].to("cpu")
317
- )
318
- else:
319
- sample = sample.to("cpu")
320
- sample = (
321
- rearrange(sample[:, None], "t c h w -> t h w c")
322
- if sample.ndim == 3
323
- else rearrange(sample, "t c h w -> t h w c")
324
- )
325
- sample = sample.clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round()
326
- sample = sample.to(torch.uint8).numpy()
327
-
328
- if is_image:
329
- mediapy.write_image(output_dir, sample[0])
330
- else:
331
- mediapy.write_video(
332
- output_dir, sample, fps=24
333
  )
 
 
 
 
 
334
 
335
- # print(f"Generated image size: {sample.shape}")
336
- gc.collect()
337
- torch.cuda.empty_cache()
338
- return output_dir, output_dir, output_dir
 
 
 
 
 
339
 
340
- with gr.Blocks(title="SeedVR2: Professional Image Upscaler") as demo:
341
- # Top logo and title with orange theme
342
- gr.HTML("""
343
- <div style='text-align:center; margin-bottom: 20px;'>
344
- <img src='https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/assets/seedvr_logo.png' style='height:50px;' alt='SeedVR logo'/>
345
  </div>
346
- <p><b>SeedVR2 Image Upscaler</b> - Professional AI-powered image enhancement and upscaling</p>
347
- <p style="color: #ff6600; font-weight: bold;">🔥 Experience state-of-the-art image restoration with advanced diffusion technology</p>
348
- """)
349
-
350
- # Interface with orange theme
351
- with gr.Row():
352
- input_image = gr.Image(label="Upload Image", type="filepath", height=400)
353
- seed = gr.Number(label="Random Seed", value=666, info="Control the randomness of the upscaling process")
354
- res_h = gr.Slider(label="Output Height", minimum=512, maximum=4096, step=64, value=1280, info="Higher resolution for better quality")
355
- res_w = gr.Slider(label="Output Width", minimum=512, maximum=4096, step=64, value=720, info="Higher resolution for better quality")
356
-
357
- with gr.Row():
358
- output_image = gr.Image(label="Upscaled Image", height=400)
359
- download_link = gr.File(label="Download Output")
360
-
361
- run_button = gr.Button("Upscale Image", variant="primary", size="lg")
362
- run_button.click(fn=generation_loop, inputs=[input_image, seed, res_h, res_w], outputs=[output_image, download_link, download_link])
363
-
364
- # Examples
365
- gr.Examples(
366
- examples=[
367
- ["./01.mp4", 4, 1280, 720],
368
- ["./02.mp4", 4, 1280, 720],
369
- ["./03.mp4", 4, 1280, 720],
370
- ],
371
- inputs=[input_image, seed, res_h, res_w]
372
  )
373
 
374
- # Article/Footer with orange accents
375
- gr.HTML("""
376
- <hr style="border-color: #ff6600;">
377
- <p>If you find SeedVR helpful, please ⭐ the
378
- <a href='https://github.com/ByteDance-Seed/SeedVR' target='_blank'>
379
- <b>GitHub repository</b></a>:</p>
380
-
381
- <a href="https://github.com/ByteDance-Seed/SeedVR" target="_blank">
382
- <img src="https://img.shields.io/github/stars/ByteDance-Seed/SeedVR?style=social" alt="GitHub Stars">
383
- </a>
384
-
385
- <h4 style="color: #ff6600;">Features</h4>
386
- <p>🎨 <b>High-Resolution Upscaling</b> - Transform low-quality images to stunning high-resolution masterpieces<br>
387
- 🚀 <b>AI-Powered Enhancement</b> - Advanced diffusion technology for superior image restoration<br>
388
- 🎯 <b>Customizable Output</b> - Control resolution and seed for personalized results</p>
389
-
390
- <h4 style="color: #ff6600;">Limitations</h4>
391
- <p>For best results, use images with moderate degradation. Extreme cases may require additional processing.</p>
392
-
393
- <h4 style="color: #ff6600;">Citation</h4>
394
- <pre style="font-size: 12px; background-color: #fff5f5; padding: 10px; border-radius: 5px; border: 1px solid #ff6600;">
395
- @article{wang2025seedvr2,
396
- title={SeedVR2: One-Step Video Restoration via Diffusion Adversarial Post-Training},
397
- author={Wang, Jianyi and Lin, Shanchuan and Lin, Zhijie and Ren, Yuxi and Wei, Meng and Yue, Zongsheng and Zhou, Shangchen and Chen, Hao and Zhao, Yang and Yang, Ceyuan and Xiao, Xuefeng and Loy, Chen Change and Jiang, Lu},
398
- booktitle={arXiv preprint arXiv:2506.05301},
399
- year={2025}
400
- }
401
- </pre>
402
-
403
- <h4 style="color: #ff6600;">License</h4>
404
- <p>Licensed under the
405
- <a href="http://www.apache.org/licenses/LICENSE-2.0" target="_blank">Apache 2.0 License</a>.</p>
406
-
407
- <h4 style="color: #ff6600;">Contact</h4>
408
- <p>Email: <b>iceclearwjy@gmail.com</b></p>
409
-
410
- <p style="text-align:center;">
411
- <img src="https://visitor-badge.laobi.icu/badge?page_id=ByteDance-Seed/SeedVR" alt="visitors">
412
- </p>
413
- """)
414
-
415
- # Launch with orange theme and modern Gradio 6 configuration
416
- demo.launch(
417
- theme=gr.themes.Soft(
418
- primary_hue="orange",
419
- secondary_hue="orange",
420
- neutral_hue="slate",
421
- font=gr.themes.GoogleFont("Inter"),
422
- text_size="lg",
423
- spacing_size="lg",
424
- radius_size="md"
425
- ).set(
426
- button_primary_background_fill="*primary_600",
427
- button_primary_background_fill_hover="*primary_700",
428
- block_title_text_weight="600",
429
- block_label_text_weight="500",
430
- input_background_fill="*surface_100",
431
- output_background_fill="*surface_100",
432
- input_border_color="*border_color",
433
- output_border_color="*border_color",
434
- input_text_color="*text_color",
435
- output_text_color="*text_color",
436
- block_background_fill="*surface_50",
437
- block_border_color="*border_color",
438
- block_border_width="1px",
439
- block_border_radius="8px",
440
- block_shadow="*shadow_medium",
441
- block_padding="16px",
442
- block_title_text_size="lg",
443
- block_label_text_size="md",
444
- input_padding="12px",
445
- output_padding="12px",
446
- button_text_size="md",
447
- button_padding="12px 24px",
448
- button_border_radius="6px",
449
- button_border_width="1px",
450
- button_border_color="*primary_600",
451
- button_text_weight="500",
452
- slider_track_color="*primary_600",
453
- slider_handle_color="*primary_600",
454
- slider_handle_size="20px",
455
- slider_track_height="6px",
456
- slider_track_radius="3px",
457
- slider_handle_border_width="2px",
458
- slider_handle_border_color="*primary_600",
459
- slider_handle_border_radius="50%",
460
- slider_handle_shadow="*shadow_small",
461
- slider_handle_transition="all 0.2s ease",
462
- slider_handle_hover_background_fill="*primary_700",
463
- slider_handle_hover_border_color="*primary_700",
464
- slider_handle_active_background_fill="*primary_700",
465
- slider_handle_active_border_color="*primary_700",
466
- slider_handle_active_shadow="*shadow_small",
467
- slider_handle_active_transform="scale(1.1)",
468
- slider_handle_active_transition="all 0.2s ease",
469
- slider_handle_active_border_radius="50%",
470
- slider_handle_active_box_shadow="0 0 0 2px *primary_600",
471
- slider_handle_active_box_shadow_transition="all 0.2s ease",
472
- slider_handle_active_box_shadow_hover="0 0 0 2px *primary_700",
473
- slider_handle_active_box_shadow_active="0 0 0 2px *primary_700",
474
- slider_handle_active_box_shadow_active_transition="all 0.2s ease",
475
- slider_handle_active_box_shadow_active_hover="0 0 0 2px *primary_700",
476
- slider_handle_active_box_shadow_active_active="0 0 0 2px *primary_700",
477
- slider_handle_active_box_shadow_active_active_transition="all 0.2s ease",
478
- slider_handle_active_box_shadow_active_active_hover="0 0 0 2px *primary_700",
479
- slider_handle_active_box_shadow_active_active_active="0 0 0 2px *primary_700",
480
- slider_handle_active_box_shadow_active_active_active_transition="all 0.2s ease",
481
- slider_handle_active_box_shadow_active_active_active_hover="0 0 0 2px *primary_700",
482
- slider_handle_active_box_shadow_active_active_active_active="0 0 0 2px *primary_700",
483
- slider_handle_active_box_shadow_active_active_active_active_transition="all 0.2s ease",
484
- slider_handle_active_box_shadow_active_active_active_active_hover="0 0 0 2px *primary_700",
485
- slider_handle_active_box_shadow_active_active_active_active_active="0 0 0 2px *primary_700",
486
- slider_handle_active_box_shadow_active_active_active_active_active_transition="all 0.2s ease",
487
- slider_handle_active_box_shadow_active_active_active_active_active_hover="0 0 0 2px *primary_700",
488
- slider_handle_active_box_shadow_active_active_active_active_active_active="0 0 0 2px *primary_700",
489
- slider_handle_active_box_shadow_active_active_active_active_active_active_transition="all 0.2s ease",
490
- slider_handle_active_box_shadow_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
491
- slider_handle_active_box_shadow_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
492
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_transition="all 0.2s ease",
493
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
494
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
495
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
496
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
497
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
498
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
499
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
500
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
501
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
502
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
503
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
504
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
505
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
506
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
507
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
508
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
509
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
510
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
511
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
512
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
513
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
514
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
515
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
516
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
517
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
518
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
519
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
520
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
521
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
522
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
523
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
524
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
525
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
526
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
527
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
528
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
529
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
530
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
531
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
532
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
533
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
534
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
535
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
536
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
537
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
538
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
539
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
540
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
541
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
542
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
543
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
544
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
545
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
546
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
547
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
548
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
549
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
550
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
551
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
552
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
553
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
554
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
555
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
556
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
557
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
558
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
559
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
560
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
561
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
562
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
563
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
564
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
565
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
566
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
567
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
568
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
569
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
570
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
571
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
572
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
573
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
574
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
575
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_active="0 0 0 2px *primary_700",
576
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_transition="all 0.2s ease",
577
- slider_handle_active_box_shadow_active_active_active_active_active_active_active_active_active_active_active_hover="0 0 0 2px *primary_700",
578
- slider_handle_active_box_shadow_active_active_active_active_active_active_activity_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active_active
 
2
  import subprocess
3
  import os
4
  import torch
5
+ import uuid
 
 
 
 
6
  import gc
7
+ import shutil
8
+ import argparse
9
+ from pathlib import Path
10
+ from urllib.parse import urlparse
11
+ from torch.hub import download_url_to_file, get_dir
12
+ import shlex
13
+ import gradio as gr
14
+ from PIL import Image
15
+ import numpy as np
16
+ from omegaconf import OmegaConf
17
+ from einops import rearrange
18
+ from torchvision.transforms import Compose, Lambda, Normalize
19
+ import torchvision.transforms as T
20
 
21
+ # --- Project Specific Imports (Assumed to be present in repo) ---
22
  from data.image.transforms.divisible_crop import DivisibleCrop
23
  from data.image.transforms.na_resize import NaResize
24
+ # Note: Keeping Rearrange in case it's a specific wrapper, though typically einops suffices
25
+ from data.video.transforms.rearrange import Rearrange
26
+
27
  if os.path.exists("./projects/video_diffusion_sr/color_fix.py"):
28
  from projects.video_diffusion_sr.color_fix import wavelet_reconstruction
29
+ use_colorfix = True
30
  else:
31
  use_colorfix = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ from common.distributed import init_torch
34
  from projects.video_diffusion_sr.infer import VideoDiffusionInfer
35
  from common.config import load_config
36
  from common.distributed.ops import sync_data
37
  from common.seed import set_seed
38
+ from common.partition import partition_by_size
 
 
 
 
 
 
 
 
 
39
 
40
+ # --- Environment Setup ---
41
  os.environ["MASTER_ADDR"] = "127.0.0.1"
42
  os.environ["MASTER_PORT"] = "12355"
43
  os.environ["RANK"] = str(0)
44
  os.environ["WORLD_SIZE"] = str(1)
45
 
46
+ # Install Flash Attention if missing
47
  subprocess.run(
48
  "pip install flash-attn --no-build-isolation",
49
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
50
  shell=True,
51
  )
52
 
53
+ # --- Model & Resource Downloading ---
54
  def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
55
+ if model_dir is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  hub_dir = get_dir()
57
  model_dir = os.path.join(hub_dir, 'checkpoints')
58
 
59
  os.makedirs(model_dir, exist_ok=True)
 
60
  parts = urlparse(url)
61
+ filename = file_name if file_name else os.path.basename(parts.path)
 
 
62
  cached_file = os.path.abspath(os.path.join(model_dir, filename))
63
+
64
  if not os.path.exists(cached_file):
65
+ print(f'Downloading: "{url}" to {cached_file}\n')
 
66
  download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
67
  return cached_file
68
 
 
 
69
  ckpt_dir = Path('./ckpts')
70
+ ckpt_dir.mkdir(exist_ok=True)
 
71
 
72
  pretrain_model_url = {
73
+ 'vae': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/ema_vae.pth',
74
+ 'dit': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/seedvr2_ema_3b.pth',
75
  'pos_emb': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/pos_emb.pt',
76
  'neg_emb': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/neg_emb.pt',
77
  'apex': 'https://huggingface.co/ByteDance-Seed/SeedVR2-3B/resolve/main/apex-0.1-cp310-cp310-linux_x86_64.whl'
78
  }
79
+
80
+ # Download Weights
81
  if not os.path.exists('./ckpts/seedvr2_ema_3b.pth'):
82
+ load_file_from_url(url=pretrain_model_url['dit'], model_dir='./ckpts/')
83
  if not os.path.exists('./ckpts/ema_vae.pth'):
84
+ load_file_from_url(url=pretrain_model_url['vae'], model_dir='./ckpts/')
85
  if not os.path.exists('./pos_emb.pt'):
86
+ load_file_from_url(url=pretrain_model_url['pos_emb'], model_dir='./')
87
  if not os.path.exists('./neg_emb.pt'):
88
+ load_file_from_url(url=pretrain_model_url['neg_emb'], model_dir='./')
89
  if not os.path.exists('./apex-0.1-cp310-cp310-linux_x86_64.whl'):
90
+ load_file_from_url(url=pretrain_model_url['apex'], model_dir='./')
91
+ subprocess.run(shlex.split("pip install apex-0.1-cp310-cp310-linux_x86_64.whl"))
92
+
93
+ # --- Core Inference Logic ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  @spaces.GPU(duration=100)
96
+ def configure_runner():
97
+ """Initializes the model runner singleton."""
98
  config_path = os.path.join('./configs_3b', 'main.yaml')
99
  config = load_config(config_path)
100
  runner = VideoDiffusionInfer(config)
101
  OmegaConf.set_readonly(runner.config, False)
102
 
103
+ # Standard init for single GPU
104
+ init_torch(cudnn_benchmark=False)
105
+
106
  runner.configure_dit_model(device="cuda", checkpoint='./ckpts/seedvr2_ema_3b.pth')
107
  runner.configure_vae_model()
108
+
109
  if hasattr(runner.vae, "set_memory_limit"):
110
  runner.vae.set_memory_limit(**runner.config.vae.memory_limit)
111
  return runner
112
 
113
  @spaces.GPU(duration=100)
114
  def generation_step(runner, text_embeds_dict, cond_latents):
115
+ """Executes the diffusion generation step."""
116
  def _move_to_cuda(x):
117
  return [i.to(torch.device("cuda")) for i in x]
118
 
119
+ # Generate noise
120
  noises = [torch.randn_like(latent) for latent in cond_latents]
121
  aug_noises = [torch.randn_like(latent) for latent in cond_latents]
122
+
123
+ # Sync and move
124
  noises, aug_noises, cond_latents = sync_data((noises, aug_noises, cond_latents), 0)
125
+ noises, aug_noises, cond_latents = list(map(_move_to_cuda, (noises, aug_noises, cond_latents)))
126
+
 
127
  cond_noise_scale = 0.1
128
 
129
  def _add_noise(x, aug_noise):
130
+ t = torch.tensor([1000.0], device=torch.device("cuda")) * cond_noise_scale
 
 
 
131
  shape = torch.tensor(x.shape[1:], device=torch.device("cuda"))[None]
132
  t = runner.timestep_transform(t, shape)
 
 
 
 
133
  x = runner.schedule.forward(x, aug_noise, t)
134
  return x
135
 
136
  conditions = [
137
+ runner.get_condition(noise, task="sr", latent_blur=_add_noise(latent_blur, aug_noise))
 
 
 
 
138
  for noise, aug_noise, latent_blur in zip(noises, aug_noises, cond_latents)
139
  ]
140
 
 
146
  **text_embeds_dict,
147
  )
148
 
149
+ # Output formatting
150
  samples = [
151
+ (rearrange(video[:, None], "c t h w -> t c h w") if video.ndim == 3
152
+ else rearrange(video, "c t h w -> t c h w"))
 
 
 
153
  for video in video_tensors
154
  ]
 
 
155
  return samples
156
 
157
+ def get_text_embeds():
158
+ """Loads static text embeddings."""
159
+ text_pos = torch.load('pos_emb.pt')
160
+ text_neg = torch.load('neg_emb.pt')
161
+ return {"texts_pos": [text_pos], "texts_neg": [text_neg]}
162
+
163
  @spaces.GPU(duration=100)
164
+ def upscale_image(image_path, seed=666, cfg_scale=1.0):
165
+ if not image_path:
166
+ return None, None
167
+
168
+ # Initialize runner
169
+ runner = configure_runner()
170
+
171
+ # Configure Diffusion
172
+ runner.config.diffusion.cfg.scale = cfg_scale
173
+ runner.config.diffusion.cfg.rescale = 0.0
174
+ runner.config.diffusion.timesteps.sampling.steps = 1 # One-step generation
175
+ runner.configure_diffusion()
176
+
177
+ # Seed
178
+ seed = int(seed) % (2**32)
179
+ set_seed(seed, same_across_ranks=True)
180
+
181
+ os.makedirs('output/', exist_ok=True)
182
+ output_filename = f'output/{uuid.uuid4()}.png'
183
+
184
+ # Prepare Transforms
185
+ # Note: Model is optimized for 2560x1440 area equivalent
186
+ video_transform = Compose([
187
+ NaResize(resolution=(2560 * 1440) ** 0.5, mode="area", downsample_only=False),
188
+ Lambda(lambda x: torch.clamp(x, 0.0, 1.0)),
189
+ DivisibleCrop((16, 16)),
190
+ Normalize(0.5, 0.5),
191
+ Rearrange("t c h w -> c t h w"),
192
+ ])
193
+
194
+ # Load and Preprocess Image
195
+ img = Image.open(image_path).convert("RGB")
196
+ img_tensor = T.ToTensor()(img).unsqueeze(0) # (1, C, H, W)
197
+ # Model expects (C, T, H, W), for image T=1
198
+ video_input = img_tensor.permute(0, 1, 2, 3)
199
+
200
+ cond_latents = [video_transform(video_input.to(torch.device("cuda")))]
201
+ input_tensor = cond_latents[0] # Keep for colorfix ref
202
+
203
+ # Encode
204
+ cond_latents = runner.vae_encode(cond_latents)
205
+
206
+ # Get Embeddings
207
+ text_embeds = get_text_embeds()
208
+ for k in ["texts_pos", "texts_neg"]:
209
+ text_embeds[k] = [emb.to(torch.device("cuda")) for emb in text_embeds[k]]
210
+
211
+ # Inference
212
+ samples = generation_step(runner, text_embeds, cond_latents=cond_latents)
213
+
214
+ # Post-process
215
+ sample = samples[0]
216
+
217
+ # Handle tensor shaping for colorfix
218
+ input_ref = (
219
+ rearrange(input_tensor[:, None], "c t h w -> t c h w")
220
+ if input_tensor.ndim == 3
221
+ else rearrange(input_tensor, "c t h w -> t c h w")
222
  )
223
+
224
+ if use_colorfix:
225
+ sample = wavelet_reconstruction(sample.to("cpu"), input_ref[:sample.size(0)].to("cpu"))
226
+ else:
227
+ sample = sample.to("cpu")
228
+
229
+ # Final normalization
230
+ sample = (
231
+ rearrange(sample[:, None], "t c h w -> t h w c")
232
+ if sample.ndim == 3
233
+ else rearrange(sample, "t c h w -> t h w c")
234
+ )
235
+ sample = sample.clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round()
236
+ sample = sample.to(torch.uint8).numpy()
237
+
238
+ # Save
239
+ result_image = Image.fromarray(sample[0])
240
+ result_image.save(output_filename)
241
 
242
+ # Cleanup
243
+ del runner, cond_latents, samples
244
+ gc.collect()
245
+ torch.cuda.empty_cache()
246
+
247
+ return result_image, output_filename
248
+
249
+ # --- Gradio UI ---
250
+
251
+ # Custom CSS for the "Top Tier" look
252
+ custom_css = """
253
+ .gradio-container {
254
+ font-family: 'Inter', sans-serif;
255
+ }
256
+ h1 {
257
+ text-align: center;
258
+ color: #FF7043;
259
+ font-weight: 800;
260
+ }
261
+ .contain {
262
+ background-color: #FAFAFA;
263
+ }
264
+ button.primary {
265
+ background: linear-gradient(45deg, #FF7043, #FFAB91);
266
+ border: none;
267
+ box-shadow: 0 4px 15px rgba(255, 112, 67, 0.3);
268
+ }
269
+ """
270
+
271
+ theme = gr.themes.Soft(
272
+ primary_hue="orange",
273
+ secondary_hue="slate",
274
+ neutral_hue="stone",
275
+ radius_size=gr.themes.sizes.RADIUS_LG,
276
+ ).set(
277
+ button_primary_background_fill="#FF7043",
278
+ button_primary_background_fill_hover="#F4511E",
279
+ button_primary_text_color="white",
280
+ )
281
+
282
+ with gr.Blocks(theme=theme, css=custom_css, title="SeedVR2 Image Upscaler") as demo:
283
+
284
+ with gr.Column(variant="panel"):
285
+ gr.Markdown(
286
+ """
287
+ # 🍊 SeedVR2 Image Upscaler
288
+ ### Professional One-Step Restoration & Upscaling
289
+ """
290
+ )
291
+
292
+ with gr.Row(equal_height=True):
293
+ with gr.Column(scale=1):
294
+ with gr.Group():
295
+ input_image = gr.Image(
296
+ label="Input Image",
297
+ type="filepath",
298
+ height=400,
299
+ sources=["upload", "clipboard"]
 
 
 
300
  )
301
+ with gr.Accordion("Advanced Settings", open=False):
302
+ seed_input = gr.Number(label="Seed", value=666, precision=0)
303
+ cfg_input = gr.Slider(label="CFG Scale", minimum=0.0, maximum=10.0, value=1.0, step=0.1)
304
+
305
+ run_btn = gr.Button("✨ Upscale Image", variant="primary", size="lg")
306
 
307
+ with gr.Column(scale=1):
308
+ output_image = gr.Image(label="Restored Result", interactive=False, height=400)
309
+ download_file = gr.File(label="Download High-Res")
310
+
311
+ run_btn.click(
312
+ fn=upscale_image,
313
+ inputs=[input_image, seed_input, cfg_input],
314
+ outputs=[output_image, download_file]
315
+ )
316
 
317
+ gr.Markdown(
318
+ """
319
+ <div style="text-align: center; opacity: 0.6; margin-top: 20px; font-size: 0.8em;">
320
+ Powered by SeedVR2 | One-Step Diffusion Model
 
321
  </div>
322
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  )
324
 
325
+ demo.queue()
326
+ demo.launch()