Spaces:
Running
Running
IceClear
commited on
Commit
·
67c0e7b
1
Parent(s):
25e6160
update
Browse files- app.py +8 -7
- projects/video_diffusion_sr/infer.py +2 -8
app.py
CHANGED
|
@@ -61,6 +61,8 @@ from pathlib import Path
|
|
| 61 |
from urllib.parse import urlparse
|
| 62 |
from torch.hub import download_url_to_file, get_dir
|
| 63 |
import shlex
|
|
|
|
|
|
|
| 64 |
|
| 65 |
os.environ["MASTER_ADDR"] = "127.0.0.1"
|
| 66 |
os.environ["MASTER_PORT"] = "12355"
|
|
@@ -223,7 +225,7 @@ def generation_step(runner, text_embeds_dict, cond_latents):
|
|
| 223 |
@spaces.GPU(duration=120)
|
| 224 |
def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size=1, cfg_scale=1.0, cfg_rescale=0.0, sample_steps=1, res_h=1280, res_w=720, sp_size=1):
|
| 225 |
runner = configure_runner(1)
|
| 226 |
-
output_dir = 'output/
|
| 227 |
def _build_pos_and_neg_prompt():
|
| 228 |
# read positive prompt
|
| 229 |
positive_text = "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, \
|
|
@@ -342,12 +344,12 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
|
|
| 342 |
input_videos = cond_latents
|
| 343 |
cond_latents = [cut_videos(video, sp_size) for video in cond_latents]
|
| 344 |
|
| 345 |
-
runner.dit.to("cpu")
|
| 346 |
print(f"Encoding videos: {list(map(lambda x: x.size(), cond_latents))}")
|
| 347 |
-
runner.vae.to(torch.device("cuda"))
|
| 348 |
cond_latents = runner.vae_encode(cond_latents)
|
| 349 |
-
runner.vae.to("cpu")
|
| 350 |
-
runner.dit.to(torch.device("cuda"))
|
| 351 |
|
| 352 |
for i, emb in enumerate(text_embeds["texts_pos"]):
|
| 353 |
text_embeds["texts_pos"][i] = emb.to(torch.device("cuda"))
|
|
@@ -355,7 +357,7 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
|
|
| 355 |
text_embeds["texts_neg"][i] = emb.to(torch.device("cuda"))
|
| 356 |
|
| 357 |
samples = generation_step(runner, text_embeds, cond_latents=cond_latents)
|
| 358 |
-
runner.dit.to("cpu")
|
| 359 |
del cond_latents
|
| 360 |
|
| 361 |
# dump samples to the output directory
|
|
@@ -364,7 +366,6 @@ def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size
|
|
| 364 |
):
|
| 365 |
if ori_length < sample.shape[0]:
|
| 366 |
sample = sample[:ori_length]
|
| 367 |
-
filename = os.path.join(tgt_path, os.path.basename(path))
|
| 368 |
# color fix
|
| 369 |
input = (
|
| 370 |
rearrange(input[:, None], "c t h w -> t c h w")
|
|
|
|
| 61 |
from urllib.parse import urlparse
|
| 62 |
from torch.hub import download_url_to_file, get_dir
|
| 63 |
import shlex
|
| 64 |
+
import uuid
|
| 65 |
+
|
| 66 |
|
| 67 |
os.environ["MASTER_ADDR"] = "127.0.0.1"
|
| 68 |
os.environ["MASTER_PORT"] = "12355"
|
|
|
|
| 225 |
@spaces.GPU(duration=120)
|
| 226 |
def generation_loop(video_path='./test_videos', seed=666, fps_out=12, batch_size=1, cfg_scale=1.0, cfg_rescale=0.0, sample_steps=1, res_h=1280, res_w=720, sp_size=1):
|
| 227 |
runner = configure_runner(1)
|
| 228 |
+
output_dir = 'output/' + uuid.uuid4() + '.mp4'
|
| 229 |
def _build_pos_and_neg_prompt():
|
| 230 |
# read positive prompt
|
| 231 |
positive_text = "Cinematic, High Contrast, highly detailed, taken using a Canon EOS R camera, \
|
|
|
|
| 344 |
input_videos = cond_latents
|
| 345 |
cond_latents = [cut_videos(video, sp_size) for video in cond_latents]
|
| 346 |
|
| 347 |
+
# runner.dit.to("cpu")
|
| 348 |
print(f"Encoding videos: {list(map(lambda x: x.size(), cond_latents))}")
|
| 349 |
+
# runner.vae.to(torch.device("cuda"))
|
| 350 |
cond_latents = runner.vae_encode(cond_latents)
|
| 351 |
+
# runner.vae.to("cpu")
|
| 352 |
+
# runner.dit.to(torch.device("cuda"))
|
| 353 |
|
| 354 |
for i, emb in enumerate(text_embeds["texts_pos"]):
|
| 355 |
text_embeds["texts_pos"][i] = emb.to(torch.device("cuda"))
|
|
|
|
| 357 |
text_embeds["texts_neg"][i] = emb.to(torch.device("cuda"))
|
| 358 |
|
| 359 |
samples = generation_step(runner, text_embeds, cond_latents=cond_latents)
|
| 360 |
+
# runner.dit.to("cpu")
|
| 361 |
del cond_latents
|
| 362 |
|
| 363 |
# dump samples to the output directory
|
|
|
|
| 366 |
):
|
| 367 |
if ori_length < sample.shape[0]:
|
| 368 |
sample = sample[:ori_length]
|
|
|
|
| 369 |
# color fix
|
| 370 |
input = (
|
| 371 |
rearrange(input[:, None], "c t h w -> t c h w")
|
projects/video_diffusion_sr/infer.py
CHANGED
|
@@ -41,6 +41,7 @@ from models.dit_v2 import na
|
|
| 41 |
class VideoDiffusionInfer():
|
| 42 |
def __init__(self, config: DictConfig):
|
| 43 |
self.config = config
|
|
|
|
| 44 |
|
| 45 |
def get_condition(self, latent: Tensor, latent_blur: Tensor, task: str) -> Tensor:
|
| 46 |
t, h, w, c = latent.shape
|
|
@@ -75,13 +76,9 @@ class VideoDiffusionInfer():
|
|
| 75 |
# For fast init & resume,
|
| 76 |
# when training from scratch, rank0 init DiT on cpu, then sync to other ranks with FSDP.
|
| 77 |
# otherwise, all ranks init DiT on meta device, then load_state_dict with assign=True.
|
| 78 |
-
if self.config.dit.get("init_with_meta_device", False):
|
| 79 |
-
init_device = "cpu" if get_global_rank() == 0 and checkpoint is None else "meta"
|
| 80 |
-
else:
|
| 81 |
-
init_device = "cpu"
|
| 82 |
|
| 83 |
# Create dit model.
|
| 84 |
-
with torch.device(
|
| 85 |
self.dit = create_object(self.config.dit.model)
|
| 86 |
self.dit.set_gradient_checkpointing(self.config.dit.gradient_checkpoint)
|
| 87 |
|
|
@@ -92,9 +89,6 @@ class VideoDiffusionInfer():
|
|
| 92 |
print(f"Loading info: {loading_info}")
|
| 93 |
self.dit = meta_non_persistent_buffer_init_fn(self.dit)
|
| 94 |
|
| 95 |
-
if device in [get_device(), "cuda"]:
|
| 96 |
-
self.dit.to(get_device())
|
| 97 |
-
|
| 98 |
# Print model size.
|
| 99 |
num_params = sum(p.numel() for p in self.dit.parameters() if p.requires_grad)
|
| 100 |
print(f"DiT trainable parameters: {num_params:,}")
|
|
|
|
| 41 |
class VideoDiffusionInfer():
|
| 42 |
def __init__(self, config: DictConfig):
|
| 43 |
self.config = config
|
| 44 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 45 |
|
| 46 |
def get_condition(self, latent: Tensor, latent_blur: Tensor, task: str) -> Tensor:
|
| 47 |
t, h, w, c = latent.shape
|
|
|
|
| 76 |
# For fast init & resume,
|
| 77 |
# when training from scratch, rank0 init DiT on cpu, then sync to other ranks with FSDP.
|
| 78 |
# otherwise, all ranks init DiT on meta device, then load_state_dict with assign=True.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
# Create dit model.
|
| 81 |
+
with torch.device(self.device):
|
| 82 |
self.dit = create_object(self.config.dit.model)
|
| 83 |
self.dit.set_gradient_checkpointing(self.config.dit.gradient_checkpoint)
|
| 84 |
|
|
|
|
| 89 |
print(f"Loading info: {loading_info}")
|
| 90 |
self.dit = meta_non_persistent_buffer_init_fn(self.dit)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
| 92 |
# Print model size.
|
| 93 |
num_params = sum(p.numel() for p in self.dit.parameters() if p.requires_grad)
|
| 94 |
print(f"DiT trainable parameters: {num_params:,}")
|