SeedVR2-3B

Sleeping

App Files Files Community

baka999 commited on 24 days ago

Commit

15726ee

verified ·

1 Parent(s): d0005e8

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -42

app.py CHANGED Viewed

@@ -266,7 +266,35 @@ PRESET_RESOLUTIONS = {
     "1440p (2560×1440)": (2560, 1440),
     "4K    (3840×2160)": (3840, 2160),
 }
-CHUNK_FRAMES = 121   # model hard limit per forward pass
 # ── Chunked video SR ────────────────────────────────────────────────────────────
 @spaces.GPU(duration=100)
@@ -279,11 +307,12 @@ def generation_loop(video_path, seed=666, fps_out=24, model_size="3b",
     def _extract_text_embeds(n_chunks):
         embeds = []
         for _ in range(n_chunks):
-            text_pos_embeds = torch.load('pos_emb.pt')
-            text_neg_embeds = torch.load('neg_emb.pt')
             embeds.append({"texts_pos": [text_pos_embeds], "texts_neg": [text_neg_embeds]})
         gc.collect()
-        torch.cuda.empty_cache()
         return embeds
     def cut_video_to_model(video, sp_size):
@@ -338,6 +367,12 @@ def generation_loop(video_path, seed=666, fps_out=24, model_size="3b",
         res_w = int(in_W * scale)
     print(f"Target resolution: {res_w}×{res_h}  (mode={res_mode})")
     target_resolution = (res_h * res_w) ** 0.5
     def make_transform(target_res):
@@ -379,14 +414,20 @@ def generation_loop(video_path, seed=666, fps_out=24, model_size="3b",
         return output_dir, None, output_dir
     # ── Chunked video processing ────────────────────────────────────────────────
-    # Split full_video (T, C, H, W) into chunks of CHUNK_FRAMES
     frame_chunks = []
-    for start in range(0, T_total, CHUNK_FRAMES):
-        end = min(start + CHUNK_FRAMES, T_total)
         frame_chunks.append(full_video[start:end])   # each: (t_chunk, C, H, W)
     n_chunks = len(frame_chunks)
-    print(f"Processing {n_chunks} chunk(s) of up to {CHUNK_FRAMES} frames each …")
     text_embeds_list = _extract_text_embeds(n_chunks)
     all_output_frames = []   # will collect numpy uint8 frames
@@ -394,41 +435,64 @@ def generation_loop(video_path, seed=666, fps_out=24, model_size="3b",
     for chunk_idx, (chunk_frames, text_embeds) in enumerate(zip(frame_chunks, text_embeds_list)):
         print(f"  Chunk {chunk_idx+1}/{n_chunks}: {chunk_frames.shape[0]} frames")
-        # Transform to model input space
-        cond = video_transform(chunk_frames.to(torch.device("cuda")))  # (C, t, H_out, W_out)
-        ori_length = cond.size(1)
-        # Pad to model alignment
-        cond_padded = cut_video_to_model(cond, sp_size)
-        # Move text embeds to GPU
-        for i, emb in enumerate(text_embeds["texts_pos"]):
-            text_embeds["texts_pos"][i] = emb.to("cuda")
-        for i, emb in enumerate(text_embeds["texts_neg"]):
-            text_embeds["texts_neg"][i] = emb.to("cuda")
-        # Encode → diffuse → decode
-        latent = runner.vae_encode([cond_padded])
-        sample = generation_step(runner, text_embeds, cond_latents=latent)[0]
-        # Trim padding
-        if ori_length < sample.shape[0]:
-            sample = sample[:ori_length]
-        # Color fix
-        input_pixel = rearrange(cond, "c t h w -> t c h w")
-        if use_colorfix:
-            sample = wavelet_reconstruction(sample.to("cpu"), input_pixel[:sample.size(0)].to("cpu"))
-        else:
-            sample = sample.to("cpu")
-        # Convert to uint8 numpy (T, H, W, C)
-        sample = rearrange(sample, "t c h w -> t h w c")
-        sample = sample.clip(-1,1).mul_(0.5).add_(0.5).mul_(255).round().to(torch.uint8).numpy()
-        all_output_frames.append(sample)
-        del latent, cond, cond_padded
-        gc.collect()
-        torch.cuda.empty_cache()
     # ── Concatenate chunks and write ────────────────────────────────────────────
     import numpy as np

     "1440p (2560×1440)": (2560, 1440),
     "4K    (3840×2160)": (3840, 2160),
 }
+CHUNK_FRAMES = 121   # absolute model hard limit per forward pass
+def _choose_safe_chunk_frames(h: int, w: int, requested: int = CHUNK_FRAMES) -> int:
+    """
+    Pick a safer temporal chunk size for high-resolution videos to avoid allocator/NVML crashes.
+    720p can usually use the full 121 frames; above that we shrink aggressively.
+    """
+    pixels = int(h) * int(w)
+    if pixels >= 3840 * 2160:   # 4K+
+        return min(requested, 8)
+    if pixels >= 2560 * 1440:   # 1440p
+        return min(requested, 12)
+    if pixels >= 1920 * 1080:   # 1080p
+        return min(requested, 16)
+    if pixels >= 1280 * 720:    # 720p
+        return min(requested, 32)
+    return min(requested, 64)
+def _is_cuda_memory_error(exc: BaseException) -> bool:
+    msg = str(exc)
+    keys = (
+        "out of memory",
+        "cuda out of memory",
+        "cudacachingallocator",
+        "nvml_success == r internal assert failed",
+        "allocator",
+    )
+    msg_low = msg.lower()
+    return any(k in msg_low for k in keys)
 # ── Chunked video SR ────────────────────────────────────────────────────────────
 @spaces.GPU(duration=100)
     def _extract_text_embeds(n_chunks):
         embeds = []
         for _ in range(n_chunks):
+            text_pos_embeds = torch.load('pos_emb.pt', map_location='cpu', weights_only=True)
+            text_neg_embeds = torch.load('neg_emb.pt', map_location='cpu', weights_only=True)
             embeds.append({"texts_pos": [text_pos_embeds], "texts_neg": [text_neg_embeds]})
         gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return embeds
     def cut_video_to_model(video, sp_size):
         res_w = int(in_W * scale)
     print(f"Target resolution: {res_w}×{res_h}  (mode={res_mode})")
+    if is_video and (res_h * res_w) > (1920 * 1080):
+        print(
+            "⚠️ High-memory mode detected. 2K/4K video restoration is very likely to fail on limited GPU "
+            "memory; the code will use smaller temporal chunks automatically."
+        )
     target_resolution = (res_h * res_w) ** 0.5
     def make_transform(target_res):
         return output_dir, None, output_dir
     # ── Chunked video processing ────────────────────────────────────────────────
+    safe_chunk_frames = _choose_safe_chunk_frames(res_h, res_w, CHUNK_FRAMES)
+    if safe_chunk_frames != CHUNK_FRAMES:
+        print(
+            f"Reducing chunk size from {CHUNK_FRAMES} to {safe_chunk_frames} "
+            f"for safer memory usage at {res_w}×{res_h}."
+        )
     frame_chunks = []
+    for start in range(0, T_total, safe_chunk_frames):
+        end = min(start + safe_chunk_frames, T_total)
         frame_chunks.append(full_video[start:end])   # each: (t_chunk, C, H, W)
     n_chunks = len(frame_chunks)
+    print(f"Processing {n_chunks} chunk(s) of up to {safe_chunk_frames} frames each …")
     text_embeds_list = _extract_text_embeds(n_chunks)
     all_output_frames = []   # will collect numpy uint8 frames
     for chunk_idx, (chunk_frames, text_embeds) in enumerate(zip(frame_chunks, text_embeds_list)):
         print(f"  Chunk {chunk_idx+1}/{n_chunks}: {chunk_frames.shape[0]} frames")
+        cond = None
+        cond_padded = None
+        latent = None
+        sample = None
+        try:
+            # Transform to model input space
+            cond = video_transform(chunk_frames.to(torch.device("cuda"), non_blocking=True))
+            ori_length = cond.size(1)
+            # Pad to model alignment
+            cond_padded = cut_video_to_model(cond, sp_size)
+            # Move text embeds to GPU lazily right before use
+            for i, emb in enumerate(text_embeds["texts_pos"]):
+                text_embeds["texts_pos"][i] = emb.to("cuda", non_blocking=True)
+            for i, emb in enumerate(text_embeds["texts_neg"]):
+                text_embeds["texts_neg"][i] = emb.to("cuda", non_blocking=True)
+            # Encode → diffuse → decode
+            latent = runner.vae_encode([cond_padded])
+            sample = generation_step(runner, text_embeds, cond_latents=latent)[0]
+            # Trim padding
+            if ori_length < sample.shape[0]:
+                sample = sample[:ori_length]
+            # Color fix
+            input_pixel = rearrange(cond, "c t h w -> t c h w")
+            if use_colorfix:
+                sample = wavelet_reconstruction(sample.to("cpu"), input_pixel[:sample.size(0)].to("cpu"))
+            else:
+                sample = sample.to("cpu")
+            # Convert to uint8 numpy (T, H, W, C)
+            sample = rearrange(sample, "t c h w -> t h w c")
+            sample = sample.clip(-1,1).mul_(0.5).add_(0.5).mul_(255).round().to(torch.uint8).numpy()
+            all_output_frames.append(sample)
+        except RuntimeError as e:
+            if _is_cuda_memory_error(e):
+                raise RuntimeError(
+                    f"GPU memory不足：当前分辨率 {res_w}×{res_h}、分块 {chunk_frames.shape[0]} 帧仍然超出显存。"
+                    f"请改为更低输出分辨率（建议 720p/1080p）、更小 upscale_factor，或继续降低 safe_chunk_frames。"
+                    f"原始错误: {e}"
+                ) from e
+            raise
+        finally:
+            del latent, cond, cond_padded, sample
+            for k in ("texts_pos", "texts_neg"):
+                for i, emb in enumerate(text_embeds[k]):
+                    if isinstance(emb, torch.Tensor):
+                        text_embeds[k][i] = emb.to("cpu")
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
     # ── Concatenate chunks and write ────────────────────────────────────────────
     import numpy as np