Spaces:

VIDraft
/

Portrait-Animation

Runtime error

App Files Files Community

openfree commited on May 11, 2025

Commit

914dc02

verified ·

1 Parent(s): b1cb088

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +33 -35

sonic.py CHANGED Viewed

@@ -22,6 +22,11 @@ from src.dataset.face_align.align import AlignImage
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 # ------------------------------------------------------------------
 #            single image + speech  →  video-tensor generator
 # ------------------------------------------------------------------
@@ -29,32 +34,29 @@ def test(
     pipe, config, wav_enc, audio_pe, audio2bucket, image_encoder,
     width, height, batch,
 ):
-    # ---------------- batch 차원 맞추기 -----------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
-    ref_img   = batch["ref_img"]        # (1,C,H,W)
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
-    image_embeds = image_encoder(clip_img).image_embeds            # (1,1024)
-    audio_feature = batch["audio_feature"]                         # (1,80,T)
-    audio_len     = int(batch["audio_len"])                        # python int
     step          = int(config.step)
-    # ---------- window 단위 Whisper 인코딩 --------------------------
-    window = 16_000                                                 # 1 초
     audio_prompts, last_prompts = [], []
     for i in range(0, audio_feature.shape[-1], window):
         chunk = audio_feature[:, :, i : i + window]
         layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
         last   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
-        audio_prompts.append(torch.stack(layers, dim=2))            # (1,?,L,384)
-        last_prompts.append(last)                                   # (1,?,1,384)
     if not audio_prompts:
         raise ValueError("[ERROR] No speech recognised in the provided audio.")
@@ -62,17 +64,13 @@ def test(
     audio_prompts = torch.cat(audio_prompts, dim=1)
     last_prompts  = torch.cat(last_prompts,  dim=1)
-    # ---------- 모델 입력 규칙에 맞춰 padding -----------------------
     audio_prompts = torch.cat(
-        [torch.zeros_like(audio_prompts[:, :4]),   # head pad
-         audio_prompts,
          torch.zeros_like(audio_prompts[:, :6])], dim=1)
     last_prompts = torch.cat(
-        [torch.zeros_like(last_prompts[:, :24]),
-         last_prompts,
          torch.zeros_like(last_prompts[:, :26])], dim=1)
-    # ---------- 음성 길이에 따라 chunk 횟수 산정 ---------------------
     total_tokens = audio_prompts.shape[1]
     num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
@@ -81,38 +79,35 @@ def test(
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
-        # ---------------- cond_clip (w=10,L=5) --------------------
-        clip_raw = audio_prompts[:, start : start + 10]              # (1,≤10,L,384)
-        # w-pad
-        if clip_raw.shape[1] < 10:
             pad_w = torch.zeros_like(clip_raw[:, :10 - clip_raw.shape[1]])
             clip_raw = torch.cat([clip_raw, pad_w], dim=1)
-        # ★ L-pad (Whisper-tiny → L=2 → 5로 확장)
-        if clip_raw.shape[2] < 5:
-            pad_L = clip_raw[:, :, -1:].repeat(1, 1, 5 - clip_raw.shape[2], 1)
-            clip_raw = torch.cat([clip_raw, pad_L], dim=2)
-        clip_raw = clip_raw[:, :, :5]                                # (1,10,5,384)
-        cond_clip = clip_raw.unsqueeze(1)                            # (1,1,10,5,384)
-        # ---------------- bucket_clip (w=50,L=1) ------------------
-        bucket_raw = last_prompts[:, start : start + 50]             # (1,≤50,1,384)
         if bucket_raw.shape[1] < 50:
             pad_w = torch.zeros_like(bucket_raw[:, :50 - bucket_raw.shape[1]])
             bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
-        bucket_clip = bucket_raw.unsqueeze(1)                        # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
-        audio_list.append(audio_pe(cond_clip).squeeze(0)[0])         # (10,1024)
-        uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0)[0])
         motion_buckets.append(motion[0])
-    # ---------- Stable-Video-Diffusion 호출 -------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
@@ -137,6 +132,9 @@ def test(
     return video.to(pipe.device).unsqueeze(0).cpu()
 # ------------------------------------------------------------------
 #                             Sonic class
 # ------------------------------------------------------------------

 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# ------------------------------------------------------------------
+#            single image + speech  →  video-tensor generator
+# ------------------------------------------------------------------
+# …(상단 import 및 기타 정의 동일)…
 # ------------------------------------------------------------------
 #            single image + speech  →  video-tensor generator
 # ------------------------------------------------------------------
     pipe, config, wav_enc, audio_pe, audio2bucket, image_encoder,
     width, height, batch,
 ):
+    # ---- 배치 차원 맞추기 -----------------------------------------
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
+    ref_img   = batch["ref_img"]
     clip_img  = batch["clip_images"]
     face_mask = batch["face_mask"]
+    image_embeds = image_encoder(clip_img).image_embeds
+    audio_feature = batch["audio_feature"]
+    audio_len     = int(batch["audio_len"])
     step          = int(config.step)
+    window = 16_000                                                  # 1 초
     audio_prompts, last_prompts = [], []
     for i in range(0, audio_feature.shape[-1], window):
         chunk = audio_feature[:, :, i : i + window]
         layers = wav_enc.encoder(chunk, output_hidden_states=True).hidden_states
         last   = wav_enc.encoder(chunk).last_hidden_state.unsqueeze(-2)
+        audio_prompts.append(torch.stack(layers, dim=2))             # (1,w,L,384)
+        last_prompts.append(last)
     if not audio_prompts:
         raise ValueError("[ERROR] No speech recognised in the provided audio.")
     audio_prompts = torch.cat(audio_prompts, dim=1)
     last_prompts  = torch.cat(last_prompts,  dim=1)
     audio_prompts = torch.cat(
+        [torch.zeros_like(audio_prompts[:, :4]), audio_prompts,
          torch.zeros_like(audio_prompts[:, :6])], dim=1)
     last_prompts = torch.cat(
+        [torch.zeros_like(last_prompts[:, :24]), last_prompts,
          torch.zeros_like(last_prompts[:, :26])], dim=1)
     total_tokens = audio_prompts.shape[1]
     num_chunks   = max(1, math.ceil(total_tokens / (2 * step)))
     for i in tqdm(range(num_chunks)):
         start = i * 2 * step
+        # ------------ cond_clip : (1,1,10,5,384) ------------------
+        clip_raw = audio_prompts[:, start : start + 10]               # (1,≤10,L,384)
+        if clip_raw.shape[1] < 10:                                    # w-pad
             pad_w = torch.zeros_like(clip_raw[:, :10 - clip_raw.shape[1]])
             clip_raw = torch.cat([clip_raw, pad_w], dim=1)
+        # ★ L-pad → 정확히 5 레이어 만들기
+        while clip_raw.shape[2] < 5:
+            clip_raw = torch.cat([clip_raw, clip_raw[:, :, -1:]], dim=2)
+        clip_raw = clip_raw[:, :, :5]                                 # (1,10,5,384)
+        cond_clip = clip_raw.unsqueeze(1)                             # (1,1,10,5,384)
+        # ------------ bucket_clip : (1,1,50,1,384) -----------------
+        bucket_raw = last_prompts[:, start : start + 50]
         if bucket_raw.shape[1] < 50:
             pad_w = torch.zeros_like(bucket_raw[:, :50 - bucket_raw.shape[1]])
             bucket_raw = torch.cat([bucket_raw, pad_w], dim=1)
+        bucket_clip = bucket_raw.unsqueeze(1)                         # (1,1,50,1,384)
         motion = audio2bucket(bucket_clip, image_embeds) * 16 + 16
         ref_list.append(ref_img[0])
+        # ★ 여기: squeeze(0)만 (bz 제거). [0] 인덱싱 제거
+        audio_list.append(audio_pe(cond_clip).squeeze(0))             # (50,1024)
+        uncond_list.append(audio_pe(torch.zeros_like(cond_clip)).squeeze(0))
         motion_buckets.append(motion[0])
+    # ---- Stable Video Diffusion 호출 ------------------------------
     video = pipe(
         ref_img, clip_img, face_mask,
         audio_list, uncond_list, motion_buckets,
     return video.to(pipe.device).unsqueeze(0).cpu()
 # ------------------------------------------------------------------
 #                             Sonic class
 # ------------------------------------------------------------------