Spaces:

VIDraft
/

Portrait-Animation

Runtime error

App Files Files Community

openfree commited on May 10, 2025

Commit

79f5781

verified ·

1 Parent(s): cca593e

Update sonic.py

Browse files

Files changed (1) hide show

sonic.py +53 -22

sonic.py CHANGED Viewed

@@ -33,9 +33,11 @@ def test(
     height,
     batch
 ):
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
     ref_img = batch['ref_img']
     clip_img = batch['clip_images']
     face_mask = batch['face_mask']
@@ -45,11 +47,11 @@ def test(
     audio_len = batch['audio_len']
     step = int(config.step)
-    # 여기서 window=3000 이었던 값을 더 크게 바꿔 최대 60초를 처리할 수 있게 함
-    # whisper-tiny는 기본 16kHz 샘플링이므로, 16,000단위면 대략 1초씩 끊게 됨
-    window = 16000  # (1초 단위로 chunk 처리)
     audio_prompts = []
     last_audio_prompts = []
     for i in range(0, audio_feature.shape[-1], window):
         audio_clip_chunk = audio_feature[:, :, i:i+window]
         # Whisper encoder
@@ -61,30 +63,38 @@ def test(
         audio_prompts.append(audio_prompt)
         last_audio_prompts.append(last_audio_prompt)
-    # ---------------------- [추가된 예외 처리] ----------------------
     if len(audio_prompts) == 0:
         raise ValueError(
             "[ERROR] No speech recognized from the audio. "
             "Please provide a valid speech audio (with clear voice)."
         )
-    # -------------------------------------------------------------
     audio_prompts = torch.cat(audio_prompts, dim=1)
-    # audio_len*2 부분은 모델 내부 로직에 따라 필요한 padding 처리
     audio_prompts = audio_prompts[:, :audio_len*2]
-    audio_prompts = torch.cat([torch.zeros_like(audio_prompts[:, :4]), audio_prompts, torch.zeros_like(audio_prompts[:, :6])], 1)
     last_audio_prompts = torch.cat(last_audio_prompts, dim=1)
     last_audio_prompts = last_audio_prompts[:, :audio_len*2]
-    last_audio_prompts = torch.cat([torch.zeros_like(last_audio_prompts[:, :24]), last_audio_prompts, torch.zeros_like(last_audio_prompts[:, :26])], 1)
     ref_tensor_list = []
     audio_tensor_list = []
     uncond_audio_tensor_list = []
     motion_buckets = []
     for i in tqdm(range(audio_len // step)):
         audio_clip = audio_prompts[:, i*2*step : i*2*step + 10].unsqueeze(0)
         audio_clip_for_bucket = last_audio_prompts[:, i*2*step : i*2*step + 50].unsqueeze(0)
         motion_bucket = audio2bucket(audio_clip_for_bucket, image_embeds)
         motion_bucket = motion_bucket * 16 + 16
         motion_buckets.append(motion_bucket[0])
@@ -138,29 +148,33 @@ class Sonic():
         config = self.config
         config.use_interframe = enable_interpolate_frame
-        device = 'cuda:{}'.format(device_id) if device_id > -1 else 'cpu'
         config.pretrained_model_name_or_path = os.path.join(BASE_DIR, config.pretrained_model_name_or_path)
         vae = AutoencoderKLTemporalDecoder.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="vae",
             variant="fp16")
         val_noise_scheduler = EulerDiscreteScheduler.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="scheduler")
         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="image_encoder",
             variant="fp16")
         unet = UNetSpatioTemporalConditionModel.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="unet",
             variant="fp16")
         add_ip_adapters(unet, [32], [config.ip_audio_scale])
         audio2token = AudioProjModel(
@@ -174,6 +188,7 @@ class Sonic():
             context_tokens=2
         ).to(device)
         unet_checkpoint_path = os.path.join(BASE_DIR, config.unet_checkpoint_path)
         audio2token_checkpoint_path = os.path.join(BASE_DIR, config.audio2token_checkpoint_path)
         audio2bucket_checkpoint_path = os.path.join(BASE_DIR, config.audio2bucket_checkpoint_path)
@@ -193,6 +208,7 @@ class Sonic():
             strict=True,
         )
         if config.weight_dtype == "fp16":
             weight_dtype = torch.float16
         elif config.weight_dtype == "fp32":
@@ -200,26 +216,34 @@ class Sonic():
         elif config.weight_dtype == "bf16":
             weight_dtype = torch.bfloat16
         else:
-            raise ValueError(
-                f"Do not support weight dtype: {config.weight_dtype}"
-            )
-        whisper = WhisperModel.from_pretrained(os.path.join(BASE_DIR, 'checkpoints/whisper-tiny/')).to(device).eval()
         whisper.requires_grad_(False)
-        self.feature_extractor = AutoFeatureExtractor.from_pretrained(os.path.join(BASE_DIR, 'checkpoints/whisper-tiny/'))
         det_path = os.path.join(BASE_DIR, 'checkpoints/yoloface_v5m.pt')
         self.face_det = AlignImage(device, det_path=det_path)
         if config.use_interframe:
             rife = RIFEModel(device=device)
             rife.load_model(os.path.join(BASE_DIR, 'checkpoints', 'RIFE/'))
             self.rife = rife
         image_encoder.to(weight_dtype)
         vae.to(weight_dtype)
         unet.to(weight_dtype)
         pipe = SonicPipeline(
             unet=unet,
             image_encoder=image_encoder,
@@ -237,13 +261,13 @@ class Sonic():
         print('Sonic init done')
     def preprocess(self, image_path, expand_ratio=1.0):
         face_image = cv2.imread(image_path)
         h, w = face_image.shape[:2]
         _, _, bboxes = self.face_det(face_image, maxface=True)
         face_num = len(bboxes)
         bbox_s = None
         if face_num > 0:
             x1, y1, ww, hh = bboxes[0]
             x2, y2 = x1 + ww, y1 + hh
@@ -270,7 +294,7 @@ class Sonic():
                 dynamic_scale=1.0,
                 keep_resolution=False,
                 seed=None):
         config = self.config
         device = self.device
         pipe = self.pipe
@@ -279,6 +303,7 @@ class Sonic():
         audio2bucket = self.audio2bucket
         image_encoder = self.image_encoder
         if seed:
             config.seed = seed
         config.num_inference_steps = inference_steps
@@ -288,17 +313,16 @@ class Sonic():
         video_path = output_path.replace('.mp4', '_noaudio.mp4')
         audio_video_path = output_path
-        # limit=config.frame_num 대신 오디오 전체를 쓰도록 수정
         test_data = image_audio_to_tensor(
             self.face_det,
             self.feature_extractor,
             image_path,
             audio_path,
-            limit=-1,  # -1 등으로 제한 해제
             image_size=min_resolution,
             area=config.area
         )
         if test_data is None:
             return -1
@@ -310,6 +334,7 @@ class Sonic():
         else:
             resolution = f'{width}x{height}'
         video = test(
             pipe,
             config,
@@ -322,7 +347,7 @@ class Sonic():
             batch=test_data,
         )
-        # 중간프레임 보간 사용시
         if config.use_interframe:
             rife = self.rife
             out = video.to(device)
@@ -337,6 +362,12 @@ class Sonic():
             results.append(out[:, :, video_len - 1])
             video = torch.stack(results, 2).cpu()
         save_videos_grid(video, video_path, n_rows=video.shape[0], fps=config.fps * (2 if config.use_interframe else 1))
-        os.system(f"ffmpeg -i '{video_path}'  -i '{audio_path}' -s {resolution} -vcodec libx264 -acodec aac -crf 18 -shortest '{audio_video_path}' -y; rm '{video_path}'")
         return 0

     height,
     batch
 ):
+    # 배치 텐서를 (1,B,C,H,W) 형태로
     for k, v in batch.items():
         if isinstance(v, torch.Tensor):
             batch[k] = v.unsqueeze(0).to(pipe.device).float()
     ref_img = batch['ref_img']
     clip_img = batch['clip_images']
     face_mask = batch['face_mask']
     audio_len = batch['audio_len']
     step = int(config.step)
+    # window=3000 -> 16000으로 변경(1초 간격)
+    window = 16000
     audio_prompts = []
     last_audio_prompts = []
     for i in range(0, audio_feature.shape[-1], window):
         audio_clip_chunk = audio_feature[:, :, i:i+window]
         # Whisper encoder
         audio_prompts.append(audio_prompt)
         last_audio_prompts.append(last_audio_prompt)
+    # ★ 여기서 비었으면 예외
     if len(audio_prompts) == 0:
         raise ValueError(
             "[ERROR] No speech recognized from the audio. "
             "Please provide a valid speech audio (with clear voice)."
         )
     audio_prompts = torch.cat(audio_prompts, dim=1)
     audio_prompts = audio_prompts[:, :audio_len*2]
+    audio_prompts = torch.cat([
+        torch.zeros_like(audio_prompts[:, :4]),
+        audio_prompts,
+        torch.zeros_like(audio_prompts[:, :6])
+    ], dim=1)
     last_audio_prompts = torch.cat(last_audio_prompts, dim=1)
     last_audio_prompts = last_audio_prompts[:, :audio_len*2]
+    last_audio_prompts = torch.cat([
+        torch.zeros_like(last_audio_prompts[:, :24]),
+        last_audio_prompts,
+        torch.zeros_like(last_audio_prompts[:, :26])
+    ], dim=1)
     ref_tensor_list = []
     audio_tensor_list = []
     uncond_audio_tensor_list = []
     motion_buckets = []
     for i in tqdm(range(audio_len // step)):
         audio_clip = audio_prompts[:, i*2*step : i*2*step + 10].unsqueeze(0)
         audio_clip_for_bucket = last_audio_prompts[:, i*2*step : i*2*step + 50].unsqueeze(0)
         motion_bucket = audio2bucket(audio_clip_for_bucket, image_embeds)
         motion_bucket = motion_bucket * 16 + 16
         motion_buckets.append(motion_bucket[0])
         config = self.config
         config.use_interframe = enable_interpolate_frame
+        device = f'cuda:{device_id}' if device_id > -1 else 'cpu'
         config.pretrained_model_name_or_path = os.path.join(BASE_DIR, config.pretrained_model_name_or_path)
+        # VAE
         vae = AutoencoderKLTemporalDecoder.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="vae",
             variant="fp16")
+        # 스케줄러
         val_noise_scheduler = EulerDiscreteScheduler.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="scheduler")
+        # CLIP Vision
         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="image_encoder",
             variant="fp16")
+        # UNet
         unet = UNetSpatioTemporalConditionModel.from_pretrained(
             config.pretrained_model_name_or_path,
             subfolder="unet",
             variant="fp16")
+        # Adapter
         add_ip_adapters(unet, [32], [config.ip_audio_scale])
         audio2token = AudioProjModel(
             context_tokens=2
         ).to(device)
+        # 로컬 체크포인트 로드
         unet_checkpoint_path = os.path.join(BASE_DIR, config.unet_checkpoint_path)
         audio2token_checkpoint_path = os.path.join(BASE_DIR, config.audio2token_checkpoint_path)
         audio2bucket_checkpoint_path = os.path.join(BASE_DIR, config.audio2bucket_checkpoint_path)
             strict=True,
         )
+        # weight_dtype 설정
         if config.weight_dtype == "fp16":
             weight_dtype = torch.float16
         elif config.weight_dtype == "fp32":
         elif config.weight_dtype == "bf16":
             weight_dtype = torch.bfloat16
         else:
+            raise ValueError(f"Do not support weight dtype: {config.weight_dtype}")
+        # Whisper
+        whisper = WhisperModel.from_pretrained(
+            os.path.join(BASE_DIR, 'checkpoints/whisper-tiny/')
+        ).to(device).eval()
         whisper.requires_grad_(False)
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained(
+            os.path.join(BASE_DIR, 'checkpoints/whisper-tiny/')
+        )
+        # Face detect
         det_path = os.path.join(BASE_DIR, 'checkpoints/yoloface_v5m.pt')
         self.face_det = AlignImage(device, det_path=det_path)
+        # RIFE 중간프레임 보간
         if config.use_interframe:
             rife = RIFEModel(device=device)
             rife.load_model(os.path.join(BASE_DIR, 'checkpoints', 'RIFE/'))
             self.rife = rife
+        # dtype 변경
         image_encoder.to(weight_dtype)
         vae.to(weight_dtype)
         unet.to(weight_dtype)
+        # SonicPipeline 초기화
         pipe = SonicPipeline(
             unet=unet,
             image_encoder=image_encoder,
         print('Sonic init done')
     def preprocess(self, image_path, expand_ratio=1.0):
         face_image = cv2.imread(image_path)
         h, w = face_image.shape[:2]
         _, _, bboxes = self.face_det(face_image, maxface=True)
         face_num = len(bboxes)
         bbox_s = None
         if face_num > 0:
             x1, y1, ww, hh = bboxes[0]
             x2, y2 = x1 + ww, y1 + hh
                 dynamic_scale=1.0,
                 keep_resolution=False,
                 seed=None):
         config = self.config
         device = self.device
         pipe = self.pipe
         audio2bucket = self.audio2bucket
         image_encoder = self.image_encoder
+        # 시드 설정
         if seed:
             config.seed = seed
         config.num_inference_steps = inference_steps
         video_path = output_path.replace('.mp4', '_noaudio.mp4')
         audio_video_path = output_path
+        # 오디오+이미지 -> tensor
         test_data = image_audio_to_tensor(
             self.face_det,
             self.feature_extractor,
             image_path,
             audio_path,
+            limit=-1,  # 전체 사용
             image_size=min_resolution,
             area=config.area
         )
         if test_data is None:
             return -1
         else:
             resolution = f'{width}x{height}'
+        # 여기서 test(...) 호출
         video = test(
             pipe,
             config,
             batch=test_data,
         )
+        # 중간프레임 보간
         if config.use_interframe:
             rife = self.rife
             out = video.to(device)
             results.append(out[:, :, video_len - 1])
             video = torch.stack(results, 2).cpu()
+        # 비디오 저장
         save_videos_grid(video, video_path, n_rows=video.shape[0], fps=config.fps * (2 if config.use_interframe else 1))
+        # 오디오 합성 후 최종 mp4
+        os.system(
+            f"ffmpeg -i '{video_path}' -i '{audio_path}' -s {resolution} "
+            f"-vcodec libx264 -acodec aac -crf 18 -shortest '{audio_video_path}' -y; rm '{video_path}'"
+        )
         return 0