Spaces:

Skywork
/

skyreels-a1-talking-head

Running on L40S

App Files Files Community

diqiu7 commited on Mar 16

Commit

8f86798

verified ·

1 Parent(s): 6e29211

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -24

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 os.system("pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt221/download.html")
 import shutil
 from huggingface_hub import snapshot_download
@@ -103,6 +104,22 @@ def save_video_with_audio(video_path, audio_path, save_path):
     audio_clip.close()
     return save_path
 # Global parameters
 model_name = "pretrained_models/SkyReels-A1-5B/"
 siglip_name = "pretrained_models/SkyReels-A1-5B/siglip-so400m-patch14-384"
@@ -204,9 +221,13 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
     out_frames = processor.preprocess_lmk3d_from_coef(
         source_outputs, source_tform, image_original.shape, driving_outputs
     )
-    out_frames = parse_video(out_frames, max_frame_num)
-    rescale_motions = np.zeros_like(image)[np.newaxis, :].repeat(48, axis=0)
     for ii in range(rescale_motions.shape[0]):
         rescale_motions[ii][y1:y1+face_h, x1:x1+face_w] = out_frames[ii]
@@ -222,8 +243,8 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
     first_motion[y1:y1+face_h, x1:x1+face_w] = ref_img
     first_motion = first_motion[np.newaxis, :]
-    motions = np.concatenate([first_motion, rescale_motions])
-    input_video = motions[:max_frame_num]
     # Face alignment
     face_helper.clean_all()
@@ -234,29 +255,44 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
     image_face = align_face[:, :, ::-1]
     # Prepare input video
-    input_video = torch.from_numpy(np.array(input_video)).permute([3, 0, 1, 2]).unsqueeze(0)
-    input_video = input_video / 255
     progress(0.6, desc="Generating animation (this may take a while)...")
     # Generate video
-    with torch.no_grad():
-        sample = pipe(
-            image=image,
-            image_face=image_face,
-            control_video=input_video,
-            prompt="",
-            negative_prompt="",
-            height=480,
-            width=720,
-            num_frames=49,
-            # generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=steps,
-        )
-    out_samples = sample.frames[0]
-    out_samples = out_samples[2:]  # Skip first two frames
     progress(0.8, desc="Creating output video...")
     # Export video
     export_to_video(out_samples, temp_video_path, fps=12)

 import os
 os.system("pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt221/download.html")
 import shutil
+import math
 from huggingface_hub import snapshot_download
     audio_clip.close()
     return save_path
+def pad_video(driving_frames, fps=25):
+    video_length = len(driving_frames)
+    duration = video_length / fps
+    target_times = np.arange(0, duration, 1/12)
+    frame_indices = (target_times * fps).astype(np.int32)
+    frame_indices = frame_indices[frame_indices < video_length]
+    new_driving_frames = []
+    for idx in frame_indices:
+        new_driving_frames.append(driving_frames[idx])
+    pad_length = math.ceil(len(new_driving_frames) / 48) * 48 - len(new_driving_frames)
+    new_driving_frames.extend([new_driving_frames[-1]]*pad_length)
+    return new_driving_frames, pad_length
 # Global parameters
 model_name = "pretrained_models/SkyReels-A1-5B/"
 siglip_name = "pretrained_models/SkyReels-A1-5B/siglip-so400m-patch14-384"
     out_frames = processor.preprocess_lmk3d_from_coef(
         source_outputs, source_tform, image_original.shape, driving_outputs
     )
+    out_frames, pad_length = pad_video(out_frames)
+    print(len(out_frames), pad_length)
+    # out_frames = parse_video(out_frames, max_frame_num)
+    rescale_motions = np.zeros_like(image)[np.newaxis, :].repeat(len(out_frames), axis=0)
     for ii in range(rescale_motions.shape[0]):
         rescale_motions[ii][y1:y1+face_h, x1:x1+face_w] = out_frames[ii]
     first_motion[y1:y1+face_h, x1:x1+face_w] = ref_img
     first_motion = first_motion[np.newaxis, :]
+    # motions = np.concatenate([first_motion, rescale_motions])
+    # input_video = motions[:max_frame_num]
     # Face alignment
     face_helper.clean_all()
     image_face = align_face[:, :, ::-1]
     # Prepare input video
+    # input_video = torch.from_numpy(np.array(input_video)).permute([3, 0, 1, 2]).unsqueeze(0)
+    # input_video = input_video / 255
     progress(0.6, desc="Generating animation (this may take a while)...")
     # Generate video
+    out_samples = []
+    for i in range(0, len(rescale_motions), 48):
+        motions = np.concatenate([first_motion, rescale_motions[i:i+48]])
+        input_video = motions
+        input_video = torch.from_numpy(np.array(input_video)).permute([3, 0, 1, 2]).unsqueeze(0)
+        input_video = input_video / 255
+        with torch.no_grad():
+            sample = pipe(
+                image=image,
+                image_face=image_face,
+                control_video=input_video,
+                prompt="",
+                negative_prompt="",
+                height=480,
+                width=720,
+                num_frames=49,
+                # generator=generator,
+                guidance_scale=guidance_scale,
+                num_inference_steps=steps,
+            )
+            if i == 0:
+                out_samples.extend(sample.frames[0])
+            else:
+                out_samples.extend(sample.frames[0][1:])
+    # out_samples = sample.frames[0]
+    # out_samples = out_samples[2:]  # Skip first two frames
+    if pad_length == 0:
+        out_samples = out_samples[1:]
+    else:
+        out_samples = out_samples[1:-pad_length]
     progress(0.8, desc="Creating output video...")
     # Export video
     export_to_video(out_samples, temp_video_path, fps=12)