Spaces:

fffiloni
/

ControlVideo

Paused

App Files Files Community

fffiloni commited on Jun 20, 2023

Commit

f6be7c6

1 Parent(s): 454eedf

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -20

app.py CHANGED Viewed

@@ -60,21 +60,21 @@ def get_args():
     args = parser.parse_args()
     return args
-if __name__ == "__main__":
-    args = get_args()
-    os.makedirs(args.output_path, exist_ok=True)
     # Height and width should be a multiple of 32
-    args.height = (args.height // 32) * 32
-    args.width = (args.width // 32) * 32
-    if args.condition == "pose":
         pretrained_model_or_path = "lllyasviel/ControlNet"
         body_model_path = hf_hub_download(pretrained_model_or_path, "annotator/ckpts/body_pose_model.pth", cache_dir="checkpoints")
         body_estimation = Body(body_model_path)
-        annotator = controlnet_parser_dict[args.condition](body_estimation)
     else:
-        annotator = controlnet_parser_dict[args.condition]()
     tokenizer = CLIPTokenizer.from_pretrained(sd_path, subfolder="tokenizer")
     text_encoder = CLIPTextModel.from_pretrained(sd_path, subfolder="text_encoder").to(dtype=torch.float16)
@@ -93,41 +93,43 @@ if __name__ == "__main__":
     pipe.to(device)
     generator = torch.Generator(device="cuda")
-    generator.manual_seed(args.seed)
     # Step 1. Read a video
-    video = read_video(video_path=args.video_path, video_length=args.video_length, width=args.width, height=args.height)
     # Save source video
     original_pixels = rearrange(video, "(b f) c h w -> b c f h w", b=1)
-    save_videos_grid(original_pixels, os.path.join(args.output_path, "source_video.mp4"), rescale=True)
     # Step 2. Parse a video to conditional frames
     pil_annotation = get_annotation(video, annotator)
-    if args.condition == "depth" and controlnet_aux.__version__ == '0.0.1':
         pil_annotation = [pil_annot[0] for pil_annot in pil_annotation]
     # Save condition video
     video_cond = [np.array(p).astype(np.uint8) for p in pil_annotation]
-    imageio.mimsave(os.path.join(args.output_path, f"{args.condition}_condition.mp4"), video_cond, fps=8)
     # Reduce memory (optional)
     del annotator; torch.cuda.empty_cache()
     # Step 3. inference
-    if args.is_long_video:
-        window_size = int(np.sqrt(args.video_length))
-        sample = pipe.generate_long_video(args.prompt + POS_PROMPT, video_length=args.video_length, frames=pil_annotation,
                     num_inference_steps=50, smooth_steps=args.smoother_steps, window_size=window_size,
                     generator=generator, guidance_scale=12.5, negative_prompt=NEG_PROMPT,
-                    width=args.width, height=args.height
                 ).videos
     else:
-        sample = pipe(args.prompt + POS_PROMPT, video_length=args.video_length, frames=pil_annotation,
                     num_inference_steps=50, smooth_steps=args.smoother_steps,
                     generator=generator, guidance_scale=12.5, negative_prompt=NEG_PROMPT,
-                    width=args.width, height=args.height
                 ).videos
-    save_videos_grid(sample, f"{args.output_path}/{args.prompt}.mp4")

     args = parser.parse_args()
     return args
+def infer(prompt, video_path, output_path, condition, video_length, height, width, smoother_steps, is_long_video, seed):
+    #args = get_args()
+    #os.makedirs(args.output_path, exist_ok=True)
     # Height and width should be a multiple of 32
+    height = (height // 32) * 32
+    width = (width // 32) * 32
+    if condition == "pose":
         pretrained_model_or_path = "lllyasviel/ControlNet"
         body_model_path = hf_hub_download(pretrained_model_or_path, "annotator/ckpts/body_pose_model.pth", cache_dir="checkpoints")
         body_estimation = Body(body_model_path)
+        annotator = controlnet_parser_dict[condition](body_estimation)
     else:
+        annotator = controlnet_parser_dict[condition]()
     tokenizer = CLIPTokenizer.from_pretrained(sd_path, subfolder="tokenizer")
     text_encoder = CLIPTextModel.from_pretrained(sd_path, subfolder="text_encoder").to(dtype=torch.float16)
     pipe.to(device)
     generator = torch.Generator(device="cuda")
+    generator.manual_seed(seed)
     # Step 1. Read a video
+    video = read_video(video_path=video_path, video_length=video_length, width=width, height=height)
     # Save source video
     original_pixels = rearrange(video, "(b f) c h w -> b c f h w", b=1)
+    save_videos_grid(original_pixels, os.path.join(output_path, "source_video.mp4"), rescale=True)
     # Step 2. Parse a video to conditional frames
     pil_annotation = get_annotation(video, annotator)
+    if condition == "depth" and controlnet_aux.__version__ == '0.0.1':
         pil_annotation = [pil_annot[0] for pil_annot in pil_annotation]
     # Save condition video
     video_cond = [np.array(p).astype(np.uint8) for p in pil_annotation]
+    imageio.mimsave(os.path.join(output_path, f"{condition}_condition.mp4"), video_cond, fps=8)
     # Reduce memory (optional)
     del annotator; torch.cuda.empty_cache()
     # Step 3. inference
+    if is_long_video:
+        window_size = int(np.sqrt(video_length))
+        sample = pipe.generate_long_video(prompt + POS_PROMPT, video_length=video_length, frames=pil_annotation,
                     num_inference_steps=50, smooth_steps=args.smoother_steps, window_size=window_size,
                     generator=generator, guidance_scale=12.5, negative_prompt=NEG_PROMPT,
+                    width=width, height=height
                 ).videos
     else:
+        sample = pipe(prompt + POS_PROMPT, video_length=video_length, frames=pil_annotation,
                     num_inference_steps=50, smooth_steps=args.smoother_steps,
                     generator=generator, guidance_scale=12.5, negative_prompt=NEG_PROMPT,
+                    width=width, height=height
                 ).videos
+    save_videos_grid(sample, f"{output_path}/{prompt}.mp4")
+    return f"{output_path}/{prompt}.mp4"