linoyts HF Staff commited on
Commit
b69b380
·
verified ·
1 Parent(s): 86cef43

Add README - relevant links and inference example

Browse files
Files changed (1) hide show
  1. README.md +58 -1
README.md CHANGED
@@ -1,5 +1,62 @@
1
  ---
2
  library_name: diffusers
 
3
  ---
 
 
4
 
5
- This is the `f1k1_x_g9_f1k1f2k2f16k4_td` FramePack for HY
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  library_name: diffusers
3
+ pipeline_tag: image-to-video
4
  ---
5
+ # FramePack - Video diffusion, but feels like image diffusion
6
+ [*Packing Input Frame Context in Next-Frame Prediction Models for Video Generation*](https://lllyasviel.github.io/frame_pack_gitpage/)
7
 
8
+ [**arxiv**](https://arxiv.org/abs/2504.12626), [**code**](https://github.com/lllyasviel/FramePack)
9
+
10
+ ---
11
+
12
+ This repo contains pre-trained `f1k1_x_g9_f1k1f2k2f16k4_td` weights and 🧨 `diffusers` inference code for FramePack for Hunyuan Video.
13
+
14
+ ## Inference with 🧨 Diffusers
15
+
16
+ ```
17
+ import torch
18
+ from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
19
+ from diffusers.hooks import apply_group_offloading
20
+ from diffusers.utils import export_to_video, load_image
21
+ from transformers import SiglipImageProcessor, SiglipVisionModel
22
+
23
+ transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
24
+ "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
25
+ )
26
+ feature_extractor = SiglipImageProcessor.from_pretrained(
27
+ "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
28
+ )
29
+ image_encoder = SiglipVisionModel.from_pretrained(
30
+ "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
31
+ )
32
+ pipe = HunyuanVideoFramepackPipeline.from_pretrained(
33
+ "hunyuanvideo-community/HunyuanVideo",
34
+ transformer=transformer,
35
+ feature_extractor=feature_extractor,
36
+ image_encoder=image_encoder,
37
+ torch_dtype=torch.float16,
38
+ )
39
+ onload_device = torch.device("cuda")
40
+ offload_device = torch.device("cpu")
41
+ list(map(
42
+ lambda x: apply_group_offloading(x, onload_device, offload_device, offload_type="leaf_level", use_stream=True, low_cpu_mem_usage=True),
43
+ [pipe.text_encoder, pipe.text_encoder_2, pipe.transformer]
44
+ ))
45
+ pipe.image_encoder.to(onload_device)
46
+ pipe.vae.to(onload_device)
47
+ pipe.vae.enable_tiling()
48
+
49
+ image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png")
50
+ output = pipe(
51
+ image=image,
52
+ prompt="A penguin dancing in the snow",
53
+ height=832,
54
+ width=480,
55
+ num_frames=91,
56
+ num_inference_steps=30,
57
+ guidance_scale=9.0,
58
+ generator=torch.Generator().manual_seed(0),
59
+ ).frames[0]
60
+ print(f"Max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
61
+ export_to_video(output, "output.mp4", fps=30)
62
+ ```