Commit
·
cdbde72
1
Parent(s):
8e6e7dd
rename
Browse files
app.py
CHANGED
|
@@ -1,19 +1,11 @@
|
|
| 1 |
-
import time
|
| 2 |
import os
|
| 3 |
-
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
-
|
| 7 |
-
# Authenticate with Hugging Face (token is auto-available in HF Spaces)
|
| 8 |
-
try:
|
| 9 |
-
login(token=os.environ.get("HF_TOKEN"))
|
| 10 |
-
print("Successfully authenticated with Hugging Face")
|
| 11 |
-
except Exception as e:
|
| 12 |
-
print(f"Warning: Could not authenticate with HF: {e}")
|
| 13 |
-
import spaces
|
| 14 |
import torch
|
| 15 |
from diffusers import AutoencoderKLCogVideoX, CogVideoXDDIMScheduler
|
| 16 |
from diffusers.utils import export_to_video
|
|
|
|
| 17 |
from PIL import Image
|
| 18 |
from transformers import T5EncoderModel, T5Tokenizer
|
| 19 |
|
|
@@ -21,22 +13,23 @@ from cogvideo_transformer import CustomCogVideoXTransformer3DModel
|
|
| 21 |
from EF_Net import EF_Net
|
| 22 |
from Sci_Fi_inbetweening_pipeline import CogVideoXEFNetInbetweeningPipeline
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 27 |
|
| 28 |
|
| 29 |
-
def
|
| 30 |
pretrained_model_path="LiuhanChen/Sci-Fi",
|
| 31 |
ef_net_path="weights/EF_Net.pth",
|
| 32 |
dtype_str="bfloat16",
|
| 33 |
):
|
| 34 |
-
"""
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
# Return early if pipeline is already loaded
|
| 38 |
-
if pipe is not None:
|
| 39 |
-
return "Pipeline already loaded!"
|
| 40 |
|
| 41 |
dtype = torch.float16 if dtype_str == "float16" else torch.bfloat16
|
| 42 |
|
|
@@ -48,29 +41,37 @@ def _load_pipeline_internal(
|
|
| 48 |
repo_id="LiuhanChen/Sci-Fi",
|
| 49 |
subfolder="EF_Net",
|
| 50 |
filename="EF_Net.pth",
|
| 51 |
-
local_dir="weights"
|
| 52 |
)
|
| 53 |
ef_net_path = "weights/EF_Net/EF_Net.pth"
|
| 54 |
print(f"EF-Net weights downloaded to {ef_net_path}")
|
| 55 |
|
| 56 |
# Load models from Hugging Face
|
|
|
|
| 57 |
tokenizer = T5Tokenizer.from_pretrained(
|
| 58 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/tokenizer"
|
| 59 |
)
|
| 60 |
text_encoder = T5EncoderModel.from_pretrained(
|
| 61 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/text_encoder"
|
| 62 |
)
|
|
|
|
|
|
|
| 63 |
transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
|
| 64 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/transformer"
|
| 65 |
)
|
|
|
|
|
|
|
| 66 |
vae = AutoencoderKLCogVideoX.from_pretrained(
|
| 67 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/vae"
|
| 68 |
)
|
|
|
|
|
|
|
| 69 |
scheduler = CogVideoXDDIMScheduler.from_pretrained(
|
| 70 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/scheduler"
|
| 71 |
)
|
| 72 |
|
| 73 |
# Load EF-Net
|
|
|
|
| 74 |
EF_Net_model = (
|
| 75 |
EF_Net(num_layers=4, downscale_coef=8, in_channels=2, num_attention_heads=48)
|
| 76 |
.requires_grad_(False)
|
|
@@ -83,7 +84,8 @@ def _load_pipeline_internal(
|
|
| 83 |
print(f"[EF-Net loaded] Missing: {len(m)} | Unexpected: {len(u)}")
|
| 84 |
|
| 85 |
# Create pipeline
|
| 86 |
-
|
|
|
|
| 87 |
tokenizer=tokenizer,
|
| 88 |
text_encoder=text_encoder,
|
| 89 |
transformer=transformer,
|
|
@@ -91,20 +93,26 @@ def _load_pipeline_internal(
|
|
| 91 |
EF_Net_model=EF_Net_model,
|
| 92 |
scheduler=scheduler,
|
| 93 |
)
|
| 94 |
-
|
| 95 |
-
|
| 96 |
)
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
-
@spaces.GPU(duration=500)
|
| 108 |
def generate_inbetweening(
|
| 109 |
first_image: Image.Image,
|
| 110 |
last_image: Image.Image,
|
|
@@ -118,16 +126,6 @@ def generate_inbetweening(
|
|
| 118 |
progress=gr.Progress(),
|
| 119 |
):
|
| 120 |
"""Generate frame inbetweening video"""
|
| 121 |
-
global pipe
|
| 122 |
-
|
| 123 |
-
# Load pipeline on first use (lazy loading with GPU access)
|
| 124 |
-
if pipe is None:
|
| 125 |
-
progress(0, desc="Loading pipeline (first run)...")
|
| 126 |
-
try:
|
| 127 |
-
_load_pipeline_internal()
|
| 128 |
-
except Exception as e:
|
| 129 |
-
return None, f"ERROR: Failed to load pipeline: {str(e)}"
|
| 130 |
-
|
| 131 |
if first_image is None or last_image is None:
|
| 132 |
return None, "Please upload both start and end frames!"
|
| 133 |
|
|
@@ -177,7 +175,7 @@ with gr.Blocks(title="Sci-Fi: Frame Inbetweening") as demo:
|
|
| 177 |
|
| 178 |
Upload start and end frames to generate smooth inbetweening video.
|
| 179 |
|
| 180 |
-
**
|
| 181 |
"""
|
| 182 |
)
|
| 183 |
|
|
@@ -280,5 +278,5 @@ with gr.Blocks(title="Sci-Fi: Frame Inbetweening") as demo:
|
|
| 280 |
)
|
| 281 |
|
| 282 |
if __name__ == "__main__":
|
| 283 |
-
print("App
|
| 284 |
demo.launch()
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import time
|
| 3 |
|
| 4 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import torch
|
| 6 |
from diffusers import AutoencoderKLCogVideoX, CogVideoXDDIMScheduler
|
| 7 |
from diffusers.utils import export_to_video
|
| 8 |
+
from huggingface_hub import hf_hub_download, login
|
| 9 |
from PIL import Image
|
| 10 |
from transformers import T5EncoderModel, T5Tokenizer
|
| 11 |
|
|
|
|
| 13 |
from EF_Net import EF_Net
|
| 14 |
from Sci_Fi_inbetweening_pipeline import CogVideoXEFNetInbetweeningPipeline
|
| 15 |
|
| 16 |
+
# Authenticate with Hugging Face
|
| 17 |
+
try:
|
| 18 |
+
login(token=os.environ.get("HF_TOKEN"))
|
| 19 |
+
print("Successfully authenticated with Hugging Face")
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f"Warning: Could not authenticate with HF: {e}")
|
| 22 |
+
|
| 23 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 24 |
|
| 25 |
|
| 26 |
+
def load_pipeline(
|
| 27 |
pretrained_model_path="LiuhanChen/Sci-Fi",
|
| 28 |
ef_net_path="weights/EF_Net.pth",
|
| 29 |
dtype_str="bfloat16",
|
| 30 |
):
|
| 31 |
+
"""Load the Sci-Fi pipeline at startup"""
|
| 32 |
+
print("Loading Sci-Fi pipeline...")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
dtype = torch.float16 if dtype_str == "float16" else torch.bfloat16
|
| 35 |
|
|
|
|
| 41 |
repo_id="LiuhanChen/Sci-Fi",
|
| 42 |
subfolder="EF_Net",
|
| 43 |
filename="EF_Net.pth",
|
| 44 |
+
local_dir="weights",
|
| 45 |
)
|
| 46 |
ef_net_path = "weights/EF_Net/EF_Net.pth"
|
| 47 |
print(f"EF-Net weights downloaded to {ef_net_path}")
|
| 48 |
|
| 49 |
# Load models from Hugging Face
|
| 50 |
+
print("Loading tokenizer and text encoder...")
|
| 51 |
tokenizer = T5Tokenizer.from_pretrained(
|
| 52 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/tokenizer"
|
| 53 |
)
|
| 54 |
text_encoder = T5EncoderModel.from_pretrained(
|
| 55 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/text_encoder"
|
| 56 |
)
|
| 57 |
+
|
| 58 |
+
print("Loading transformer...")
|
| 59 |
transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
|
| 60 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/transformer"
|
| 61 |
)
|
| 62 |
+
|
| 63 |
+
print("Loading VAE...")
|
| 64 |
vae = AutoencoderKLCogVideoX.from_pretrained(
|
| 65 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/vae"
|
| 66 |
)
|
| 67 |
+
|
| 68 |
+
print("Loading scheduler...")
|
| 69 |
scheduler = CogVideoXDDIMScheduler.from_pretrained(
|
| 70 |
pretrained_model_path, subfolder="CogVideoX-5b-I2V/scheduler"
|
| 71 |
)
|
| 72 |
|
| 73 |
# Load EF-Net
|
| 74 |
+
print("Loading EF-Net...")
|
| 75 |
EF_Net_model = (
|
| 76 |
EF_Net(num_layers=4, downscale_coef=8, in_channels=2, num_attention_heads=48)
|
| 77 |
.requires_grad_(False)
|
|
|
|
| 84 |
print(f"[EF-Net loaded] Missing: {len(m)} | Unexpected: {len(u)}")
|
| 85 |
|
| 86 |
# Create pipeline
|
| 87 |
+
print("Creating pipeline...")
|
| 88 |
+
pipeline = CogVideoXEFNetInbetweeningPipeline(
|
| 89 |
tokenizer=tokenizer,
|
| 90 |
text_encoder=text_encoder,
|
| 91 |
transformer=transformer,
|
|
|
|
| 93 |
EF_Net_model=EF_Net_model,
|
| 94 |
scheduler=scheduler,
|
| 95 |
)
|
| 96 |
+
pipeline.scheduler = CogVideoXDDIMScheduler.from_config(
|
| 97 |
+
pipeline.scheduler.config, timestep_spacing="trailing"
|
| 98 |
)
|
| 99 |
|
| 100 |
+
print(f"Moving pipeline to {device}...")
|
| 101 |
+
pipeline.to(device)
|
| 102 |
+
pipeline = pipeline.to(dtype=dtype)
|
| 103 |
+
|
| 104 |
+
pipeline.vae.enable_slicing()
|
| 105 |
+
pipeline.vae.enable_tiling()
|
| 106 |
|
| 107 |
+
print("Pipeline loaded successfully!")
|
| 108 |
+
return pipeline
|
| 109 |
|
| 110 |
+
|
| 111 |
+
# Load pipeline at startup
|
| 112 |
+
print("Initializing Sci-Fi pipeline at startup...")
|
| 113 |
+
pipe = load_pipeline()
|
| 114 |
|
| 115 |
|
|
|
|
| 116 |
def generate_inbetweening(
|
| 117 |
first_image: Image.Image,
|
| 118 |
last_image: Image.Image,
|
|
|
|
| 126 |
progress=gr.Progress(),
|
| 127 |
):
|
| 128 |
"""Generate frame inbetweening video"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
if first_image is None or last_image is None:
|
| 130 |
return None, "Please upload both start and end frames!"
|
| 131 |
|
|
|
|
| 175 |
|
| 176 |
Upload start and end frames to generate smooth inbetweening video.
|
| 177 |
|
| 178 |
+
**Model is pre-loaded and ready to use!**
|
| 179 |
"""
|
| 180 |
)
|
| 181 |
|
|
|
|
| 278 |
)
|
| 279 |
|
| 280 |
if __name__ == "__main__":
|
| 281 |
+
print("App ready - pipeline is loaded and ready for inference!")
|
| 282 |
demo.launch()
|