File size: 11,012 Bytes
29cc382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import os
import pandas as pd
import torch
import numpy as np
import random

from diffusers import StableDiffusionPipeline
from diffusers.utils import export_to_video

# Specify the GPU to use (adjust as needed)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def set_seed(seed: int = 42):
    """
    Set random seed for reproducibility
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set random seed
set_seed(42)

def generate_image(pipeline, prompt: str, output_path: str):
    """
    Generate an image using the Stable Diffusion model and save it
    """
    with torch.autocast("cuda"):
        image = pipeline(prompt).images[0]
    image.save(output_path)

import torch
from diffusers.utils import export_to_video  # Ensure these methods are correctly imported

def generate_video(pipeline, pipeline_type: str, prompt: str, output_path: str, **kwargs):
    """
    Generate a video using different video generation pipelines and save as mp4 or gif

    Parameters:
      pipeline: Loaded video generation pipeline
      pipeline_type: Type of video model, options are "cogvideo", "ltx", "hunyuan", "animatediff"
      prompt: Text description
      output_path: Output video path (animatediff defaults to gif, others to mp4)
      kwargs: Hyperparameter settings, e.g., width, height, num_frames, num_inference_steps, fps, guidance_scale, etc.
    """
    if pipeline_type == "cogvideo":
        # Example call for CogVideoX (some hyperparameters may only apply to this pipeline)
        video = pipeline(
            prompt=prompt,
            num_videos_per_prompt=kwargs.get("num_videos_per_prompt", 1),
            num_inference_steps=kwargs.get("num_inference_steps", 50),
            num_frames=kwargs.get("num_frames", 49),
            guidance_scale=kwargs.get("guidance_scale", 6),
            generator=kwargs.get("generator", torch.Generator(device="cuda").manual_seed(42))
        ).frames[0]
        export_to_video(video, output_path, fps=kwargs.get("fps", 8))
    elif pipeline_type == "ltx":
        # Example call for LTXPipeline
        video = pipeline(
            prompt=prompt,
            negative_prompt=kwargs.get("negative_prompt", "worst quality, inconsistent motion, blurry, jittery, distorted"),
            width=kwargs.get("width", 704),
            height=kwargs.get("height", 480),
            num_frames=kwargs.get("num_frames", 161),
            num_inference_steps=kwargs.get("num_inference_steps", 50),
        ).frames[0]
        export_to_video(video, output_path, fps=kwargs.get("fps", 15))
    elif pipeline_type == "hunyuan":
        # Example call for HunyuanVideoPipeline
        video = pipeline(
            prompt=prompt,
            width=kwargs.get("width", 512),
            height=kwargs.get("height", 320),
            num_frames=kwargs.get("num_frames", 61),
            num_inference_steps=kwargs.get("num_inference_steps", 30),
        ).frames[0]
        export_to_video(video, output_path, fps=kwargs.get("fps", 15))
    elif pipeline_type == "animatediff":
        # Example call for AnimateDiff-Lightning (defaults to generating gif)
        video = pipeline(
            prompt=prompt,
            guidance_scale=kwargs.get("guidance_scale", 1.0),
            num_inference_steps=kwargs.get("num_inference_steps", 4)  # Default step is 4, options are 1,2,4,8
        ).frames[0]
        export_to_video(video, output_path)
    else:
        raise ValueError(f"Unknown pipeline type: {pipeline_type}")

def load_video_pipeline(pipeline_type: str):
    """
    Load the corresponding video generation model based on pipeline_type

    Parameters:
      pipeline_type: Options are "cogvideo", "ltx", "hunyuan", "animatediff"
    Returns:
      Loaded and initialized video generation pipeline
    """
    if pipeline_type == "cogvideo":
        from diffusers import CogVideoXPipeline
        print("Loading video generation model (CogVideoX-5b)...")
        pipe = CogVideoXPipeline.from_pretrained(
            "THUDM/CogVideoX-5b",
            torch_dtype=torch.bfloat16
        )
        pipe.vae.enable_slicing()
        pipe.vae.enable_tiling()
        pipe.to("cuda")
        return pipe
    elif pipeline_type == "ltx":
        from diffusers import LTXPipeline
        print("Loading video generation model (LTX-Video)...")
        pipe = LTXPipeline.from_pretrained(
            "Lightricks/LTX-Video",
            torch_dtype=torch.bfloat16
        )
        pipe.to("cuda")
        return pipe
    elif pipeline_type == "hunyuan":
        from diffusers import BitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
        from diffusers.hooks import apply_layerwise_casting
        from transformers import LlamaModel
        print("Loading video generation model (HunyuanVideo)...")
        model_id = "hunyuanvideo-community/HunyuanVideo"
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
        )
        text_encoder = LlamaModel.from_pretrained(model_id, subfolder="text_encoder", torch_dtype=torch.float16)
        apply_layerwise_casting(text_encoder, storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.float16)
        transformer = HunyuanVideoTransformer3DModel.from_pretrained(
            model_id,
            subfolder="transformer",
            quantization_config=quantization_config,
            torch_dtype=torch.bfloat16,
        )
        pipe = HunyuanVideoPipeline.from_pretrained(
            model_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch.float16
        )
        pipe.vae.enable_tiling()
        pipe.enable_model_cpu_offload()
        return pipe
    elif pipeline_type == "animatediff":
        from diffusers import AnimateDiffPipeline, MotionAdapter, EulerDiscreteScheduler
        from huggingface_hub import hf_hub_download
        from safetensors.torch import load_file
        print("Loading video generation model (AnimateDiff-Lightning)...")
        device = "cuda"
        dtype = torch.float16
        step = 4  # Options: [1,2,4,8], default is 4
        repo = "ByteDance/AnimateDiff-Lightning"
        ckpt = f"animatediff_lightning_{step}step_diffusers.safetensors"
        base = "emilianJR/epiCRealism"  # Choose base model as preferred
        adapter = MotionAdapter().to(device, dtype)
        # Download and load weights
        adapter.load_state_dict(load_file(hf_hub_download(repo, ckpt), device=device))
        pipe = AnimateDiffPipeline.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
        pipe.scheduler = EulerDiscreteScheduler.from_config(
            pipe.scheduler.config, timestep_spacing="trailing", beta_schedule="linear"
        )
        return pipe
    else:
        raise ValueError(f"Unknown pipeline type: {pipeline_type}")

def main():
    # ============ 1. Load/Initialize Models ============
    # (1) Image generation model: Stable Diffusion
    print("Loading image generation model (Stable Diffusion)...")
    pipe_image = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    )
    pipe_image.to("cuda")
    # Enable xformers acceleration if needed
    # pipe_image.enable_xformers_memory_efficient_attention()

    # (2) Video generation model: Choose "cogvideo", "ltx", or "hunyuan"
    video_pipeline_type = "ltx"  # Change here to select other models: "ltx" or "hunyuan" animatediff

    # ============ 2. Define Task List ============
    tasks1 = [
        {
            "csv_file": "output_prompt_rag_more/prompt_ai_concrete_rag_10_testset.csv",
            "image_dir": "output_ai_covers_concrete_rag_10_testset",
            "video_dir": "output_ai_videos_concrete_rag_10_testset_ltx"
        },
        {
            "csv_file": "output_prompt_rag_more/prompt_ai_abstract_rag_10_testset.csv",
            "image_dir": "output_ai_covers_abstract_rag_10_testset",
            "video_dir": "output_ai_videos_abstract_rag_10_testset_ltx"
        }

    ]

    
    # Only the first task is used in the example
    #tasks = [tasks[-4],tasks[-2]]
    #tasks=tasks_ablation_abstract_5b+tasks_ablation_concrete_5b
    #tasks= tasks_ablation_concrete2
    tasks = tasks1
    pipe_video = load_video_pipeline(video_pipeline_type)

    # ============ 3. Iterate over CSV files to generate images and videos ============
    for task in tasks:
        csv_file = task["csv_file"]
        image_dir = task["image_dir"]
        video_dir = task["video_dir"]
        os.makedirs(image_dir, exist_ok=True)
        print(f"Ensuring directory exists: {image_dir}")
        os.makedirs(video_dir, exist_ok=True)
        print(f"Ensuring directory exists: {video_dir}")

        if not os.path.exists(csv_file):
            print(f"Error: CSV file {csv_file} not found, please check the path.")
            continue

        df = pd.read_csv(csv_file)
        for idx, row in df.iterrows():
            user_prompt = str(row["user prompt"])
            title = str(row["title"])
            cover_prompt = str(row["cover prompt"])
            video_prompt = str(row["video prompt"])

            # Generate filenames
            image_filename = os.path.join(image_dir, f"{user_prompt}.png")
            video_filename = os.path.join(video_dir, f"{user_prompt}.mp4")

            print("-" * 50)
            print(f"[CSV: {csv_file}] - [{idx}] Starting generation: {user_prompt}")
            print(f"Title: {title}")
            print(f"Cover Prompt: {cover_prompt}")
            print(f"Video Prompt: {video_prompt}")

            if os.path.exists(image_filename) and os.path.exists(video_filename):
                print(f"File already exists, skipping generation: {video_filename}")
                continue

            # 4. Generate image
            try:
                generate_image(pipe_image, cover_prompt, image_filename)
                print(f"Image saved to {image_filename}")
            except Exception as e:
                print(f"Image generation failed: {e}")

            # 5. Generate video (customize hyperparameters by passing additional arguments)
            try:
                generate_video(
                    pipe_video,
                    pipeline_type=video_pipeline_type,
                    prompt=video_prompt,
                    output_path=video_filename
                    # To modify hyperparameters, pass them here, e.g.:
                    # num_inference_steps=60, num_frames=50, fps=10, width=640, height=360, guidance_scale=7, ...
                )
                print(f"Video saved to {video_filename}")
            except Exception as e:
                print(f"Video generation failed: {e}")

    print("All generation tasks completed!")

if __name__ == "__main__":
    main()