LTTEAM commited on
Commit
c2add15
·
verified ·
1 Parent(s): f529bd2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +273 -403
app.py CHANGED
@@ -1,403 +1,273 @@
1
- import gradio as gr
2
- import torch
3
- import spaces
4
- import numpy as np
5
- import random
6
- import os
7
- import yaml
8
- from pathlib import Path
9
- import imageio
10
- import tempfile
11
- from PIL import Image
12
- from huggingface_hub import hf_hub_download
13
- import shutil
14
-
15
- from inference import (
16
- create_ltx_video_pipeline,
17
- create_latent_upsampler,
18
- load_image_to_tensor_with_resize_and_crop,
19
- seed_everething,
20
- get_device,
21
- calculate_padding,
22
- load_media_file
23
- )
24
- from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline, LTXVideoPipeline
25
- from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
26
-
27
- # Đường dẫn tới file cấu hình YAML
28
- config_file_path = "configs/ltxv-13b-0.9.7-distilled.yaml"
29
- with open(config_file_path, "r") as file:
30
- PIPELINE_CONFIG_YAML = yaml.safe_load(file)
31
-
32
- # Repo HuggingFace của bạn chứa hai file mô hình
33
- LTX_REPO = "LTTEAM/VideoAI"
34
- MAX_IMAGE_SIZE = PIPELINE_CONFIG_YAML.get("max_resolution", 1280)
35
- MAX_NUM_FRAMES = 257
36
-
37
- FPS = 30.0 # Khung hình trên giây
38
-
39
- # --- Biến toàn cục cho pipeline và upsampler đã tải ---
40
- pipeline_instance = None
41
- latent_upsampler_instance = None
42
- models_dir = "downloaded_models_gradio_cpu_init"
43
- Path(models_dir).mkdir(parents=True, exist_ok=True)
44
-
45
- print("Đang tải mô hình (nếu chưa có)…")
46
- # Tải mô hình distilled
47
- distilled_model_actual_path = hf_hub_download(
48
- repo_id=LTX_REPO,
49
- filename=PIPELINE_CONFIG_YAML["checkpoint_path"],
50
- local_dir=models_dir,
51
- local_dir_use_symlinks=False
52
- )
53
- PIPELINE_CONFIG_YAML["checkpoint_path"] = distilled_model_actual_path
54
- print(f"Đường dẫn mô hình distilled: {distilled_model_actual_path}")
55
-
56
- # Tải mô hình upscaler không gian
57
- SPATIAL_UPSCALER_FILENAME = PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"]
58
- spatial_upscaler_actual_path = hf_hub_download(
59
- repo_id=LTX_REPO,
60
- filename=SPATIAL_UPSCALER_FILENAME,
61
- local_dir=models_dir,
62
- local_dir_use_symlinks=False
63
- )
64
- PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"] = spatial_upscaler_actual_path
65
- print(f"Đường dẫn hình upscaler: {spatial_upscaler_actual_path}")
66
-
67
- print("Khởi tạo pipeline LTX Video trên CPU…")
68
- pipeline_instance = create_ltx_video_pipeline(
69
- ckpt_path=PIPELINE_CONFIG_YAML["checkpoint_path"],
70
- precision=PIPELINE_CONFIG_YAML["precision"],
71
- text_encoder_model_name_or_path=PIPELINE_CONFIG_YAML["text_encoder_model_name_or_path"],
72
- sampler=PIPELINE_CONFIG_YAML["sampler"],
73
- device="cpu",
74
- enhance_prompt=False,
75
- prompt_enhancer_image_caption_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_image_caption_model_name_or_path"],
76
- prompt_enhancer_llm_model_name_or_path=PIPELINE_CONFIG_YAML["prompt_enhancer_llm_model_name_or_path"],
77
- )
78
- print("Pipeline LTX Video đã sẵn sàng trên CPU.")
79
-
80
- if PIPELINE_CONFIG_YAML.get("spatial_upscaler_model_path"):
81
- print("Khởi tạo latent upsampler trên CPU…")
82
- latent_upsampler_instance = create_latent_upsampler(
83
- PIPELINE_CONFIG_YAML["spatial_upscaler_model_path"],
84
- device="cpu"
85
- )
86
- print("Latent upsampler đã sẵn sàng trên CPU.")
87
-
88
- # Chuyển sang GPU để inference
89
- target_inference_device = "cuda"
90
- print(f"Sử dụng thiết bị inference: {target_inference_device}")
91
- pipeline_instance.to(target_inference_device)
92
- if latent_upsampler_instance:
93
- latent_upsampler_instance.to(target_inference_device)
94
-
95
-
96
- # --- Hàm trợ giúp tính kích thước mới sao cho vừa khung 32 pixel ---
97
- MIN_DIM_SLIDER = 256 # Giá trị tối thiểu của slider
98
- TARGET_FIXED_SIDE = 768 # Kích thước cố định một cạnh
99
-
100
- def calculate_new_dimensions(orig_w, orig_h):
101
- """
102
- Tính kích thước mới cho height/width slider dựa trên kích thước gốc.
103
- - Một cạnh cố định là TARGET_FIXED_SIDE, cạnh kia co tỉ lệ.
104
- - Cả hai phải chia hết cho 32 và trong khoảng [MIN_DIM_SLIDER, MAX_IMAGE_SIZE].
105
- """
106
- if orig_w == 0 or orig_h == 0:
107
- return TARGET_FIXED_SIDE, TARGET_FIXED_SIDE
108
-
109
- if orig_w >= orig_h:
110
- new_h = TARGET_FIXED_SIDE
111
- aspect_ratio = orig_w / orig_h
112
- new_w_ideal = new_h * aspect_ratio
113
- new_w = round(new_w_ideal / 32) * 32
114
- new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
115
- new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
116
- else:
117
- new_w = TARGET_FIXED_SIDE
118
- aspect_ratio = orig_h / orig_w
119
- new_h_ideal = new_w * aspect_ratio
120
- new_h = round(new_h_ideal / 32) * 32
121
- new_h = max(MIN_DIM_SLIDER, min(new_h, MAX_IMAGE_SIZE))
122
- new_w = max(MIN_DIM_SLIDER, min(new_w, MAX_IMAGE_SIZE))
123
-
124
- return int(new_h), int(new_w)
125
-
126
- def get_duration(prompt, negative_prompt, input_image_filepath, input_video_filepath,
127
- height_ui, width_ui, mode,
128
- duration_ui, ui_frames_to_use,
129
- seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
130
- progress):
131
- # Spaces.GPU yêu cầu hàm duration trả về số giây max
132
- return 75 if duration_ui > 7 else 60
133
-
134
- @spaces.GPU(duration=get_duration)
135
- def generate(prompt, negative_prompt, input_image_filepath, input_video_filepath,
136
- height_ui, width_ui, mode,
137
- duration_ui, ui_frames_to_use,
138
- seed_ui, randomize_seed, ui_guidance_scale, improve_texture_flag,
139
- progress=gr.Progress(track_tqdm=True)):
140
- # Xử lý seed
141
- if randomize_seed:
142
- seed_ui = random.randint(0, 2**32 - 1)
143
- seed_everething(int(seed_ui))
144
-
145
- # Tính số frame
146
- target_frames = round(duration_ui * FPS)
147
- target_frames = max(1, target_frames)
148
- n_val = round((target_frames - 1) / 8)
149
- num_frames = int(n_val * 8 + 1)
150
- num_frames = max(9, min(num_frames, MAX_NUM_FRAMES))
151
-
152
- h, w = int(height_ui), int(width_ui)
153
- h_padded = ((h - 1) // 32 + 1) * 32
154
- w_padded = ((w - 1) // 32 + 1) * 32
155
- frames_padded = ((num_frames - 2) // 8 + 1) * 8 + 1
156
- if frames_padded != num_frames:
157
- print(f"Cảnh báo: num_frames ({num_frames}) ≠ frames_padded ({frames_padded}), dùng frames_padded.")
158
-
159
- padding_values = calculate_padding(h, w, h_padded, w_padded)
160
-
161
- # Chuẩn bị tham số gọi pipeline
162
- call_kwargs = {
163
- "prompt": prompt,
164
- "negative_prompt": negative_prompt,
165
- "height": h_padded,
166
- "width": w_padded,
167
- "num_frames": frames_padded,
168
- "frame_rate": int(FPS),
169
- "generator": torch.Generator(device=target_inference_device).manual_seed(int(seed_ui)),
170
- "output_type": "pt",
171
- "conditioning_items": None,
172
- "media_items": None,
173
- "decode_timestep": PIPELINE_CONFIG_YAML["decode_timestep"],
174
- "decode_noise_scale": PIPELINE_CONFIG_YAML["decode_noise_scale"],
175
- "stochastic_sampling": PIPELINE_CONFIG_YAML["stochastic_sampling"],
176
- "image_cond_noise_scale": 0.15,
177
- "is_video": True,
178
- "vae_per_channel_normalize": True,
179
- "mixed_precision": (PIPELINE_CONFIG_YAML["precision"] == "mixed_precision"),
180
- "offload_to_cpu": False,
181
- "enhance_prompt": False,
182
- }
183
-
184
- # Cấu hình skip layer strategy
185
- stg = PIPELINE_CONFIG_YAML.get("stg_mode", "attention_values").lower()
186
- if stg in ["stg_av", "attention_values"]:
187
- call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.AttentionValues
188
- elif stg in ["stg_as", "attention_skip"]:
189
- call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.AttentionSkip
190
- elif stg in ["stg_r", "residual"]:
191
- call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.Residual
192
- elif stg in ["stg_t", "transformer_block"]:
193
- call_kwargs["skip_layer_strategy"] = SkipLayerStrategy.TransformerBlock
194
- else:
195
- raise ValueError(f"stg_mode không hợp lệ: {stg}")
196
-
197
- # Xử lý conditioning cho image-to-video hoặc video-to-video
198
- if mode == "image-to-video" and input_image_filepath:
199
- try:
200
- media_tensor = load_image_to_tensor_with_resize_and_crop(input_image_filepath, h, w)
201
- media_tensor = torch.nn.functional.pad(media_tensor, padding_values)
202
- call_kwargs["conditioning_items"] = [
203
- ConditioningItem(media_tensor.to(target_inference_device), 0, 1.0)
204
- ]
205
- except Exception as e:
206
- raise gr.Error(f"Không tải được ảnh: {e}")
207
- elif mode == "video-to-video" and input_video_filepath:
208
- try:
209
- call_kwargs["media_items"] = load_media_file(
210
- media_path=input_video_filepath,
211
- height=h,
212
- width=w,
213
- max_frames=int(ui_frames_to_use),
214
- padding=padding_values
215
- ).to(target_inference_device)
216
- except Exception as e:
217
- raise gr.Error(f"Không tải được video: {e}")
218
-
219
- print(f"Chuyển hình sang {target_inference_device} nếu cần…")
220
- active_upsampler = latent_upsampler_instance if improve_texture_flag else None
221
-
222
- # Chạy pipeline (1 pass hoặc multi-scale)
223
- if improve_texture_flag and active_upsampler:
224
- pipeline_ms = LTXMultiScalePipeline(pipeline_instance, active_upsampler)
225
- # Cấu hình 2 pass
226
- first_pass = PIPELINE_CONFIG_YAML.get("first_pass", {}).copy()
227
- first_pass["guidance_scale"] = float(ui_guidance_scale)
228
- first_pass.pop("num_inference_steps", None)
229
- second_pass = PIPELINE_CONFIG_YAML.get("second_pass", {}).copy()
230
- second_pass["guidance_scale"] = float(ui_guidance_scale)
231
- second_pass.pop("num_inference_steps", None)
232
- kwargs_ms = {**call_kwargs,
233
- "downscale_factor": PIPELINE_CONFIG_YAML["downscale_factor"],
234
- "first_pass": first_pass,
235
- "second_pass": second_pass}
236
- print(f"Chạy multi-scale pipeline: {h}x{w}, frames {num_frames}→{frames_padded}")
237
- result = pipeline_ms(**kwargs_ms).images
238
- else:
239
- single_cfg = PIPELINE_CONFIG_YAML.get("first_pass", {})
240
- kwargs_sp = {**call_kwargs,
241
- "timesteps": single_cfg.get("timesteps"),
242
- "guidance_scale": float(ui_guidance_scale),
243
- "stg_scale": single_cfg.get("stg_scale"),
244
- "rescaling_scale": single_cfg.get("rescaling_scale"),
245
- "skip_block_list": single_cfg.get("skip_block_list")}
246
- # Loại bỏ key không dùng
247
- for k in ["num_inference_steps", "first_pass", "second_pass", "downscale_factor"]:
248
- kwargs_sp.pop(k, None)
249
- print(f"Chạy single-pass pipeline: {h_padded}x{w_padded}, frames {num_frames}→{frames_padded}")
250
- result = pipeline_instance(**kwargs_sp).images
251
-
252
- if result is None:
253
- raise gr.Error("Quá trình tạo thất bại.")
254
-
255
- # Bỏ padding
256
- pad_l, pad_r, pad_t, pad_b = padding_values
257
- slice_h = None if pad_b == 0 else -pad_b
258
- slice_w = None if pad_r == 0 else -pad_r
259
- result = result[:, :, :num_frames, pad_t:slice_h, pad_l:slice_w]
260
-
261
- # Chuyển sang numpy và lưu video
262
- video_np = result[0].permute(1, 2, 3, 0).cpu().numpy()
263
- video_np = (np.clip(video_np,0,1)*255).astype(np.uint8)
264
-
265
- temp_dir = tempfile.mkdtemp()
266
- out_path = os.path.join(temp_dir, f"output_{random.randint(10000,99999)}.mp4")
267
- try:
268
- with imageio.get_writer(out_path, fps=int(FPS), macro_block_size=1) as writer:
269
- for i in range(video_np.shape[0]):
270
- progress(i/video_np.shape[0], desc="Lưu video")
271
- writer.append_data(video_np[i])
272
- except Exception:
273
- # Fallback dùng ffmpeg
274
- with imageio.get_writer(out_path, fps=int(FPS), format='FFMPEG', codec='libx264', quality=8) as writer:
275
- for i in range(video_np.shape[0]):
276
- progress(i/video_np.shape[0], desc="Lưu video fallback")
277
- writer.append_data(video_np[i])
278
-
279
- return out_path, seed_ui
280
-
281
- # Hàm cập nhật tab
282
- def update_task_image(): return "image-to-video"
283
- def update_task_text(): return "text-to-video"
284
- def update_task_video(): return "video-to-video"
285
-
286
- # --- Định nghĩa giao diện Gradio ---
287
- css = """
288
- #col-container {
289
- margin: 0 auto;
290
- max-width: 900px;
291
- }
292
- """
293
-
294
- with gr.Blocks(css=css) as demo:
295
- gr.Markdown("# Ứng dụng LTX Video 0.9.7 Distilled")
296
- gr.Markdown(
297
- "Tạo video chất lượng cao nhanh chóng. "
298
- "[Mô hình](https://huggingface.co/LTTEAM/VideoAI/blob/main/ltxv-13b-0.9.7-distilled.safetensors) · "
299
- "[GitHub](https://github.com/Lightricks/LTX-Video)"
300
- )
301
-
302
- with gr.Row():
303
- with gr.Column():
304
- # Tab image-to-video
305
- with gr.Tab("Ảnh→Video") as tab_img:
306
- video_i_hidden = gr.Textbox(visible=False)
307
- image_input = gr.Image(label="Chọn ảnh", type="filepath", sources=["upload","webcam","clipboard"])
308
- prompt_img = gr.Textbox(label="Nhập mô tả", value="Con sinh vật trong ảnh bắt đầu chuyển động", lines=3)
309
- btn_img = gr.Button("Tạo video từ ảnh", variant="primary")
310
-
311
- # Tab text-to-video
312
- with gr.Tab("Văn bản→Video") as tab_txt:
313
- image_n_hidden = gr.Textbox(visible=False)
314
- video_n_hidden = gr.Textbox(visible=False)
315
- prompt_txt = gr.Textbox(label="Nhập mô tả", value="Rồng hùng vĩ bay trên lâu đài thời trung cổ", lines=3)
316
- btn_txt = gr.Button("Tạo video từ văn bản", variant="primary")
317
-
318
- # Tab video-to-video (ẩn theo mặc định)
319
- with gr.Tab("Video→Video", visible=False) as tab_vid:
320
- image_v_hidden = gr.Textbox(visible=False)
321
- video_input = gr.Video(label="Chọn video", sources=["upload","webcam"])
322
- frames_slider = gr.Slider(label="Số frame dùng", minimum=9, maximum=MAX_NUM_FRAMES, value=9, step=8,
323
- info="Phải là N*8+1")
324
- prompt_vid = gr.Textbox(label="Nhập mô tả", value="Chuyển phong cách sang anime điện ảnh", lines=3)
325
- btn_vid = gr.Button("Tạo video từ video", variant="primary")
326
-
327
- duration_input = gr.Slider(label="Thời lượng video (s)", minimum=0.3, maximum=8.5, value=2, step=0.1)
328
- improve_texture = gr.Checkbox(label="Cải thiện chi tiết (multi-scale)", value=True)
329
-
330
- with gr.Column():
331
- output_video = gr.Video(label="Video kết quả", interactive=False)
332
-
333
- # Cài đặt nâng cao
334
- with gr.Accordion("Cài đặt nâng cao", open=False):
335
- mode = gr.Dropdown(["text-to-video","image-to-video","video-to-video"], visible=False)
336
- negative_prompt = gr.Textbox(label="Negative Prompt", value="worst quality, inconsistent motion, blurry, jittery, distorted", lines=2)
337
- with gr.Row():
338
- seed_input = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
339
- rand_seed = gr.Checkbox(label="Randomize Seed", value=True)
340
- with gr.Row():
341
- cfg_scale = gr.Slider(label="Guidance Scale (CFG)", minimum=1.0, maximum=10.0,
342
- value=PIPELINE_CONFIG_YAML.get("first_pass",{}).get("guidance_scale",1.0), step=0.1)
343
- with gr.Row():
344
- height_slider = gr.Slider(label="Chiều cao", value=512, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE)
345
- width_slider = gr.Slider(label="Chiều rộng", value=704, step=32, minimum=MIN_DIM_SLIDER, maximum=MAX_IMAGE_SIZE)
346
-
347
- # Cập nhật kích thước khi tải ảnh/video
348
- def on_image_upload(fp, h, w):
349
- if not fp: return gr.update(value=h), gr.update(value=w)
350
- try:
351
- img = Image.open(fp)
352
- new_h, new_w = calculate_new_dimensions(*img.size)
353
- return gr.update(value=new_h), gr.update(value=new_w)
354
- except:
355
- return gr.update(value=h), gr.update(value=w)
356
-
357
- def on_video_upload(fp, h, w):
358
- if not fp: return gr.update(value=h), gr.update(value=w)
359
- try:
360
- fp = str(fp)
361
- if not os.path.exists(fp): return gr.update(value=h), gr.update(value=w)
362
- reader = imageio.get_reader(fp)
363
- meta = reader.get_meta_data()
364
- if "size" in meta:
365
- orig_w, orig_h = meta["size"]
366
- else:
367
- f0 = reader.get_data(0)
368
- orig_h, orig_w = f0.shape[0], f0.shape[1]
369
- new_h, new_w = calculate_new_dimensions(orig_w, orig_h)
370
- return gr.update(value=new_h), gr.update(value=new_w)
371
- except:
372
- return gr.update(value=h), gr.update(value=w)
373
-
374
- image_input.upload(on_image_upload, [image_input, height_slider, width_slider], [height_slider, width_slider])
375
- video_input.upload(on_video_upload, [video_input, height_slider, width_slider], [height_slider, width_slider])
376
-
377
- # Kết nối tab với mode
378
- tab_img.select(lambda: "image-to-video", outputs=[mode])
379
- tab_txt.select(lambda: "text-to-video", outputs=[mode])
380
- tab_vid.select(lambda: "video-to-video", outputs=[mode])
381
-
382
- # Đầu vào cho mỗi nút
383
- inputs_txt = [prompt_txt, negative_prompt, image_n_hidden, video_n_hidden,
384
- height_slider, width_slider, mode,
385
- duration_input, frames_slider,
386
- seed_input, rand_seed, cfg_scale, improve_texture]
387
- inputs_img = [prompt_img, negative_prompt, image_input, video_i_hidden,
388
- height_slider, width_slider, mode,
389
- duration_input, frames_slider,
390
- seed_input, rand_seed, cfg_scale, improve_texture]
391
- inputs_vid = [prompt_vid, negative_prompt, image_v_hidden, video_input,
392
- height_slider, width_slider, mode,
393
- duration_input, frames_slider,
394
- seed_input, rand_seed, cfg_scale, improve_texture]
395
-
396
- btn_txt.click(fn=generate, inputs=inputs_txt, outputs=[output_video, seed_input], api_name="text_to_video")
397
- btn_img.click(fn=generate, inputs=inputs_img, outputs=[output_video, seed_input], api_name="image_to_video")
398
- btn_vid.click(fn=generate, inputs=inputs_vid, outputs=[output_video, seed_input], api_name="video_to_video")
399
-
400
- if __name__ == "__main__":
401
- if os.path.isdir(models_dir):
402
- print(f"Thư mục mô hình: {Path(models_dir).resolve()}")
403
- demo.queue().launch(debug=True, share=False, mcp_server=True)
 
1
+ import gradio as gr
2
+ import torch
3
+ import spaces
4
+ import numpy as np
5
+ import random
6
+ import os
7
+ import yaml
8
+ from pathlib import Path
9
+ import imageio
10
+ import tempfile
11
+ from PIL import Image
12
+ from huggingface_hub import hf_hub_download
13
+
14
+ from inference import (
15
+ create_ltx_video_pipeline,
16
+ create_latent_upsampler,
17
+ load_image_to_tensor_with_resize_and_crop,
18
+ seed_everething,
19
+ calculate_padding,
20
+ load_media_file
21
+ )
22
+ from ltx_video.pipelines.pipeline_ltx_video import ConditioningItem, LTXMultiScalePipeline
23
+ from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
24
+
25
+ # --- Cấu hình và tải mô hình từ repo của bạn ---
26
+ CONFIG_PATH = "configs/ltxv-13b-0.9.7-distilled.yaml"
27
+ with open(CONFIG_PATH, "r") as f:
28
+ CFG = yaml.safe_load(f)
29
+
30
+ HF_REPO = "LTTEAM/VideoAI"
31
+ MODELS_DIR = "downloaded_models"
32
+ Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)
33
+
34
+ print("Đang tải mô hình (nếu chưa có)…")
35
+ ckpt_path = hf_hub_download(
36
+ repo_id=HF_REPO,
37
+ filename=CFG["checkpoint_path"],
38
+ local_dir=MODELS_DIR
39
+ )
40
+ CFG["checkpoint_path"] = ckpt_path
41
+ upscaler_path = hf_hub_download(
42
+ repo_id=HF_REPO,
43
+ filename=CFG["spatial_upscaler_model_path"],
44
+ local_dir=MODELS_DIR
45
+ )
46
+ CFG["spatial_upscaler_model_path"] = upscaler_path
47
+
48
+ # --- Khởi tạo pipeline trên CPU ban đầu ---
49
+ print("Khởi tạo LTX Video pipeline trên CPU…")
50
+ pipeline = create_ltx_video_pipeline(
51
+ ckpt_path=CFG["checkpoint_path"],
52
+ precision=CFG["precision"],
53
+ text_encoder_model_name_or_path=CFG["text_encoder_model_name_or_path"],
54
+ sampler=CFG["sampler"],
55
+ device="cpu",
56
+ enhance_prompt=False,
57
+ prompt_enhancer_image_caption_model_name_or_path=CFG["prompt_enhancer_image_caption_model_name_or_path"],
58
+ prompt_enhancer_llm_model_name_or_path=CFG["prompt_enhancer_llm_model_name_or_path"],
59
+ )
60
+ print("Pipeline sẵn sàng.")
61
+ print("Khởi tạo latent upsampler trên CPU…")
62
+ upsampler = create_latent_upsampler(CFG["spatial_upscaler_model_path"], device="cpu")
63
+ print("Latent upsampler sẵn sàng.")
64
+
65
+ # --- Các thông số cố định ---
66
+ FPS = 30.0
67
+ MAX_NUM_FRAMES = 257
68
+ MIN_DIM = 256
69
+ TARGET_SIDE = 768
70
+ MAX_RES = CFG.get("max_resolution", 1280)
71
+
72
+ def calculate_new_dimensions(w, h):
73
+ if w==0 or h==0:
74
+ return TARGET_SIDE, TARGET_SIDE
75
+ if w>=h:
76
+ nh = TARGET_SIDE
77
+ nw = round((nh * w/h)/32)*32
78
+ else:
79
+ nw = TARGET_SIDE
80
+ nh = round((nw * h/w)/32)*32
81
+ return (
82
+ int(max(MIN_DIM, min(nh, MAX_RES))),
83
+ int(max(MIN_DIM, min(nw, MAX_RES)))
84
+ )
85
+
86
+ def get_duration(*args, **kwargs):
87
+ # spaces.GPU yêu cầu
88
+ return 75 if kwargs.get("duration_ui",0) > 7 else 60
89
+
90
+ @spaces.GPU(duration=get_duration)
91
+ def generate(prompt, neg_prompt,
92
+ img_path, vid_path,
93
+ height, width,
94
+ mode_task, duration_ui, frames_to_use,
95
+ seed, rand_seed, cfg_scale,
96
+ improve_tex, device_choice,
97
+ progress=gr.Progress(track_tqdm=True)):
98
+ # 1) Chuyển pipeline & upsampler theo lựa chọn
99
+ dev = "cuda" if device_choice=="GPU" and torch.cuda.is_available() else "cpu"
100
+ print(f"Chạy trên thiết bị: {dev}")
101
+ pipeline.to(dev)
102
+ upsampler.to(dev)
103
+
104
+ # 2) Xử seed
105
+ if rand_seed:
106
+ seed = random.randint(0, 2**32-1)
107
+ seed_everething(int(seed))
108
+
109
+ # 3) Tính số frame
110
+ tf = max(1, round(duration_ui*FPS))
111
+ n8 = round((tf-1)/8)
112
+ n_frames = max(9, min(n8*8+1, MAX_NUM_FRAMES))
113
+
114
+ # 4) Padding kích thước
115
+ h, w = int(height), int(width)
116
+ h32 = ((h-1)//32+1)*32
117
+ w32 = ((w-1)//32+1)*32
118
+ pad = calculate_padding(h, w, h32, w32)
119
+
120
+ # 5) Chuẩn bị kwargs
121
+ kwargs = {
122
+ "prompt": prompt,
123
+ "negative_prompt": neg_prompt,
124
+ "height": h32,
125
+ "width": w32,
126
+ "num_frames": n_frames,
127
+ "frame_rate": int(FPS),
128
+ "generator": torch.Generator(device=dev).manual_seed(int(seed)),
129
+ "output_type": "pt",
130
+ "decode_timestep": CFG["decode_timestep"],
131
+ "decode_noise_scale": CFG["decode_noise_scale"],
132
+ "stochastic_sampling": CFG["stochastic_sampling"],
133
+ "is_video": True,
134
+ "vae_per_channel_normalize": True,
135
+ "mixed_precision": CFG["precision"]=="mixed_precision",
136
+ "offload_to_cpu": False,
137
+ "enhance_prompt": False,
138
+ }
139
+ # skip strategy
140
+ stg = CFG.get("stg_mode","attention_values").lower()
141
+ mapping = {
142
+ "stg_av":SkipLayerStrategy.AttentionValues,
143
+ "attention_values":SkipLayerStrategy.AttentionValues,
144
+ "stg_as":SkipLayerStrategy.AttentionSkip,
145
+ "attention_skip":SkipLayerStrategy.AttentionSkip,
146
+ "stg_r":SkipLayerStrategy.Residual,
147
+ "residual":SkipLayerStrategy.Residual,
148
+ "stg_t":SkipLayerStrategy.TransformerBlock,
149
+ "transformer_block":SkipLayerStrategy.TransformerBlock,
150
+ }
151
+ kwargs["skip_layer_strategy"] = mapping.get(stg, SkipLayerStrategy.AttentionValues)
152
+
153
+ # 6) Conditioning
154
+ if mode_task=="image-to-video" and img_path:
155
+ tensor = load_image_to_tensor_with_resize_and_crop(img_path, h, w)
156
+ tensor = torch.nn.functional.pad(tensor, pad)
157
+ kwargs["conditioning_items"] = [ConditioningItem(tensor.to(dev),0,1.0)]
158
+ elif mode_task=="video-to-video" and vid_path:
159
+ mi = load_media_file(vid_path, h, w, int(frames_to_use), pad).to(dev)
160
+ kwargs["media_items"] = mi
161
+
162
+ # 7) Chọn multi-scale hay single
163
+ if improve_tex:
164
+ pipe_ms = LTXMultiScalePipeline(pipeline, upsampler)
165
+ fp = CFG.get("first_pass",{}).copy()
166
+ fp["guidance_scale"] = float(cfg_scale)
167
+ fp.pop("num_inference_steps",None)
168
+ sp = CFG.get("second_pass",{}).copy()
169
+ sp["guidance_scale"] = float(cfg_scale)
170
+ sp.pop("num_inference_steps",None)
171
+ kwargs.update({
172
+ "downscale_factor":CFG["downscale_factor"],
173
+ "first_pass":fp,
174
+ "second_pass":sp
175
+ })
176
+ out = pipe_ms(**kwargs).images
177
+ else:
178
+ fp0 = CFG.get("first_pass",{})
179
+ kwargs.update({
180
+ "timesteps":fp0.get("timesteps"),
181
+ "guidance_scale":float(cfg_scale),
182
+ "stg_scale":fp0.get("stg_scale"),
183
+ "rescaling_scale":fp0.get("rescaling_scale"),
184
+ "skip_block_list":fp0.get("skip_block_list")
185
+ })
186
+ for k in ["first_pass","second_pass","downscale_factor","num_inference_steps"]:
187
+ kwargs.pop(k, None)
188
+ out = pipeline(**kwargs).images
189
+
190
+ # 8) Xử kết quả, bỏ padding, lưu video
191
+ pad_l,pad_r,pad_t,pad_b = pad
192
+ sh = None if pad_b==0 else -pad_b
193
+ sw = None if pad_r==0 else -pad_r
194
+ vid_tensor = out[0][:,:,:n_frames,pad_t:sh,pad_l:sw]
195
+ arr = vid_tensor.permute(1,2,3,0).cpu().numpy()
196
+ arr = (np.clip(arr,0,1)*255).astype(np.uint8)
197
+
198
+ tmp = tempfile.mkdtemp()
199
+ dst = os.path.join(tmp, f"out_{random.randint(0,99999)}.mp4")
200
+ with imageio.get_writer(dst, fps=int(FPS), macro_block_size=1) as w:
201
+ for i in range(arr.shape[0]):
202
+ progress(i/arr.shape[0], desc="Lưu video")
203
+ w.append_data(arr[i])
204
+ return dst, seed
205
+
206
+ # --- Giao diện Gradio ---
207
+ css = """
208
+ #col-container {margin:0 auto; max-width:900px;}
209
+ """
210
+ with gr.Blocks(css=css) as demo:
211
+ gr.Markdown("## Ứng dụng LTX Video 0.9.7 Distilled")
212
+ gr.Markdown(
213
+ "[Mô hình trên HF](https://huggingface.co/LTTEAM/VideoAI) · "
214
+ "[GitHub](https://github.com/Lightricks/LTX-Video)"
215
+ )
216
+
217
+ with gr.Row():
218
+ with gr.Column():
219
+ # Chọn thiết bị
220
+ device = gr.Radio(["CPU","GPU"], label="Chạy trên thiết bị", value="CPU")
221
+ # Tabs
222
+ with gr.Tab("Ảnh→Video"):
223
+ img_in = gr.Image(label="Ảnh đầu vào", type="filepath", source="upload")
224
+ prompt1 = gr.Textbox(label="Mô tả", lines=2, value="Con sinh vật di chuyển")
225
+ btn1 = gr.Button("Tạo từ ảnh")
226
+ with gr.Tab("Văn bản→Video"):
227
+ prompt2 = gr.Textbox(label="Mô tả", lines=2, value="Rồng bay trên lâu đài")
228
+ btn2 = gr.Button("Tạo từ văn bản")
229
+ with gr.Tab("Video→Video"):
230
+ vid_in = gr.Video(label="Video đầu vào", source="upload")
231
+ frames = gr.Slider(label="Số frame dùng", minimum=9, maximum=MAX_NUM_FRAMES, step=8, value=9)
232
+ prompt3 = gr.Textbox(label="Mô tả", lines=2, value="Chuyển phong cách anime")
233
+ btn3 = gr.Button("Tạo từ video")
234
+
235
+ duration = gr.Slider(label="Thời lượng (giây)", minimum=0.3, maximum=8.5, step=0.1, value=2)
236
+ improve = gr.Checkbox(label="Cải thiện chi tiết", value=True)
237
+
238
+ with gr.Column():
239
+ out_video = gr.Video(label="Kết quả", interactive=False)
240
+
241
+ # Ẩn mode, reuse chung
242
+ mode = gr.State("image-to-video")
243
+ # Nút
244
+ btn1.click(fn=generate,
245
+ inputs=[prompt1, gr.State(""), img_in, gr.State(""),
246
+ height := gr.State(512), width := gr.State(704),
247
+ mode := gr.State("image-to-video"),
248
+ duration, frames,
249
+ seed := gr.State(42), gr.State(True),
250
+ cfg_scale := gr.State(CFG["first_pass"]["guidance_scale"]),
251
+ improve, device],
252
+ outputs=[out_video, seed])
253
+ btn2.click(fn=generate,
254
+ inputs=[prompt2, gr.State(""), gr.State(""), gr.State(""),
255
+ height, width,
256
+ mode := gr.State("text-to-video"),
257
+ duration, frames,
258
+ seed, gr.State(True),
259
+ cfg_scale,
260
+ improve, device],
261
+ outputs=[out_video, seed])
262
+ btn3.click(fn=generate,
263
+ inputs=[prompt3, gr.State(""), gr.State(""), vid_in,
264
+ height, width,
265
+ mode := gr.State("video-to-video"),
266
+ duration, frames,
267
+ seed, gr.State(True),
268
+ cfg_scale,
269
+ improve, device],
270
+ outputs=[out_video, seed])
271
+
272
+ if __name__ == "__main__":
273
+ demo.queue().launch(debug=True, share=False)