Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import sys
|
| 2 |
from pathlib import Path
|
| 3 |
-
import uuid
|
| 4 |
|
| 5 |
# Add packages to Python path
|
| 6 |
current_dir = Path(__file__).parent
|
|
@@ -97,82 +96,41 @@ def encode_text_simple(text_encoder, prompt: str):
|
|
| 97 |
def encode_prompt(
|
| 98 |
prompt: str,
|
| 99 |
enhance_prompt: bool = True,
|
| 100 |
-
input_image
|
| 101 |
seed: int = 42,
|
| 102 |
negative_prompt: str = ""
|
| 103 |
):
|
| 104 |
-
"""
|
| 105 |
-
Encode a text prompt using Gemma text encoder.
|
| 106 |
-
Args:
|
| 107 |
-
prompt: Text prompt to encode
|
| 108 |
-
enhance_prompt: Whether to use AI to enhance the prompt
|
| 109 |
-
input_image: Optional image for image-to-video enhancement
|
| 110 |
-
seed: Random seed for prompt enhancement
|
| 111 |
-
negative_prompt: Optional negative prompt for CFG (two-stage pipeline)
|
| 112 |
-
Returns:
|
| 113 |
-
tuple: (file_path, enhanced_prompt_text, status_message)
|
| 114 |
-
"""
|
| 115 |
start_time = time.time()
|
| 116 |
-
|
| 117 |
try:
|
| 118 |
-
# Enhance prompt if requested
|
| 119 |
final_prompt = prompt
|
| 120 |
if enhance_prompt:
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
input_image.save(temp_image_path)
|
| 128 |
-
else:
|
| 129 |
-
temp_image_path = input_image
|
| 130 |
-
|
| 131 |
-
final_prompt = generate_enhanced_prompt(
|
| 132 |
-
text_encoder=text_encoder,
|
| 133 |
-
prompt=prompt,
|
| 134 |
-
image_path=str(temp_image_path),
|
| 135 |
-
seed=seed
|
| 136 |
-
)
|
| 137 |
-
else:
|
| 138 |
-
final_prompt = generate_enhanced_prompt(
|
| 139 |
-
text_encoder=text_encoder,
|
| 140 |
-
prompt=prompt,
|
| 141 |
-
image_path=None,
|
| 142 |
-
seed=seed
|
| 143 |
-
)
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
-
# Encode negative prompt if provided
|
| 149 |
video_context_negative = None
|
| 150 |
audio_context_negative = None
|
| 151 |
if negative_prompt:
|
| 152 |
video_context_negative, audio_context_negative = encode_text_simple(text_encoder, negative_prompt)
|
| 153 |
|
| 154 |
-
|
| 155 |
-
output_dir = Path("embeddings")
|
| 156 |
-
output_dir.mkdir(exist_ok=True)
|
| 157 |
-
output_path = output_dir / f"embedding_{run_id}.pt"
|
| 158 |
-
|
| 159 |
-
# Save embeddings (with negative contexts if provided)
|
| 160 |
embedding_data = {
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
}
|
| 166 |
-
|
| 167 |
-
# Add negative contexts if they were encoded
|
| 168 |
if video_context_negative is not None:
|
| 169 |
-
embedding_data[
|
| 170 |
-
embedding_data[
|
| 171 |
-
embedding_data[
|
| 172 |
-
|
| 173 |
-
torch.save(embedding_data, output_path)
|
| 174 |
|
| 175 |
-
# Get memory stats
|
| 176 |
elapsed_time = time.time() - start_time
|
| 177 |
if torch.cuda.is_available():
|
| 178 |
allocated = torch.cuda.memory_allocated() / 1024**3
|
|
@@ -181,7 +139,7 @@ def encode_prompt(
|
|
| 181 |
else:
|
| 182 |
status = f"✓ Encoded in {elapsed_time:.2f}s (CPU mode)"
|
| 183 |
|
| 184 |
-
return
|
| 185 |
|
| 186 |
except Exception as e:
|
| 187 |
import traceback
|
|
@@ -189,6 +147,7 @@ def encode_prompt(
|
|
| 189 |
print(error_msg)
|
| 190 |
return None, prompt, error_msg
|
| 191 |
|
|
|
|
| 192 |
# Default prompt from docstring example
|
| 193 |
DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
|
| 194 |
|
|
@@ -406,49 +365,39 @@ def generate_video(
|
|
| 406 |
frame_rate = 24.0
|
| 407 |
num_frames = int(duration * frame_rate) + 1 # +1 to ensure we meet the duration
|
| 408 |
|
| 409 |
-
|
| 410 |
-
|
| 411 |
output_dir = Path("outputs")
|
| 412 |
output_dir.mkdir(exist_ok=True)
|
| 413 |
-
|
| 414 |
-
output_path = output_dir / f"video_{run_id}.mp4"
|
| 415 |
-
temp_image_path = output_dir / f"temp_input_{run_id}.jpg"
|
| 416 |
-
|
| 417 |
|
| 418 |
# Handle image input
|
| 419 |
images = []
|
| 420 |
-
|
|
|
|
|
|
|
| 421 |
if input_image is not None:
|
| 422 |
-
|
| 423 |
-
if hasattr(input_image, 'save'):
|
| 424 |
-
input_image.save(temp_image_path)
|
| 425 |
-
else:
|
| 426 |
-
# If it's a file path already
|
| 427 |
-
temp_image_path = Path(input_image)
|
| 428 |
-
# Format: (image_path, frame_idx, strength)
|
| 429 |
-
images = [(str(temp_image_path), 0, 1.0)]
|
| 430 |
|
| 431 |
-
|
| 432 |
# Prepare image for upload if it exists
|
| 433 |
image_input = None
|
| 434 |
|
| 435 |
|
| 436 |
-
|
| 437 |
prompt=prompt,
|
| 438 |
enhance_prompt=enhance_prompt,
|
| 439 |
input_image=input_image,
|
| 440 |
seed=current_seed,
|
| 441 |
negative_prompt="",
|
| 442 |
)
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
# Load embeddings
|
| 447 |
-
embeddings = torch.load(embedding_path)
|
| 448 |
-
video_context = embeddings['video_context']
|
| 449 |
-
audio_context = embeddings['audio_context']
|
| 450 |
print("✓ Embeddings loaded successfully")
|
| 451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
# Run inference - progress automatically tracks tqdm from pipeline
|
| 453 |
pipeline(
|
| 454 |
prompt=prompt,
|
|
@@ -463,6 +412,7 @@ def generate_video(
|
|
| 463 |
video_context=video_context,
|
| 464 |
audio_context=audio_context,
|
| 465 |
)
|
|
|
|
| 466 |
torch.cuda.empty_cache()
|
| 467 |
print("successful generation")
|
| 468 |
|
|
@@ -472,7 +422,7 @@ def generate_video(
|
|
| 472 |
import traceback
|
| 473 |
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
|
| 474 |
print(error_msg)
|
| 475 |
-
return None
|
| 476 |
|
| 477 |
|
| 478 |
def apply_resolution(resolution: str):
|
|
@@ -649,9 +599,10 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
|
|
| 649 |
|
| 650 |
input_image = gr.Image(
|
| 651 |
label="Input Image (Optional)",
|
| 652 |
-
type="pil"
|
| 653 |
-
height=512
|
| 654 |
-
|
|
|
|
| 655 |
prompt = gr.Textbox(
|
| 656 |
label="Prompt",
|
| 657 |
value="Make this image come alive with cinematic motion, smooth animation",
|
|
|
|
| 1 |
import sys
|
| 2 |
from pathlib import Path
|
|
|
|
| 3 |
|
| 4 |
# Add packages to Python path
|
| 5 |
current_dir = Path(__file__).parent
|
|
|
|
| 96 |
def encode_prompt(
|
| 97 |
prompt: str,
|
| 98 |
enhance_prompt: bool = True,
|
| 99 |
+
input_image=None, # this is now filepath (string) or None
|
| 100 |
seed: int = 42,
|
| 101 |
negative_prompt: str = ""
|
| 102 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
start_time = time.time()
|
|
|
|
| 104 |
try:
|
|
|
|
| 105 |
final_prompt = prompt
|
| 106 |
if enhance_prompt:
|
| 107 |
+
final_prompt = generate_enhanced_prompt(
|
| 108 |
+
text_encoder=text_encoder,
|
| 109 |
+
prompt=prompt,
|
| 110 |
+
image_path=input_image if input_image is not None else None,
|
| 111 |
+
seed=seed,
|
| 112 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
with torch.inference_mode():
|
| 115 |
+
video_context, audio_context = encode_text_simple(text_encoder, final_prompt)
|
| 116 |
|
|
|
|
| 117 |
video_context_negative = None
|
| 118 |
audio_context_negative = None
|
| 119 |
if negative_prompt:
|
| 120 |
video_context_negative, audio_context_negative = encode_text_simple(text_encoder, negative_prompt)
|
| 121 |
|
| 122 |
+
# IMPORTANT: return tensors directly (no torch.save)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
embedding_data = {
|
| 124 |
+
"video_context": video_context.detach().cpu(),
|
| 125 |
+
"audio_context": audio_context.detach().cpu(),
|
| 126 |
+
"prompt": final_prompt,
|
| 127 |
+
"original_prompt": prompt,
|
| 128 |
}
|
|
|
|
|
|
|
| 129 |
if video_context_negative is not None:
|
| 130 |
+
embedding_data["video_context_negative"] = video_context_negative
|
| 131 |
+
embedding_data["audio_context_negative"] = audio_context_negative
|
| 132 |
+
embedding_data["negative_prompt"] = negative_prompt
|
|
|
|
|
|
|
| 133 |
|
|
|
|
| 134 |
elapsed_time = time.time() - start_time
|
| 135 |
if torch.cuda.is_available():
|
| 136 |
allocated = torch.cuda.memory_allocated() / 1024**3
|
|
|
|
| 139 |
else:
|
| 140 |
status = f"✓ Encoded in {elapsed_time:.2f}s (CPU mode)"
|
| 141 |
|
| 142 |
+
return embedding_data, final_prompt, status
|
| 143 |
|
| 144 |
except Exception as e:
|
| 145 |
import traceback
|
|
|
|
| 147 |
print(error_msg)
|
| 148 |
return None, prompt, error_msg
|
| 149 |
|
| 150 |
+
|
| 151 |
# Default prompt from docstring example
|
| 152 |
DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
|
| 153 |
|
|
|
|
| 365 |
frame_rate = 24.0
|
| 366 |
num_frames = int(duration * frame_rate) + 1 # +1 to ensure we meet the duration
|
| 367 |
|
| 368 |
+
# Create output directory if it doesn't exist
|
|
|
|
| 369 |
output_dir = Path("outputs")
|
| 370 |
output_dir.mkdir(exist_ok=True)
|
| 371 |
+
output_path = output_dir / f"video_{current_seed}.mp4"
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
# Handle image input
|
| 374 |
images = []
|
| 375 |
+
temp_image_path = None # Initialize to None
|
| 376 |
+
|
| 377 |
+
images = []
|
| 378 |
if input_image is not None:
|
| 379 |
+
images = [(input_image, 0, 1.0)] # input_image is already a path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
|
|
|
| 381 |
# Prepare image for upload if it exists
|
| 382 |
image_input = None
|
| 383 |
|
| 384 |
|
| 385 |
+
embeddings, final_prompt, status = encode_prompt(
|
| 386 |
prompt=prompt,
|
| 387 |
enhance_prompt=enhance_prompt,
|
| 388 |
input_image=input_image,
|
| 389 |
seed=current_seed,
|
| 390 |
negative_prompt="",
|
| 391 |
)
|
| 392 |
+
|
| 393 |
+
video_context = embeddings["video_context"].to("cuda", non_blocking=True)
|
| 394 |
+
audio_context = embeddings["audio_context"].to("cuda", non_blocking=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
print("✓ Embeddings loaded successfully")
|
| 396 |
|
| 397 |
+
# free prompt enhancer / encoder temps ASAP
|
| 398 |
+
del embeddings, final_prompt, status
|
| 399 |
+
torch.cuda.empty_cache()
|
| 400 |
+
|
| 401 |
# Run inference - progress automatically tracks tqdm from pipeline
|
| 402 |
pipeline(
|
| 403 |
prompt=prompt,
|
|
|
|
| 412 |
video_context=video_context,
|
| 413 |
audio_context=audio_context,
|
| 414 |
)
|
| 415 |
+
del video_context, audio_context
|
| 416 |
torch.cuda.empty_cache()
|
| 417 |
print("successful generation")
|
| 418 |
|
|
|
|
| 422 |
import traceback
|
| 423 |
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
|
| 424 |
print(error_msg)
|
| 425 |
+
return None, current_seed
|
| 426 |
|
| 427 |
|
| 428 |
def apply_resolution(resolution: str):
|
|
|
|
| 599 |
|
| 600 |
input_image = gr.Image(
|
| 601 |
label="Input Image (Optional)",
|
| 602 |
+
type="filepath", # <-- was "pil"
|
| 603 |
+
height=512
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
prompt = gr.Textbox(
|
| 607 |
label="Prompt",
|
| 608 |
value="Make this image come alive with cinematic motion, smooth animation",
|