Test

Paused

App Files Files Community

alexnasa commited on Jan 9

Commit

9087721

verified ·

1 Parent(s): 6ed7319

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -88

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import sys
 from pathlib import Path
-import uuid
 # Add packages to Python path
 current_dir = Path(__file__).parent
@@ -97,82 +96,41 @@ def encode_text_simple(text_encoder, prompt: str):
 def encode_prompt(
     prompt: str,
     enhance_prompt: bool = True,
-    input_image = None,
     seed: int = 42,
     negative_prompt: str = ""
 ):
-    """
-    Encode a text prompt using Gemma text encoder.
-    Args:
-        prompt: Text prompt to encode
-        enhance_prompt: Whether to use AI to enhance the prompt
-        input_image: Optional image for image-to-video enhancement
-        seed: Random seed for prompt enhancement
-        negative_prompt: Optional negative prompt for CFG (two-stage pipeline)
-    Returns:
-        tuple: (file_path, enhanced_prompt_text, status_message)
-    """
     start_time = time.time()
     try:
-        # Enhance prompt if requested
         final_prompt = prompt
         if enhance_prompt:
-            if input_image is not None:
-                # Save image temporarily
-                temp_dir = Path("temp_images")
-                temp_dir.mkdir(exist_ok=True)
-                temp_image_path = temp_dir / f"temp_{int(time.time())}.jpg"
-                if hasattr(input_image, 'save'):
-                    input_image.save(temp_image_path)
-                else:
-                    temp_image_path = input_image
-                final_prompt = generate_enhanced_prompt(
-                    text_encoder=text_encoder,
-                    prompt=prompt,
-                    image_path=str(temp_image_path),
-                    seed=seed
-                )
-            else:
-                final_prompt = generate_enhanced_prompt(
-                    text_encoder=text_encoder,
-                    prompt=prompt,
-                    image_path=None,
-                    seed=seed
-                )
-        # Encode the positive prompt using the pre-loaded text encoder
-        video_context, audio_context = encode_text_simple(text_encoder, final_prompt)
-        # Encode negative prompt if provided
         video_context_negative = None
         audio_context_negative = None
         if negative_prompt:
             video_context_negative, audio_context_negative = encode_text_simple(text_encoder, negative_prompt)
-        run_id = uuid.uuid4().hex
-        output_dir = Path("embeddings")
-        output_dir.mkdir(exist_ok=True)
-        output_path = output_dir / f"embedding_{run_id}.pt"
-        # Save embeddings (with negative contexts if provided)
         embedding_data = {
-            'video_context': video_context.cpu(),
-            'audio_context': audio_context.cpu(),
-            'prompt': final_prompt,
-            'original_prompt': prompt if enhance_prompt else final_prompt,
         }
-        # Add negative contexts if they were encoded
         if video_context_negative is not None:
-            embedding_data['video_context_negative'] = video_context_negative.cpu()
-            embedding_data['audio_context_negative'] = audio_context_negative.cpu()
-            embedding_data['negative_prompt'] = negative_prompt
-        torch.save(embedding_data, output_path)
-        # Get memory stats
         elapsed_time = time.time() - start_time
         if torch.cuda.is_available():
             allocated = torch.cuda.memory_allocated() / 1024**3
@@ -181,7 +139,7 @@ def encode_prompt(
         else:
             status = f"✓ Encoded in {elapsed_time:.2f}s (CPU mode)"
-        return str(output_path), final_prompt, status
     except Exception as e:
         import traceback
@@ -189,6 +147,7 @@ def encode_prompt(
         print(error_msg)
         return None, prompt, error_msg
 # Default prompt from docstring example
 DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
@@ -406,49 +365,39 @@ def generate_video(
         frame_rate = 24.0
         num_frames = int(duration * frame_rate) + 1  # +1 to ensure we meet the duration
-        run_id = uuid.uuid4().hex
         output_dir = Path("outputs")
         output_dir.mkdir(exist_ok=True)
-        output_path = output_dir / f"video_{run_id}.mp4"
-        temp_image_path = output_dir / f"temp_input_{run_id}.jpg"
         # Handle image input
         images = []
         if input_image is not None:
-            if hasattr(input_image, 'save'):
-                input_image.save(temp_image_path)
-            else:
-                # If it's a file path already
-                temp_image_path = Path(input_image)
-            # Format: (image_path, frame_idx, strength)
-            images = [(str(temp_image_path), 0, 1.0)]
         # Prepare image for upload if it exists
         image_input = None
-        result = encode_prompt(
             prompt=prompt,
             enhance_prompt=enhance_prompt,
             input_image=input_image,
             seed=current_seed,
             negative_prompt="",
         )
-        embedding_path = result[0]  # Path to .pt file
-        print(f"Embeddings received from: {embedding_path}")
-        # Load embeddings
-        embeddings = torch.load(embedding_path)
-        video_context = embeddings['video_context']
-        audio_context = embeddings['audio_context']
         print("✓ Embeddings loaded successfully")
         # Run inference - progress automatically tracks tqdm from pipeline
         pipeline(
             prompt=prompt,
@@ -463,6 +412,7 @@ def generate_video(
             video_context=video_context,
             audio_context=audio_context,
         )
         torch.cuda.empty_cache()
         print("successful generation")
@@ -472,7 +422,7 @@ def generate_video(
         import traceback
         error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
         print(error_msg)
-        return None
 def apply_resolution(resolution: str):
@@ -649,9 +599,10 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
                 input_image = gr.Image(
                     label="Input Image (Optional)",
-                    type="pil",
-                    height=512)
                 prompt = gr.Textbox(
                     label="Prompt",
                     value="Make this image come alive with cinematic motion, smooth animation",

 import sys
 from pathlib import Path
 # Add packages to Python path
 current_dir = Path(__file__).parent
 def encode_prompt(
     prompt: str,
     enhance_prompt: bool = True,
+    input_image=None,   # this is now filepath (string) or None
     seed: int = 42,
     negative_prompt: str = ""
 ):
     start_time = time.time()
     try:
         final_prompt = prompt
         if enhance_prompt:
+            final_prompt = generate_enhanced_prompt(
+                text_encoder=text_encoder,
+                prompt=prompt,
+                image_path=input_image if input_image is not None else None,
+                seed=seed,
+            )
+        with torch.inference_mode():
+            video_context, audio_context = encode_text_simple(text_encoder, final_prompt)
         video_context_negative = None
         audio_context_negative = None
         if negative_prompt:
             video_context_negative, audio_context_negative = encode_text_simple(text_encoder, negative_prompt)
+        # IMPORTANT: return tensors directly (no torch.save)
         embedding_data = {
+            "video_context": video_context.detach().cpu(),
+            "audio_context": audio_context.detach().cpu(),
+            "prompt": final_prompt,
+            "original_prompt": prompt,
         }
         if video_context_negative is not None:
+            embedding_data["video_context_negative"] = video_context_negative
+            embedding_data["audio_context_negative"] = audio_context_negative
+            embedding_data["negative_prompt"] = negative_prompt
         elapsed_time = time.time() - start_time
         if torch.cuda.is_available():
             allocated = torch.cuda.memory_allocated() / 1024**3
         else:
             status = f"✓ Encoded in {elapsed_time:.2f}s (CPU mode)"
+        return embedding_data, final_prompt, status
     except Exception as e:
         import traceback
         print(error_msg)
         return None, prompt, error_msg
 # Default prompt from docstring example
 DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
         frame_rate = 24.0
         num_frames = int(duration * frame_rate) + 1  # +1 to ensure we meet the duration
+        # Create output directory if it doesn't exist
         output_dir = Path("outputs")
         output_dir.mkdir(exist_ok=True)
+        output_path = output_dir / f"video_{current_seed}.mp4"
         # Handle image input
         images = []
+        temp_image_path = None  # Initialize to None
+        images = []
         if input_image is not None:
+            images = [(input_image, 0, 1.0)]  # input_image is already a path
         # Prepare image for upload if it exists
         image_input = None
+        embeddings, final_prompt, status = encode_prompt(
             prompt=prompt,
             enhance_prompt=enhance_prompt,
             input_image=input_image,
             seed=current_seed,
             negative_prompt="",
         )
+        video_context = embeddings["video_context"].to("cuda", non_blocking=True)
+        audio_context = embeddings["audio_context"].to("cuda", non_blocking=True)
         print("✓ Embeddings loaded successfully")
+        # free prompt enhancer / encoder temps ASAP
+        del embeddings, final_prompt, status
+        torch.cuda.empty_cache()
         # Run inference - progress automatically tracks tqdm from pipeline
         pipeline(
             prompt=prompt,
             video_context=video_context,
             audio_context=audio_context,
         )
+        del video_context, audio_context
         torch.cuda.empty_cache()
         print("successful generation")
         import traceback
         error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
         print(error_msg)
+        return None, current_seed
 def apply_resolution(resolution: str):
                 input_image = gr.Image(
                     label="Input Image (Optional)",
+                    type="filepath",   # <-- was "pil"
+                    height=512
+                )
                 prompt = gr.Textbox(
                     label="Prompt",
                     value="Make this image come alive with cinematic motion, smooth animation",