Test

Paused

App Files Files Community

alexnasa commited on Jan 8

Commit

1fec881

verified ·

1 Parent(s): 5b43fde

Update app.py

Browse files

Files changed (1) hide show

app.py +188 -47

app.py CHANGED Viewed

@@ -7,15 +7,15 @@ sys.path.insert(0, str(current_dir / "packages" / "ltx-pipelines" / "src"))
 sys.path.insert(0, str(current_dir / "packages" / "ltx-core" / "src"))
 import spaces
 import gradio as gr
-from gradio_client import Client, handle_file
 import numpy as np
 import random
 import torch
 from typing import Optional
 from pathlib import Path
-from huggingface_hub import hf_hub_download
-from gradio_client import Client
 from ltx_pipelines.distilled import DistilledPipeline
 from ltx_core.model.video_vae import TilingConfig
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
@@ -29,7 +29,165 @@ from ltx_pipelines.utils.constants import (
     DEFAULT_LORA_STRENGTH,
 )
 MAX_SEED = np.iinfo(np.int32).max
 # Default prompt from docstring example
 DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
@@ -86,6 +244,7 @@ loras = [
 # Initialize pipeline WITHOUT text encoder (gemma_root=None)
 # Text encoding will be done by external space
 pipeline = DistilledPipeline(
     checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=None,  # No text encoder in this space
@@ -93,23 +252,18 @@ pipeline = DistilledPipeline(
     fp8transformer=False,
     local_files_only=False,
 )
 pipeline._video_encoder = pipeline.model_ledger.video_encoder()
 pipeline._transformer = pipeline.model_ledger.transformer()
-# Initialize text encoder client
-print(f"Connecting to text encoder space: {TEXT_ENCODER_SPACE}")
-try:
-    text_encoder_client = Client(TEXT_ENCODER_SPACE)
-    print("✓ Text encoder client connected!")
-except Exception as e:
-    print(f"⚠ Warning: Could not connect to text encoder space: {e}")
-    text_encoder_client = None
 print("=" * 80)
 print("Pipeline fully loaded and ready!")
 print("=" * 80)
-@spaces.GPU(duration=300)
 def generate_video(
     input_image,
     prompt: str,
@@ -118,9 +272,10 @@ def generate_video(
     seed: int = 42,
     randomize_seed: bool = True,
     height: int = DEFAULT_1_STAGE_HEIGHT,
-    width: int = DEFAULT_1_STAGE_WIDTH,
     progress=gr.Progress(track_tqdm=True)
 ):
     """Generate a video based on the given parameters."""
     try:
         # Randomize seed if checkbox is enabled
@@ -153,39 +308,25 @@ def generate_video(
         # Get embeddings from text encoder space
         print(f"Encoding prompt: {prompt}")
-        if text_encoder_client is None:
-            raise RuntimeError(
-                f"Text encoder client not connected. Please ensure the text encoder space "
-                f"({TEXT_ENCODER_SPACE}) is running and accessible."
-            )
-        try:
-            # Prepare image for upload if it exists
-            image_input = None
-            if temp_image_path is not None:
-                image_input = handle_file(str(temp_image_path))
-            result = text_encoder_client.predict(
-                prompt=prompt,
-                enhance_prompt=enhance_prompt,
-                input_image=image_input,
-                seed=current_seed,
-                negative_prompt="",
-                api_name="/encode_prompt"
-            )
-            embedding_path = result[0]  # Path to .pt file
-            print(f"Embeddings received from: {embedding_path}")
-            # Load embeddings
-            embeddings = torch.load(embedding_path)
-            video_context = embeddings['video_context']
-            audio_context = embeddings['audio_context']
-            print("✓ Embeddings loaded successfully")
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to get embeddings from text encoder space: {e}\n"
-                f"Please ensure {TEXT_ENCODER_SPACE} is running properly."
-            )
         # Run inference - progress automatically tracks tqdm from pipeline
         pipeline(
@@ -321,4 +462,4 @@ css = '''
 .gradio-container .contain{max-width: 1200px !important; margin: 0 auto !important}
 '''
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Citrus(), css=css)

 sys.path.insert(0, str(current_dir / "packages" / "ltx-core" / "src"))
 import spaces
+import flash_attn_interface
+import time
 import gradio as gr
 import numpy as np
 import random
 import torch
 from typing import Optional
 from pathlib import Path
+from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_pipelines.distilled import DistilledPipeline
 from ltx_core.model.video_vae import TilingConfig
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
     DEFAULT_LORA_STRENGTH,
 )
 MAX_SEED = np.iinfo(np.int32).max
+# Import from public LTX-2 package
+# Install with: pip install git+https://github.com/Lightricks/LTX-2.git
+from ltx_pipelines.utils import ModelLedger
+from ltx_pipelines.utils.helpers import generate_enhanced_prompt
+# HuggingFace Hub defaults
+DEFAULT_REPO_ID = "Lightricks/LTX-2"
+DEFAULT_GEMMA_REPO_ID = "unsloth/gemma-3-12b-it-qat-bnb-4bit"
+DEFAULT_CHECKPOINT_FILENAME = "ltx-2-19b-dev.safetensors"
+def get_hub_or_local_checkpoint(repo_id: str, filename: str):
+    """Download from HuggingFace Hub."""
+    print(f"Downloading {filename} from {repo_id}...")
+    ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    print(f"Downloaded to {ckpt_path}")
+    return ckpt_path
+def download_gemma_model(repo_id: str):
+    """Download the full Gemma model directory."""
+    print(f"Downloading Gemma model from {repo_id}...")
+    local_dir = snapshot_download(repo_id=repo_id)
+    print(f"Gemma model downloaded to {local_dir}")
+    return local_dir
+# Initialize model ledger and text encoder at startup (load once, keep in memory)
+print("=" * 80)
+print("Loading Gemma Text Encoder...")
+print("=" * 80)
+checkpoint_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_CHECKPOINT_FILENAME)
+gemma_local_path = download_gemma_model(DEFAULT_GEMMA_REPO_ID)
+device = "cuda"
+print(f"Initializing text encoder with:")
+print(f"  checkpoint_path={checkpoint_path}")
+print(f"  gemma_root={gemma_local_path}")
+print(f"  device={device}")
+model_ledger = ModelLedger(
+    dtype=torch.bfloat16,
+    device=device,
+    checkpoint_path=checkpoint_path,
+    gemma_root_path=DEFAULT_GEMMA_REPO_ID,
+    local_files_only=False
+)
+# Load text encoder once and keep it in memory
+text_encoder = model_ledger.text_encoder()
+print("=" * 80)
+print("Text encoder loaded and ready!")
+print("=" * 80)
+def encode_text_simple(text_encoder, prompt: str):
+    """Simple text encoding without using pipeline_utils."""
+    v_context, a_context, _ = text_encoder(prompt)
+    return v_context, a_context
+@spaces.GPU()
+def encode_prompt(
+    prompt: str,
+    enhance_prompt: bool = True,
+    input_image = None,
+    seed: int = 42,
+    negative_prompt: str = ""
+):
+    """
+    Encode a text prompt using Gemma text encoder.
+    Args:
+        prompt: Text prompt to encode
+        enhance_prompt: Whether to use AI to enhance the prompt
+        input_image: Optional image for image-to-video enhancement
+        seed: Random seed for prompt enhancement
+        negative_prompt: Optional negative prompt for CFG (two-stage pipeline)
+    Returns:
+        tuple: (file_path, enhanced_prompt_text, status_message)
+    """
+    start_time = time.time()
+    try:
+        # Enhance prompt if requested
+        final_prompt = prompt
+        if enhance_prompt:
+            if input_image is not None:
+                # Save image temporarily
+                temp_dir = Path("temp_images")
+                temp_dir.mkdir(exist_ok=True)
+                temp_image_path = temp_dir / f"temp_{int(time.time())}.jpg"
+                if hasattr(input_image, 'save'):
+                    input_image.save(temp_image_path)
+                else:
+                    temp_image_path = input_image
+                final_prompt = generate_enhanced_prompt(
+                    text_encoder=text_encoder,
+                    prompt=prompt,
+                    image_path=str(temp_image_path),
+                    seed=seed
+                )
+            else:
+                final_prompt = generate_enhanced_prompt(
+                    text_encoder=text_encoder,
+                    prompt=prompt,
+                    image_path=None,
+                    seed=seed
+                )
+        # Encode the positive prompt using the pre-loaded text encoder
+        video_context, audio_context = encode_text_simple(text_encoder, final_prompt)
+        # Encode negative prompt if provided
+        video_context_negative = None
+        audio_context_negative = None
+        if negative_prompt:
+            video_context_negative, audio_context_negative = encode_text_simple(text_encoder, negative_prompt)
+        # Save embeddings to file
+        output_dir = Path("embeddings")
+        output_dir.mkdir(exist_ok=True)
+        output_path = output_dir / f"embedding_{int(time.time())}.pt"
+        # Save embeddings (with negative contexts if provided)
+        embedding_data = {
+            'video_context': video_context.cpu(),
+            'audio_context': audio_context.cpu(),
+            'prompt': final_prompt,
+            'original_prompt': prompt if enhance_prompt else final_prompt,
+        }
+        # Add negative contexts if they were encoded
+        if video_context_negative is not None:
+            embedding_data['video_context_negative'] = video_context_negative.cpu()
+            embedding_data['audio_context_negative'] = audio_context_negative.cpu()
+            embedding_data['negative_prompt'] = negative_prompt
+        torch.save(embedding_data, output_path)
+        # Get memory stats
+        elapsed_time = time.time() - start_time
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            peak = torch.cuda.max_memory_allocated() / 1024**3
+            status = f"✓ Encoded in {elapsed_time:.2f}s | VRAM: {allocated:.2f}GB allocated, {peak:.2f}GB peak"
+        else:
+            status = f"✓ Encoded in {elapsed_time:.2f}s (CPU mode)"
+        return str(output_path), final_prompt, status
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return None, prompt, error_msg
 # Default prompt from docstring example
 DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
 # Initialize pipeline WITHOUT text encoder (gemma_root=None)
 # Text encoding will be done by external space
 pipeline = DistilledPipeline(
+    device=torch.device("cuda"),
     checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=None,  # No text encoder in this space
     fp8transformer=False,
     local_files_only=False,
 )
 pipeline._video_encoder = pipeline.model_ledger.video_encoder()
 pipeline._transformer = pipeline.model_ledger.transformer()
+# pipeline.device = torch.device("cuda")
+# pipeline.model_ledger.device = torch.device("cuda")
 print("=" * 80)
 print("Pipeline fully loaded and ready!")
 print("=" * 80)
+@spaces.GPU(duration=80)
 def generate_video(
     input_image,
     prompt: str,
     seed: int = 42,
     randomize_seed: bool = True,
     height: int = DEFAULT_1_STAGE_HEIGHT,
+    width: int = DEFAULT_1_STAGE_WIDTH ,
     progress=gr.Progress(track_tqdm=True)
 ):
     """Generate a video based on the given parameters."""
     try:
         # Randomize seed if checkbox is enabled
         # Get embeddings from text encoder space
         print(f"Encoding prompt: {prompt}")
+        # Prepare image for upload if it exists
+        image_input = None
+        result = encode_prompt(
+            prompt=prompt,
+            enhance_prompt=enhance_prompt,
+            input_image=input_image,
+            seed=current_seed,
+            negative_prompt="",
+        )
+        embedding_path = result[0]  # Path to .pt file
+        print(f"Embeddings received from: {embedding_path}")
+        # Load embeddings
+        embeddings = torch.load(embedding_path)
+        video_context = embeddings['video_context']
+        audio_context = embeddings['audio_context']
+        print("✓ Embeddings loaded successfully")
         # Run inference - progress automatically tracks tqdm from pipeline
         pipeline(
 .gradio-container .contain{max-width: 1200px !important; margin: 0 auto !important}
 '''
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Citrus(), css=css)