gemma-text-encoder

Sleeping

App Files Files Community

linoyts HF Staff commited on Jan 6

Commit

5625a61

verified ·

1 Parent(s): 2576965

Create app.py

Browse files

Files changed (1) hide show

app.py +207 -0

app.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+LTX-2 Gemma Text Encoder Space
+Encodes text prompts using Gemma-3-12B for LTX-2 video generation.
+Supports prompt enhancement for better results.
+"""
+import time
+from pathlib import Path
+import spaces
+import gradio as gr
+import torch
+from huggingface_hub import hf_hub_download
+# Import from public LTX-2 package
+# Install with: pip install git+https://github.com/Lightricks/LTX-2.git
+from ltx_pipelines.utils import ModelLedger, get_device
+from ltx_pipelines.utils.helpers import generate_enhanced_prompt
+# HuggingFace Hub defaults
+DEFAULT_REPO_ID = "Lightricks/LTX-2"
+DEFAULT_GEMMA_REPO_ID = "google/gemma-3-12b-it-qat-q4_0-unquantized"
+DEFAULT_CHECKPOINT_FILENAME = "ltx-2-19b-dev-fp8.safetensors"
+def get_hub_or_local_checkpoint(repo_id: str, filename: str):
+    """Download from HuggingFace Hub."""
+    print(f"Downloading {filename} from {repo_id}...")
+    ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
+    print(f"Downloaded to {ckpt_path}")
+    return ckpt_path
+# Initialize model ledger and text encoder at startup (load once, keep in memory)
+print("=" * 80)
+print("Loading Gemma Text Encoder...")
+print("=" * 80)
+checkpoint_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_CHECKPOINT_FILENAME)
+device = get_device()
+print(f"Initializing text encoder with:")
+print(f"  checkpoint_path={checkpoint_path}")
+print(f"  gemma_root={DEFAULT_GEMMA_REPO_ID}")
+print(f"  device={device}")
+model_ledger = ModelLedger(
+    dtype=torch.bfloat16,
+    device=device,
+    checkpoint_path=checkpoint_path,
+    gemma_root_path=DEFAULT_GEMMA_REPO_ID,
+    local_files_only=False,
+)
+# Load text encoder once and keep it in memory
+text_encoder = model_ledger.text_encoder()
+print("=" * 80)
+print("Text encoder loaded and ready!")
+print("=" * 80)
+def encode_text_simple(text_encoder, prompt: str):
+    """Simple text encoding without using pipeline_utils."""
+    v_context, a_context, _ = text_encoder(prompt)
+    return v_context, a_context
+@spaces.GPU()
+def encode_prompt(
+    prompt: str,
+    enhance_prompt: bool = False,
+    input_image = None,
+    seed: int = 42
+):
+    """
+    Encode a text prompt using Gemma text encoder.
+    Args:
+        prompt: Text prompt to encode
+        enhance_prompt: Whether to use AI to enhance the prompt
+        input_image: Optional image for image-to-video enhancement
+        seed: Random seed for prompt enhancement
+    Returns:
+        tuple: (file_path, enhanced_prompt_text, status_message)
+    """
+    start_time = time.time()
+    try:
+        # Enhance prompt if requested
+        final_prompt = prompt
+        if enhance_prompt:
+            if input_image is not None:
+                # Save image temporarily
+                temp_dir = Path("temp_images")
+                temp_dir.mkdir(exist_ok=True)
+                temp_image_path = temp_dir / f"temp_{int(time.time())}.jpg"
+                if hasattr(input_image, 'save'):
+                    input_image.save(temp_image_path)
+                else:
+                    temp_image_path = input_image
+                final_prompt = generate_enhanced_prompt(
+                    text_encoder=text_encoder,
+                    prompt=prompt,
+                    image_path=str(temp_image_path),
+                    seed=seed
+                )
+            else:
+                final_prompt = generate_enhanced_prompt(
+                    text_encoder=text_encoder,
+                    prompt=prompt,
+                    image_path=None,
+                    seed=seed
+                )
+        # Encode the prompt using the pre-loaded text encoder
+        video_context, audio_context = encode_text_simple(text_encoder, final_prompt)
+        # Save embeddings to file
+        output_dir = Path("embeddings")
+        output_dir.mkdir(exist_ok=True)
+        output_path = output_dir / f"embedding_{int(time.time())}.pt"
+        # Save both video and audio contexts
+        torch.save({
+            'video_context': video_context.cpu(),
+            'audio_context': audio_context.cpu(),
+            'prompt': final_prompt,
+            'original_prompt': prompt if enhance_prompt else final_prompt,
+        }, output_path)
+        # Get memory stats
+        elapsed_time = time.time() - start_time
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            peak = torch.cuda.max_memory_allocated() / 1024**3
+            status = f"✓ Encoded in {elapsed_time:.2f}s | VRAM: {allocated:.2f}GB allocated, {peak:.2f}GB peak"
+        else:
+            status = f"✓ Encoded in {elapsed_time:.2f}s (CPU mode)"
+        return str(output_path), final_prompt, status
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return None, prompt, error_msg
+# Create Gradio interface
+with gr.Blocks(title="LTX-2 Gemma Text Encoder") as demo:
+    gr.Markdown("# LTX-2 Gemma Text Encoder 🎯")
+    gr.Markdown("""
+    Encode text prompts using Gemma-3-12B for LTX-2 video generation.
+    This space generates embeddings that can be used by the main LTX-2 generation space.
+    """)
+    with gr.Row():
+        with gr.Column():
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                placeholder="Enter your prompt here...",
+                lines=5,
+                value="An astronaut hatches from a fragile egg on the surface of the Moon"
+            )
+            enhance_checkbox = gr.Checkbox(
+                label="Enhance Prompt with AI",
+                value=False,
+                info="Use Gemma to automatically enhance your prompt for better results"
+            )
+            with gr.Accordion("Prompt Enhancement Settings", open=False):
+                input_image = gr.Image(
+                    label="Reference Image (Optional)",
+                    type="pil",
+                    info="Provide an image for image-to-video prompt enhancement"
+                )
+                enhancement_seed = gr.Slider(
+                    label="Enhancement Seed",
+                    minimum=0,
+                    maximum=10000,
+                    value=42,
+                    step=1,
+                    info="Random seed for prompt enhancement"
+                )
+            encode_btn = gr.Button("Encode Prompt", variant="primary", size="lg")
+        with gr.Column():
+            embedding_file = gr.File(label="Embedding File (.pt)")
+            enhanced_prompt_output = gr.Textbox(
+                label="Final Prompt Used",
+                lines=5,
+                info="This is the prompt that was encoded (enhanced if enabled)"
+            )
+            status_output = gr.Textbox(label="Status", lines=2)
+    encode_btn.click(
+        fn=encode_prompt,
+        inputs=[prompt_input, enhance_checkbox, input_image, enhancement_seed],
+        outputs=[embedding_file, enhanced_prompt_output, status_output]
+    )
+css = '''
+.gradio-container .contain{max-width: 1200px !important; margin: 0 auto !important}
+'''
+if __name__ == "__main__":
+    demo.launch(css=css)