Instructions to use vidfom/Ltx-3 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- llama-cpp-python
How to use vidfom/Ltx-3 with llama-cpp-python:
# !pip install llama-cpp-python from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="vidfom/Ltx-3", filename="ComfyUI/models/text_encoders/gemma-3-12b-it-qat-UD-Q4_K_XL.gguf", )
llm.create_chat_completion( messages = "No input example has been defined for this model task." )
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- llama.cpp
How to use vidfom/Ltx-3 with llama.cpp:
Install from brew
brew install llama.cpp # Start a local OpenAI-compatible server with a web UI: llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL # Run inference directly in the terminal: llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL
Install from WinGet (Windows)
winget install llama.cpp # Start a local OpenAI-compatible server with a web UI: llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL # Run inference directly in the terminal: llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL
Use pre-built binary
# Download pre-built binary from: # https://github.com/ggerganov/llama.cpp/releases # Start a local OpenAI-compatible server with a web UI: ./llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL # Run inference directly in the terminal: ./llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL
Build from source code
git clone https://github.com/ggerganov/llama.cpp.git cd llama.cpp cmake -B build cmake --build build -j --target llama-server llama-cli # Start a local OpenAI-compatible server with a web UI: ./build/bin/llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL # Run inference directly in the terminal: ./build/bin/llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL
Use Docker
docker model run hf.co/vidfom/Ltx-3:UD-Q4_K_XL
- LM Studio
- Jan
- Ollama
How to use vidfom/Ltx-3 with Ollama:
ollama run hf.co/vidfom/Ltx-3:UD-Q4_K_XL
- Unsloth Studio
How to use vidfom/Ltx-3 with Unsloth Studio:
Install Unsloth Studio (macOS, Linux, WSL)
curl -fsSL https://unsloth.ai/install.sh | sh # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for vidfom/Ltx-3 to start chatting
Install Unsloth Studio (Windows)
irm https://unsloth.ai/install.ps1 | iex # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for vidfom/Ltx-3 to start chatting
Using HuggingFace Spaces for Unsloth
# No setup required # Open https://huggingface.co/spaces/unsloth/studio in your browser # Search for vidfom/Ltx-3 to start chatting
- Docker Model Runner
How to use vidfom/Ltx-3 with Docker Model Runner:
docker model run hf.co/vidfom/Ltx-3:UD-Q4_K_XL
- Lemonade
How to use vidfom/Ltx-3 with Lemonade:
Pull the model
# Download Lemonade from https://lemonade-server.ai/ lemonade pull vidfom/Ltx-3:UD-Q4_K_XL
Run and chat with the model
lemonade run user.Ltx-3-UD-Q4_K_XL
List all available models
lemonade list
| from abc import ABC, abstractmethod | |
| from typing import Tuple | |
| import torch | |
| from einops import rearrange | |
| from torch import Tensor | |
| def latent_to_pixel_coords( | |
| latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False | |
| ) -> Tensor: | |
| """ | |
| Converts latent coordinates to pixel coordinates by scaling them according to the VAE's | |
| configuration. | |
| Args: | |
| latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents] | |
| containing the latent corner coordinates of each token. | |
| scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space. | |
| causal_fix (bool): Whether to take into account the different temporal scale | |
| of the first frame. Default = False for backwards compatibility. | |
| Returns: | |
| Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates. | |
| """ | |
| shape = [1] * latent_coords.ndim | |
| shape[1] = -1 | |
| pixel_coords = ( | |
| latent_coords | |
| * torch.tensor(scale_factors, device=latent_coords.device).view(*shape) | |
| ) | |
| if causal_fix: | |
| # Fix temporal scale for first frame to 1 due to causality | |
| pixel_coords[:, 0, ...] = (pixel_coords[:, 0, ...] + 1 - scale_factors[0]).clamp(min=0) | |
| return pixel_coords | |
| class Patchifier(ABC): | |
| def __init__(self, patch_size: int, start_end: bool=False): | |
| super().__init__() | |
| self._patch_size = (1, patch_size, patch_size) | |
| self.start_end = start_end | |
| def patchify( | |
| self, latents: Tensor, frame_rates: Tensor, scale_grid: bool | |
| ) -> Tuple[Tensor, Tensor]: | |
| pass | |
| def unpatchify( | |
| self, | |
| latents: Tensor, | |
| output_height: int, | |
| output_width: int, | |
| output_num_frames: int, | |
| out_channels: int, | |
| ) -> Tuple[Tensor, Tensor]: | |
| pass | |
| def patch_size(self): | |
| return self._patch_size | |
| def get_latent_coords( | |
| self, latent_num_frames, latent_height, latent_width, batch_size, device | |
| ): | |
| """ | |
| Return a tensor of shape [batch_size, 3, num_patches] containing the | |
| top-left corner latent coordinates of each latent patch. | |
| The tensor is repeated for each batch element. | |
| """ | |
| latent_sample_coords = torch.meshgrid( | |
| torch.arange(0, latent_num_frames, self._patch_size[0], device=device), | |
| torch.arange(0, latent_height, self._patch_size[1], device=device), | |
| torch.arange(0, latent_width, self._patch_size[2], device=device), | |
| indexing="ij", | |
| ) | |
| latent_sample_coords_start = torch.stack(latent_sample_coords, dim=0) | |
| delta = torch.tensor(self._patch_size, device=latent_sample_coords_start.device, dtype=latent_sample_coords_start.dtype)[:, None, None, None] | |
| latent_sample_coords_end = latent_sample_coords_start + delta | |
| latent_sample_coords_start = latent_sample_coords_start.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1) | |
| latent_sample_coords_start = rearrange( | |
| latent_sample_coords_start, "b c f h w -> b c (f h w)", b=batch_size | |
| ) | |
| if self.start_end: | |
| latent_sample_coords_end = latent_sample_coords_end.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1) | |
| latent_sample_coords_end = rearrange( | |
| latent_sample_coords_end, "b c f h w -> b c (f h w)", b=batch_size | |
| ) | |
| latent_coords = torch.stack((latent_sample_coords_start, latent_sample_coords_end), dim=-1) | |
| else: | |
| latent_coords = latent_sample_coords_start | |
| return latent_coords | |
| class SymmetricPatchifier(Patchifier): | |
| def patchify( | |
| self, | |
| latents: Tensor, | |
| ) -> Tuple[Tensor, Tensor]: | |
| b, _, f, h, w = latents.shape | |
| latent_coords = self.get_latent_coords(f, h, w, b, latents.device) | |
| latents = rearrange( | |
| latents, | |
| "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)", | |
| p1=self._patch_size[0], | |
| p2=self._patch_size[1], | |
| p3=self._patch_size[2], | |
| ) | |
| return latents, latent_coords | |
| def unpatchify( | |
| self, | |
| latents: Tensor, | |
| output_height: int, | |
| output_width: int, | |
| output_num_frames: int, | |
| out_channels: int, | |
| ) -> Tuple[Tensor, Tensor]: | |
| output_height = output_height // self._patch_size[1] | |
| output_width = output_width // self._patch_size[2] | |
| latents = rearrange( | |
| latents, | |
| "b (f h w) (c p q) -> b c f (h p) (w q) ", | |
| f=output_num_frames, | |
| h=output_height, | |
| w=output_width, | |
| p=self._patch_size[1], | |
| q=self._patch_size[2], | |
| ) | |
| return latents | |
| class AudioPatchifier(Patchifier): | |
| def __init__(self, patch_size: int, | |
| sample_rate=16000, | |
| hop_length=160, | |
| audio_latent_downsample_factor=4, | |
| is_causal=True, | |
| start_end=False, | |
| shift = 0 | |
| ): | |
| super().__init__(patch_size, start_end=start_end) | |
| self.hop_length = hop_length | |
| self.sample_rate = sample_rate | |
| self.audio_latent_downsample_factor = audio_latent_downsample_factor | |
| self.is_causal = is_causal | |
| self.shift = shift | |
| def copy_with_shift(self, shift): | |
| return AudioPatchifier( | |
| self.patch_size, self.sample_rate, self.hop_length, self.audio_latent_downsample_factor, | |
| self.is_causal, self.start_end, shift | |
| ) | |
| def _get_audio_latent_time_in_sec(self, start_latent, end_latent: int, dtype: torch.dtype, device=torch.device): | |
| audio_latent_frame = torch.arange(start_latent, end_latent, dtype=dtype, device=device) | |
| audio_mel_frame = audio_latent_frame * self.audio_latent_downsample_factor | |
| if self.is_causal: | |
| audio_mel_frame = (audio_mel_frame + 1 - self.audio_latent_downsample_factor).clip(min=0) | |
| return audio_mel_frame * self.hop_length / self.sample_rate | |
| def patchify(self, audio_latents: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: | |
| # audio_latents: (batch, channels, time, freq) | |
| b, _, t, _ = audio_latents.shape | |
| audio_latents = rearrange( | |
| audio_latents, | |
| "b c t f -> b t (c f)", | |
| ) | |
| audio_latents_start_timings = self._get_audio_latent_time_in_sec(self.shift, t + self.shift, torch.float32, audio_latents.device) | |
| audio_latents_start_timings = audio_latents_start_timings.unsqueeze(0).expand(b, -1).unsqueeze(1) | |
| if self.start_end: | |
| audio_latents_end_timings = self._get_audio_latent_time_in_sec(self.shift + 1, t + self.shift + 1, torch.float32, audio_latents.device) | |
| audio_latents_end_timings = audio_latents_end_timings.unsqueeze(0).expand(b, -1).unsqueeze(1) | |
| audio_latents_timings = torch.stack([audio_latents_start_timings, audio_latents_end_timings], dim=-1) | |
| else: | |
| audio_latents_timings = audio_latents_start_timings | |
| return audio_latents, audio_latents_timings | |
| def unpatchify(self, audio_latents: torch.Tensor, channels: int, freq: int) -> torch.Tensor: | |
| # audio_latents: (batch, time, freq * channels) | |
| audio_latents = rearrange( | |
| audio_latents, "b t (c f) -> b c t f", c=channels, f=freq | |
| ) | |
| return audio_latents | |