Instructions to use vidfom/Ltx-3 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use vidfom/Ltx-3 with llama-cpp-python:

# !pip install llama-cpp-python

from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="vidfom/Ltx-3",
	filename="ComfyUI/models/text_encoders/gemma-3-12b-it-qat-UD-Q4_K_XL.gguf",
)

llm.create_chat_completion(
	messages = "No input example has been defined for this model task."
)

Notebooks
Google Colab
Kaggle
Local Apps Settings

llama.cpp

How to use vidfom/Ltx-3 with llama.cpp:

Install from brew

brew install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL
# Run inference directly in the terminal:
llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL

Install from WinGet (Windows)

winget install llama.cpp
# Start a local OpenAI-compatible server with a web UI:
llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL
# Run inference directly in the terminal:
llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL

Use pre-built binary

# Download pre-built binary from:
# https://github.com/ggerganov/llama.cpp/releases
# Start a local OpenAI-compatible server with a web UI:
./llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL
# Run inference directly in the terminal:
./llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL

Build from source code

git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
cmake -B build
cmake --build build -j --target llama-server llama-cli
# Start a local OpenAI-compatible server with a web UI:
./build/bin/llama-server -hf vidfom/Ltx-3:UD-Q4_K_XL
# Run inference directly in the terminal:
./build/bin/llama-cli -hf vidfom/Ltx-3:UD-Q4_K_XL

Use Docker

docker model run hf.co/vidfom/Ltx-3:UD-Q4_K_XL

LM Studio
Jan
Ollama
How to use vidfom/Ltx-3 with Ollama:
```
ollama run hf.co/vidfom/Ltx-3:UD-Q4_K_XL
```

Unsloth Studio

How to use vidfom/Ltx-3 with Unsloth Studio:

Install Unsloth Studio (macOS, Linux, WSL)

curl -fsSL https://unsloth.ai/install.sh | sh
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for vidfom/Ltx-3 to start chatting

Install Unsloth Studio (Windows)

irm https://unsloth.ai/install.ps1 | iex
# Run unsloth studio
unsloth studio -H 0.0.0.0 -p 8888
# Then open http://localhost:8888 in your browser
# Search for vidfom/Ltx-3 to start chatting

Using HuggingFace Spaces for Unsloth

# No setup required
# Open https://huggingface.co/spaces/unsloth/studio in your browser
# Search for vidfom/Ltx-3 to start chatting

Docker Model Runner
How to use vidfom/Ltx-3 with Docker Model Runner:
```
docker model run hf.co/vidfom/Ltx-3:UD-Q4_K_XL
```

Lemonade

How to use vidfom/Ltx-3 with Lemonade:

Pull the model

# Download Lemonade from https://lemonade-server.ai/
lemonade pull vidfom/Ltx-3:UD-Q4_K_XL

Run and chat with the model

lemonade run user.Ltx-3-UD-Q4_K_XL

List all available models

lemonade list

Ltx-3 / ComfyUI /custom_nodes /ComfyUI-LTXVideo /guide.py

vidfom

Upload folder using huggingface_hub

e00eceb verified 2 months ago

raw

history blame contribute delete

15.7 kB

	import comfy
	import comfy_extras.nodes_lt as nodes_lt
	import comfy_extras.nodes_post_processing as post_processing
	import nodes

	from .nodes_registry import comfy_node


	def blur_internal(image, blur_radius):
	if blur_radius > 0:
	# https://docs.opencv.org/2.4/modules/imgproc/doc/filtering.html#getgaussiankernel
	# sigma = 0.3 * blur_radius + 0.5 is what is recommended in the OpenCV doc for the
	# relationship between sigma and kernel size 2*blur_radius + 1, however we want somewhat weaker
	# blurring, so we use 0.3 * blur_radius instead, reducing the sigma value by 0.5
	sigma = 0.3 * blur_radius
	image = post_processing.Blur.execute(image, blur_radius, sigma)[0]
	return image


	@comfy_node(name="LTXVAddGuideAdvanced")
	class LTXVAddGuideAdvanced:
	@classmethod
	def INPUT_TYPES(s):
	return {
	"required": {
	"positive": ("CONDITIONING",),
	"negative": ("CONDITIONING",),
	"vae": ("VAE",),
	"latent": ("LATENT",),
	"image": ("IMAGE",),
	"frame_idx": (
	"INT",
	{
	"default": 0,
	"min": -9999,
	"max": 9999,
	"tooltip": "Frame index to start the conditioning at. For single-frame images or "
	"videos with 1-8 frames, any frame_idx value is acceptable. For videos with 9+ "
	"frames, frame_idx must be divisible by 8, otherwise it will be rounded down to "
	"the nearest multiple of 8. Negative values are counted from the end of the video.",
	},
	),
	"strength": (
	"FLOAT",
	{
	"default": 1.0,
	"min": 0.0,
	"max": 1.0,
	"tooltip": "Strength of the conditioning. Higher values will make the conditioning more exact.",
	},
	),
	"crf": (
	"INT",
	{
	"default": 29,
	"min": 0,
	"max": 51,
	"step": 1,
	"tooltip": "CRF value for the video. Higher values mean more motion, lower values mean higher quality.",
	},
	),
	"blur_radius": (
	"INT",
	{
	"default": 0,
	"min": 0,
	"max": 7,
	"step": 1,
	"tooltip": "Blur kernel radius size. Higher values mean more motion, lower values mean higher quality.",
	},
	),
	"interpolation": (
	[
	"lanczos",
	"bislerp",
	"nearest",
	"bilinear",
	"bicubic",
	"area",
	"nearest-exact",
	],
	{"default": "lanczos"},
	),
	"crop": (["center", "disabled"], {"default": "disabled"}),
	}
	}

	RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
	RETURN_NAMES = ("positive", "negative", "latent")

	CATEGORY = "conditioning/video_models"
	FUNCTION = "generate"

	DESCRIPTION = (
	"Adds a conditioning frame or a video at a specific frame index. "
	"This node is used to add a keyframe or a video segment which should appear in the "
	"generated video at a specified index. It resizes the image to the correct size and "
	"applies preprocessing to it."
	)

	def generate(
	self,
	positive,
	negative,
	vae,
	latent,
	image,
	frame_idx,
	strength,
	crf,
	blur_radius,
	interpolation,
	crop,
	):
	_, width_scale_factor, height_scale_factor = vae.downscale_index_formula
	width, height = (
	latent["samples"].shape[4] * width_scale_factor,
	latent["samples"].shape[3] * height_scale_factor,
	)
	image = (
	comfy.utils.common_upscale(
	image.movedim(-1, 1), width, height, interpolation, crop=crop
	)
	.movedim(1, -1)
	.clamp(0, 1)
	)
	image = nodes_lt.LTXVPreprocess().execute(image, crf)[0]
	image = blur_internal(image, blur_radius)
	return nodes_lt.LTXVAddGuide().execute(
	positive=positive,
	negative=negative,
	vae=vae,
	latent=latent,
	image=image,
	frame_idx=frame_idx,
	strength=strength,
	)


	@comfy_node(name="LTXVImgToVideoAdvanced")
	class LTXVImgToVideoAdvanced:
	@classmethod
	def INPUT_TYPES(s):
	return {
	"required": {
	"positive": ("CONDITIONING",),
	"negative": ("CONDITIONING",),
	"vae": ("VAE",),
	"image": ("IMAGE",),
	"width": (
	"INT",
	{
	"default": 768,
	"min": 64,
	"max": nodes.MAX_RESOLUTION,
	"step": 32,
	},
	),
	"height": (
	"INT",
	{
	"default": 512,
	"min": 64,
	"max": nodes.MAX_RESOLUTION,
	"step": 32,
	},
	),
	"length": (
	"INT",
	{"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8},
	),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
	"crf": (
	"INT",
	{
	"default": 29,
	"min": 0,
	"max": 51,
	"step": 1,
	"tooltip": "CRF value for the video. Higher values mean more motion, lower values mean higher quality.",
	},
	),
	"blur_radius": (
	"INT",
	{
	"default": 0,
	"min": 0,
	"max": 7,
	"step": 1,
	"tooltip": "Blur kernel radius size. Higher values mean more motion, lower values mean higher quality.",
	},
	),
	"interpolation": (
	[
	"lanczos",
	"bislerp",
	"nearest",
	"bilinear",
	"bicubic",
	"area",
	"nearest-exact",
	],
	{"default": "lanczos"},
	),
	"crop": (["center", "disabled"], {"default": "disabled"}),
	"strength": ("FLOAT", {"default": 0.9, "min": 0, "max": 1}),
	}
	}

	RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
	RETURN_NAMES = ("positive", "negative", "latent")

	CATEGORY = "conditioning/video_models"
	FUNCTION = "generate"

	DESCRIPTION = (
	"Adds a conditioning frame or a video at index 0. "
	"This node is used to add a keyframe or a video segment which should appear in the "
	"generated video at index 0. It resizes the image to the correct size "
	"and applies preprocessing to it."
	)

	def generate(
	self,
	positive,
	negative,
	vae,
	image,
	width,
	height,
	length,
	batch_size,
	crf,
	blur_radius,
	interpolation,
	crop,
	strength,
	):
	image = comfy.utils.common_upscale(
	image.movedim(-1, 1), width, height, interpolation, crop=crop
	).movedim(1, -1)
	image = nodes_lt.LTXVPreprocess().execute(image, crf)[0]
	image = blur_internal(image, blur_radius)
	return nodes_lt.LTXVImgToVideo().execute(
	positive=positive,
	negative=negative,
	vae=vae,
	image=image,
	width=width,
	height=height,
	length=length,
	batch_size=batch_size,
	strength=strength,
	)


	@comfy_node(name="LTXVAddGuideAdvancedAttention")
	class LTXVAddGuideAdvancedAttention:
	"""Extended keyframe guide node with per-guide attention strength control.

	Same preprocessing as LTXVAddGuideAdvanced (CRF, blur, interpolation, crop),
	plus attention_strength and attention_mask inputs to control how strongly
	this guide's conditioning influences generation via self-attention.
	"""

	@classmethod
	def INPUT_TYPES(s):
	return {
	"required": {
	"positive": ("CONDITIONING",),
	"negative": ("CONDITIONING",),
	"vae": ("VAE",),
	"latent": ("LATENT",),
	"image": ("IMAGE",),
	"frame_idx": (
	"INT",
	{
	"default": 0,
	"min": -9999,
	"max": 9999,
	"tooltip": (
	"Frame index to start the conditioning at. "
	"Negative values are counted from the end of the video."
	),
	},
	),
	"strength": (
	"FLOAT",
	{
	"default": 1.0,
	"min": 0.0,
	"max": 1.0,
	"tooltip": "Strength of the conditioning. Higher values make it more exact.",
	},
	),
	"crf": (
	"INT",
	{
	"default": 29,
	"min": 0,
	"max": 51,
	"step": 1,
	"tooltip": "CRF value. Higher = more motion, lower = higher quality.",
	},
	),
	"blur_radius": (
	"INT",
	{
	"default": 0,
	"min": 0,
	"max": 7,
	"step": 1,
	"tooltip": "Blur kernel radius. Higher = more motion.",
	},
	),
	"interpolation": (
	[
	"lanczos",
	"bislerp",
	"nearest",
	"bilinear",
	"bicubic",
	"area",
	"nearest-exact",
	],
	{"default": "lanczos"},
	),
	"crop": (["center", "disabled"], {"default": "disabled"}),
	"attention_strength": (
	"FLOAT",
	{
	"default": 1.0,
	"min": 0.0,
	"max": 1.0,
	"step": 0.01,
	"tooltip": (
	"Controls how strongly this guide influences generation via "
	"self-attention. 1.0 = full conditioning, 0.0 = ignore."
	),
	},
	),
	},
	"optional": {
	"attention_mask": (
	"MASK",
	{
	"tooltip": (
	"Optional pixel-space spatial mask. Shape (F, H, W) or (H, W). "
	"Values in [0, 1]. Controls per-region conditioning influence. "
	"Multiplied by attention_strength."
	),
	},
	),
	},
	}

	RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
	RETURN_NAMES = ("positive", "negative", "latent")

	CATEGORY = "conditioning/video_models"
	FUNCTION = "generate"

	DESCRIPTION = (
	"Adds a conditioning frame/video at a specific frame index with per-guide "
	"attention strength control. Same preprocessing as LTXVAddGuideAdvanced, "
	"plus attention_strength and optional spatial attention_mask."
	)

	def generate(
	self,
	positive,
	negative,
	vae,
	latent,
	image,
	frame_idx,
	strength,
	crf,
	blur_radius,
	interpolation,
	crop,
	attention_strength=1.0,
	attention_mask=None,
	):
	from .iclora_attention import append_guide_attention_entry, normalize_mask

	# Preprocessing: resize, CRF, blur (same as LTXVAddGuideAdvanced)
	scale_factors = vae.downscale_index_formula
	_, width_scale_factor, height_scale_factor = scale_factors
	latent_image = latent["samples"]
	noise_mask = nodes_lt.get_noise_mask(latent)
	_, _, latent_length, latent_height, latent_width = latent_image.shape

	width = latent_width * width_scale_factor
	height = latent_height * height_scale_factor
	image = (
	comfy.utils.common_upscale(
	image.movedim(-1, 1), width, height, interpolation, crop=crop
	)
	.movedim(1, -1)
	.clamp(0, 1)
	)
	image = nodes_lt.LTXVPreprocess().execute(image, crf)[0]
	image = blur_internal(image, blur_radius)

	# Encode
	_, t = nodes_lt.LTXVAddGuide.encode(
	vae, latent_width, latent_height, image, scale_factors
	)

	# Compute latent index
	frame_idx, latent_idx = nodes_lt.LTXVAddGuide.get_latent_index(
	positive, latent_length, len(image), frame_idx, scale_factors
	)
	assert (
	latent_idx + t.shape[2] <= latent_length
	), "Conditioning frames exceed the length of the latent sequence."

	# Append keyframe
	positive, negative, latent_image, noise_mask = (
	nodes_lt.LTXVAddGuide.append_keyframe(
	positive,
	negative,
	frame_idx,
	latent_image,
	noise_mask,
	t,
	strength,
	scale_factors,
	)
	)

	# Track with custom attention strength/mask
	pre_filter_count = t.shape[2] * t.shape[3] * t.shape[4]
	guide_latent_shape = list(t.shape[2:])
	norm_mask = normalize_mask(attention_mask)
	positive = append_guide_attention_entry(
	positive,
	pre_filter_count,
	guide_latent_shape,
	attention_strength=attention_strength,
	attention_mask=norm_mask,
	)
	negative = append_guide_attention_entry(
	negative,
	pre_filter_count,
	guide_latent_shape,
	attention_strength=attention_strength,
	attention_mask=norm_mask,
	)

	return (
	positive,
	negative,
	{"samples": latent_image, "noise_mask": noise_mask},
	)