ltx-2

Running on Zero

ltx-2 / packages /ltx-trainer /src /ltx_trainer /captioning.py

linoy

inital commit

ebfc6b3 20 days ago

15.9 kB

	"""
	Audio-visual media captioning using multimodal models.

	This module provides captioning capabilities for videos with audio using:
	- Qwen2.5-Omni: Local model supporting text, audio, image, and video inputs (default)
	- Gemini Flash: Cloud-based API for audio-visual captioning

	Requirements:
	- Qwen2.5-Omni: transformers>=4.50, torch
	- Gemini Flash: google-generativeai (pip install google-generativeai)
	Set GEMINI_API_KEY or GOOGLE_API_KEY environment variable
	"""

	import itertools
	import re
	from abc import ABC, abstractmethod
	from enum import Enum
	from pathlib import Path

	import torch

	# Instruction for audio-visual captioning (default) - includes speech transcription and sounds
	DEFAULT_CAPTION_INSTRUCTION = """\
	Analyze this media and provide a detailed caption in the following EXACT format. Fill in ALL sections:

	[VISUAL]: <Detailed description of people, objects, actions, settings, colors, and movements>
	[SPEECH]: <Word-for-word transcription of everything spoken.
	Listen carefully and transcribe the exact words. If no speech, write "None">
	[SOUNDS]: <Description of music, ambient sounds, sound effects. If none, write "None">
	[TEXT]: <Any on-screen text visible. If none, write "None">

	You MUST fill in all four sections. For [SPEECH], transcribe the actual words spoken, not a summary."""

	# Instruction for video-only captioning (no audio processing)
	VIDEO_ONLY_CAPTION_INSTRUCTION = """\
	Analyze this media and provide a detailed caption in the following EXACT format. Fill in ALL sections:

	[VISUAL]: <Detailed description of people, objects, actions, settings, colors, and movements>
	[TEXT]: <Any on-screen text visible. If none, write "None">

	You MUST fill in both sections."""


	class CaptionerType(str, Enum):
	"""Enum for different types of media captioners."""

	QWEN_OMNI = "qwen_omni" # Local Qwen2.5-Omni model (audio + video)
	GEMINI_FLASH = "gemini_flash" # Gemini Flash API (audio + video)


	def create_captioner(captioner_type: CaptionerType, **kwargs) -> "MediaCaptioningModel":
	"""Factory function to create a media captioner.

	Args:
	captioner_type: The type of captioner to create
	**kwargs: Additional arguments to pass to the captioner constructor

	Returns:
	An instance of a MediaCaptioningModel
	"""
	match captioner_type:
	case CaptionerType.QWEN_OMNI:
	return QwenOmniCaptioner(**kwargs)
	case CaptionerType.GEMINI_FLASH:
	return GeminiFlashCaptioner(**kwargs)
	case _:
	raise ValueError(f"Unsupported captioner type: {captioner_type}")


	class MediaCaptioningModel(ABC):
	"""Abstract base class for audio-visual media captioning models."""

	@abstractmethod
	def caption(self, path: str \| Path, **kwargs) -> str:
	"""Generate a caption for the given video or image.

	Args:
	path: Path to the video/image file to caption

	Returns:
	A string containing the generated caption
	"""

	@property
	@abstractmethod
	def supports_audio(self) -> bool:
	"""Whether this captioner supports audio input."""

	@staticmethod
	def _is_image_file(path: str \| Path) -> bool:
	"""Check if the file is an image based on extension."""
	return str(path).lower().endswith((".png", ".jpg", ".jpeg", ".heic", ".heif", ".webp"))

	@staticmethod
	def _is_video_file(path: str \| Path) -> bool:
	"""Check if the file is a video based on extension."""
	return str(path).lower().endswith((".mp4", ".avi", ".mov", ".mkv", ".webm"))

	@staticmethod
	def _clean_raw_caption(caption: str) -> str:
	"""Clean up the raw caption by removing common VLM patterns."""
	start = ["The", "This"]
	kind = ["video", "image", "scene", "animated sequence", "clip", "footage"]
	act = ["displays", "shows", "features", "depicts", "presents", "showcases", "captures", "contains"]

	for x, y, z in itertools.product(start, kind, act):
	caption = caption.replace(f"{x} {y} {z} ", "", 1)

	return caption


	class QwenOmniCaptioner(MediaCaptioningModel):
	"""Audio-visual captioning using Alibaba's Qwen2.5-Omni model.

	Qwen2.5-Omni is an end-to-end multimodal model that can perceive text, images, audio, and video.
	It uses a Thinker-Talker architecture where the Thinker generates text and the Talker can
	generate speech. For captioning, we use only the Thinker component for text generation.

	Key features:
	- Block-wise processing for streaming multimodal inputs
	- TMRoPE (Time-aligned Multimodal RoPE) for synchronizing video and audio timestamps
	- Can extract and process audio directly from video files

	See: https://huggingface.co/docs/transformers/en/model_doc/qwen2_5_omni

	Model: Qwen/Qwen2.5-Omni-7B (7B parameters)
	"""

	MODEL_ID = "Qwen/Qwen2.5-Omni-7B"

	# Default system prompt required by Qwen2.5-Omni for proper audio processing
	DEFAULT_SYSTEM_PROMPT = (
	"You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
	"capable of perceiving auditory and visual inputs, as well as generating text and speech."
	)

	def __init__(
	self,
	device: str \| torch.device \| None = None,
	use_8bit: bool = False,
	instruction: str \| None = None,
	):
	"""
	Initialize the Qwen2.5-Omni captioner.

	Args:
	device: Device to use for inference (e.g., 'cuda', 'cuda:0', 'cpu')
	use_8bit: Whether to use 8-bit quantization for reduced memory usage
	instruction: Custom instruction prompt. If None, uses the default instruction
	"""
	self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
	self.instruction = instruction
	self._load_model(use_8bit=use_8bit)

	@property
	def supports_audio(self) -> bool:
	return True

	def caption(
	self,
	path: str \| Path,
	fps: int = 1,
	include_audio: bool = True,
	clean_caption: bool = True,
	) -> str:
	"""Generate a caption for the given video or image.

	Args:
	path: Path to the video/image file to caption
	fps: Frames per second to sample from videos
	include_audio: Whether to include audio in the captioning (for videos)
	clean_caption: Whether to clean up the raw caption by removing common VLM patterns

	Returns:
	A string containing the generated caption
	"""
	path = Path(path)
	is_image = self._is_image_file(path)
	is_video = self._is_video_file(path)

	# Determine if we should process audio
	use_audio = include_audio and is_video

	# Use custom instruction if provided, otherwise pick appropriate default
	if self.instruction is not None:
	instruction = self.instruction
	else:
	instruction = DEFAULT_CAPTION_INSTRUCTION if use_audio else VIDEO_ONLY_CAPTION_INSTRUCTION

	# Build the user content based on media type
	# Based on HuggingFace docs: https://huggingface.co/docs/transformers/en/model_doc/qwen2_5_omni
	user_content = []

	if is_image:
	user_content.append({"type": "image", "image": str(path)})
	elif is_video:
	user_content.append({"type": "video", "video": str(path)})

	# Add the instruction text
	user_content.append({"type": "text", "text": instruction})

	# Build conversation - use the default system prompt required by Qwen2.5-Omni
	# Using a custom system prompt causes warnings and may affect audio processing
	messages = [
	{
	"role": "system",
	"content": [{"type": "text", "text": self.DEFAULT_SYSTEM_PROMPT}],
	},
	{"role": "user", "content": user_content},
	]

	# Process inputs using the processor's apply_chat_template
	# For videos with audio, use load_audio_from_video=True and use_audio_in_video=True
	inputs = self.processor.apply_chat_template(
	messages,
	load_audio_from_video=use_audio,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	fps=fps,
	padding=True,
	use_audio_in_video=use_audio,
	).to(self.model.device)

	# Generate caption (text only, using Thinker-only model)
	# Note: For Qwen2_5OmniThinkerForConditionalGeneration, use standard generate params
	# (not thinker_ prefixed ones, those are for the full Qwen2_5OmniForConditionalGeneration)
	input_len = inputs["input_ids"].shape[1]

	output_tokens = self.model.generate(
	**inputs,
	use_audio_in_video=use_audio,
	do_sample=False,
	max_new_tokens=1024,
	)

	# Extract only the generated tokens (exclude the input/prompt tokens)
	generated_tokens = output_tokens[:, input_len:]

	# Decode only the generated response
	caption_raw = self.processor.batch_decode(
	generated_tokens,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	)[0]

	# Remove hallucinated conversation turns (e.g., "Human\nHuman\n..." or "Human: ...")
	# This is a known issue with chat models continuing to generate fake turns
	# We look for patterns that are clearly hallucinated chat turns, not legitimate uses of "human"

	# Match "\nHuman" followed by ":", "\n", or end of string (chat turn patterns)
	# This won't match "A human walks..." or "...the human body..."
	caption_raw = re.split(r"\nHuman(?::\|(?:\s*\n)\|$)", caption_raw, maxsplit=1)[0]
	caption_raw = caption_raw.strip()

	# Clean up caption if requested
	return self._clean_raw_caption(caption_raw) if clean_caption else caption_raw

	def _load_model(self, use_8bit: bool) -> None:
	"""Load the Qwen2.5-Omni model and processor.

	Uses the Thinker-only model (Qwen2_5OmniThinkerForConditionalGeneration) for text generation
	to save compute by not loading the audio generation components.
	"""
	from transformers import ( # noqa: PLC0415
	BitsAndBytesConfig,
	Qwen2_5OmniProcessor,
	Qwen2_5OmniThinkerForConditionalGeneration,
	)

	quantization_config = BitsAndBytesConfig(load_in_8bit=True) if use_8bit else None

	# Use Thinker-only model for text generation (saves memory by not loading Talker)
	self.model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
	self.MODEL_ID,
	dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	quantization_config=quantization_config,
	device_map="auto",
	)

	self.processor = Qwen2_5OmniProcessor.from_pretrained(self.MODEL_ID)


	class GeminiFlashCaptioner(MediaCaptioningModel):
	"""Audio-visual captioning using Google's Gemini Flash API.

	Gemini Flash is a cloud-based multimodal model that natively supports
	audio and video understanding. Requires a Google API key.

	Note: This captioner requires the `google-generativeai` package and a valid API key.
	Set the GEMINI_API_KEY or GOOGLE_API_KEY environment variable, or pass the key directly.
	"""

	MODEL_ID = "gemini-flash-lite-latest"

	def __init__(
	self,
	api_key: str \| None = None,
	instruction: str \| None = None,
	):
	"""Initialize the Gemini Flash captioner.

	Args:
	api_key: Google API key. If not provided, will look for
	GEMINI_API_KEY or GOOGLE_API_KEY environment variable.
	instruction: Custom instruction prompt. If None, uses the default instruction
	"""
	self.instruction = instruction
	self._init_client(api_key)

	@property
	def supports_audio(self) -> bool:
	return True

	def caption(
	self,
	path: str \| Path,
	fps: int = 3, # noqa: ARG002 - kept for API compatibility
	include_audio: bool = True,
	clean_caption: bool = True,
	) -> str:
	"""Generate a caption for the given video or image.

	Args:
	path: Path to the video/image file to caption
	fps: Frames per second (not used for Gemini, kept for API compatibility)
	include_audio: Whether to include audio content in the caption
	clean_caption: Whether to clean up the raw caption

	Returns:
	A string containing the generated caption
	"""
	import time # noqa: PLC0415

	path = Path(path)
	is_video = self._is_video_file(path)
	use_audio = include_audio and is_video

	# Use custom instruction if provided, otherwise pick appropriate default
	if self.instruction is not None:
	instruction = self.instruction
	else:
	instruction = DEFAULT_CAPTION_INSTRUCTION if use_audio else VIDEO_ONLY_CAPTION_INSTRUCTION

	# Upload the file to Gemini
	uploaded_file = self._genai.upload_file(path)

	# Wait for processing to complete (videos need time to process)
	while uploaded_file.state.name == "PROCESSING":
	time.sleep(1)
	uploaded_file = self._genai.get_file(uploaded_file.name)

	if uploaded_file.state.name == "FAILED":
	raise RuntimeError(f"File processing failed: {uploaded_file.state.name}")

	# Generate caption
	response = self._model.generate_content([uploaded_file, instruction])

	caption_raw = response.text

	# Clean up the uploaded file
	self._genai.delete_file(uploaded_file.name)

	# Clean up caption if requested
	return self._clean_raw_caption(caption_raw) if clean_caption else caption_raw

	def _init_client(self, api_key: str \| None) -> None:
	"""Initialize the Gemini API client."""
	import os # noqa: PLC0415

	try:
	import google.generativeai as genai # noqa: PLC0415
	except ImportError as e:
	raise ImportError(
	"The `google-generativeai` package is required for Gemini Flash captioning. "
	"Install it with: `uv pip install google-generativeai`"
	) from e

	# Get API key from argument or environment
	# GEMINI_API_KEY is the recommended variable, GOOGLE_API_KEY also works
	resolved_api_key = api_key or os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")

	if not resolved_api_key:
	raise ValueError(
	"Gemini API key is required. Provide it via the `api_key` argument "
	"or set the GEMINI_API_KEY or GOOGLE_API_KEY environment variable."
	)

	# Configure the genai library with the API key
	genai.configure(api_key=resolved_api_key)

	# Store reference to genai module for file operations
	self._genai = genai

	# Initialize the model
	self._model = genai.GenerativeModel(self.MODEL_ID)


	def example() -> None:
	"""Example usage of the captioning module."""
	import sys # noqa: PLC0415

	if len(sys.argv) < 2:
	print(f"Usage: python {sys.argv[0]} <video_path> [captioner_type]") # noqa: T201
	print(" captioner_type: qwen_omni (default) or gemini_flash") # noqa: T201
	sys.exit(1)

	video_path = sys.argv[1]
	captioner_type = CaptionerType(sys.argv[2]) if len(sys.argv) > 2 else CaptionerType.QWEN_OMNI

	print(f"Using {captioner_type.value} captioner:") # noqa: T201
	captioner = create_captioner(captioner_type)
	caption = captioner.caption(video_path)
	print(f"CAPTION: {caption}") # noqa: T201


	if __name__ == "__main__":
	example()