Iris1.5 / model.py

Add model.py source

14e3915 verified 24 days ago

18.6 kB

	"""IRIS-2 VLM.

	Architecture (LLaVA-style):
	- Vision: nvidia/RADIO [FROZEN]
	- Language: ByteDance/Ouro-1.4B (LoopLM) [FROZEN, or LoRA adapters when ``use_lora``]
	- Connector: configurable-depth MLP projector [TRAINED — always full weights]

	Trainable parameters: the MLP projector; optionally PEFT LoRA on selected Ouro linears.
	Default LoRA targets are attention + MLP projections; set ``lora_target_modules`` to
	``["early_exit_gate"]`` to train only the universal-transformer / ACT exit head (and the
	projector), leaving the rest of the LM frozen. Set ``lora_edge_layers: N`` to attach LoRA only
	to the first N and last N Ouro decoder blocks (not LoRA rank), via PEFT
	``layers_to_transform``.
	"""

	from __future__ import annotations

	from collections.abc import Sequence
	from dataclasses import dataclass
	from typing import Any, Optional

	import torch
	import torch.nn as nn
	import torchvision.transforms as T
	from PIL import Image
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


	RADIO_HUB_REPO = "NVlabs/RADIO"
	RADIO_VERSION = "radio_v2.5-h"
	OURO_NAME = "ByteDance/Ouro-1.4B"


	@dataclass
	class IrisConfig:
	radio_version: str = RADIO_VERSION # torch.hub version tag
	radio_repo: str = RADIO_HUB_REPO
	ouro_name: str = OURO_NAME
	image_size: int = 432 # must be divisible by RADIO patch size (16)
	projector_hidden_mult: int = 1 # H = mult * llm_hidden
	projector_num_intermediate: int = 1 # H-wide layers before last Linear → llm_hidden
	torch_dtype: torch.dtype = torch.bfloat16
	compile_mode: str \| None = None # e.g. "default", "reduce-overhead", "max-autotune"
	compile_dynamic: bool = True # allow dynamic seq lengths
	# ``eager`` materializes attention weights (needed for viz); default lets HF pick (often SDPA).
	llm_attn_implementation: Optional[str] = None
	# Ouro LoRA (PEFT); base LM weights stay frozen, only adapter matrices train.
	use_lora: bool = False
	lora_r: int = 16
	lora_alpha: int = 32
	lora_dropout: float = 0.05
	# Default (None): LoRA on attention + MLP linears. Use ``["early_exit_gate"]`` for ACT gate only.
	lora_target_modules: Optional[list[str]] = None
	# If set (e.g. 4), apply LoRA only on the first/last N decoder blocks (Ouro `layers.0` …
	# `layers.N-1`), via PEFT ``layers_to_transform``. Ignored for `early_exit_gate`-only LoRA
	# unless ``lora_dual_edge_and_gate`` is true. Cannot mix gate + layer in one adapter;
	# use ``lora_dual_edge_and_gate: true`` instead (two PEFT adapters).
	lora_edge_layers: Optional[int] = None
	# If true: two LoRA adapters — (1) edge blocks on ``lora_target_modules`` or defaults,
	# (2) ``early_exit_gate``. Requires ``lora_edge_layers`` and ``use_lora``.
	lora_dual_edge_and_gate: bool = False


	class RadioImageTransform:
	"""Preprocess PIL images for RADIO.

	RADIO ships its own ``input_conditioner`` that handles per-channel
	normalization internally, so we only resize + tensorize to ``[0, 1]``.
	"""

	def __init__(self, image_size: int):
	self.image_size = image_size
	self._tx = T.Compose([
	T.Resize((image_size, image_size), interpolation=T.InterpolationMode.BICUBIC),
	T.ToTensor(), # HWC uint8 -> CHW float in [0, 1]
	])

	def __call__(self, images, return_tensors: str = "pt"):
	if isinstance(images, Image.Image):
	images = [images]
	tensors = [self._tx(img.convert("RGB")) for img in images]
	pixel_values = torch.stack(tensors, dim=0)
	if return_tensors == "pt":
	return {"pixel_values": pixel_values}
	return {"pixel_values": pixel_values.numpy()}


	def _mlp_sequential(
	dims: Sequence[int], act: type[nn.Module] = nn.GELU
	) -> nn.Sequential:
	"""``Linear`` stack ``dims[0] → … → dims[-1]`` with ``act()`` after all but the last."""
	if len(dims) < 2:
	raise ValueError("mlp must have at least in_dim and out_dim")
	layers: list[nn.Module] = []
	for i in range(len(dims) - 1):
	layers.append(nn.Linear(dims[i], dims[i + 1]))
	if i < len(dims) - 2:
	layers.append(act())
	return nn.Sequential(*layers)


	class MLPProjector(nn.Module):
	"""Nonlinear visual-token → LLM: ``num_intermediate + 1`` `Linear` layers, GELU between."""

	def __init__(
	self,
	vision_dim: int,
	llm_hidden: int,
	hidden_mult: int = 1,
	num_intermediate: int = 1,
	):
	super().__init__()
	if num_intermediate < 1:
	raise ValueError("num_intermediate must be >= 1")
	h = llm_hidden * hidden_mult
	dims: list[int] = [vision_dim] + [h] * num_intermediate + [llm_hidden]
	self.net = _mlp_sequential(dims)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.net(x)


	OURO_DEFAULT_LORA_TARGETS: tuple[str, ...] = (
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"up_proj",
	"down_proj",
	)

	# Ouro ``OuroModel.early_exit_gate`` (Linear → sigmoid): per–UT-step exit / ACT-style head.
	OURO_ACT_GATE_LORA_TARGETS: tuple[str, ...] = ("early_exit_gate",)

	# Submodule names that live under ``model.layers.{i}`` (PEFT `layers_to_transform` applies).
	OURO_LAYER_LORA_NAME_PREFIXES: frozenset[str] = frozenset(
	{"q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"}
	)


	def lora_layer_indices_for_edges(num_hidden_layers: int, edge: int) -> list[int]:
	"""Indices ``0..edge-1`` and ``num_hidden_layers-edge .. num_hidden_layers-1`` (inclusive)."""
	e = int(edge)
	if e < 1:
	raise ValueError("edge must be >= 1")
	n = int(num_hidden_layers)
	if n < 1:
	raise ValueError("num_hidden_layers must be >= 1")
	return list(range(e)) + list(range(max(0, n - e), n))


	class IrisVLM(nn.Module):
	"""Frozen RADIO + trainable MLP projector + Ouro-1.4B (frozen or LoRA)."""

	def __init__(self, cfg: Optional[IrisConfig] = None):
	super().__init__()
	self.cfg = cfg or IrisConfig()

	# nvidia/RADIO via torch.hub (HF's remote code is incompatible with
	# transformers >= 4.55's per-parameter weight loading). Kept in fp32
	# because RADIO's input_conditioner has fp32-pinned buffers; we run
	# the forward under autocast instead of casting weights.
	self.vision = torch.hub.load(
	self.cfg.radio_repo,
	"radio_model",
	version=self.cfg.radio_version,
	progress=True,
	skip_validation=True,
	)
	self.image_processor = RadioImageTransform(self.cfg.image_size)

	self.tokenizer = AutoTokenizer.from_pretrained(
	self.cfg.ouro_name, trust_remote_code=True
	)
	if self.tokenizer.pad_token_id is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# OuroConfig omits pad_token_id; inject from the tokenizer so the
	# model's __init__ can read config.pad_token_id.
	llm_config = AutoConfig.from_pretrained(self.cfg.ouro_name, trust_remote_code=True)
	if getattr(llm_config, "pad_token_id", None) is None:
	llm_config.pad_token_id = self.tokenizer.pad_token_id
	_llm_kw: dict = dict(
	pretrained_model_name_or_path=self.cfg.ouro_name,
	config=llm_config,
	trust_remote_code=True,
	torch_dtype=self.cfg.torch_dtype,
	)
	if self.cfg.llm_attn_implementation is not None:
	_llm_kw["attn_implementation"] = self.cfg.llm_attn_implementation
	self.llm = AutoModelForCausalLM.from_pretrained(**_llm_kw)

	# Freeze RADIO and the Ouro base weights (PEFT adds trainable LoRA side matrices).
	for p in self.vision.parameters():
	p.requires_grad_(False)
	for p in self.llm.parameters():
	p.requires_grad_(False)
	self.vision.eval()

	self._use_lora = bool(self.cfg.use_lora)
	self._lora_dual_edge_gate = False
	if self._use_lora:
	from peft import LoraConfig, TaskType, get_peft_model

	if self.cfg.lora_dual_edge_and_gate:
	if self.cfg.lora_edge_layers is None:
	raise ValueError("lora_dual_edge_and_gate requires lora_edge_layers (e.g. 4).")
	edge_targets = self.cfg.lora_target_modules
	if not edge_targets:
	edge_targets = list(OURO_DEFAULT_LORA_TARGETS)
	if "early_exit_gate" in edge_targets:
	raise ValueError(
	"lora_target_modules must not include early_exit_gate when using "
	"lora_dual_edge_and_gate (the gate uses a separate adapter)."
	)
	n = int(self.llm.config.num_hidden_layers)
	lt = lora_layer_indices_for_edges(n, int(self.cfg.lora_edge_layers))
	common = dict(
	r=self.cfg.lora_r,
	lora_alpha=self.cfg.lora_alpha,
	lora_dropout=self.cfg.lora_dropout,
	bias="none",
	task_type=TaskType.CAUSAL_LM,
	)
	cfg_edge = LoraConfig(
	target_modules=edge_targets,
	layers_to_transform=lt,
	**common,
	)
	cfg_gate = LoraConfig(
	target_modules=list(OURO_ACT_GATE_LORA_TARGETS),
	**common,
	)
	self.llm = get_peft_model(self.llm, cfg_edge, adapter_name="edge_lora")
	self.llm.add_adapter("gate_lora", cfg_gate)
	self.llm.base_model.set_adapter(["edge_lora", "gate_lora"])
	self._lora_dual_edge_gate = True
	else:
	targets = self.cfg.lora_target_modules
	if not targets:
	targets = list(OURO_DEFAULT_LORA_TARGETS)
	ts_set = frozenset(targets)
	layerish = bool(ts_set & OURO_LAYER_LORA_NAME_PREFIXES)
	has_act_gate = "early_exit_gate" in ts_set
	if has_act_gate and layerish:
	raise ValueError(
	"lora_target_modules cannot list both `early_exit_gate` and layer modules "
	"(q_proj, mlp, …) in one PEFT adapter; set lora_dual_edge_and_gate: true "
	"with lora_edge_layers, or use one target family only."
	)
	peft_kw: dict = dict(
	r=self.cfg.lora_r,
	lora_alpha=self.cfg.lora_alpha,
	lora_dropout=self.cfg.lora_dropout,
	bias="none",
	task_type=TaskType.CAUSAL_LM,
	target_modules=targets,
	)
	el = self.cfg.lora_edge_layers
	if el is not None:
	if not layerish:
	pass
	else:
	n = int(self.llm.config.num_hidden_layers)
	lt = lora_layer_indices_for_edges(n, int(el))
	peft_kw["layers_to_transform"] = lt
	self.llm = get_peft_model(self.llm, LoraConfig(**peft_kw))
	else:
	self.llm.eval()

	vision_dim = self._probe_vision_dim()
	llm_hidden = self.llm.config.hidden_size

	self.projector = MLPProjector(
	vision_dim=vision_dim,
	llm_hidden=llm_hidden,
	hidden_mult=self.cfg.projector_hidden_mult,
	num_intermediate=self.cfg.projector_num_intermediate,
	).to(self.cfg.torch_dtype)

	def compile_components(self, mode: str = "default", dynamic: bool = True) -> "IrisVLM":
	"""Wrap heavy submodules with torch.compile.

	Only the frozen vision + LM forwards matter for throughput; the
	projector is tiny so we skip it (also keeps its ``state_dict`` keys
	prefix-free for checkpoints). ``dynamic=True`` avoids recompiles when
	prompt / response lengths vary across batches; we also raise the
	dynamo cache limit to absorb the first few shape specializations.

	PEFT-wrapped LMs are not ``torch.compile``'d (unsupported / fragile).
	"""
	import torch._dynamo as _dynamo

	_dynamo.config.cache_size_limit = max(_dynamo.config.cache_size_limit, 64)

	self.vision = torch.compile(self.vision, mode=mode, dynamic=dynamic, fullgraph=False)
	if not self._use_lora:
	self.llm = torch.compile(self.llm, mode=mode, dynamic=dynamic, fullgraph=False)
	return self

	@staticmethod
	def _spatial_features(out) -> torch.Tensor:
	"""Extract dense (B,T,D) features from a RADIO forward output."""
	for name in ("features", "spatial_features"):
	v = getattr(out, name, None)
	if v is not None:
	return v
	if isinstance(out, tuple) and len(out) >= 2:
	return out[1]
	raise TypeError(f"Cannot find spatial features on RADIO output: {type(out)}")

	def _vision_autocast(self):
	"""Autocast context matching cfg.torch_dtype (bf16/fp16), or no-op for fp32."""
	dtype = self.cfg.torch_dtype
	if dtype in (torch.bfloat16, torch.float16):
	device = next(self.vision.parameters()).device
	return torch.autocast(device_type=device.type, dtype=dtype)

	import contextlib
	return contextlib.nullcontext()

	@torch.no_grad()
	def _probe_vision_dim(self) -> int:
	"""Run RADIO once on a dummy image to discover spatial_features dim."""
	device = next(self.vision.parameters()).device
	dummy = torch.zeros(
	1, 3, self.cfg.image_size, self.cfg.image_size,
	device=device, dtype=torch.float32,
	)
	with self._vision_autocast():
	spatial = self._spatial_features(self.vision(dummy))
	assert spatial.dim() == 3, f"expected (B,T,D), got {tuple(spatial.shape)}"
	return spatial.shape[-1]

	def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor:
	"""(B,3,H,W) -> (B,T,llm_hidden) visual tokens in LLM embedding space.

	RADIO runs in fp32 weights under autocast so its mixed fp32 buffers
	(input_conditioner) stay consistent; output is cast to the projector
	dtype before the trainable MLP.
	"""
	with torch.no_grad(), self._vision_autocast():
	spatial = self._spatial_features(self.vision(pixel_values.float()))
	return self.projector(
	spatial.to(self.projector.net[0].weight.dtype)
	)

	def _embed_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
	return self.llm.get_input_embeddings()(input_ids)

	def forward(
	self,
	pixel_values: torch.Tensor, # (B, 3, H, W)
	prompt_ids: torch.Tensor \| None = None,
	prompt_mask: torch.Tensor \| None = None,
	response_ids: torch.Tensor \| None = None,
	response_mask: torch.Tensor \| None = None,
	packed_text_ids: torch.Tensor \| None = None,
	packed_text_mask: torch.Tensor \| None = None,
	packed_text_labels: torch.Tensor \| None = None,
	**kwargs: Any,
	) -> torch.Tensor:
	"""Vision-first forward.

	Single-turn (default): ``[visual, prompt, response]`` — CE only on
	``response`` tokens (see ``prompt_ids`` … ``response_mask``).

	Multiturn: pass ``packed_text_ids``, ``packed_text_mask``, and
	``packed_text_labels`` instead of prompt/response tensors. Layout is
	``[visual, packed_text]`` with ``packed_text_labels`` already ``-100`` on
	user spans and padding (loss on every assistant span in one sequence).
	Extra keyword args are ignored so callers can pass through batch dicts.
	"""
	_ = kwargs # allow batch dicts with unused keys
	B = pixel_values.size(0)
	device = pixel_values.device

	visual = self.encode_images(pixel_values) # (B, T, H)
	T = visual.size(1)

	if packed_text_ids is not None:
	if packed_text_mask is None or packed_text_labels is None:
	raise ValueError(
	"packed_text_ids requires packed_text_mask and packed_text_labels"
	)
	text_emb = self._embed_tokens(packed_text_ids)
	inputs_embeds = torch.cat([visual, text_emb], dim=1)
	visual_mask = torch.ones(B, T, dtype=packed_text_mask.dtype, device=device)
	attention_mask = torch.cat([visual_mask, packed_text_mask], dim=1)
	ignore_vis = torch.full(
	(B, T), -100, dtype=torch.long, device=device
	)
	labels = torch.cat([ignore_vis, packed_text_labels], dim=1)
	return self.llm(
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	labels=labels,
	use_cache=False,
	)

	if (
	prompt_ids is None
	or prompt_mask is None
	or response_ids is None
	or response_mask is None
	):
	raise ValueError(
	"Either packed_text_* (multiturn) or prompt_ids/response_ids (single-turn) is required"
	)

	prompt_emb = self._embed_tokens(prompt_ids) # (B, Lp, H)
	resp_emb = self._embed_tokens(response_ids) # (B, Lr, H)

	inputs_embeds = torch.cat([visual, prompt_emb, resp_emb], dim=1) # (B, L, H)

	visual_mask = torch.ones(B, T, dtype=prompt_mask.dtype, device=device)
	attention_mask = torch.cat([visual_mask, prompt_mask, response_mask], dim=1)

	ignore = torch.full(
	(B, T + prompt_ids.size(1)), -100, dtype=torch.long, device=device
	)
	resp_labels = response_ids.masked_fill(response_mask == 0, -100)
	labels = torch.cat([ignore, resp_labels], dim=1)

	return self.llm(
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	labels=labels,
	use_cache=False,
	)

	def trainable_parameters(self) -> list[torch.nn.Parameter]:
	out = [p for p in self.projector.parameters() if p.requires_grad]
	if self._use_lora:
	out.extend(p for p in self.llm.parameters() if p.requires_grad)
	return out