Initial release: Ker-VLJEPA-3B inference package

a974113 verified 3 months ago

21.7 kB

	"""
	Ker-VLJEPA-3B — Inference-only model.

	Loads the Llama 3.2 3B backbone, applies LoRA adapters, visual encoder,
	and cross-attention bridge components for CT report generation.

	Requires:
	- A local copy of meta-llama/Llama-3.2-3B (user-provided)
	- Weight files in weights/ (shipped with this package)
	"""
	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from pathlib import Path
	from typing import Optional, List, Tuple, Dict
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	from safetensors.torch import load_file


	# ---------------------------------------------------------------------------
	# Visual encoder components (inference-only, no training heads)
	# ---------------------------------------------------------------------------

	def sinusoidal_positional_encoding(n: int, d: int) -> torch.Tensor:
	pe = torch.zeros(n, d)
	pos = torch.arange(n, dtype=torch.float).unsqueeze(1)
	div = torch.exp(torch.arange(0, d, 2).float() * (-math.log(10000.0) / d))
	pe[:, 0::2] = torch.sin(pos * div)
	pe[:, 1::2] = torch.cos(pos * div)
	return pe.unsqueeze(0)


	class ZonedCrossAttention(nn.Module):
	"""Z-Zoned cross-attention: each zone's queries attend only to their spatial slice range."""

	def __init__(self, slice_dim=1024, hidden_dim=1024, num_zones=32,
	tokens_per_zone=1, num_heads=16, dropout=0.1):
	super().__init__()
	self.num_zones = num_zones
	self.tokens_per_zone = tokens_per_zone
	self.num_regions = num_zones * tokens_per_zone
	self.num_heads = num_heads
	self.hidden_dim = hidden_dim

	self.zone_queries = nn.Parameter(torch.zeros(num_zones, tokens_per_zone, hidden_dim))
	self.zone_embed = nn.Embedding(num_zones, hidden_dim)
	self.slice_proj = nn.Linear(slice_dim, hidden_dim)
	self.attention = nn.MultiheadAttention(hidden_dim, num_heads, batch_first=True, dropout=dropout)
	self.output_proj = nn.Linear(hidden_dim, slice_dim)
	self.fallback_pos_embed = nn.Parameter(torch.randn(1, 1, hidden_dim) * 0.02)

	# Physical Z-positional encoding buffers
	div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-math.log(10000.0) / hidden_dim))
	self.register_buffer("div_term", div_term)

	def _add_z_pos(self, x_proj, mask):
	"""Sinusoidal positional encoding based on slice index."""
	B, S, D = x_proj.shape
	device, dtype = x_proj.device, x_proj.dtype
	z_positions = torch.zeros(B, S, device=device)
	for b in range(B):
	n = int(mask[b].sum().item()) if mask is not None else S
	pos = torch.arange(n, device=device, dtype=torch.float) * 2.5 / 600.0 * 100.0
	z_positions[b, :n] = pos
	pos = z_positions.unsqueeze(-1)
	pe = torch.zeros(B, S, D, device=device, dtype=dtype)
	pe[:, :, 0::2] = torch.sin(pos * self.div_term)
	pe[:, :, 1::2] = torch.cos(pos * self.div_term)
	if mask is not None:
	pe = pe * mask.unsqueeze(-1)
	return x_proj + pe

	def forward(self, x, mask=None, metadata=None):
	B, S, _ = x.shape
	x_proj = self.slice_proj(x)
	if mask is not None:
	x_proj = self._add_z_pos(x_proj, mask)
	else:
	x_proj = x_proj + self.fallback_pos_embed

	queries = self.zone_queries.reshape(self.num_regions, self.hidden_dim)
	zone_ids = torch.arange(self.num_zones, device=x.device)
	zone_emb = self.zone_embed(zone_ids).unsqueeze(1).expand(-1, self.tokens_per_zone, -1)
	queries = (queries + zone_emb.reshape(self.num_regions, self.hidden_dim)).unsqueeze(0).expand(B, -1, -1)

	# Zone mask
	zone_mask = torch.ones(B, self.num_regions, S, device=x.device, dtype=torch.bool)
	for b in range(B):
	n = int(mask[b].sum().item()) if mask is not None else S
	zone_size = n / self.num_zones
	for z in range(self.num_zones):
	s, e = int(round(z * zone_size)), min(max(int(round((z + 1) * zone_size)), int(round(z * zone_size)) + 1), n)
	q_s, q_e = z * self.tokens_per_zone, (z + 1) * self.tokens_per_zone
	zone_mask[b, q_s:q_e, s:e] = False
	zone_mask = zone_mask.unsqueeze(1).expand(-1, self.num_heads, -1, -1).reshape(B * self.num_heads, self.num_regions, S)

	kpm = (mask == 0) if mask is not None else None
	region_hidden, attn_w = self.attention(queries, x_proj, x_proj, attn_mask=zone_mask, key_padding_mask=kpm, need_weights=True)
	return self.output_proj(region_hidden), attn_w


	class VisualEncoder(nn.Module):
	"""Compresses variable-length slice embeddings into fixed visual tokens in LLM space."""

	def __init__(self, slice_dim=1024, hidden_dim=1024, llm_dim=3072,
	num_regions=32, num_zones=32, tokens_per_zone=1,
	num_heads=16, dropout=0.1):
	super().__init__()
	self.region_query = ZonedCrossAttention(slice_dim, hidden_dim, num_zones, tokens_per_zone, num_heads, dropout)
	self.global_self_attn = nn.TransformerEncoderLayer(
	d_model=slice_dim, nhead=num_heads, batch_first=True,
	dropout=dropout, dim_feedforward=slice_dim * 4, activation="gelu",
	)
	self.jepa_predictor = nn.Sequential(
	nn.Dropout(0.0), # disabled at inference
	nn.Linear(slice_dim, llm_dim),
	nn.LayerNorm(llm_dim),
	nn.Dropout(0.0),
	)
	self.norm_calibrator_scale = 1.0 # set from checkpoint buffer
	self.register_buffer("_norm_scale", torch.tensor(1.0))

	def forward(self, slices, mask=None):
	"""
	Args:
	slices: (B, num_slices, 1024) pre-computed LeJEPA embeddings
	mask: (B, num_slices) binary mask, 1=valid 0=pad
	Returns:
	(B, 32, 3072) visual tokens in LLM hidden space
	"""
	regions, _ = self.region_query(slices, mask)
	regions = self.global_self_attn(regions)
	predicted = self.jepa_predictor(regions)
	return predicted * self._norm_scale


	class GatedCrossAttentionLayer(nn.Module):
	"""Flamingo-style cross-attention: text hidden states attend to visual tokens."""

	def __init__(self, hidden_dim=3072, num_heads=16):
	super().__init__()
	self.hidden_dim = hidden_dim
	self.num_heads = num_heads
	self.head_dim = hidden_dim // num_heads
	self.q_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
	self.k_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
	self.v_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
	self.o_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
	self.q_norm = nn.LayerNorm(hidden_dim)
	self.gate = nn.Parameter(torch.tensor(0.0), requires_grad=False)

	def forward(self, text_hidden, visual_tokens):
	B, S, _ = text_hidden.shape
	dt = self.q_proj.weight.dtype
	text_hidden = text_hidden.to(dt)
	visual_tokens = visual_tokens.to(dt)
	q = self.q_proj(self.q_norm(text_hidden)).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
	k = self.k_proj(visual_tokens).view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
	v = self.v_proj(visual_tokens).view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
	out = F.scaled_dot_product_attention(q, k, v, is_causal=False)
	return self.o_proj(out.transpose(1, 2).contiguous().view(B, S, self.hidden_dim))


	# ---------------------------------------------------------------------------
	# Main inference model
	# ---------------------------------------------------------------------------

	class KerVLJEPA(nn.Module):
	"""
	Ker-VLJEPA-3B inference model.

	Takes pre-computed LeJEPA slice embeddings (1024-d per slice) and generates
	free-text radiology reports using Llama 3.2 3B + LoRA + cross-attention bridge.
	"""

	INJECTION_LAYERS = [7, 14, 21]
	NUM_REGIONS = 32
	VISUAL_TOKEN = "<\|visual_region\|>"

	def __init__(self, llm_path: str, weights_dir: str = "weights", device: str = "cuda"):
	super().__init__()
	weights_dir = Path(weights_dir)
	self.device = torch.device(device)

	# --- 1. Load tokenizer ---
	tok_dir = weights_dir / "tokenizer"
	self.tokenizer = AutoTokenizer.from_pretrained(
	llm_path, trust_remote_code=True, padding_side="left",
	)
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token
	self.tokenizer.add_tokens([self.VISUAL_TOKEN], special_tokens=True)
	self.vis_token_id = self.tokenizer.convert_tokens_to_ids(self.VISUAL_TOKEN)

	# Load custom chat template
	tmpl_path = tok_dir / "chat_template.jinja"
	if tmpl_path.exists():
	self.tokenizer.chat_template = tmpl_path.read_text()

	# --- 2. Load Llama 3.2 3B + LoRA ---
	self.llm = AutoModelForCausalLM.from_pretrained(
	llm_path, torch_dtype=torch.bfloat16,
	device_map=device, trust_remote_code=True,
	attn_implementation="flash_attention_2",
	)
	self.llm.resize_token_embeddings(len(self.tokenizer))

	lora_dir = weights_dir / "lora_adapters"
	self.llm = PeftModel.from_pretrained(self.llm, str(lora_dir), is_trainable=False)
	self.llm.eval()

	llm_dim = self.llm.config.hidden_size # 3072

	# --- 3. Visual encoder ---
	self.visual_encoder = VisualEncoder(
	slice_dim=1024, hidden_dim=1024, llm_dim=llm_dim,
	num_regions=self.NUM_REGIONS, num_zones=32, tokens_per_zone=1,
	num_heads=16, dropout=0.0,
	)
	ve_state = load_file(str(weights_dir / "visual_encoder.safetensors"))
	# Map checkpoint keys (which include output_ln, norm_calibrator, classifiers)
	# into our simplified VisualEncoder
	mapped = {}
	for k, v in ve_state.items():
	if k == "norm_calibrator.scale":
	self.visual_encoder._norm_scale = v.clone()
	continue
	if k.startswith("norm_calibrator.") or k.startswith("region_classifier.") or \
	k.startswith("slice_organ_classifier.") or k == "_last_attention_weights" or \
	k.startswith("output_ln."):
	continue
	mapped[k] = v
	self.visual_encoder.load_state_dict(mapped, strict=False)
	self.visual_encoder = self.visual_encoder.to(self.device).to(torch.bfloat16)
	self.visual_encoder.eval()

	# --- 4. Bridge components ---
	bridge = load_file(str(weights_dir / "bridge_components.safetensors"))

	# Text embedding norm (for grafting normalization)
	self.register_buffer("text_embed_norm", bridge["text_embed_norm"].to(self.device))

	# LayerNorm for grafting
	self.layernorm = nn.LayerNorm(llm_dim).to(self.device).to(torch.bfloat16)
	self.layernorm.weight.data.copy_(bridge["layernorm.weight"])
	self.layernorm.bias.data.copy_(bridge["layernorm.bias"])

	# Cross-attention adapters + layer projectors
	self.cross_attn_adapters = nn.ModuleDict()
	self.layer_projectors = nn.ModuleDict()
	for layer_idx in self.INJECTION_LAYERS:
	li = str(layer_idx)
	adapter = GatedCrossAttentionLayer(llm_dim, 16)
	adapter.q_proj.weight.data.copy_(bridge[f"cross_attn_adapters.{li}.q_proj.weight"])
	adapter.k_proj.weight.data.copy_(bridge[f"cross_attn_adapters.{li}.k_proj.weight"])
	adapter.v_proj.weight.data.copy_(bridge[f"cross_attn_adapters.{li}.v_proj.weight"])
	adapter.o_proj.weight.data.copy_(bridge[f"cross_attn_adapters.{li}.o_proj.weight"])
	adapter.q_norm.weight.data.copy_(bridge[f"cross_attn_adapters.{li}.q_norm.weight"])
	adapter.q_norm.bias.data.copy_(bridge[f"cross_attn_adapters.{li}.q_norm.bias"])
	self.cross_attn_adapters[li] = adapter.to(self.device).to(torch.bfloat16)

	proj = nn.Linear(llm_dim, llm_dim, bias=False)
	proj.weight.data.copy_(bridge[f"layer_projectors.{li}.weight"])
	self.layer_projectors[li] = proj.to(self.device).to(torch.bfloat16)

	# ------------------------------------------------------------------
	# Helpers
	# ------------------------------------------------------------------

	def _get_llm_layers(self):
	model = self.llm
	for _ in range(10):
	if hasattr(model, "base_model"):
	model = model.base_model
	elif hasattr(model, "model"):
	model = model.model
	else:
	break
	return model.layers

	def _get_embed_layer(self):
	model = self.llm
	for _ in range(10):
	if hasattr(model, "get_input_embeddings"):
	emb = model.get_input_embeddings()
	if emb is not None:
	return emb
	if hasattr(model, "base_model"):
	model = model.base_model
	elif hasattr(model, "model"):
	model = model.model
	else:
	break
	raise RuntimeError("Could not find embedding layer")

	def _normalize_visual(self, visual_embeds):
	norms = visual_embeds.norm(dim=-1, keepdim=True).clamp(min=1e-8)
	return visual_embeds / norms * self.text_embed_norm

	@staticmethod
	def _clean_text(text: str) -> str:
	text = text.strip()
	if not text:
	return text
	low = text.lower()
	first = low.find("findings:")
	if first >= 0:
	second = low.find("findings:", first + 10)
	if second > 0:
	text = text[:second].strip()
	for marker in ["\n\n\n", "User", "user", "assistant", "system"]:
	idx = text.find(marker)
	if idx > 20:
	text = text[:idx].strip()
	return text

	# ------------------------------------------------------------------
	# Generation
	# ------------------------------------------------------------------

	@torch.no_grad()
	def generate(
	self,
	slice_embeddings: torch.Tensor,
	mask: Optional[torch.Tensor] = None,
	max_new_tokens: int = 384,
	temperature: float = 0.6,
	top_p: float = 0.9,
	repetition_penalty: float = 1.1,
	no_repeat_ngram_size: int = 4,
	) -> str:
	"""
	Generate a radiology report from pre-computed LeJEPA slice embeddings.

	Args:
	slice_embeddings: (1, num_slices, 1024) — stacked 1024-d embeddings,
	one per CT slice. Padding is allowed.
	mask: (1, num_slices) — binary mask where 1=real slice, 0=padding.
	If None, all slices are treated as valid.
	max_new_tokens: maximum tokens to generate (default 384).
	temperature: sampling temperature (default 0.6).
	top_p: nucleus sampling threshold (default 0.9).
	repetition_penalty: penalize repeated tokens (default 1.1).
	no_repeat_ngram_size: prevent repeating n-grams (default 4).

	Returns:
	Generated report text (str).
	"""
	assert slice_embeddings.ndim == 3 and slice_embeddings.shape[0] == 1

	# 1. Visual forward
	slices = slice_embeddings.to(self.device, dtype=torch.bfloat16)
	if mask is not None:
	mask = mask.to(self.device, dtype=torch.bfloat16)
	visual_tokens = self.visual_encoder(slices, mask)
	visual_tokens = self._normalize_visual(visual_tokens.to(torch.bfloat16))

	# 2. Build prompt
	placeholders = self.VISUAL_TOKEN * self.NUM_REGIONS
	messages = [
	{"role": "system", "content": "You are a radiology reporting assistant. Describe thoracic findings based on the provided CT scan visual features. Report only what you observe."},
	{"role": "user", "content": f"Based on the visual features from this CT scan, describe the thoracic findings. {placeholders}"},
	]
	tokenized = self.tokenizer.apply_chat_template(
	messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
	)
	if hasattr(tokenized, "input_ids"):
	input_ids = tokenized["input_ids"].to(self.device)
	attention_mask = tokenized["attention_mask"].to(self.device)
	else:
	input_ids = tokenized.to(self.device)
	attention_mask = torch.ones_like(input_ids)

	# 3. Graft visual tokens into embedding sequence
	embed_layer = self._get_embed_layer()
	inputs_embeds = embed_layer(input_ids).clone()
	vis_mask = (input_ids == self.vis_token_id)
	vis_positions = vis_mask[0].nonzero(as_tuple=True)[0]
	assert len(vis_positions) == self.NUM_REGIONS, \
	f"Expected {self.NUM_REGIONS} visual tokens, found {len(vis_positions)}"
	for idx, pos in enumerate(vis_positions):
	inputs_embeds[0, pos] = visual_tokens[0, idx].to(inputs_embeds.dtype)

	# 4. Register cross-attention hooks
	hooks = []
	llm_layers = self._get_llm_layers()
	for layer_idx in self.INJECTION_LAYERS:
	li = str(layer_idx)
	proj = self.layer_projectors[li]
	adapter = self.cross_attn_adapters[li]

	def make_hook(p, a, v_tokens, v_mask, seq_len):
	def hook_fn(module, args, output):
	hidden = output[0] if isinstance(output, tuple) else output
	# Additive injection at visual positions (prefill only)
	if hidden.shape[1] == seq_len:
	projected = p(v_tokens.to(hidden.dtype))
	vis_pos = v_mask[0].nonzero(as_tuple=True)[0]
	hidden[0, vis_pos] = hidden[0, vis_pos] + projected[0, :len(vis_pos)]
	# Cross-attention (every token)
	xattn_out = a(hidden, v_tokens.to(hidden.dtype))
	modified = hidden + xattn_out.to(hidden.dtype)
	return (modified,) + output[1:] if isinstance(output, tuple) else modified
	return hook_fn

	h = llm_layers[layer_idx].register_forward_hook(
	make_hook(proj, adapter, visual_tokens, vis_mask, input_ids.shape[1])
	)
	hooks.append(h)

	# 5. Generate
	eot_id = self.tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	start_header = self.tokenizer.convert_tokens_to_ids("<\|start_header_id\|>")
	stop_ids = [self.tokenizer.eos_token_id]
	if eot_id is not None and eot_id != self.tokenizer.eos_token_id:
	stop_ids.append(eot_id)
	if start_header is not None and start_header not in stop_ids:
	stop_ids.append(start_header)

	try:
	generated_ids = self.llm.generate(
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	no_repeat_ngram_size=no_repeat_ngram_size,
	do_sample=True,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=stop_ids,
	)
	finally:
	for h in hooks:
	h.remove()

	text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
	return self._clean_text(text)

	@torch.no_grad()
	def classify(
	self,
	slice_embeddings: torch.Tensor,
	mask: Optional[torch.Tensor] = None,
	) -> Dict[str, float]:
	"""
	Run the auxiliary 18-class abnormality classifier on slice embeddings.

	Returns a dict mapping each CT-RATE condition name to its sigmoid probability.
	"""
	CLASS_NAMES = [
	"Medical material", "Arterial wall calcification",
	"Cardiomegaly", "Pericardial effusion",
	"Coronary artery wall calcification", "Hiatal hernia",
	"Lymphadenopathy", "Emphysema",
	"Atelectasis", "Lung nodule",
	"Lung opacity", "Pulmonary fibrotic sequela",
	"Pleural effusion", "Mosaic attenuation pattern",
	"Peribronchial thickening", "Consolidation",
	"Bronchiectasis", "Interlobular septal thickening",
	]
	slices = slice_embeddings.to(self.device, dtype=torch.bfloat16)
	if mask is not None:
	mask = mask.to(self.device, dtype=torch.bfloat16)
	visual_tokens = self.visual_encoder(slices, mask) # (1, 32, 3072)
	pooled = visual_tokens.mean(dim=1) # (1, 3072)
	# case_classifier is loaded from bridge but we need to add it
	logits = self._case_classifier(pooled) # (1, 18)
	probs = torch.sigmoid(logits).squeeze(0).cpu().tolist()
	return {name: round(p, 4) for name, p in zip(CLASS_NAMES, probs)}


	def load_model(llm_path: str, weights_dir: str = "weights", device: str = "cuda") -> KerVLJEPA:
	"""
	Load the Ker-VLJEPA-3B model for inference.

	Args:
	llm_path: Path to local Llama 3.2 3B model directory.
	weights_dir: Path to the weights/ directory from this package.
	device: CUDA device string (default "cuda").

	Returns:
	Ready-to-use KerVLJEPA model instance.
	"""
	model = KerVLJEPA(llm_path=llm_path, weights_dir=weights_dir, device=device)
	model.eval()
	return model