μ²Tokenizer
Collection
Official models and datasets for paper μ²Tokenizer(https://arxiv.org/abs/2507.00316) • 9 items • Updated • 1
μ²Qwen3-4B-Instruct is a multi-scale multi-modal model designed for Radiology Report Generation (RRG). It leverages the μ²Tokenizer, a differentiable intermediate layer that efficiently fuses visual features from 3D CT scans with textual information. The model is fine-tuned using Direct Preference Optimization (DPO) guided by the GREEN score to ensure clinical accuracy and alignment with expert standards.
This model is part of the work described in the paper: μ²Tokenizer: Differentiable Multi-Scale Multi-Modal Tokenizer for Radiology Report Generation.
pip install torch transformers monai==1.3.2 nibabel==5.3.3
Below is a sample code snippet to generate a report from a CT scan using this model.
import torch
import nibabel as nib
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from monai.transforms import (
Compose, CropForeground, ToTensor, ScaleIntensityRangePercentiles
)
from monai.transforms.spatial.functional import resize
import torch.nn.functional as F
import types
import inspect
class u2Transform:
def __init__(self, mode='bilinear', device="cpu"):
self.adaptive_transforms = Compose([
ScaleIntensityRangePercentiles(lower=0.5, upper=99.5, b_max=1.0, b_min=0.0, clip=True),
CropForeground(source_key="image"),
ToTensor(),
])
self.mode = mode
self.device = device
def adaptive_resize(self, input_path, target_image_size=256, padding_size=32*8):
data = nib.load(input_path).get_fdata().transpose(2, 0, 1)[np.newaxis, ...]
data = torch.tensor(data, device=self.device)
data = self.adaptive_transforms(data)[0]
data = torch.permute(data, (1, 2, 0))
input_shape = data.shape
ratio = min([target_image_size / input_shape[i] for i in range(2)])
scaling_shape = [int(input_shape[i] * ratio) for i in range(2)]
if padding_size >= input_shape[2]:
scaling_shape.append(input_shape[2])
data = resize(
img=data.unsqueeze(0),
out_size=scaling_shape,
mode=self.mode,
align_corners=True
)
pad_tuple = (0, padding_size - scaling_shape[2], 0, target_image_size - scaling_shape[1], 0, target_image_size - scaling_shape[0])
data = F.pad(data, pad_tuple, mode='constant', value=0)
else:
scaling_shape.append(padding_size)
data = resize(
img=data.unsqueeze(0),
out_size=scaling_shape,
mode=self.mode,
align_corners=True
)
pad_tuple = (0, 0, 0, target_image_size - scaling_shape[1], 0, target_image_size - scaling_shape[0])
data = F.pad(data, pad_tuple, mode='constant', value=0)
data = torch.permute(data, (0, 3, 1, 2))
data = data.view(-1, 32, target_image_size, target_image_size)
return data
def __call__(self, *args, **kwds):
return self.adaptive_resize(*args, **kwds)
# Load Model
model_path = "AlpachinoNLP/u2Qwen3-4B-Instruct" # Replace with actual path
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=dtype,
device_map="auto"
)
model.eval()
# Prepare Input
image_path = "example.nii.gz" # Replace with your NIfTI file path
question = "Can you provide a diagnosis based on the findings in this image?"
image_transforms = u2Transform(mode="bilinear", device=device)
image = image_transforms(image_path).unsqueeze(0).to(dtype).to(device)
proj_out_num = getattr(getattr(model.get_model(), "mm_projector", None), "proj_out_num", 256)
image_tokens = "<im_patch>" * int(proj_out_num)
prompt = image_tokens + question
encoded = tokenizer(prompt, return_tensors="pt")
input_ids = encoded["input_ids"].to(device)
question_ids = tokenizer(question, add_special_tokens=False, return_tensors="pt")["input_ids"].to(device)
# Compatibility fix for newer transformers
if "cache_position" not in inspect.signature(model.forward).parameters:
original_forward = model.forward
def _forward_compat(self, *f_args, **f_kwargs):
f_kwargs.pop("cache_position", None)
return original_forward(*f_args, **f_kwargs)
model.forward = types.MethodType(_forward_compat, model)
# Generate
with torch.no_grad():
output_ids = model.generate(
images=image,
inputs=input_ids,
question_ids=question_ids,
max_new_tokens=512,
)
print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
If you find this model useful, please consider citing our paper:
@misc{li2025mu2tokenizerdifferentiablemultiscalemultimodal,
title={${\mu}^2$Tokenizer: Differentiable Multi-Scale Multi-Modal Tokenizer for Radiology Report Generation},
author={Siyou Li and Pengyao Qin and Huanan Wu and Dong Nie and Arun J. Thirunavukarasu and Juntao Yu and Le Zhang},
year={2025},
eprint={2507.00316},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2507.00316},
}