Model Description

Model Summary

μ²Qwen3-4B-Instruct is a multi-scale multi-modal model designed for Radiology Report Generation (RRG). It leverages the μ²Tokenizer, a differentiable intermediate layer that efficiently fuses visual features from 3D CT scans with textual information. The model is fine-tuned using Direct Preference Optimization (DPO) guided by the GREEN score to ensure clinical accuracy and alignment with expert standards.

This model is part of the work described in the paper: μ²Tokenizer: Differentiable Multi-Scale Multi-Modal Tokenizer for Radiology Report Generation.

Model Details

Model Architecture:
- Image Encoder: 3D Vision Transformer (ViT3D) initialized from M3D-CLIP.
- Tokenizer: μ²Tokenizer (Multi-scale, Multi-modal).
- LLM Backbone: Qwen3-4B-Instruct.
Input: 3D CT Scans (NIfTI format) and text prompts.
Output: Radiology reports or answers to clinical questions.
Training Data: Trained on large-scale CT image-report Synthetic datasets base on CT-RATE.

How to Get Started with the Model

Requirements

pip install torch transformers monai==1.3.2 nibabel==5.3.3

Inference Code

Below is a sample code snippet to generate a report from a CT scan using this model.

import argparse
import gzip
import inspect
import json
import os
import struct
import sys
import types

from sympy import content
import torch.nn.functional as F
import torch
import nibabel as nib
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from monai.data.image_reader import NibabelReader
from monai.transforms import (
        LoadImage,
        Compose,
        CropForeground,
        ToTensor,
        SaveImage,
        ScaleIntensityRangePercentiles,
        RandRotate90,
        RandFlip,
        NormalizeIntensity,
        RandScaleIntensity,
        RandShiftIntensity
    )
from monai.transforms.spatial.functional import resize


_DTYPE_MAP = {
    2: np.uint8,
    4: np.int16,
    8: np.int32,
    16: np.float32,
    64: np.float64,
    256: np.int8,
    512: np.uint16,
    768: np.uint32,
    1024: np.int64,
}
class u2Transform:
    def __init__(self, mode='bilinear', device="cpu"):
        transforms = Compose(
                [
                #LoadImage(image_only=True, ensure_channel_first=False, reader=NibabelReader()),
                ScaleIntensityRangePercentiles(lower=0.5, upper=99.5, b_max=1.0, b_min=0.0, clip=True),
                CropForeground(source_key="image"),
                #NormalizeIntensity(),   
                ToTensor(),
                ]
            )
        self.adaptive_transforms = transforms
        self.mode = mode
        self.save = SaveImage(separate_folder=False, output_postfix='')
        self.device = device

    def adaptive_resize(self, input_path, target_image_size=256, padding_size=32*8):
        """
        adaptive resize the NIfTI file to the target size.
        The minimum dimension is scaled to the target dimension, and other dimensions are scaled proportionally
        """
        data = nib.load(input_path).get_fdata().transpose(2, 0, 1)[np.newaxis, ...]
        data = torch.tensor(data, device=self.device)
        data = self.adaptive_transforms(data)[0]
        data = torch.permute(data,(1, 2, 0))
        
        input_shape = data.shape
        # print(f"Input shape: {input_shape}")
        ratio = min([target_image_size / input_shape[i] for i in range(2)])
        scaling_shape = [int(input_shape[i] * ratio) for i in range(2)]
        # print(f"Scaling shape: {scaling_shape}")

        # padding the image to [padding_size, target_image_size, target_image_size]
        if padding_size >= input_shape[2]:
            scaling_shape.append(input_shape[2])
            data = resize(
                img=data.unsqueeze(0), 
                out_size=scaling_shape, 
                mode=self.mode,
                align_corners=True,
                dtype=None,
                input_ndim=3,
                anti_aliasing= True,
                anti_aliasing_sigma=None,
                lazy=False,
                transform_info=None,
                )
            pad_tuple = (0, padding_size - scaling_shape[2], 0, target_image_size - scaling_shape[1], 0, target_image_size - scaling_shape[0])
            data = F.pad(data, pad_tuple, mode='constant', value=0)
        else:
            scaling_shape.append(padding_size)
            data = resize(
                img=data.unsqueeze(0), 
                out_size=scaling_shape, 
                mode=self.mode,
                align_corners=True,
                dtype=None,
                input_ndim=3,
                anti_aliasing= True,
                anti_aliasing_sigma=None,
                lazy=False,
                transform_info=None,
                )
            # crop the image to [padding_size, target_image_size, target_image_size]
            pad_tuple = (0, 0, 0, target_image_size - scaling_shape[1], 0, target_image_size - scaling_shape[0])
            data = F.pad(data, pad_tuple, mode='constant', value=0)
            # data = data[:, :, :, :padding_size]
        # print("max:", data.max())
        # print("min:", data.min())
        # self.save(data, "/import/c4dm-04/siyoul/u2Tokenizer/amos_0001_resized.nii.gz")
        # print(f"Output shape: {data.shape}")
        data = torch.permute(data,(0, 3, 1, 2))
        # print(f"Output shape: {data.shape}")
        # split the date to multiple slices, every 32 slices is a batch
        data = data.view(-1, 32, target_image_size, target_image_size)
        # print(f"Output shape: {data.shape}")
        return data
        
    def __call__(self, *args, **kwds):
        return self.adaptive_resize(*args, **kwds)

def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="AlpachinoNLP/u2Qwen3-4B-Thinking")
    parser.add_argument("--image-path", default="example.nii.gz", help="NIfTI file (.nii or .nii.gz).")
    parser.add_argument("--question", default="Please provide a medical analysis of this image.", help="The question about the image.")
    parser.add_argument("--max-new-tokens", type=int, default=8192)
    args = parser.parse_args()

    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
    print(dtype)

    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False, trust_remote_code=True)
    try:
        model = AutoModelForCausalLM.from_pretrained(
            args.model,
            trust_remote_code=True,
            dtype=dtype,
            device_map="auto" if torch.cuda.is_available() else None,
        )
    except TypeError:
        model = AutoModelForCausalLM.from_pretrained(
            args.model,
            trust_remote_code=True,
            torch_dtype=dtype,
            device_map="auto" if torch.cuda.is_available() else None,
        )
    device = next(model.parameters()).device
    model.eval()

    target_dhw = tuple(int(x) for x in getattr(model.config, "image_size", (32, 256, 256)))
    image_transforms = u2Transform(mode="bilinear", device=device)
    image = image_transforms(args.image_path).unsqueeze(0).to(dtype)

    proj_out_num = getattr(getattr(model.get_model(), "mm_projector", None), "proj_out_num", 256)
    image_tokens = "<im_patch>" * int(proj_out_num)
    prompt = image_tokens + args.question

    encoded = tokenizer(prompt, return_tensors="pt")
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded.get("attention_mask")
    attention_mask = attention_mask.to(device) if attention_mask is not None else None
    question_ids = tokenizer(args.question, add_special_tokens=False, return_tensors="pt")["input_ids"].to(device)

    # Transformers >=4.57 passes `cache_position` to forward() during generation; older custom model
    # implementations might not accept it yet.
    if "cache_position" not in inspect.signature(model.forward).parameters:
        original_forward = model.forward

        def _forward_compat(self, *f_args, **f_kwargs):
            f_kwargs.pop("cache_position", None)
            return original_forward(*f_args, **f_kwargs)

        model.forward = types.MethodType(_forward_compat, model)

    with torch.no_grad():
        output_ids = model.generate(
            images=image,
            inputs=input_ids,
            question_ids=question_ids,
            attention_mask=attention_mask,
            max_new_tokens=args.max_new_tokens,
            # 强制指定停止符
            eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>"),
            pad_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>")
            
        )

    output_ids = output_ids[0][:].tolist() 

    # parsing thinking content
    try:
        # rindex finding 151668 (</think>)
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0

    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

    print("[*]thinking content:", thinking_content)
    print("[*]content:", content)


if __name__ == "__main__":
    main()

Citation

If you find this model useful, please consider citing our paper:

@misc{li2025mu2tokenizerdifferentiablemultiscalemultimodal,
      title={${\mu}^2$Tokenizer: Differentiable Multi-Scale Multi-Modal Tokenizer for Radiology Report Generation}, 
      author={Siyou Li and Pengyao Qin and Huanan Wu and Dong Nie and Arun J. Thirunavukarasu and Juntao Yu and Le Zhang},
      year={2025},
      eprint={2507.00316},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2507.00316}, 
}

Downloads last month: 20

Safetensors

Model size

5B params

Tensor type

BF16

Collection including AlpachinoNLP/u2Qwen3-4B-Thinking

μ²Tokenizer

Collection

Official models and datasets for paper μ²Tokenizer(https://arxiv.org/abs/2507.00316) • 9 items • Updated Feb 2 • 1

Paper for AlpachinoNLP/u2Qwen3-4B-Thinking

μ^2Tokenizer: Differentiable Multi-Scale Multi-Modal Tokenizer for Radiology Report Generation

Paper • 2507.00316 • Published Jun 30, 2025 • 15