Compositional Text-to-Image Generation Via Region-aware Bimodal Direct Preference Optimization
Paper • 2605.28615 • Published
from diffusers import DiffusionPipeline, AutoencoderKL
import torch
sdxl_base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
sdxl_vae_path = "madebyollin/sdxl-vae-fp16-fix"
model_path = "anzeameol/sdxl-BiDPO"
vae = AutoencoderKL.from_pretrained(
sdxl_vae_path, subfolder=None, torch_dtype=torch.float16
)
pipe = DiffusionPipeline.from_pretrained(
sdxl_base_model_path,
torch_dtype=torch.float16,
safety_checker=None,
)
pipe.vae = vae
pipe.load_lora_weights(
model_path, weight_name="pytorch_lora_weights.safetensors"
)
prompt = "a red apple"
image = pipe(prompt, num_images_per_prompt=1).images[0]
@InProceedings{Liu_2026_CVPR,
author = {Liu, Zhuohan and Peng, Wujian and Chen, Yitong and Wu, Zuxuan},
title = {Compositional Text-to-Image Generation Via Region-aware Bimodal Direct Preference Optimization},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2026},
pages = {36604-36614}
}