QwenImage-TextPecker-SQPA

This model is a LoRA adapter for Qwen/Qwen-Image, trained using the TextPecker structural anomaly perceptive RL strategy. TextPecker is designed to enhance Visual Text Rendering (VTR) by quantifying and rewarding structural anomalies like distortion and misalignment.

Paper: TextPecker: Rewarding Structural Anomaly Quantification for Enhancing Visual Text Rendering
Repository: https://github.com/CIawevy/TextPecker
Base Model: Qwen/Qwen-Image

Model Description

Visual Text Rendering (VTR) remains a critical challenge in text-to-image generation. Even advanced models frequently produce text with structural anomalies. TextPecker addresses this using a structural anomaly perceptive RL strategy that works with any text-to-image generator. When applied to Qwen-Image, it yields significant gains in structural fidelity and semantic alignment for text rendering.

Usage

This repository provides only the LoRA weights (SQPA). You must download the Qwen-Image base model first to use this adapter.

import os
import torch
from diffusers import DiffusionPipeline
from safetensors.torch import load_file
from peft import LoraConfig, get_peft_model

os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
os.environ["DIFFUSERS_DISABLE_NATIVE_ATTENTION"] = "1"

def load_model(model_path, ckpt_path=None, use_lora=True):
    torch_dtype = torch.get_default_dtype() if not torch.cuda.is_available() else torch.bfloat16
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    pipe = DiffusionPipeline.from_pretrained(
        model_path,
        torch_dtype=torch_dtype,
    ).to(device)
    pipe.safety_checker = None

    if ckpt_path is not None and use_lora:
        target_modules = [
            "attn.to_k", "attn.to_q", "attn.to_v", "attn.to_out.0",
            "attn.add_k_proj", "attn.add_q_proj", "attn.add_v_proj", "attn.to_add_out",
            "img_mlp.net.0.proj", "img_mlp.net.2",
            "txt_mlp.net.0.proj", "txt_mlp.net.2",
        ]
        transformer_lora_config = LoraConfig(
            r=64,
            lora_alpha=128,
            init_lora_weights="gaussian",
            target_modules=target_modules,
        )
        
        pipe.transformer = get_peft_model(pipe.transformer, transformer_lora_config)
        
        model_state_dict = load_file(ckpt_path, device="cpu")
        pipe.transformer.load_state_dict(model_state_dict, strict=False)
        print(f"successfully load lora: {ckpt_path}")
    
    return pipe

model_id = "Qwen/Qwen-Image"
lora_ckpt_path = "CIawevy/QwenImage-TextPecker-SQPA"
device = "cuda" if torch.cuda.is_available() else "cpu"

negative_prompt = " "
aspect_ratios = {
    "1:1": (1328, 1328),
    "16:9": (1664, 928),
    "9:16": (928, 1664),
}
width, height = aspect_ratios["1:1"]
num_inference_steps = 50
true_cfg_scale = 4.0

pipe = load_model(model_id, lora_ckpt_path)

prompt = 'a weathered cave explorers journal page, with the phrase "TextPecker" prominently written in faded ink, surrounded by sketches of ancient ruins and cryptic symbols, under a dim, mystical light.'
image = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=width,
    height=height,
    num_inference_steps=num_inference_steps,
    true_cfg_scale=true_cfg_scale,
    generator=torch.Generator(device=device).manual_seed(42)
).images[0]

image.save("TextPecker_qwen_demo.png")
print("img has been saved to: TextPecker_qwen_demo.png")

Citation

@article{zhu2026TextPecker,
  title   = {TextPecker: Rewarding Structural Anomaly Quantification for Enhancing Visual Text Rendering},
  author  = {Zhu, Hanshen and Liu, Yuliang and Wu, Xuecheng and Wang, An-Lan and Feng, Hao and Yang, Dingkang and Feng, Chao and Huang, Can and Tang, Jingqun and Bai, Xiang},
  journal = {arXiv preprint arXiv:2602.20903},
  year    = {2026}
}