| import torch |
| from nunchaku.utils import get_gpu_memory, get_precision |
| from nunchaku.models.transformers.transformer_qwenimage import NunchakuQwenImageTransformer2DModel |
|
|
| class QwenBackend: |
| def __init__(self, model_id, optimized_model_path=None, optimized_edit_model_path=None, uma=False): |
| self.model_id = model_id |
| self.optimized_model_path = optimized_model_path |
| self.optimized_edit_model_path = optimized_edit_model_path |
| self.uma = uma |
| self.pipeline = None |
| self.rank = 32 |
| |
|
|
| def load(self): |
| print(f"Loading Qwen backend from {self.model_id}...") |
|
|
| if not self.optimized_model_path: |
| print("Warning: No optimized model path provided for QwenBackend. This requires the Nunchaku optimized model.") |
| |
| |
| import math |
| from diffusers import FlowMatchEulerDiscreteScheduler |
| |
| scheduler_config = { |
| "base_image_seq_len": 256, |
| "base_shift": math.log(3), |
| "invert_sigmas": False, |
| "max_image_seq_len": 8192, |
| "max_shift": math.log(3), |
| "num_train_timesteps": 1000, |
| "shift": 1.0, |
| "shift_terminal": None, |
| "stochastic_sampling": False, |
| "time_shift_type": "exponential", |
| "use_beta_sigmas": False, |
| "use_dynamic_shifting": True, |
| "use_exponential_sigmas": False, |
| "use_karras_sigmas": False, |
| } |
| scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config) |
|
|
| |
| print(f"Loading T2I NunchakuQwenImageTransformer2DModel from {self.optimized_model_path} with FA2...") |
| transformer_t2i = NunchakuQwenImageTransformer2DModel.from_pretrained( |
| self.optimized_model_path, |
| attn_implementation="flash_attention_2" |
| ) |
|
|
| |
| if self.optimized_edit_model_path: |
| print(f"Loading Edit NunchakuQwenImageTransformer2DModel from {self.optimized_edit_model_path} with FA2...") |
| transformer_edit = NunchakuQwenImageTransformer2DModel.from_pretrained( |
| self.optimized_edit_model_path, |
| attn_implementation="flash_attention_2" |
| ) |
| else: |
| print(f"Using shared transformer for Edit pipeline...") |
| transformer_edit = transformer_t2i |
|
|
| print(f"Loading QwenImagePipeline from {self.model_id}...") |
| |
| from diffusers import QwenImagePipeline, QwenImageEditPlusPipeline |
| |
| text_encoder = None |
| if self.uma: |
| print("UMA mode: Loading text_encoder in 8-bit using BitsAndBytes...") |
| from transformers import BitsAndBytesConfig, AutoModel |
| bnb_config = BitsAndBytesConfig(load_in_8bit=True) |
| text_encoder = AutoModel.from_pretrained( |
| self.model_id, |
| subfolder="text_encoder", |
| quantization_config=bnb_config, |
| torch_dtype=torch.bfloat16, |
| trust_remote_code=True |
| ) |
| |
| |
| print(f"Loading QwenImageEditPlusPipeline from {self.model_id}...") |
| |
| pipeline_kwargs = { |
| "transformer": transformer_edit, |
| "scheduler": scheduler, |
| "torch_dtype": torch.bfloat16 |
| } |
| if text_encoder is not None: |
| pipeline_kwargs["text_encoder"] = text_encoder |
|
|
| edit_pipeline = QwenImageEditPlusPipeline.from_pretrained( |
| self.model_id, |
| **pipeline_kwargs |
| ) |
|
|
| |
| print("Creating QwenImagePipeline (T2I) with shared components...") |
| |
| |
| if edit_pipeline.text_encoder is None: |
| print("Text encoder not found in edit_pipeline, loading manually...") |
| |
| if text_encoder is None: |
| from transformers import AutoModel |
| text_encoder = AutoModel.from_pretrained(self.model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16, trust_remote_code=True) |
| |
| |
| edit_pipeline.register_modules(text_encoder=text_encoder) |
| else: |
| text_encoder = edit_pipeline.text_encoder |
|
|
| tokenizer = edit_pipeline.tokenizer |
| |
| if tokenizer is None: |
| print("Tokenizer not found in edit_pipeline, loading manually...") |
| from transformers import AutoTokenizer |
| tokenizer = AutoTokenizer.from_pretrained(self.model_id, subfolder="tokenizer", trust_remote_code=True) |
| edit_pipeline.register_modules(tokenizer=tokenizer) |
|
|
| pipeline = QwenImagePipeline( |
| transformer=transformer_t2i, |
| scheduler=edit_pipeline.scheduler, |
| vae=edit_pipeline.vae, |
| text_encoder=text_encoder, |
| tokenizer=tokenizer, |
| ) |
| |
| |
| |
|
|
| |
| if self.uma: |
| print("UMA mode enabled: Text encoder loaded in 8-bit. Moving other components to GPU.") |
| |
| |
| |
| print("Moving T2I Transformer to CUDA...") |
| transformer_t2i.to("cuda") |
| |
| if transformer_edit != transformer_t2i: |
| print("Moving Edit Transformer to CUDA...") |
| transformer_edit.to("cuda") |
|
|
| |
| if hasattr(edit_pipeline, "vae") and edit_pipeline.vae: |
| print("Moving VAE to CUDA...") |
| edit_pipeline.vae.to("cuda") |
| |
| |
| |
| pass |
| |
| else: |
| print("Non-UMA mode: Using aggressive per-layer offloading.") |
| transformer_t2i.set_offload( |
| True, use_pin_memory=True, num_blocks_on_gpu=8 |
| ) |
| if self.optimized_edit_model_path: |
| transformer_edit.set_offload( |
| True, use_pin_memory=True, num_blocks_on_gpu=8 |
| ) |
| |
| edit_pipeline._exclude_from_cpu_offload.append("transformer") |
| edit_pipeline.enable_sequential_cpu_offload() |
| |
| |
| |
| |
| pipeline.enable_sequential_cpu_offload() |
|
|
| if self.optimized_edit_model_path: |
| pass |
| |
| self.pipeline = pipeline |
| self.edit_pipeline = edit_pipeline |
| return self.pipeline, self.edit_pipeline |
|
|