catplusplus commited on 9 days ago

Commit

1e103b7

verified ·

1 Parent(s): 8c09cde

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

.gitattributes +1 -0
chat_template.jinja +117 -0
config.json +121 -0
extras/Flux2Backend.py +70 -0
extras/GlmBackend.py +45 -0
extras/ImageEditServer.py +496 -0
extras/ImageGenClient.py +150 -0
extras/ImageGenServer.py +123 -0
extras/ImageGenServer_cpu.py +307 -0
extras/ImageGenServer_new.py +176 -0
extras/KontextBackend.py +93 -0
extras/NVFP4TextEncoder.py +324 -0
extras/OmniImageEditServer.py +261 -0
extras/QwenBackend.py +174 -0
extras/QwenImageBackend.py +60 -0
extras/ZImageTurboBackend.py +131 -0
extras/compress_mllm.py +83 -0
extras/imagegen_zimage_turbo.sh +18 -0
extras/imagegen_zimage_turbo_int4.sh +11 -0
generation_config.json +13 -0
model.safetensors +3 -0
recipe.yaml +8 -0
tokenizer.json +3 -0
tokenizer_config.json +29 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,117 @@

+{% macro render_extra_keys(json_dict, handled_keys) %}
+    {%- if json_dict is mapping %}
+        {%- for json_key in json_dict if json_key not in handled_keys %}
+            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
+                {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
+            {%- else %}
+                {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{% endmacro %}
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = [] %}
+{%- endif %}
+{%- if system_message is defined %}
+    {{- "<|im_start|>system\n" + system_message }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }}
+    {%- endif %}
+{%- endif %}
+{%- if tools is iterable and tools | length > 0 %}
+    {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }}
+    {{- "<tools>" }}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
+        {%- if tool.description is defined %}
+            {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
+        {%- endif %}
+        {{- '\n<parameters>' }}
+        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
+            {%- for param_name, param_fields in tool.parameters.properties|items %}
+                {{- '\n<parameter>' }}
+                {{- '\n<name>' ~ param_name ~ '</name>' }}
+                {%- if param_fields.type is defined %}
+                    {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
+                {%- endif %}
+                {%- if param_fields.description is defined %}
+                    {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
+                {%- endif %}
+                {%- set handled_keys = ['name', 'type', 'description'] %}
+                {{- render_extra_keys(param_fields, handled_keys) }}
+                {{- '\n</parameter>' }}
+            {%- endfor %}
+        {%- endif %}
+        {% set handled_keys = ['type', 'properties'] %}
+        {{- render_extra_keys(tool.parameters, handled_keys) }}
+        {{- '\n</parameters>' }}
+        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
+        {{- render_extra_keys(tool, handled_keys) }}
+        {{- '\n</function>' }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+{%- endif %}
+{%- if system_message is defined %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in loop_messages %}
+    {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
+            {{- '\n' + message.content | trim + '\n' }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+            {%- if tool_call.arguments is defined %}
+                {%- for args_name, args_value in tool_call.arguments|items %}
+                    {{- '<parameter=' + args_name + '>\n' }}
+                    {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                    {{- args_value }}
+                    {{- '\n</parameter>\n' }}
+                {%- endfor %}
+            {%- endif %}
+            {{- '</function>\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>\n' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,121 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "format": "nvfp4-pack-quantized",
+        "input_activations": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": "local",
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "static_minmax",
+          "observer_kwargs": {},
+          "scale_dtype": "torch.float8_e4m3fn",
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float",
+          "zp_dtype": null
+        },
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "memoryless_minmax",
+          "observer_kwargs": {},
+          "scale_dtype": "torch.float8_e4m3fn",
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float",
+          "zp_dtype": null
+        }
+      }
+    },
+    "format": "nvfp4-pack-quantized",
+    "global_compression_ratio": null,
+    "ignore": [
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed",
+    "sparsity_config": {},
+    "transform_config": {},
+    "version": "0.15.1.dev14+g01a1c9a"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 1000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.2.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

extras/Flux2Backend.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+from transformers import Mistral3ForConditionalGeneration, PixtralProcessor, BitsAndBytesConfig
+from diffusers import Flux2Pipeline, AutoencoderKLFlux2, Flux2Transformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+class Flux2Backend:
+    def __init__(self, model_id):
+        self.model_id = model_id
+        self.pipeline = None
+    def load(self):
+        print(f"Loading Flux2 backend from {self.model_id}...")
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        )
+        # Scheduler
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            self.model_id,
+            subfolder="scheduler",
+            torch_dtype=torch.bfloat16
+        )
+        # VAE - loaded manually with full precision
+        vae = AutoencoderKLFlux2.from_pretrained(
+            self.model_id,
+            subfolder="vae",
+            torch_dtype=torch.float16
+        )
+        tokenizer = PixtralProcessor.from_pretrained(
+            self.model_id,
+            subfolder="tokenizer",
+            torch_dtype=torch.float16
+        )
+        text_encoder = Mistral3ForConditionalGeneration.from_pretrained(
+            self.model_id,
+            subfolder="text_encoder",
+            torch_dtype=torch.float16,
+            quantization_config=quantization_config
+        )
+        dit = Flux2Transformer2DModel.from_pretrained(
+            self.model_id,
+            subfolder="transformer",
+            torch_dtype=torch.float16,
+            quantization_config=quantization_config
+        )
+        # Standard loading without Nunchaku optimization
+        # Constructing pipeline manually rather than from_pretrained
+        pipeline = Flux2Pipeline(
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=dit,
+        )
+        self.pipeline = pipeline
+        self.pipeline.to("cuda")
+        self.pipeline.transformer.set_attention_backend("flash")
+        return self.pipeline, self.pipeline

extras/GlmBackend.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import diffusers
+try:
+    from sdnq import SDNQConfig
+    from sdnq.common import use_torch_compile as triton_is_available
+    from sdnq.loader import apply_sdnq_options_to_model
+    SDNQ_AVAILABLE = True
+except ImportError:
+    print("SDNQ not found, optimized GLM loading will be skipped.")
+    SDNQ_AVAILABLE = False
+class GlmBackend:
+    def __init__(self, model_id="Disty0/GLM-Image-SDNQ-4bit-dynamic"):
+        self.model_id = model_id
+        self.pipeline = None
+    def load(self):
+        print(f"Loading GLM backend from {self.model_id}...")
+        # Load the pipeline
+        # Using bfloat16 as per request snippet
+        pipeline = diffusers.GlmImagePipeline.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+        if SDNQ_AVAILABLE:
+            # Enable INT8 MatMul for GPUs if Triton is available
+            if triton_is_available and (torch.cuda.is_available() or torch.xpu.is_available()):
+                print("Applying SDNQ optimizations (INT8 MatMul)...")
+                pipeline.transformer = apply_sdnq_options_to_model(pipeline.transformer, use_quantized_matmul=True)
+                # pipeline.transformer = torch.compile(pipeline.transformer) # Optional, commented out as in snippet
+            else:
+                print("Triton or CUDA/XPU not available, skipping SDNQ optimization.")
+        print("Enabling CPU offload for GLM pipeline...")
+        pipeline.enable_model_cpu_offload()
+        self.pipeline = pipeline
+        # The user stated: "this one uses same pipe line for image generation and editing"
+        # So we return the same pipeline for both.
+        return self.pipeline, self.pipeline

extras/ImageEditServer.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import argparse
+import base64
+import io
+import time
+import torch
+import uvicorn
+import gc
+import asyncio
+import traceback
+from typing import List, Optional, Union
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from pydantic import BaseModel
+from PIL import Image, ImageOps
+# Argument parsing
+parser = argparse.ArgumentParser(description="Flux Image Edit Server with Nunchaku")
+parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+parser.add_argument("--model", type=str, default="black-forest-labs/FLUX.1-Kontext-dev", help="Path or Repo ID of the base model")
+parser.add_argument("--optimized-model", type=str, default=None, help="Path to the optimized Nunchaku model safetensors file")
+parser.add_argument("--optimized-edit-model", type=str, default=None, help="Path to the optimized Nunchaku model safetensors file for editing (optional)")
+parser.add_argument("--backend", type=str, default="kontext", choices=["kontext", "flux2", "qwen", "glm", "zimage"], help="Backend to use: 'kontext', 'flux2', 'qwen', 'glm', or 'zimage'")
+parser.add_argument("--steps", type=int, default=28, help="Default number of inference steps")
+parser.add_argument("--guidance-scale", type=float, default=3.5, help="Default guidance scale")
+parser.add_argument("--qwenimage", action="store_true", help="Use QwenImageBackend (T2I only) instead of full Qwen edit backend")
+parser.add_argument("--uma", action="store_true", help="Enable Unified Memory Architecture mode (load all to GPU, disable offload)")
+parser.add_argument(
+    "--nvfp4-text-encoder",
+    type=str,
+    default=None,
+    help=(
+        "Path to an NVFP4-pack-quantized HuggingFace text encoder "
+        "(compressed-tensors format). Currently honoured by the zimage backend; "
+        "swaps in vLLM's W4A4 NVFP4 CUTLASS GEMM for ~4x text-encoder VRAM savings."
+    ),
+)
+args = parser.parse_args()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup logic
+    load_model()
+    yield
+    # Shutdown logic (if any) could go here
+app = FastAPI(lifespan=lifespan)
+# Global components
+IMAGE_DIMENSION_ALIGNMENT = 32
+pipeline = None
+edit_pipeline = None
+request_lock = asyncio.Lock()
+is_sleeping_flag = False
+sleep_requested = False
+def load_model():
+    global pipeline, edit_pipeline
+    try:
+        if args.backend == "kontext":
+            import KontextBackend
+            print(f"Initializing KontextBackend...")
+            backend = KontextBackend.KontextBackend(args.model, args.optimized_model)
+            pipeline, edit_pipeline = backend.load()
+        elif args.backend == "flux2":
+            import Flux2Backend
+            print(f"Initializing Flux2Backend...")
+            backend = Flux2Backend.Flux2Backend(args.model)
+            pipeline, edit_pipeline = backend.load()
+        elif args.backend == "glm":
+            import GlmBackend
+            print(f"Initializing GlmBackend...")
+            # Use provided model or default to the one in the snippet if args.model is generic
+            # The user might pass the specific GLM model via --model, or we default in GlmBackend.
+            # Let's pass args.model if it's not the default flux one, otherwise let GlmBackend use its default.
+            model_to_use = args.model if args.model != "black-forest-labs/FLUX.1-Kontext-dev" else "Disty0/GLM-Image-SDNQ-4bit-dynamic"
+            backend = GlmBackend.GlmBackend(model_to_use)
+            pipeline, edit_pipeline = backend.load()
+        elif args.backend.startswith("qwen"):
+            if args.qwenimage:
+                import QwenImageBackend
+                print(f"Initializing QwenImageBackend (T2I only)...")
+                backend = QwenImageBackend.QwenImageBackend(args.model, args.optimized_model)
+                pipeline, edit_pipeline = backend.load()
+            else:
+                import QwenBackend
+                print(f"Initializing QwenBackend...")
+                backend = QwenBackend.QwenBackend(args.model, args.optimized_model, optimized_edit_model_path=args.optimized_edit_model, uma=args.uma)
+                pipeline, edit_pipeline = backend.load()
+        elif args.backend == "zimage":
+            import ZImageTurboBackend
+            print(f"Initializing ZImageTurboBackend...")
+            backend = ZImageTurboBackend.ZImageTurboBackend(
+                args.model,
+                args.optimized_model,
+                uma=args.uma,
+                nvfp4_text_encoder_path=args.nvfp4_text_encoder,
+            )
+            pipeline, edit_pipeline = backend.load()
+        else:
+            raise ValueError(f"Unknown backend: {args.backend}")
+    except Exception as e:
+        print(f"Oh no! The model refused to wake up: {e}")
+        raise e
+    # Enable progress bar for diffusers
+    import diffusers.utils.logging
+    diffusers.utils.logging.enable_progress_bar()
+    diffusers.utils.logging.set_verbosity_info()
+    print("Model loaded successfully! Ready for editing quests!")
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+class ImageGenerationRequest(BaseModel):
+    prompt: str
+    n: int = 1
+    size: str = "1024x1024"
+    response_format: str = "b64_json"
+    quality: str = "standard"
+    style: str = "vivid"
+    num_inference_steps: Optional[int] = None
+    guidance_scale: Optional[float] = None
+    negative_prompt: Optional[str] = None
+    seed: Optional[int] = None
+@app.post("/v1/sleep")
+async def sleep_endpoint():
+    global is_sleeping_flag, sleep_requested
+    sleep_requested = True
+    try:
+        async with request_lock:
+            if not is_sleeping_flag and sleep_requested:
+                print("Sleep requested, moving models to CPU...")
+                for p in [pipeline, edit_pipeline]:
+                    if not p: continue
+                    for name, component in p.components.items():
+                        if isinstance(component, torch.nn.Module):
+                            # Special handling for Nunchaku which blocks .to() if offload is True
+                            if hasattr(component, "set_offload") and getattr(component, "offload", False):
+                                component.set_offload(False)
+                                component._nunchaku_was_offloaded = True
+                            try:
+                                component.to("cpu")
+                            except Exception as e:
+                                pass
+                flush()
+                is_sleeping_flag = True
+    finally:
+        sleep_requested = False
+    return {"status": "sleep completed", "is_sleeping": is_sleeping_flag}
+@app.post("/v1/wake_up")
+async def wake_up_endpoint():
+    global is_sleeping_flag, sleep_requested
+    sleep_requested = False
+    async with request_lock:
+        if is_sleeping_flag:
+            print("Waking up, restoring models to CUDA...")
+            for p in [pipeline, edit_pipeline]:
+                if not p: continue
+                excluded = getattr(p, "_exclude_from_cpu_offload", [])
+                for name, component in p.components.items():
+                    if isinstance(component, torch.nn.Module):
+                        if getattr(component, "_nunchaku_was_offloaded", False):
+                            component.set_offload(True, use_pin_memory=True, num_blocks_on_gpu=8)
+                            for attr in ["img_in", "txt_in", "txt_norm", "time_text_embed", "norm_out", "proj_out"]:
+                                if hasattr(component, attr):
+                                    try:
+                                        getattr(component, attr).to("cuda")
+                                    except Exception:
+                                        pass
+                            component._nunchaku_was_offloaded = False
+                        elif not hasattr(component, "_hf_hook") or name in excluded:
+                            try:
+                                component.to("cuda")
+                            except Exception:
+                                pass
+            is_sleeping_flag = False
+    return {"status": "awoken", "is_sleeping": False}
+@app.get("/v1/is_sleeping")
+async def is_sleeping_endpoint():
+    return {"is_sleeping": is_sleeping_flag}
+@app.get("/v1/memory_stats")
+async def memory_stats_endpoint():
+    """Lightweight introspection endpoint that returns PyTorch's CUDA allocator
+    snapshot. Used to diagnose VRAM/UMA bloat without restarting the server."""
+    stats = {}
+    if torch.cuda.is_available():
+        stats["allocated_gb"] = torch.cuda.memory_allocated() / 1e9
+        stats["reserved_gb"] = torch.cuda.memory_reserved() / 1e9
+        stats["max_allocated_gb"] = torch.cuda.max_memory_allocated() / 1e9
+        stats["max_reserved_gb"] = torch.cuda.max_memory_reserved() / 1e9
+        # Top allocations by size from the allocator snapshot (>=64 MiB)
+        try:
+            snap = torch.cuda.memory_snapshot()
+            blocks = []
+            for seg in snap:
+                for b in seg.get("blocks", []):
+                    if b.get("state") == "active_allocated" and b.get("size", 0) >= 64 * 1024 * 1024:
+                        blocks.append(b["size"])
+            blocks.sort(reverse=True)
+            stats["large_active_blocks_gb"] = [round(s / 1e9, 3) for s in blocks[:20]]
+            stats["large_active_blocks_total_gb"] = round(sum(blocks) / 1e9, 3)
+            stats["large_active_blocks_count"] = len(blocks)
+        except Exception as e:
+            stats["snapshot_error"] = str(e)
+        # Walk Python objects to find big tensors and group them
+        try:
+            import gc as _gc
+            seen = set()
+            big = []
+            for obj in _gc.get_objects():
+                try:
+                    if isinstance(obj, torch.Tensor) and obj.is_cuda:
+                        ptr = obj.data_ptr()
+                        if ptr in seen or ptr == 0:
+                            continue
+                        seen.add(ptr)
+                        sz = obj.element_size() * obj.numel()
+                        if sz >= 16 * 1024 * 1024:
+                            big.append((sz, tuple(obj.shape), str(obj.dtype)))
+                except Exception:
+                    continue
+            big.sort(reverse=True)
+            # Group by (shape, dtype)
+            from collections import Counter
+            grouped = Counter((shape, dtype) for _, shape, dtype in big)
+            stats["big_tensor_groups"] = [
+                {"shape": list(shape), "dtype": dtype, "count": cnt,
+                 "size_gb_each": round(
+                     (1 if shape == () else (lambda l: __import__('functools').reduce(lambda a, b: a*b, l, 1))(shape)) * (
+                         8 if 'int64' in dtype or 'float64' in dtype else
+                         4 if 'int32' in dtype or 'float32' in dtype else
+                         2 if 'bfloat16' in dtype or 'float16' in dtype else 1
+                     ) / 1e9, 4)}
+                for (shape, dtype), cnt in grouped.most_common(30)
+            ]
+            stats["big_tensor_count"] = len(big)
+            stats["big_tensor_total_gb"] = round(sum(s for s, _, _ in big) / 1e9, 3)
+        except Exception as e:
+            stats["walk_error"] = str(e)
+    return stats
+@app.post("/v1/images/edits")
+async def edit_image(
+    image: Union[List[UploadFile], UploadFile] = File(...),
+    prompt: str = Form(...),
+    n: int = Form(1),
+    size: str = Form("1024x1024"),
+    response_format: str = Form("b64_json"), # Default to b64_json
+    guidance_scale: Optional[float] = Form(None),
+    num_inference_steps: Optional[int] = Form(None),
+    negative_prompt: Optional[str] = Form(None),
+    seed: Optional[int] = Form(None)
+):
+    # Use CLI defaults if not provided
+    steps = num_inference_steps if num_inference_steps is not None else args.steps
+    cfg_scale = guidance_scale if guidance_scale is not None else args.guidance_scale
+    neg_prompt = negative_prompt if negative_prompt is not None else "" # Default empty for now, or maybe None?
+    generator = None
+    import random
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+    print(f"Using seed: {seed}")
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    if not edit_pipeline:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    if sleep_requested or is_sleeping_flag:
+        raise HTTPException(status_code=503, detail="Server is sleeping or trying to sleep.")
+    async with request_lock:
+        print(f"Received edit request: {prompt}")
+        # Processing the input image(s)
+        input_files = image if isinstance(image, list) else [image]
+        init_images = []
+        try:
+            for img_file in input_files:
+                await img_file.seek(0)
+                contents = await img_file.read()
+                img = Image.open(io.BytesIO(contents)).convert("RGB")
+                init_images.append(img)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Invalid image file: {e}")
+        if not init_images:
+             raise HTTPException(status_code=400, detail="No images provided")
+        # Parse max target dimensions from requested size
+        try:
+            target_width, target_height = map(int, size.split("x"))
+        except ValueError:
+            target_width, target_height = 1024, 1024
+        # Calculate new dimensions preserving aspect ratio based on the first image
+        first_image = init_images[0]
+        orig_width, orig_height = first_image.size
+        scale = min(target_width / orig_width, target_height / orig_height)
+        new_width = int(orig_width * scale)
+        new_height = int(orig_height * scale)
+        # Ensure dimensions are aligned to 32 for compatibility (e.g. GLM-Image)
+        width = (new_width // IMAGE_DIMENSION_ALIGNMENT) * IMAGE_DIMENSION_ALIGNMENT
+        height = (new_height // IMAGE_DIMENSION_ALIGNMENT) * IMAGE_DIMENSION_ALIGNMENT
+        # Resize input images to match the calculated target size, padding if necessary
+        resized_images = []
+        for img in init_images:
+            if img.size != (width, height):
+                # Use ImageOps.pad to preserve aspect ratio and center in the target size
+                # This handles cases where subsequent images might have different ARs
+                img = ImageOps.pad(img, (width, height), method=Image.LANCZOS, color=(0, 0, 0))
+            resized_images.append(img)
+        # If single image, pass as item, if multiple, pass as list
+        # GLM pipeline has a bug where it checks len() on the input, so it must be a list
+        if len(resized_images) > 1 or args.backend == "glm":
+            image_input = resized_images
+        else:
+            image_input = resized_images[0]
+        response_images = []
+        try:
+            if args.backend.startswith("qwen"):
+                # Qwen specific parameters
+                # guidance_scale maps to true_cfg_scale
+                if args.qwenimage: # QwenImageBackend is T2I only, so it doesn't take an image
+                    generated_images = edit_pipeline(
+                        prompt=prompt,
+                        height=height,
+                        width=width,
+                        num_inference_steps=steps,
+                        true_cfg_scale=cfg_scale,
+                        num_images_per_prompt=n,
+                        generator=generator,
+                    ).images
+                else: # Full Qwen edit backend takes an image (or list of images now)
+                    generated_images = edit_pipeline(
+                        image=image_input,
+                        prompt=prompt,
+                        height=height,
+                        width=width,
+                        negative_prompt=neg_prompt,
+                        num_inference_steps=steps,
+                        true_cfg_scale=cfg_scale,
+                        num_images_per_prompt=n,
+                        generator=generator,
+                    ).images
+            else:
+                # Standard Flux/Kontext or GLM
+                # GLM I2I Fix: Manually move vision encoder to GPU because get_image_features escapes hooks
+                if args.backend == "glm" and hasattr(edit_pipeline, "vision_language_encoder"):
+                     print("Manually moving GLM Vision Encoder to GPU...")
+                     edit_pipeline.vision_language_encoder.to("cuda")
+                try:
+                    generated_images = edit_pipeline(
+                        image=image_input,
+                        prompt=prompt,
+                        height=height,
+                        width=width,
+                        num_inference_steps=steps,
+                        guidance_scale=cfg_scale,
+                        num_images_per_prompt=n,
+                        generator=generator,
+                    ).images
+                finally:
+                    if args.backend == "glm" and hasattr(edit_pipeline, "vision_language_encoder"):
+                         print("Moving GLM Vision Encoder back to CPU...")
+                         edit_pipeline.vision_language_encoder.to("cpu")
+            for img in generated_images:
+                buffered = io.BytesIO()
+                img.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                if response_format == "b64_json":
+                    response_images.append({"b64_json": img_str})
+                else:
+                    # If url is requested we can't really do it without storage, so we fallback or error?
+                    # For now, let's just assume simple b64_json as per request
+                    response_images.append({"b64_json": img_str}) # Fallback
+        except Exception as e:
+            print(f"Error during editing: {e}")
+            print(traceback.format_exc())
+            raise HTTPException(status_code=500, detail=str(e))
+        finally:
+            flush()
+        return {
+            "created": int(time.time()),
+            "data": response_images
+        }
+@app.post("/v1/images/generations")
+async def generate_image(request: ImageGenerationRequest):
+    if not pipeline:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    if sleep_requested or is_sleeping_flag:
+        raise HTTPException(status_code=503, detail="Server is sleeping or trying to sleep.")
+    async with request_lock:
+        #print(f"Received generation request: {request.prompt}")
+        # Parse size
+        try:
+            width, height = map(int, request.size.split("x"))
+        except ValueError:
+            width, height = 1024, 1024
+        # Ensure dimensions are aligned to 32
+        width = (width // IMAGE_DIMENSION_ALIGNMENT) * IMAGE_DIMENSION_ALIGNMENT
+        height = (height // IMAGE_DIMENSION_ALIGNMENT) * IMAGE_DIMENSION_ALIGNMENT
+        response_images = []
+        try:
+            # Generate images (no image argument for txt2img!)
+            steps = request.num_inference_steps if request.num_inference_steps is not None else args.steps
+            cfg_scale = request.guidance_scale if request.guidance_scale is not None else args.guidance_scale
+            # negative_prompt not in standard request body in original snippet, but we added it to model
+            neg_prompt = request.negative_prompt if request.negative_prompt is not None else ""
+            generator = None
+            import random
+            seed = request.seed
+            if seed is None:
+                seed = random.randint(0, 2**32 - 1)
+            print(f"Using seed: {seed}")
+            generator = torch.Generator(device="cuda").manual_seed(seed)
+            if args.backend.startswith("qwen"):
+                 generated_images = pipeline(
+                    prompt=request.prompt,
+                    height=height,
+                    width=width,
+                    num_inference_steps=steps,
+                    true_cfg_scale=cfg_scale,
+                    num_images_per_prompt=request.n,
+                    negative_prompt=neg_prompt,
+                    generator=generator,
+                ).images
+            else:
+                generated_images = pipeline(
+                    prompt=request.prompt,
+                    height=height,
+                    width=width,
+                    num_inference_steps=steps,
+                    guidance_scale=cfg_scale,
+                    num_images_per_prompt=request.n,
+                    generator=generator,
+                    # Not passing negative_prompt here for generation unless we confirm support in standard Flux pipeline?
+                ).images
+            for img in generated_images:
+                buffered = io.BytesIO()
+                img.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                response_images.append({"b64_json": img_str})
+        except Exception as e:
+            print(f"Error during generation: {e}")
+            print(traceback.format_exc())
+            raise HTTPException(status_code=500, detail=str(e))
+        finally:
+            flush()
+        return {
+            "created": int(time.time()),
+            "data": response_images
+        }
+if __name__ == "__main__":
+    uvicorn.run(app, host=args.host, port=args.port)

extras/ImageGenClient.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import argparse
+import sys
+import os
+import base64
+import time
+import io
+import requests
+from PIL import Image
+# Oh, hello there! Nikola here, ready to help this little client talk to the big server!
+# It's like sending a messenger bird from our village to the capital!
+def main():
+    # Peeking at the arguments... gotta make sure we have all our supplies for the journey!
+    parser = argparse.ArgumentParser(description="ImageGen Client - A little seeker tool!")
+    parser.add_argument("--host", type=str, default="localhost", help="Where the server lives (Host)")
+    parser.add_argument("--port", type=int, default=8000, help="The door to knock on (Port)")
+    parser.add_argument("--num_images", type=int, default=1, help="How many pictures to paint?")
+    parser.add_argument("--image_folder", type=str, default="generated_images", help="Where to keep our treasures")
+    # Changing defaults to None so we can use input image size if needed!
+    parser.add_argument("--width", type=int, default=None, help="Canvas width (default: 1024 or input image size)")
+    parser.add_argument("--height", type=int, default=None, help="Canvas height (default: 1024 or input image size)")
+    # New shiny tools for our quest!
+    parser.add_argument("--input", type=str, default=None, help="Path to an input image (for image-to-image magic!)")
+    parser.add_argument("--max-size", type=int, default=1024, help="Max size for the input image (we don't want it to get too heavy for the bird!)")
+    args = parser.parse_args()
+    # Reading the prompt from the spirits... I mean, stdin!
+    # "What do you desire to see?" *sparkle*
+    print("Waiting for a prompt from stdin... (Type something and press Ctrl+D!)")
+    try:
+        prompt = sys.stdin.read().strip()
+    except Exception as e:
+        print(f"Oh no! The spirits were silent (stdin error): {e}")
+        return
+    if not prompt:
+        print("Aww, the prompt was empty! The canvas remains blank.")
+        return
+    print(f"Yay! We got a prompt: '{prompt}'")
+    # Restoring the canvas size variables from the journey's start!
+    final_width = args.width
+    final_height = args.height
+    # Prepare prompt and payload
+    url_gen = f"http://{args.host}:{args.port}/v1/images/generations"
+    url_edit = f"http://{args.host}:{args.port}/v1/images/edits"
+    try:
+        if args.input:
+            print(f"Oh! You brought a reference image: {args.input}. Let's go to the Editing Shrine!")
+            # Prepare for multipart upload
+            # We need to open the image file effectively
+            if not os.path.exists(args.input):
+                print(f"Eek! I can't find the image at {args.input}")
+                return
+            # Open image to ensure it's valid and memory-friendly resize if needed
+            with Image.open(args.input) as img:
+                img = img.convert("RGB")
+                w, h = img.size
+                max_dim = max(w, h)
+                if max_dim > args.max_size:
+                    scale = args.max_size / max_dim
+                    new_w = int(w * scale)
+                    new_h = int(h * scale)
+                    print(f"Resizing big image from {w}x{h} to {new_w}x{new_h}. Compact and cute!")
+                    img = img.resize((new_w, new_h), Image.LANCZOS)
+                if final_width is None: final_width = img.width
+                if final_height is None: final_height = img.height
+                # Save to buffer for upload
+                buffered = io.BytesIO()
+                img.save(buffered, format="PNG")
+                buffered.seek(0)
+                image_bytes = buffered.getvalue()
+            # Construct multipart payload
+            files = {
+                'image': ('input.png', image_bytes, 'image/png')
+            }
+            data = {
+                'prompt': prompt,
+                'n': args.num_images,
+                'size': f"{final_width}x{final_height}",
+                'response_format': 'b64_json',
+                'guidance_scale': 2.5 # Default specific to edit/kontext if needed
+            }
+            print(f"Sending input image to {url_edit}... *whoosh*")
+            response = requests.post(url_edit, files=files, data=data)
+        else:
+            # Standard Generation
+            print("Just a prompt? Off to the Creation Forge!")
+            if final_width is None: final_width = 1024
+            if final_height is None: final_height = 1024
+            payload = {
+                "prompt": prompt,
+                "n": args.num_images,
+                "size": f"{final_width}x{final_height}",
+                "response_format": "b64_json"
+            }
+            print(f"Sending prompt to {url_gen}... *sparkle*")
+            response = requests.post(url_gen, json=payload)
+        response.raise_for_status()
+        data = response.json()
+        # Making sure we have a chest for our treasures
+        if not os.path.exists(args.image_folder):
+            print(f"Creating a new treasure chest at {args.image_folder}...")
+            os.makedirs(args.image_folder)
+        # Unpacking the magic
+        images = data.get("data", [])
+        print(f"Ooh! The server sent back {len(images)} masterpieces!")
+        for i, img_data in enumerate(images):
+            # Decoding the spell
+            img_bytes = base64.b64decode(img_data["b64_json"])
+            timestamp = int(time.time())
+            filename = f"image_{timestamp}_{i}.png"
+            filepath = os.path.join(args.image_folder, filename)
+            with open(filepath, "wb") as f:
+                f.write(img_bytes)
+            print(f"Saved masterpiece #{i+1} to {filepath}! It sparkles!")
+    except requests.exceptions.ConnectionError:
+        print("Oh no! The server didn't answer. Is it sleeping? (Connection Refused)")
+        print("Maybe check if the host and port are correct? We tried: " + url)
+    except Exception as e:
+        print(f"Eek! Something went wrong on the journey: {e}")
+        # We'll give it a gentle hug and try to understand...
+        print("Don't worry, we can try again later!")
+if __name__ == "__main__":
+    main()

extras/ImageGenServer.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import argparse
+import base64
+import io
+import time
+import torch
+import uvicorn
+import gc
+import asyncio
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from diffusers import FluxPipeline
+from nunchaku import NunchakuFluxTransformer2dModel
+# Argument parsing
+parser = argparse.ArgumentParser(description="Flux Image Generation Server with Nunchaku")
+parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+parser.add_argument("--model", type=str, default="black-forest-labs/FLUX.1-dev", help="Path or Repo ID of the base model")
+parser.add_argument("--optimized-model", type=str, required=True, help="Path to the optimized Nunchaku model safetensors file")
+args = parser.parse_args()
+app = FastAPI()
+# Global components
+pipeline = None
+request_lock = asyncio.Lock()
+def load_model():
+    global pipeline
+    print(f"Loading base model from {args.model}...")
+    print(f"Loading optimized transformer from {args.optimized_model}...")
+    try:
+        # Load the optimized transformer
+        transformer = NunchakuFluxTransformer2dModel.from_pretrained(args.optimized_model)
+        # Load the pipeline with the optimized transformer
+        pipeline = FluxPipeline.from_pretrained(
+            args.model,
+            transformer=transformer,
+            torch_dtype=torch.bfloat16,
+        ).to("cuda")
+        pipeline.transformer.set_attention_backend("flash")
+        pipeline.enable_model_cpu_offload()
+        pipeline.enable_vae_tiling()
+        pipeline.enable_vae_slicing()
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        raise e
+    print("Model loaded successfully!")
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+class ImageGenerationRequest(BaseModel):
+    prompt: str
+    n: int = 1
+    size: str = "1024x1024"
+    response_format: str = "b64_json"
+    quality: str = "standard"
+    style: str = "vivid"
+@app.on_event("startup")
+async def startup_event():
+    load_model()
+@app.post("/v1/images/generations")
+async def generate_image(request: ImageGenerationRequest):
+    if not pipeline:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    async with request_lock:
+        print(f"Received request: {request.prompt}")
+        # Parse size
+        try:
+            width, height = map(int, request.size.split("x"))
+        except ValueError:
+            width, height = 1024, 1024
+        # Flux requires dimensions to be multiples of 16 (or 8 depending on VAE)
+        # Standard Flux dev usually works well with 1024x1024
+        # We'll ensure they are divisible by 16 just in case
+        width = (width // 16) * 16
+        height = (height // 16) * 16
+        images = []
+        try:
+            # Generate images
+            generated_images = pipeline(
+                request.prompt,
+                height=height,
+                width=width,
+                num_inference_steps=4, # Standard for Flux Dev
+                guidance_scale=3.5,     # Nunchaku example uses 3.5, previous code used 4.0. Let's stick to 3.5 or 4.0. Example says 3.5.
+                num_images_per_prompt=request.n
+            ).images
+            for image in generated_images:
+                buffered = io.BytesIO()
+                image.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                images.append({"b64_json": img_str})
+        except Exception as e:
+            print(f"Error during generation: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+        finally:
+            flush()
+        return {
+            "created": int(time.time()),
+            "data": images
+        }
+if __name__ == "__main__":
+    uvicorn.run(app, host=args.host, port=args.port)

extras/ImageGenServer_cpu.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import argparse
+import base64
+import io
+import time
+import torch
+import uvicorn
+import numpy as np
+import gc
+import asyncio
+from fastapi import FastAPI, HTTPException, Request
+from accelerate import infer_auto_device_map, dispatch_model
+from pydantic import BaseModel
+from diffusers import (
+    Flux2Pipeline,
+    Flux2Transformer2DModel,
+    AutoencoderKLFlux2,
+    FlowMatchEulerDiscreteScheduler
+)
+from diffusers.pipelines.flux2.pipeline_flux2 import compute_empirical_mu, retrieve_timesteps
+from diffusers.pipelines.flux2.image_processor import Flux2ImageProcessor
+from transformers import Mistral3ForConditionalGeneration, AutoProcessor
+# Argument parsing
+parser = argparse.ArgumentParser(description="Flux2 Image Generation Server")
+parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+parser.add_argument("--model", type=str, default="black-forest-labs/FLUX.1-dev", help="Path or Repo ID of the model")
+args = parser.parse_args()
+app = FastAPI()
+# Global components
+text_encoder = None
+tokenizer = None
+transformer = None
+vae = None
+scheduler = None
+image_processor = None
+request_lock = asyncio.Lock()
+# Device maps
+text_encoder_map = None
+transformer_map = None
+vae_map = None
+GPU_MEMORY_FRACTION = 0.90
+def load_model():
+    global text_encoder, tokenizer, transformer, vae, scheduler, image_processor
+    global text_encoder_map, transformer_map, vae_map
+    print(f"Loading model from {args.model}...")
+    try:
+        print("Loading Flux2 components...")
+        # Calculate max memory per GPU
+        #max_memory = {}
+        #if torch.cuda.is_available():
+        #    for i in range(torch.cuda.device_count()):
+        #        total_mem = torch.cuda.get_device_properties(i).total_memory
+        #        max_memory[i] = int(total_mem * GPU_MEMORY_FRACTION)
+        max_memory = {
+          0: "5GB",   # leave a little headroom
+ #        1: "10GB",
+          "cpu": "120GB"  # your 128GB RAM minus OS
+        }
+        # Load Text Encoder (Mistral3) on CPU
+        print("Loading Text Encoder on CPU...")
+        text_encoder = Mistral3ForConditionalGeneration.from_pretrained(
+            args.model,
+            subfolder="text_encoder",
+            torch_dtype=torch.bfloat16,
+            device_map="cpu"
+        )
+        print("Calculating Text Encoder device map...")
+        text_encoder_map = infer_auto_device_map(text_encoder, max_memory=max_memory)
+        # Load Tokenizer on CPU
+        print("Loading Tokenizer on CPU...")
+        tokenizer = AutoProcessor.from_pretrained(
+            args.model,
+            subfolder="tokenizer",
+            device_map="cpu"
+        )
+        # Load Transformer on CPU
+        print("Loading Transformer on CPU...")
+        transformer = Flux2Transformer2DModel.from_pretrained(
+            args.model,
+            subfolder="transformer",
+            torch_dtype=torch.bfloat16,
+            device_map="cpu"
+        )
+        print("Calculating Transformer device map...")
+        transformer_map = infer_auto_device_map(transformer, max_memory=max_memory)
+        # Load VAE on CPU
+        print("Loading VAE on CPU...")
+        vae = AutoencoderKLFlux2.from_pretrained(
+            args.model,
+            subfolder="vae",
+            torch_dtype=torch.bfloat16,
+            device_map="cpu"
+        )
+        print("Calculating VAE device map...")
+        vae_map = infer_auto_device_map(vae, max_memory=max_memory)
+        # Initialize Scheduler
+        print("Initializing Scheduler...")
+        scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
+            args.model,
+            subfolder="scheduler"
+        )
+        # Initialize Image Processor
+        print("Initializing Image Processor...")
+        # VAE scale factor logic from pipeline
+        vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+        image_processor = Flux2ImageProcessor(vae_scale_factor=vae_scale_factor * 2)
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        raise e
+    print("Model loaded successfully!")
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+class ImageGenerationRequest(BaseModel):
+    prompt: str
+    n: int = 1
+    size: str = "1024x1024"
+    response_format: str = "b64_json"
+    quality: str = "standard"
+    style: str = "vivid"
+@app.on_event("startup")
+async def startup_event():
+    load_model()
+@app.post("/v1/images/generations")
+async def generate_image(request: ImageGenerationRequest):
+    if not transformer:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    async with request_lock:
+        print(f"Received request: {request.prompt}")
+        # Parse size
+        try:
+            width, height = map(int, request.size.split("x"))
+        except ValueError:
+            width, height = 1024, 1024
+        num_inference_steps = 28
+        guidance_scale = 4.0
+        max_sequence_length = 512
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+        images = []
+        # 1. Generate embeddings on CPU
+        print("Generating embeddings...")
+        flush()
+        prompt_embeds = Flux2Pipeline._get_mistral_3_small_prompt_embeds(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            prompt=request.prompt,
+#            device=torch.device("cpu"),
+            max_sequence_length=max_sequence_length
+        )
+#       prompt_embeds = prompt_embeds.to("cuda")
+        # 2. Prepare Latents
+        # Flux latents are turned into 2x2 patches and packed.
+        # This means the latent width and height has to be divisible by the patch size.
+        # So the vae scale factor is multiplied by the patch size to account for this
+        vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+        height = height or 1024
+        width = width or 1024
+        # Resize to be divisible by vae_scale_factor * 2
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        num_channels_latents = transformer.config.in_channels // 4
+        shape = (1, num_channels_latents * 4, height // 2, width // 2)
+        # 3. Prepare IDs
+        # We need to prepare text_ids and latent_ids
+        # prompt_embeds shape: (batch_size, seq_len, hidden_dim)
+        batch_size, seq_len, _ = prompt_embeds.shape
+        # Repeat for num_images_per_prompt (assuming 1 for now per loop iteration)
+        # If request.n > 1, we loop outside or handle batching. Here we loop outside.
+        # Prepare text IDs
+        text_ids = Flux2Pipeline._prepare_text_ids(prompt_embeds).to(device)
+        for _ in range(request.n):
+            # Generate random latents
+            latents = torch.randn(shape, device=device, dtype=dtype)
+            # Prepare latent IDs
+            latent_ids = Flux2Pipeline._prepare_latent_ids(latents).to(device)
+            # Pack latents
+            packed_latents = Flux2Pipeline._pack_latents(latents)
+            # 4. Prepare Timesteps
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+            image_seq_len = packed_latents.shape[1]
+            mu = compute_empirical_mu(image_seq_len=image_seq_len, num_steps=num_inference_steps)
+            timesteps, num_inference_steps = retrieve_timesteps(
+                scheduler,
+                num_inference_steps,
+                device,
+                sigmas=sigmas,
+                mu=mu,
+            )
+            # --- SWAP TRANSFORMER TO CUDA ---
+            print("Moving Transformer to CUDA...")
+            flush()
+            dispatch_model(transformer, device_map=transformer_map)
+            # 5. Denoising Loop
+            print("Starting denoising loop on CUDA...")
+            scheduler.set_begin_index(0)
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(packed_latents.shape[0])
+            for i, t in enumerate(timesteps):
+                start_time = time.time()
+                # broadcast to batch dimension
+                timestep = t.expand(packed_latents.shape[0]).to(packed_latents.dtype)
+                noise_pred = transformer(
+                    hidden_states=packed_latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_ids,
+                    return_dict=False,
+                )[0]
+                # step
+                packed_latents = scheduler.step(noise_pred, t, packed_latents, return_dict=False)[0]
+                step_time = time.time() - start_time
+                print(f"Step {i+1}/{num_inference_steps}: {step_time:.2f}s")
+            # --- SWAP TRANSFORMER TO CPU ---
+            print("Moving Transformer to CPU...")
+            transformer.to("cpu")
+            flush()
+            # --- SWAP VAE TO CUDA ---
+            print("Moving VAE to CUDA...")
+            dispatch_model(vae, device_map=vae_map)
+            # 6. Decode
+            print("Decoding on CUDA...")
+            # Move packed_latents to CUDA for decoding (already there, but ensuring)
+            packed_latents = packed_latents.to(device)
+            latent_ids = latent_ids.to(device)
+            latents = Flux2Pipeline._unpack_latents_with_ids(packed_latents, latent_ids)
+            latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype)
+            latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps).to(
+                latents.device, latents.dtype
+            )
+            latents = latents * latents_bn_std + latents_bn_mean
+            latents = Flux2Pipeline._unpatchify_latents(latents)
+            image = vae.decode(latents, return_dict=False)[0]
+            image = image_processor.postprocess(image, output_type="pil")[0]
+            # --- SWAP VAE TO CPU ---
+            print("Moving VAE to CPU...")
+            vae.to("cpu")
+            # Convert to base64
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            images.append({"b64_json": img_str})
+        return {
+            "created": int(time.time()),
+            "data": images
+        }
+if __name__ == "__main__":
+    uvicorn.run(app, host=args.host, port=args.port)

extras/ImageGenServer_new.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import argparse
+import base64
+import io
+import time
+import torch
+import uvicorn
+import gc
+import asyncio
+from typing import Optional
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from diffusers import FluxPipeline, FluxKontextPipeline
+from nunchaku import NunchakuFluxTransformer2dModel
+from PIL import Image
+# Argument parsing
+parser = argparse.ArgumentParser(description="Flux Image Generation Server with Nunchaku")
+parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+parser.add_argument("--model", type=str, default="black-forest-labs/FLUX.1-dev", help="Path or Repo ID of the base model")
+parser.add_argument("--optimized-model", type=str, required=True, help="Path to the optimized Nunchaku model safetensors file")
+args = parser.parse_args()
+app = FastAPI()
+# Global components
+pipeline = None
+img2img_pipeline = None
+request_lock = asyncio.Lock()
+def load_model():
+    global pipeline, img2img_pipeline
+    print(f"Loading base model from {args.model}...")
+    print(f"Loading optimized transformer from {args.optimized_model}...")
+    try:
+        # Load the optimized transformer
+        # Ensuring transformer is in bfloat16 to match the pipeline expectation
+        transformer = NunchakuFluxTransformer2dModel.from_pretrained(args.optimized_model)
+        # Load the pipeline with the optimized transformer
+        pipeline = FluxPipeline.from_pretrained(
+            args.model,
+            transformer=transformer,
+            torch_dtype=torch.bfloat16,
+        ).to("cuda")
+        # Load the Img2Img/Context pipeline sharing the same components
+        # We use strict component sharing to avoid VRAM duplication
+        print("Initializing FluxKontextPipeline for image inputs...")
+        # Since FluxKontextPipeline shares architecture with FluxPipeline, we can initialize it with the same components
+        img2img_pipeline = FluxKontextPipeline.from_pretrained(
+            args.model,
+            transformer=pipeline.transformer,
+            vae=pipeline.vae,
+            text_encoder=pipeline.text_encoder,
+            text_encoder_2=pipeline.text_encoder_2,
+            tokenizer=pipeline.tokenizer,
+            tokenizer_2=pipeline.tokenizer_2,
+            scheduler=pipeline.scheduler,
+            torch_dtype=torch.bfloat16
+        ).to("cuda")
+        # Enable CPU offload for the main pipeline.
+        # Since components are shared, this should handle memory management for both.
+        pipeline.enable_model_cpu_offload()
+        # img2img_pipeline.enable_model_cpu_offload() # Avoid double hook registration
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        raise e
+    print("Model loaded successfully!")
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+class ImageGenerationRequest(BaseModel):
+    prompt: str
+    n: int = 1
+    size: str = "1024x1024"
+    response_format: str = "b64_json"
+    quality: str = "standard"
+    style: str = "vivid"
+    image: Optional[str] = None # Base64 encoded image
+@app.on_event("startup")
+async def startup_event():
+    load_model()
+@app.post("/v1/images/generations")
+async def generate_image(request: ImageGenerationRequest):
+    if not pipeline:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    async with request_lock:
+        print(f"Received request: {request.prompt}")
+        # Parse size
+        try:
+            width, height = map(int, request.size.split("x"))
+        except ValueError:
+            width, height = 1024, 1024
+        # Flux requires dimensions to be multiples of 16 (or 8 depending on VAE)
+        # Standard Flux dev usually works well with 1024x1024
+        # We'll ensure they are divisible by 16 just in case
+        width = (width // 16) * 16
+        height = (height // 16) * 16
+        images = []
+        try:
+            input_image = None
+            if request.image:
+                try:
+                    # Handle data URI if present
+                    img_data = request.image
+                    if "," in img_data:
+                        img_data = img_data.split(",")[1]
+                    input_bytes = base64.b64decode(img_data)
+                    input_image = Image.open(io.BytesIO(input_bytes)).convert("RGB")
+                    # Resize input image to match request size
+                    input_image = input_image.resize((width, height), Image.LANCZOS)
+                    print(f"Processed input image of size {input_image.size}")
+                except Exception as e:
+                    print(f"Failed to decode input image: {e}")
+                    raise HTTPException(status_code=400, detail="Invalid image data")
+            # Generate images
+            if input_image:
+                # Use FluxKontextPipeline
+                print("Running FluxKontextPipeline...")
+                generated_images = pipeline(
+                    image=input_image,
+                    prompt=request.prompt,
+                    height=height,
+                    width=width,
+                    num_inference_steps=28,
+                    guidance_scale=2.5, # Recommended for Kontext
+                    num_images_per_prompt=request.n
+                ).images
+            else:
+                # Use standard FluxPipeline
+                print("Running FluxPipeline...")
+                generated_images = pipeline(
+                    request.prompt,
+                    height=height,
+                    width=width,
+                    num_inference_steps=28, # Standard for Flux Dev
+                    guidance_scale=3.5,     # Nunchaku example uses 3.5
+                    num_images_per_prompt=request.n
+                ).images
+            for image in generated_images:
+                buffered = io.BytesIO()
+                image.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                images.append({"b64_json": img_str})
+        except Exception as e:
+            print(f"Error during generation: {e}")
+            raise HTTPException(status_code=500, detail=str(e))
+        finally:
+            flush()
+        return {
+            "created": int(time.time()),
+            "data": images
+        }
+if __name__ == "__main__":
+    uvicorn.run(app, host=args.host, port=args.port)

extras/KontextBackend.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+from transformers import T5EncoderModel, BitsAndBytesConfig
+from diffusers import FluxKontextPipeline
+class KontextBackend:
+    def __init__(self, model_id, optimized_model_path=None):
+        self.model_id = model_id
+        self.optimized_model_path = optimized_model_path
+        self.pipeline = None
+    def load(self):
+        print(f"Loading Kontext backend from {self.model_id}...")
+        if self.optimized_model_path:
+            print(f"Loading optimized transformer from {self.optimized_model_path}...")
+            # Load the optimized transformer (Nunchaku style! *hyah!*)
+            try:
+                from nunchaku import NunchakuFluxTransformer2dModel
+            except ImportError:
+                 print("Oops, nunchaku not found! Please install it for optimized magic.")
+                 raise
+            transformer = NunchakuFluxTransformer2dModel.from_pretrained(self.optimized_model_path)
+            text_quant_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_use_double_quant=True
+            )
+            text_encoder_2_4bit = T5EncoderModel.from_pretrained(
+                self.model_id,
+                subfolder="text_encoder_2",
+                quantization_config=text_quant_config,
+                torch_dtype=torch.bfloat16  # bfloat16 for your NVIDIA setup—faster magic!
+            )
+            # Load the pipeline with the optimized transformer
+            # We need FluxKontextPipeline for editing magic!
+            pipeline = FluxKontextPipeline.from_pretrained(
+                self.model_id,
+                text_encoder_2=text_encoder_2_4bit,
+                transformer=transformer,
+                torch_dtype=torch.bfloat16,
+            )
+        else:
+            print("No optimized model path provided for KontextBackend. Falling back to standard loading if possible, or maybe we should insist on one?")
+            # Original code implied usage of optimized model for Kontext was the main path, but let's support standard if needed,
+            # or minimally just load standard logic if that was the fallback.
+            # Looking at original code: "if args.optimized_model: ... else: ... Flux2Pipeline"
+            # Wait, the original code fell back to Flux2Pipeline if no optimized model was present!
+            # The user request says: "create KontextBackend.py that creates a pipeline from base and optional optimized paths"
+            # So KontextBackend *should* support both optimized and unoptimized? Or was the fallback in original code actually switching to Flux2?
+            # Original code:
+            # if args.optimized_model:
+            #    # Load Nunchaku stuff
+            #    pipeline = FluxKontextPipeline(...)
+            # else:
+            #    # Load standard stuff
+            #    pipeline = Flux2Pipeline(...)
+            #
+            # The USER request says: "KontextBackend.py that creates a pipeline from base and optional optimized paths".
+            # This implies if I choose "kontext" backend but don't provide optimized path, it should still load a FluxKontextPipeline (presumably unoptimized/standard).
+            # However, FluxKontextPipeline might expect specific components.
+            # Let's assume standard loading for FluxKontextPipeline if no optimized model is separate.
+            print(f"Loading standard FluxKontextPipeline from {self.model_id}...")
+            # Assuming standard 4-bit loading for memory savings similar to before
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+            )
+            # Use basic from_pretrained
+            pipeline = FluxKontextPipeline.from_pretrained(
+                self.model_id,
+                torch_dtype=torch.bfloat16
+                # We might need quantization for components if memory is tight, but from_pretrained handles a lot.
+                # Let's keep it simple for now as we don't have the Nunchaku specific loading here.
+            )
+            # Actually, if we look at how specialized the optimized loading was, standard loading might just be:
+            # pipeline = FluxKontextPipeline.from_pretrained(model_id, torch_dtype=...)
+        self.pipeline = pipeline
+        self.pipeline.to("cuda")
+        # Additional setup if needed (like offload)
+        # self.pipeline.enable_model_cpu_offload() # User code had this for optimized path
+        return self.pipeline, self.pipeline

extras/NVFP4TextEncoder.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+NVFP4 text encoder loader for diffusers image pipelines.
+Loads a compressed-tensors NVFP4-pack-quantized HuggingFace causal LM and wraps
+it so it can be plugged into ``diffusers.ZImagePipeline`` (or any pipeline
+calling ``self.text_encoder(input_ids, attention_mask, output_hidden_states=True)``).
+Strategy:
+- Instantiate the HF model on the ``meta`` device (no real allocation).
+- Walk every ``torch.nn.Linear`` and swap it for vLLM's ``ReplicatedLinear`` with
+  ``CompressedTensorsConfig`` derived from the checkpoint's
+  ``quantization_config``. This registers ``weight_packed`` / ``weight_scale`` /
+  ``*_global_scale`` parameters in the exact layout vLLM's
+  ``CompressedTensorsW4A4Fp4`` scheme expects.
+- Materialise remaining (non-Linear) parameters (embeddings, RMSNorm, k/q norms)
+  on the target device & dtype.
+- Stream the safetensors file and dispatch each tensor through the registered
+  vLLM ``weight_loader`` (which handles layout swizzling on
+  ``process_weights_after_loading``).
+- Tie the LM head to the input embedding when ``config.tie_word_embeddings``.
+The result is a regular ``nn.Module`` matching the HF model's call signature
+(``forward(input_ids, attention_mask, output_hidden_states)``) -- usable directly
+as ``ZImagePipeline.text_encoder``.
+vLLM requires a minimal global context (distributed process group + model
+parallel state + active VllmConfig) even at TP=1 because ``ReplicatedLinear``
+queries the TP world size at construction. We bootstrap that lazily once.
+Forced kernel: we set ``VLLM_NVFP4_GEMM_BACKEND=cutlass`` to skip
+flashinfer-cutlass JIT (which needs the ``ninja`` binary on PATH). The vLLM
+CUTLASS kernel is built into the wheel.
+"""
+from __future__ import annotations
+import json
+import os
+from collections.abc import Iterator
+from typing import Optional
+import torch
+import torch.nn as nn
+# ----------------------------------------------------------------------------
+# One-time vLLM bootstrap (TP=1, no engine, just enough context for ReplicatedLinear)
+# ----------------------------------------------------------------------------
+_VLLM_BOOTSTRAPPED = False
+_VLLM_CONFIG_CTX = None  # holds the entered set_current_vllm_config context manager
+def _bootstrap_vllm_once() -> None:
+    """Initialise the bits of vLLM that ReplicatedLinear needs at TP=1.
+    Idempotent. Uses ``gloo`` so it works without NCCL/CUDA-aware MPI and even
+    when CUDA is busy with the diffusion transformer.
+    """
+    global _VLLM_BOOTSTRAPPED, _VLLM_CONFIG_CTX
+    if _VLLM_BOOTSTRAPPED:
+        return
+    # Force CUTLASS to avoid flashinfer-cutlass JIT (requires `ninja` on PATH).
+    os.environ.setdefault("VLLM_NVFP4_GEMM_BACKEND", "cutlass")
+    from vllm.config import VllmConfig
+    from vllm.config.vllm import set_current_vllm_config
+    from vllm.distributed import (
+        ensure_model_parallel_initialized,
+        init_distributed_environment,
+    )
+    # Pick a free port; world_size=1.
+    import socket
+    s = socket.socket()
+    s.bind(("127.0.0.1", 0))
+    port = s.getsockname()[1]
+    s.close()
+    if not torch.distributed.is_initialized():
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            local_rank=0,
+            distributed_init_method=f"tcp://127.0.0.1:{port}",
+            backend="gloo",
+        )
+    # Enter a long-lived VllmConfig context. We never exit it -- the encoder
+    # may construct submodules lazily and ReplicatedLinear calls
+    # get_current_vllm_config() at init.
+    vc = VllmConfig()
+    _VLLM_CONFIG_CTX = set_current_vllm_config(vc)
+    _VLLM_CONFIG_CTX.__enter__()
+    ensure_model_parallel_initialized(1, 1)
+    _VLLM_BOOTSTRAPPED = True
+# ----------------------------------------------------------------------------
+# Module: linear replacement
+# ----------------------------------------------------------------------------
+def _replace_linears_with_replicated(
+    model: nn.Module, quant_config
+) -> None:
+    """Recursively swap every ``nn.Linear`` for vLLM ``ReplicatedLinear``.
+    Carries the ``prefix`` so quant_config's ``ignore`` patterns (e.g. ``lm_head``)
+    are correctly applied.
+    """
+    from vllm.model_executor.layers.linear import ReplicatedLinear
+    def _walk(parent: nn.Module, prefix: str) -> None:
+        for child_name, child in list(parent.named_children()):
+            qname = f"{prefix}.{child_name}" if prefix else child_name
+            if isinstance(child, nn.Linear):
+                new = ReplicatedLinear(
+                    input_size=child.in_features,
+                    output_size=child.out_features,
+                    bias=child.bias is not None,
+                    quant_config=quant_config,
+                    prefix=qname,
+                    return_bias=False,
+                    params_dtype=torch.bfloat16,
+                )
+                setattr(parent, child_name, new)
+            else:
+                _walk(child, qname)
+    _walk(model, prefix="")
+def _materialize_remaining_meta_params(
+    model: nn.Module, dtype: torch.dtype, device: torch.device
+) -> None:
+    """Replace any ``meta`` parameter with empty real storage.
+    Only touches parameters NOT already created on a real device by the
+    ReplicatedLinear swap above (i.e. embeddings, layernorms, biases).
+    """
+    for name, param in list(model.named_parameters(recurse=True)):
+        if param.device.type == "meta":
+            real = nn.Parameter(
+                torch.empty(param.shape, dtype=dtype, device=device),
+                requires_grad=False,
+            )
+            # Replace in the parent module
+            parent = model
+            *path, leaf = name.split(".")
+            for p in path:
+                parent = getattr(parent, p)
+            setattr(parent, leaf, real)
+    # Same for buffers (e.g. rotary inv_freq if registered as buffer on meta)
+    for name, buf in list(model.named_buffers(recurse=True)):
+        if buf.device.type == "meta":
+            real = torch.empty(buf.shape, dtype=buf.dtype, device=device)
+            parent = model
+            *path, leaf = name.split(".")
+            for p in path:
+                parent = getattr(parent, p)
+            parent.register_buffer(leaf, real, persistent=False)
+# ----------------------------------------------------------------------------
+# Weight loading
+# ----------------------------------------------------------------------------
+def _iter_safetensors(model_dir: str) -> Iterator[tuple[str, torch.Tensor]]:
+    """Yield (name, tensor) pairs from all *.safetensors shards in ``model_dir``."""
+    from safetensors import safe_open
+    # Single-file checkpoint or sharded? Prefer ``model.safetensors.index.json``.
+    index_path = os.path.join(model_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        with open(index_path) as f:
+            index = json.load(f)
+        shards = sorted(set(index["weight_map"].values()))
+    else:
+        # Find all *.safetensors files in dir
+        shards = sorted(
+            fn for fn in os.listdir(model_dir) if fn.endswith(".safetensors")
+        )
+    for shard in shards:
+        path = os.path.join(model_dir, shard)
+        with safe_open(path, framework="pt") as f:
+            for key in f.keys():
+                yield key, f.get_tensor(key)
+def _load_weights_into_model(model: nn.Module, model_dir: str) -> None:
+    """Stream safetensors into the (already-structured) model.
+    Uses each ReplicatedLinear's registered ``weight_loader`` for quantised
+    params (which handles tensor-parallel sharding, even though TP=1 here it
+    keeps casts consistent). Other params (embeddings, layernorms, biases) are
+    copied directly.
+    """
+    # Strip vllm-omni-style "text_encoder." prefix if present; not applicable
+    # here since we load the standalone HF Qwen3 checkpoint where keys start
+    # with "model.layers..." / "model.embed_tokens..." / "lm_head...".
+    name_to_param: dict[str, nn.Parameter] = dict(model.named_parameters(recurse=True))
+    name_to_buffer: dict[str, torch.Tensor] = dict(model.named_buffers(recurse=True))
+    missing = set(name_to_param.keys())
+    unexpected = []
+    for key, tensor in _iter_safetensors(model_dir):
+        # Skip rotary inv_freq etc that aren't params (rare in modern HF saves)
+        if key in name_to_param:
+            param = name_to_param[key]
+            wl = getattr(param, "weight_loader", None)
+            if wl is not None:
+                wl(param, tensor.to(param.device))
+            else:
+                with torch.no_grad():
+                    param.data.copy_(tensor.to(param.device, dtype=param.dtype))
+            missing.discard(key)
+        elif key in name_to_buffer:
+            with torch.no_grad():
+                name_to_buffer[key].copy_(tensor.to(name_to_buffer[key].device))
+        else:
+            unexpected.append(key)
+    # Tied embeddings (lm_head.weight not in checkpoint when tie_word_embeddings=True)
+    cfg = getattr(model, "config", None)
+    if cfg is not None and getattr(cfg, "tie_word_embeddings", False):
+        try:
+            inp_emb = model.get_input_embeddings().weight
+            model.lm_head.weight = inp_emb  # share storage
+            missing.discard("lm_head.weight")
+        except Exception:
+            pass
+    if missing:
+        # It's OK if missing entries are *purely* lm_head.weight when tied; we
+        # already handled that above. Anything else is fatal-ish.
+        leftover = sorted(missing)
+        if leftover:
+            print(
+                f"[NVFP4TextEncoder] WARN: {len(leftover)} params missing from checkpoint; "
+                f"first 5: {leftover[:5]}"
+            )
+    if unexpected:
+        print(
+            f"[NVFP4TextEncoder] WARN: {len(unexpected)} keys in checkpoint unused; "
+            f"first 5: {unexpected[:5]}"
+        )
+def _process_weights_after_loading(model: nn.Module) -> None:
+    """Invoke vLLM's per-layer ``process_weights_after_loading`` for each
+    ReplicatedLinear (renames ``weight_packed`` -> ``weight``, computes ``alpha``,
+    swizzles scales for the CUTLASS kernel, etc.)."""
+    for module in model.modules():
+        qm = getattr(module, "quant_method", None)
+        if qm is not None and hasattr(qm, "process_weights_after_loading"):
+            qm.process_weights_after_loading(module)
+# ----------------------------------------------------------------------------
+# Public API
+# ----------------------------------------------------------------------------
+def load_nvfp4_text_encoder(
+    model_dir: str,
+    device: str | torch.device = "cuda",
+    dtype: torch.dtype = torch.bfloat16,
+) -> nn.Module:
+    """Load an NVFP4-quantised HuggingFace causal LM as a plug-in text encoder.
+    Args:
+        model_dir: path to the checkpoint directory containing ``config.json``
+            and ``model*.safetensors``. The config must carry a
+            ``quantization_config`` block with ``"format": "nvfp4-pack-quantized"``.
+        device: target CUDA device (forwards to ``model.to(device)``-equivalent
+            during materialisation).
+        dtype: activation / non-quantised-param dtype.
+    Returns:
+        A ``PreTrainedModel`` whose ``Linear`` layers are NVFP4 inside the vLLM
+        CUTLASS kernel. Activations flow as ``dtype``.
+    """
+    _bootstrap_vllm_once()
+    from transformers import AutoConfig, AutoModelForCausalLM
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
+        CompressedTensorsConfig,
+    )
+    from vllm.model_executor.models.transformers.utils import (
+        init_on_device_without_buffers,
+    )
+    hf_config = AutoConfig.from_pretrained(model_dir, local_files_only=True)
+    if not getattr(hf_config, "quantization_config", None):
+        raise ValueError(
+            f"{model_dir}/config.json has no `quantization_config`; "
+            "this loader only handles NVFP4-quantised checkpoints."
+        )
+    quant_config = CompressedTensorsConfig.from_config(hf_config.quantization_config)
+    # 1) Build the model skeleton on meta (zero allocation).
+    with init_on_device_without_buffers("meta"):
+        model = AutoModelForCausalLM.from_config(hf_config)
+    # 2) Swap Linear -> ReplicatedLinear(quant_config) (creates real CUDA params
+    #    of the quantised shapes).
+    target_device = torch.device(device)
+    _replace_linears_with_replicated(model, quant_config)
+    # 3) Materialise any leftover meta parameters (embeddings, RMSNorms, ...)
+    _materialize_remaining_meta_params(model, dtype=dtype, device=target_device)
+    # 4) Move newly-created quantised params to target device (ReplicatedLinear
+    #    creates them on the current default device which is usually CPU).
+    model.to(target_device)
+    # 5) Load weights via per-param weight_loader.
+    _load_weights_into_model(model, model_dir)
+    # 6) Let vLLM swizzle scales / rename weight_packed->weight / compute alpha.
+    _process_weights_after_loading(model)
+    # 7) Match HF semantics for downstream pipelines.
+    model.eval()
+    model.config.use_cache = False
+    return model

extras/OmniImageEditServer.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import argparse
+import base64
+import io
+import time
+import torch
+import uvicorn
+import gc
+import asyncio
+import os
+import sys
+import os
+import inspect
+# Add OmniGen2-DFloat11 to path
+# Script is in imagegen/, so we go up one level and into packages/OmniGen2-DFloat11
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.dirname(current_dir)
+omnigen_path = os.path.join(project_root, "packages", "OmniGen2")
+sys.path.insert(0, omnigen_path)
+from typing import List, Optional
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from pydantic import BaseModel
+from PIL import Image, ImageOps
+# Import OmniGen2 and DFloat11 components
+from omnigen2.pipelines.omnigen2.pipeline_omnigen2 import OmniGen2Pipeline
+from omnigen2.models.transformers.transformer_omnigen2 import OmniGen2Transformer2DModel
+from transformers import CLIPProcessor, BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration
+from transformers.modeling_utils import no_init_weights
+# Yay! Nikola here, ready to bring the OmniGen2 magic to our village!
+# This server is like a new canvas for our artistic endeavors!
+# Argument parsing
+parser = argparse.ArgumentParser(description="OmniGen2 Image Edit Server")
+parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
+parser.add_argument("--port", type=int, default=8000, help="Port to bind to")
+# Default paths relative to project root as per plan
+parser.add_argument("--base-model", type=str, default="../models/OmniGen2", help="Path to base OmniGen2 model")
+parser.add_argument("--dtype", type=str, default='bf16', choices=['fp32', 'fp16', 'bf16'], help="Model precision")
+args = parser.parse_args()
+app = FastAPI()
+# Global components
+pipeline = None
+request_lock = asyncio.Lock()
+def load_model():
+    global pipeline
+    print(f"Loading OmniGen2 from {args.base_model}...")
+    # Determine usage dtype
+    weight_dtype = torch.float32
+    if args.dtype == 'fp16':
+        weight_dtype = torch.float16
+    elif args.dtype == 'bf16':
+        weight_dtype = torch.bfloat16
+    try:
+        # Load the base pipeline (tokenizer, scheduler, etc.)
+        # processor needs to be loaded separately sometimes depending on library version,
+        # but following inference.py pattern:
+        # Manually load MLLM in 4-bit to save VRAM, yay!
+        print("Loading MLLM in 4-bit mode for extra village efficiency!")
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=weight_dtype,
+        )
+        mllm = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            args.base_model,
+            subfolder="mllm",
+            quantization_config=quantization_config,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = OmniGen2Pipeline.from_pretrained(
+            args.base_model,
+            mllm=mllm,
+            processor=CLIPProcessor.from_pretrained(
+                args.base_model,
+                subfolder="processor",
+                use_fast=True
+            ),
+            torch_dtype=weight_dtype,
+            trust_remote_code=True,
+        ).to("cuda")
+        pipeline.enable_taylorseer = True
+        pipeline.transformer.set_attention_backend("flash")
+        print("Enabling CPU offload...")
+        #pipeline.enable_model_cpu_offload()
+        #pipeline.enable_sequential_cpu_offload()
+    except Exception as e:
+        print(f"Oh no! The OmniGen2 spirit refused to manifest: {e}")
+        raise e
+    print("OmniGen2 loaded successfully! Let's paint the village!")
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+class ImageGenerationRequest(BaseModel):
+    prompt: str
+    n: int = 1
+    size: str = "1024x1024"
+    response_format: str = "b64_json"
+    quality: str = "standard"
+    style: str = "vivid"
+@app.on_event("startup")
+async def startup_event():
+    load_model()
+@app.post("/v1/images/edits")
+async def edit_image(
+    image: UploadFile = File(...),
+    prompt: str = Form(...),
+    n: int = Form(1),
+    size: str = Form("1024x1024"),
+    response_format: str = Form("b64_json"),
+    guidance_scale: float = Form(2.5), # Image guidance scale
+    strength: float = Form(1.0) # Using strength to map to something or just ignored?
+                                # OmniGen uses image_guidance_scale.
+                                # We can map strength to text_guidance_scale maybe?
+                                # Let's keep defaults for now from inference.py
+):
+    if not pipeline:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    async with request_lock:
+        print(f"Received edit request: {prompt}")
+        # Processing the input image
+        try:
+            contents = await image.read()
+            init_image = Image.open(io.BytesIO(contents)).convert("RGB")
+            init_image = ImageOps.exif_transpose(init_image)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Invalid image file: {e}")
+        # Parse max target dimensions from requested size
+        try:
+            target_width, target_height = map(int, size.split("x"))
+        except ValueError:
+            target_width, target_height = 1024, 1024
+        # Calculate new dimensions preserving aspect ratio
+        orig_width, orig_height = init_image.size
+        scale = min(target_width / orig_width, target_height / orig_height)
+        new_width = int(orig_width * scale)
+        new_height = int(orig_height * scale)
+        # Enforce multiples of 16 for compatibility
+        width = (new_width // 16) * 16
+        height = (new_height // 16) * 16
+        response_images = []
+        try:
+            # Generate edits
+            # OmniGen2Pipeline signature from inference.py:
+            # prompt, input_images, width, height, num_inference_steps, ...
+            # Using defaults from inference.py for now
+            results = pipeline(
+                prompt=prompt,
+                input_images=[init_image],
+                width=width,
+                height=height,
+                num_inference_steps=26, # Standard for OmniGen2
+                max_sequence_length=1024,
+                text_guidance_scale=5.0, # Default per inference.py
+                image_guidance_scale=guidance_scale, # Map guidance_scale from request here
+                cfg_range=(0.0, 1.0),
+                negative_prompt="(((deformed))), blurry, over saturation, bad anatomy, disfigured, poorly drawn face, mutation, mutated, (extra_limb), (ugly), (poorly drawn hands), fused fingers, messy drawing, broken legs censor, censored, censor_bar",
+                num_images_per_prompt=n,
+                output_type="pil",
+            )
+            for img in results.images:
+                buffered = io.BytesIO()
+                img.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                response_images.append({"b64_json": img_str})
+        except Exception as e:
+            print(f"Error during editing: {e}")
+            raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+        finally:
+            flush()
+        return {
+            "created": int(time.time()),
+            "data": response_images
+        }
+@app.post("/v1/images/generations")
+async def generate_image(request: ImageGenerationRequest):
+    if not pipeline:
+        raise HTTPException(status_code=500, detail="Model not loaded")
+    async with request_lock:
+        print(f"Received generation request: {request.prompt}")
+        # Parse size
+        try:
+            width, height = map(int, request.size.split("x"))
+        except ValueError:
+            width, height = 1024, 1024
+        # Enforce multiples of 16 for compatibility
+        width = (width // 16) * 16
+        height = (height // 16) * 16
+        response_images = []
+        try:
+            # Generate images (input_images=None for txt2img)
+            results = pipeline(
+                prompt=request.prompt,
+                input_images=None,
+                width=width,
+                height=height,
+                num_inference_steps=26,
+                max_sequence_length=1024,
+                text_guidance_scale=5.0,
+                image_guidance_scale=2.0, # Default
+                cfg_range=(0.0, 1.0),
+                negative_prompt="(((deformed))), blurry, over saturation, bad anatomy, disfigured, poorly drawn face, mutation, mutated, (extra_limb), (ugly), (poorly drawn hands), fused fingers, messy drawing, broken legs censor, censored, censor_bar",
+                num_images_per_prompt=request.n,
+                output_type="pil",
+            )
+            for img in results.images:
+                buffered = io.BytesIO()
+                img.save(buffered, format="PNG")
+                img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                response_images.append({"b64_json": img_str})
+        except Exception as e:
+            print(f"Error during generation: {e}")
+            raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+        finally:
+            flush()
+        return {
+            "created": int(time.time()),
+            "data": response_images
+        }
+if __name__ == "__main__":
+    uvicorn.run(app, host=args.host, port=args.port)

extras/QwenBackend.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+from nunchaku.utils import get_gpu_memory, get_precision
+from nunchaku.models.transformers.transformer_qwenimage import NunchakuQwenImageTransformer2DModel
+class QwenBackend:
+    def __init__(self, model_id, optimized_model_path=None, optimized_edit_model_path=None, uma=False):
+        self.model_id = model_id
+        self.optimized_model_path = optimized_model_path
+        self.optimized_edit_model_path = optimized_edit_model_path
+        self.uma = uma
+        self.pipeline = None
+        self.rank = 32 # Default from example (was 128 in snippet, user example has 32)
+        # Check snippet: rank = 32 in the example content I read.
+    def load(self):
+        print(f"Loading Qwen backend from {self.model_id}...")
+        if not self.optimized_model_path:
+             print("Warning: No optimized model path provided for QwenBackend. This requires the Nunchaku optimized model.")
+        # Scheduler config from example
+        import math
+        from diffusers import FlowMatchEulerDiscreteScheduler
+        scheduler_config = {
+            "base_image_seq_len": 256,
+            "base_shift": math.log(3),
+            "invert_sigmas": False,
+            "max_image_seq_len": 8192,
+            "max_shift": math.log(3),
+            "num_train_timesteps": 1000,
+            "shift": 1.0,
+            "shift_terminal": None,
+            "stochastic_sampling": False,
+            "time_shift_type": "exponential",
+            "use_beta_sigmas": False,
+            "use_dynamic_shifting": True,
+            "use_exponential_sigmas": False,
+            "use_karras_sigmas": False,
+        }
+        scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
+        # Load the base transformer (T2I)
+        print(f"Loading T2I NunchakuQwenImageTransformer2DModel from {self.optimized_model_path} with FA2...")
+        transformer_t2i = NunchakuQwenImageTransformer2DModel.from_pretrained(
+            self.optimized_model_path,
+            attn_implementation="flash_attention_2"
+        )
+        # Load the edit transformer
+        if self.optimized_edit_model_path:
+            print(f"Loading Edit NunchakuQwenImageTransformer2DModel from {self.optimized_edit_model_path} with FA2...")
+            transformer_edit = NunchakuQwenImageTransformer2DModel.from_pretrained(
+                self.optimized_edit_model_path,
+                attn_implementation="flash_attention_2"
+            )
+        else:
+            print(f"Using shared transformer for Edit pipeline...")
+            transformer_edit = transformer_t2i
+        print(f"Loading QwenImagePipeline from {self.model_id}...")
+        # Use QwenImagePipeline (T2I)
+        from diffusers import QwenImagePipeline, QwenImageEditPlusPipeline
+        text_encoder = None
+        if self.uma:
+            print("UMA mode: Loading text_encoder in 8-bit using BitsAndBytes...")
+            from transformers import BitsAndBytesConfig, AutoModel
+            bnb_config = BitsAndBytesConfig(load_in_8bit=True)
+            text_encoder = AutoModel.from_pretrained(
+                self.model_id,
+                subfolder="text_encoder",
+                quantization_config=bnb_config,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True
+            )
+        # 1. Load Edit Pipeline (To handle processor correctly)
+        print(f"Loading QwenImageEditPlusPipeline from {self.model_id}...")
+        pipeline_kwargs = {
+            "transformer": transformer_edit,
+            "scheduler": scheduler,
+            "torch_dtype": torch.bfloat16
+        }
+        if text_encoder is not None:
+            pipeline_kwargs["text_encoder"] = text_encoder
+        edit_pipeline = QwenImageEditPlusPipeline.from_pretrained(
+            self.model_id,
+            **pipeline_kwargs
+        )
+        # 2. Create T2I Pipeline sharing components (except transformer if separate)
+        print("Creating QwenImagePipeline (T2I) with shared components...")
+        # Ensure we have a text_encoder and tokenizer
+        if edit_pipeline.text_encoder is None:
+            print("Text encoder not found in edit_pipeline, loading manually...")
+            # Load from model_id or subfolder
+            if text_encoder is None:
+                from transformers import AutoModel
+                text_encoder = AutoModel.from_pretrained(self.model_id, subfolder="text_encoder", torch_dtype=torch.bfloat16, trust_remote_code=True)
+            # CRITICAL FIX: Assign it back to the pipeline!
+            edit_pipeline.register_modules(text_encoder=text_encoder)
+        else:
+            text_encoder = edit_pipeline.text_encoder
+        tokenizer = edit_pipeline.tokenizer
+        if tokenizer is None:
+             print("Tokenizer not found in edit_pipeline, loading manually...")
+             from transformers import AutoTokenizer
+             tokenizer = AutoTokenizer.from_pretrained(self.model_id, subfolder="tokenizer", trust_remote_code=True)
+             edit_pipeline.register_modules(tokenizer=tokenizer)
+        pipeline = QwenImagePipeline(
+            transformer=transformer_t2i,
+            scheduler=edit_pipeline.scheduler,
+            vae=edit_pipeline.vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+        # Manually assign processors if needed (though QwenImagePipeline creates its own image_processor)
+        # pipeline.feature_extractor = edit_pipeline.image_processor
+        # Logic for offloading / UMA
+        if self.uma:
+            print("UMA mode enabled: Text encoder loaded in 8-bit. Moving other components to GPU.")
+            # Note: 8-bit text encoder is already handled by bitsandbytes (on GPU or offloaded as needed, typically GPU).
+            # Explicitly move transformers to CUDA
+            print("Moving T2I Transformer to CUDA...")
+            transformer_t2i.to("cuda")
+            if transformer_edit != transformer_t2i:
+                print("Moving Edit Transformer to CUDA...")
+                transformer_edit.to("cuda")
+            # We need to ensure other components (VAE) are on CUDA.
+            if hasattr(edit_pipeline, "vae") and edit_pipeline.vae:
+                print("Moving VAE to CUDA...")
+                edit_pipeline.vae.to("cuda")
+            # Since we can't call pipeline.to("cuda") generally if 8-bit modules are present (sometimes safe, sometimes not),
+            # we manually handle it or trust loaded components.
+            pass
+            # Note: pipeline (T2I) shares components, so it should be on cuda too.
+        else:
+            print("Non-UMA mode: Using aggressive per-layer offloading.")
+            transformer_t2i.set_offload(
+                True, use_pin_memory=True, num_blocks_on_gpu=8
+            )
+            if self.optimized_edit_model_path:
+                transformer_edit.set_offload(
+                    True, use_pin_memory=True, num_blocks_on_gpu=8
+                )
+            edit_pipeline._exclude_from_cpu_offload.append("transformer")
+            edit_pipeline.enable_sequential_cpu_offload()
+            # The T2I pipeline (pipeline) also needs to handle offloading.
+            # If we manually loaded text_encoder, it might not be attached to edit_pipeline's offload hooks.
+            # We should enable sequential CPU offload for the T2I pipeline too.
+            pipeline.enable_sequential_cpu_offload()
+            if self.optimized_edit_model_path:
+                pass
+        self.pipeline = pipeline
+        self.edit_pipeline = edit_pipeline
+        return self.pipeline, self.edit_pipeline

extras/QwenImageBackend.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import torch
+from nunchaku.utils import get_gpu_memory, get_precision
+from nunchaku.models.transformers.transformer_qwenimage import NunchakuQwenImageTransformer2DModel
+class QwenImageBackend:
+    def __init__(self, model_id, optimized_model_path=None):
+        self.model_id = model_id
+        self.optimized_model_path = optimized_model_path
+        self.pipeline = None
+        self.rank = 32  # default rank as per example
+    def load(self):
+        print(f"Loading QwenImageBackend from {self.model_id}...")
+        # Scheduler config (same as QwenBackend)
+        import math
+        from diffusers import FlowMatchEulerDiscreteScheduler
+        scheduler_config = {
+            "base_image_seq_len": 256,
+            "base_shift": math.log(3),
+            "invert_sigmas": False,
+            "max_image_seq_len": 8192,
+            "max_shift": math.log(3),
+            "num_train_timesteps": 1000,
+            "shift": 1.0,
+            "shift_terminal": None,
+            "stochastic_sampling": False,
+            "time_shift_type": "exponential",
+            "use_beta_sigmas": False,
+            "use_dynamic_shifting": True,
+            "use_exponential_sigmas": False,
+            "use_karras_sigmas": False,
+        }
+        scheduler = FlowMatchEulerDiscreteScheduler.from_config(scheduler_config)
+        # Load transformer (optimized model)
+        print(f"Loading NunchakuQwenImageTransformer2DModel from {self.optimized_model_path}...")
+        transformer = NunchakuQwenImageTransformer2DModel.from_pretrained(self.optimized_model_path)
+        # Load T2I pipeline
+        from diffusers import QwenImagePipeline
+        pipeline = QwenImagePipeline.from_pretrained(
+            self.model_id,
+            transformer=transformer,
+            scheduler=scheduler,
+            torch_dtype=torch.bfloat16,
+        )
+        # Offloading logic (same as QwenBackend)
+        if get_gpu_memory() > 18:
+            print("GPU memory > 18GB, using cpu offload")
+            pipeline.enable_model_cpu_offload()
+        else:
+            print("GPU memory <= 18GB, using per-layer offloading for low VRAM")
+            transformer.set_offload(True, use_pin_memory=False, num_blocks_on_gpu=1)
+            pipeline._exclude_from_cpu_offload.append("transformer")
+            pipeline.enable_sequential_cpu_offload()
+        self.pipeline = pipeline
+        # For edit endpoint we reuse the same pipeline (ignores image)
+        return self.pipeline, self.pipeline

extras/ZImageTurboBackend.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os
+import torch
+from diffusers import ZImagePipeline
+from nunchaku.models.transformers.transformer_zimage import NunchakuZImageTransformer2DModel
+from nunchaku.utils import get_gpu_memory
+class ZImageTurboBackend:
+    def __init__(
+        self,
+        model_id,
+        optimized_model_path=None,
+        optimized_edit_model_path=None,
+        uma=False,
+        nvfp4_text_encoder_path: str | None = None,
+    ):
+        self.model_id = model_id
+        self.optimized_model_path = optimized_model_path
+        self.pipeline = None
+        self.uma = uma
+        # Optional path to an NVFP4-pack-quantized Qwen3 text encoder. When set,
+        # we load the encoder via vLLM's CompressedTensorsW4A4Fp4 (CUTLASS NVFP4
+        # GEMM) instead of the bf16 text_encoder shipped inside the Z-Image
+        # base repo. Cuts encoder VRAM ~4x with negligible quality loss
+        # (cosine >0.999 vs the bf16 reference on Thor).
+        self.nvfp4_text_encoder_path = nvfp4_text_encoder_path
+    def _build_nvfp4_text_encoder(self):
+        """Load the NVFP4 text encoder if requested, returns (encoder, tokenizer) or (None, None)."""
+        if not self.nvfp4_text_encoder_path:
+            return None, None
+        print(
+            f"[ZImageTurboBackend] Loading NVFP4 text encoder from {self.nvfp4_text_encoder_path} "
+            "(vLLM CompressedTensorsW4A4Fp4 + CUTLASS NVFP4 GEMM)"
+        )
+        from NVFP4TextEncoder import load_nvfp4_text_encoder
+        from transformers import AutoTokenizer
+        encoder = load_nvfp4_text_encoder(
+            self.nvfp4_text_encoder_path,
+            device="cuda",
+            dtype=torch.bfloat16,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.nvfp4_text_encoder_path)
+        return encoder, tokenizer
+    def load(self):
+        print(f"Loading ZImageTurboBackend from {self.model_id}...")
+        print(f"Loading NunchakuZImageTransformer2DModel from {self.optimized_model_path}...")
+        # Load transformer (optimized model)
+        transformer = NunchakuZImageTransformer2DModel.from_pretrained(self.optimized_model_path)
+        # If requested, build the NVFP4 text encoder before constructing the pipeline so
+        # diffusers does not also load the bf16 text_encoder from disk (it would double VRAM).
+        nvfp4_encoder, nvfp4_tokenizer = self._build_nvfp4_text_encoder()
+        # Load pipeline
+        print("Initializing ZImagePipeline...")
+        pipeline_kwargs = dict(
+            transformer=transformer,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=False,  # standard for HF example
+        )
+        if nvfp4_encoder is not None:
+            # Pass our pre-built encoder so diffusers skips loading the bf16 subfolder.
+            pipeline_kwargs["text_encoder"] = nvfp4_encoder
+            if nvfp4_tokenizer is not None:
+                pipeline_kwargs["tokenizer"] = nvfp4_tokenizer
+        pipeline = ZImagePipeline.from_pretrained(self.model_id, **pipeline_kwargs)
+        gpu_mem = get_gpu_memory()
+        print(f"GPU memory available: {gpu_mem} GB")
+        # Enable Flash Attention 2
+        try:
+            if hasattr(pipeline.transformer, "set_attention_backend"):
+                pipeline.transformer.set_attention_backend("native")
+                print("Enabled Native SDPA for Z-Image transformer")
+            if hasattr(pipeline.vae, "set_attention_backend"):
+                pipeline.vae.set_attention_backend("native")
+                print("Enabled Native SDPA for Z-Image VAE")
+        except Exception as e:
+            print(f"Could not enable Flash Attention 2: {e}")
+        if self.uma:
+            print("UMA mode enabled: Loading all components to GPU and disabling offloads")
+            # When using the NVFP4 encoder, it is already on CUDA and its quantised parameters
+            # are not compatible with diffusers' generic .to() pathway (e.g. uint8 weight_packed).
+            # We move only the diffusers-managed components (vae, transformer if not nunchaku, ...).
+            if nvfp4_encoder is not None:
+                # Exclude text_encoder from blanket .to('cuda'); it is already on cuda.
+                excl = getattr(pipeline, "_exclude_from_cpu_offload", [])
+                if "text_encoder" not in excl:
+                    excl.append("text_encoder")
+                    pipeline._exclude_from_cpu_offload = excl
+                for name, comp in pipeline.components.items():
+                    if name == "text_encoder":
+                        continue
+                    if isinstance(comp, torch.nn.Module):
+                        try:
+                            comp.to("cuda")
+                        except Exception:
+                            pass
+            else:
+                pipeline.to("cuda")
+        elif gpu_mem <= 18:
+            print("GPU memory <= 18GB, using sequential cpu offload for low VRAM")
+            # The prompt requested sequential offloading without splitting layers for Nunchaku
+            pipeline._exclude_from_cpu_offload.append("transformer")
+            if nvfp4_encoder is not None:
+                # NVFP4 weights live entirely on CUDA; do not let accelerate move them.
+                pipeline._exclude_from_cpu_offload.append("text_encoder")
+            pipeline.enable_sequential_cpu_offload()
+            transformer.to("cuda")
+            if nvfp4_encoder is not None:
+                nvfp4_encoder.to("cuda")
+        else:
+            print("GPU memory > 18GB, using cpu offload")
+            if nvfp4_encoder is not None:
+                if not hasattr(pipeline, "_exclude_from_cpu_offload"):
+                    pipeline._exclude_from_cpu_offload = []
+                pipeline._exclude_from_cpu_offload.append("text_encoder")
+            pipeline.enable_model_cpu_offload()
+            if nvfp4_encoder is not None:
+                nvfp4_encoder.to("cuda")
+        self.pipeline = pipeline
+        # Return twice for pipeline and edit_pipeline (though Z-Image-Turbo is T2I only)
+        return self.pipeline, self.pipeline

extras/compress_mllm.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import argparse
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration
+from dfloat11 import compress_model
+def main():
+    parser = argparse.ArgumentParser("Compress OmniGen2 MLLM (Qwen2.5-VL) using DFloat11")
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        required=True,
+        help='The path to the OmniGen2 model (containing "mllm" folder) or direct path to MLLM checkpoint'
+    )
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./OmniGen2-mllm-DF11',
+        help='The path to save the compressed model'
+    )
+    parser.add_argument(
+        '--save_single_file',
+        action='store_true',
+        help='Save the compressed model as a single .safetensors file'
+    )
+    parser.add_argument(
+        '--check_correctness',
+        action='store_true',
+        help='Check the correctness of the compressed weights during compression'
+    )
+    parser.add_argument(
+        '--block_range',
+        type=int,
+        nargs=2,
+        default=(0, 100),
+        help='The range of transformer blocks to compress (for parallel compression over multiple CPU cores)'
+    )
+    args = parser.parse_args()
+    # Determine MLLM path
+    import os
+    mllm_path = args.model_path
+    if os.path.isdir(os.path.join(args.model_path, "mllm")):
+        mllm_path = os.path.join(args.model_path, "mllm")
+    print(f"Loading MLLM from: {mllm_path}")
+    # Load the Qwen2.5-VL model in bfloat16 precision
+    # Use trust_remote_code=True same as in inference.py
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        mllm_path,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True
+    )
+    # Untie weights to avoid safetensors error about shared memory
+    # safetensors.torch.save_file dies if tensors share memory.
+    if hasattr(model, 'lm_head') and hasattr(model.lm_head, 'weight'):
+        print("Untying lm_head weights to avoid safetensors shared memory error...")
+        model.lm_head.weight = torch.nn.Parameter(model.lm_head.weight.clone())
+    # Compress the model using DFloat11 compression
+    # Pattern updated to match Qwen2.5-VL internal structure (model.language_model.layers...)
+    compress_model(
+        model=model,
+        pattern_dict={
+            r"model\.language_model\.layers\.\d+": (
+                "self_attn.q_proj",
+                "self_attn.k_proj",
+                "self_attn.v_proj",
+                "self_attn.o_proj",
+                "mlp.gate_proj",
+                "mlp.up_proj",
+                "mlp.down_proj",
+            ),
+        },
+        save_path=args.save_path,
+        save_single_file=args.save_single_file, # Force single file to use state_dict keys (model.language_model...)
+        check_correctness=args.check_correctness,
+        block_range=args.block_range,
+    )
+if __name__ == "__main__":
+    main()

extras/imagegen_zimage_turbo.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#source /home/olegk/venv/vllm/bin/activate
+cd /home/olegk/Nikola/src/imagegen
+# Force vLLM's built-in CUTLASS NVFP4 kernel (skips flashinfer-cutlass JIT which
+# needs the `ninja` binary on PATH). The kernel still uses the CUTLASS FP4 GEMM
+# path on Thor (sm_110).
+export VLLM_NVFP4_GEMM_BACKEND=cutlass
+python ImageEditServer.py \
+    --port 4500 \
+    --model /home/olegk/Nikola/models/Z-Image-Turbo \
+    --optimized-model /home/olegk/Nikola/models/nunchaku-z-image-turbo/svdq-fp4_r32-z-image-turbo.safetensors \
+    --backend zimage \
+    --steps 8 \
+    --guidance-scale 0.0 \
+    --uma \
+    --nvfp4-text-encoder /home/olegk/Nikola/models/Z-Image-Turbo-Text-Encoder-NVFP4

extras/imagegen_zimage_turbo_int4.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/bin/bash
+source /home/olegk/venv/vllm/bin/activate
+cd /home/olegk/Nikola/src/imagegen
+python ImageEditServer.py \
+    --port 4500 \
+    --model /home/olegk/Nikola/models/Z-Image-Turbo \
+    --optimized-model /home/olegk/Nikola/models/nunchaku-z-image-turbo/svdq-int4_r32-z-image-turbo.safetensors \
+    --backend zimage \
+    --steps 8 \
+    --guidance-scale 0.0 \
+    --uma

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "bos_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "pad_token_id": 151643,
+    "temperature": 0.6,
+    "top_k": 20,
+    "top_p": 0.95,
+    "transformers_version": "4.51.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaad037756f1afd7cb847ff4b7c23db02ec56936bb30806903fb57d2b0b1588d
+size 2822178072

recipe.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+default_stage:
+  default_modifiers:
+    QuantizationModifier:
+      targets: [Linear]
+      ignore: [lm_head, 're:.*mlp.gate$', 're:.*mlp.shared_expert_gate$', 're:.*linear_attn.*',
+        're:model\.visual\..*', 're:model\.image_encoder\..*']
+      scheme: NVFP4
+      bypass_divisibility_checks: false

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}