""" Z-Image-Turbo GGUF API - Using stable-diffusion-cpp-python Optimized for CPU inference with quantized models """ import os import io import base64 import random import gc from pathlib import Path from PIL import Image from fastapi import FastAPI, HTTPException from fastapi.responses import HTMLResponse from pydantic import BaseModel import uvicorn from huggingface_hub import hf_hub_download app = FastAPI(title="Z-Image-Turbo GGUF API") # Global model sd_model = None MODELS_DIR = Path("/tmp/models") class GenerateRequest(BaseModel): prompt: str width: int = 512 height: int = 512 seed: int = -1 num_steps: int = 8 class GenerateResponse(BaseModel): image_base64: str seed: int status: str def download_models(): """Download GGUF models from HuggingFace""" MODELS_DIR.mkdir(parents=True, exist_ok=True) models = { "diffusion": { "repo": "leejet/Z-Image-Turbo-GGUF", "file": "z_image_turbo-Q4_0.gguf", # Q4 for balance of speed/quality "local": MODELS_DIR / "z_image_turbo.gguf" }, "llm": { "repo": "unsloth/Qwen3-4B-Instruct-2507-GGUF", "file": "Qwen3-4B-Instruct-2507-Q4_K_M.gguf", "local": MODELS_DIR / "qwen3_4b.gguf" }, "vae": { "repo": "Comfy-Org/z_image_turbo", # Z-Image VAE (same as FLUX) "file": "split_files/vae/ae.safetensors", "local": MODELS_DIR / "ae.safetensors" } } for name, model in models.items(): if not model["local"].exists(): print(f"Downloading {name} model...") hf_hub_download( repo_id=model["repo"], filename=model["file"], local_dir=MODELS_DIR, local_dir_use_symlinks=False ) # Rename to expected name downloaded = MODELS_DIR / model["file"] if downloaded.exists(): downloaded.rename(model["local"]) print(f"{name} downloaded!") else: print(f"{name} already exists") return models def load_model(): """Load the Z-Image GGUF model""" global sd_model if sd_model is None: print("Loading Z-Image-Turbo GGUF model...") from stable_diffusion_cpp import StableDiffusion models = download_models() sd_model = StableDiffusion( diffusion_model_path=str(models["diffusion"]["local"]), llm_path=str(models["llm"]["local"]), vae_path=str(models["vae"]["local"]), offload_params_to_cpu=True, diffusion_flash_attn=True, ) print("Model loaded!") return sd_model @app.get("/", response_class=HTMLResponse) async def root(): """Simple HTML interface""" return """ Z-Image-Turbo GGUF API

🎨 Z-Image-Turbo API

GGUF Quantized Generate images from text using AI - Optimized for CPU

GGUF Quantized Model - Faster and lighter than full model. First run downloads ~6GB of models.

""" @app.post("/generate", response_model=GenerateResponse) async def generate(request: GenerateRequest): """Generate an image from text prompt using GGUF model""" try: model = load_model() seed = request.seed if seed == -1: seed = random.randint(0, 2147483647) width = min(max(request.width, 256), 1024) height = min(max(request.height, 256), 1024) print(f"Generating: '{request.prompt[:50]}...' at {width}x{height}, seed={seed}") # Generate image using stable-diffusion-cpp output = model.generate_image( prompt=request.prompt, width=width, height=height, cfg_scale=1.0, # Low CFG for turbo models sample_steps=request.num_steps, seed=seed, ) print(f"Output type: {type(output)}") # Handle different output formats from stable-diffusion-cpp if isinstance(output, list): # Returns list of images, take first one img_data = output[0] else: img_data = output # Convert to PIL Image based on data type if isinstance(img_data, bytes): image = Image.open(io.BytesIO(img_data)) elif hasattr(img_data, 'data'): # Raw pixel data image = Image.frombytes('RGB', (width, height), img_data.data) elif hasattr(img_data, 'tobytes'): # NumPy array or similar import numpy as np arr = np.array(img_data) image = Image.fromarray(arr.astype('uint8')) elif isinstance(img_data, Image.Image): image = img_data else: # Try direct conversion image = Image.fromarray(img_data) # Convert to base64 buffer = io.BytesIO() image.save(buffer, format="PNG") image_base64 = base64.b64encode(buffer.getvalue()).decode() gc.collect() return GenerateResponse( image_base64=image_base64, seed=seed, status="success" ) except Exception as e: print(f"Error: {e}") import traceback traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") async def health(): return {"status": "ok", "model": "Z-Image-Turbo-GGUF"} if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) uvicorn.run(app, host="0.0.0.0", port=port)