""" Gemma Diffusion — live website builder (gradio.Server backend + custom frontend). ZeroGPU port. `gradio.Server` (a FastAPI subclass) gives us Gradio's queue + SSE streaming while we serve our own hand-written HTML/CSS/JS frontend. The single streaming endpoint `/generate` runs the block-diffusion model and yields JSON frames (one per denoising step) that the frontend renders side-by-side: the raw HTML canvas diffusing on the left, the live rendered page on the right. ZeroGPU specifics: - `import spaces` happens before `torch`. - The model is loaded once at module scope with `.to("cuda")` (ZeroGPU registers it). - The actual `model.generate` call lives inside the `@spaces.GPU` function `_gpu_stream`; the `gradio.Server` endpoint only marshals picklable CPU tensors in/out of it. Refs: - https://huggingface.co/blog/introducing-gradio-server - https://huggingface.co/docs/hub/spaces-zerogpu """ import glob import os import subprocess import sys # Set before torch is imported (transformers pulls torch in). os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") import spaces # must precede torch so ZeroGPU can patch it def _ensure_transformers(): """Install the bundled custom DiffusionGemma `transformers` wheel at runtime. Spaces installs `requirements.txt` *before* copying the repo files into the image, so the wheel can't be referenced by local path there. By the time this app runs the file is present in the working directory, so we install it here (only if a stock / no transformers is importable) before importing torch/transformers below. """ try: import transformers # noqa: F401 if hasattr(transformers, "DiffusionGemmaForBlockDiffusion") or hasattr( getattr(transformers, "models", object), "diffusion_gemma" ): return except Exception: pass wheels = sorted(glob.glob(os.path.join(os.path.dirname(os.path.abspath(__file__)), "transformers-*.whl"))) if not wheels: return print(f"[gdiff] Installing bundled transformers wheel: {os.path.basename(wheels[0])}", flush=True) subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", wheels[0]]) import importlib importlib.invalidate_caches() _ensure_transformers() import json import queue as queue_lib import re import threading import time as _time import torch from fastapi.responses import HTMLResponse from gradio import Server from transformers import AutoTokenizer, DiffusionGemmaForBlockDiffusion from transformers.generation.streamers import BaseStreamer HERE = os.path.dirname(os.path.abspath(__file__)) MODEL_PATH = os.environ.get("GDIFF_MODEL_PATH", "google/diffusiongemma-26B-A4B-it") HF_TOKEN = os.environ.get("HF_TOKEN") MAX_ITERS_CAP = 120 # hard cap on denoising steps per block # ZeroGPU: the 26B checkpoint (~49 GB bf16) needs the full backing card. GPU_SIZE = os.environ.get("GDIFF_GPU_SIZE", "xlarge") SYSTEM_PROMPT = ( "You are an expert front-end web developer with great visual taste. When asked to " "build or change a web page, respond with a SINGLE, complete, self-contained HTML5 " "document. Put all CSS in a