Spaces:

NoobNovel
/

DDIM_Image_Generation

Sleeping

File size: 11,064 Bytes

"""Gradio demo — DDIM Face Generation.

Single-page layout:
  - Top: title + generate controls + output
  - Middle: trajectory GIF + interpolation (collapsible)
  - Bottom: how it works / architecture description
"""
from __future__ import annotations

import argparse
import os
import tempfile
from typing import Optional

os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")

import numpy as np
import torch
from PIL import Image

from sample import load_run
from utils.visualize import interpolate_latents, trajectory_to_gif, make_grid


# ---------------------------------------------------------------------------
# Global state — loaded once at startup
# ---------------------------------------------------------------------------
class State:
    def __init__(self, ckpt_path: str, prefer_ema: bool = True):
        if torch.backends.mps.is_available():
            self.device = torch.device("mps")
        elif torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        self.cfg, self.model, self.diffusion = load_run(ckpt_path, self.device, prefer_ema)
        self.image_size = self.cfg.image_size
        self.in_channels = self.cfg.in_channels


STATE: Optional[State] = None


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _seeded(seed: Optional[int]) -> torch.Generator:
    g = torch.Generator(device="cpu")
    if seed is not None and seed >= 0:
        g.manual_seed(int(seed))
    return g


def _grid_pil(samples: torch.Tensor, nrow: int) -> Image.Image:
    return Image.fromarray(make_grid(samples.cpu(), nrow=nrow))


# ---------------------------------------------------------------------------
# Callbacks
# ---------------------------------------------------------------------------
def cb_generate(num: int, steps: int, seed: float) -> Image.Image:
    s = STATE
    g = _seeded(int(seed))
    shape = (int(num), s.in_channels, s.image_size, s.image_size)
    x_T = torch.randn(*shape, generator=g).to(s.device)
    with torch.no_grad():
        out = s.diffusion.ddim_sample(s.model, shape, num_steps=int(steps),
                                      eta=0.0, x_T=x_T, device=s.device)
    nrow = int(np.ceil(np.sqrt(num)))
    return _grid_pil(out, nrow)


def cb_trajectory(steps: int, seed: float) -> str:
    s = STATE
    g = _seeded(int(seed))
    shape = (1, s.in_channels, s.image_size, s.image_size)
    x_T = torch.randn(*shape, generator=g).to(s.device)
    with torch.no_grad():
        _, traj = s.diffusion.ddim_sample(
            s.model, shape, num_steps=int(steps), eta=0.0,
            x_T=x_T, device=s.device,
            return_trajectory=True, trajectory_stride=1,
        )
    tmp = tempfile.NamedTemporaryFile(suffix=".gif", delete=False)
    tmp.close()
    trajectory_to_gif(traj, tmp.name, fps=12)
    return tmp.name


def cb_interpolate(frames: int, steps: int, seed_a: float, seed_b: float) -> Image.Image:
    s = STATE
    shape_one = (1, s.in_channels, s.image_size, s.image_size)
    z1 = torch.randn(*shape_one, generator=_seeded(int(seed_a)))
    z2 = torch.randn(*shape_one, generator=_seeded(int(seed_b)))
    latents = interpolate_latents(z1, z2, num_steps=int(frames)).squeeze(1).to(s.device)
    with torch.no_grad():
        out = s.diffusion.ddim_sample(
            s.model, (int(frames), s.in_channels, s.image_size, s.image_size),
            num_steps=int(steps), eta=0.0, x_T=latents, device=s.device,
        )
    return _grid_pil(out, int(frames))


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
TECH_MD = """
## How it works

This demo runs a **DDIM (Denoising Diffusion Implicit Model)** trained from scratch — no pretrained weights, no diffusers library.

### The core idea
A diffusion model learns to reverse a noise process. During training, we take a real face and progressively corrupt it with Gaussian noise over T=1000 steps until it's pure noise. The model (a U-Net) learns to predict the noise added at each step. At inference, we start from pure random noise and run the reverse process — but with DDIM we can skip most steps, getting a good result in just 20–50 steps instead of 1000.

### Architecture

```
Input (noise + timestep t)
        │
   ┌────▼────┐
   │  U-Net  │   Channels: [64, 128, 256, 256]
   │         │   Self-attention at 8×8 and 16×16 resolution
   │  Time   │   Sinusoidal time embedding → MLP → injected at every ResBlock
   │ Embed   │   GroupNorm + SiLU activations throughout
   └────┬────┘
        │
   predicted ε (noise)
```

The U-Net has:
- **4 resolution levels** with strided conv downsampling / nearest-neighbour upsampling
- **Residual blocks** with time-step conditioning (FiLM-style additive injection)
- **Multi-head self-attention** at the two lowest resolutions (8×8, 16×16)
- **EMA weights** used for inference — a running exponential average of training weights that produces cleaner samples

### Training
- **Dataset:** CelebA-HQ — 30,000 aligned face photographs at 256×256, resized to 64×64
- **Hardware:** Apple Mac Mini M-series (MPS backend), no cloud GPU
- **Duration:** ~100 epochs, ~14 hours total
- **Optimizer:** AdamW (CPU-resident state to avoid MPS memory pressure)
- **Loss:** simple MSE between predicted and actual noise — `L = ||ε - ε_θ(x_t, t)||²`
- **Noise schedule:** linear β from 1×10⁻⁴ → 0.02 over T=1000 steps

### Sampling modes
| Mode | What it shows |
|------|--------------|
| **Generate** | New faces sampled from pure Gaussian noise via DDIM |
| **Trajectory** | The full denoising path animated as a GIF — from noise to face |
| **Interpolate** | Spherical linear interpolation (slerp) between two noise vectors, showing a smooth transition between two generated faces |

### DDIM speedup
Standard DDPM requires T=1000 sequential network passes. DDIM uses a non-Markovian sampler that achieves comparable quality in 20–50 steps — a **20–50× speedup** with no retraining.

### Built entirely from scratch
Every component is hand-written in PyTorch:
`attention.py` · `unet.py` · `diffusion.py` · `dataset.py` · `train.py`
No Hugging Face Diffusers, no guided-diffusion, no pre-trained encoders.
"""


def build_ui():
    import gradio as gr

    s = STATE
    max_steps = min(s.cfg.timesteps, 100)   # cap at 100 for CPU

    with gr.Blocks(title="DDIM Face Generation") as demo:

        gr.Markdown("""
# 🧠 DDIM Face Generation
**Denoising Diffusion Implicit Model trained from scratch on CelebA-HQ.**
Generates novel human faces by reversing a learned noise process — no pretrained weights used.
> ⏱️ Running on CPU — generation takes ~30–60 seconds. Use **seed ≥ 0** to reproduce results.
        """)

        # ── Generate ──────────────────────────────────────────────────
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### ⚙️ Controls")
                num    = gr.Slider(1, 9, value=4, step=1, label="Number of faces")
                steps  = gr.Slider(10, max_steps, value=20, step=5,
                                   label="DDIM steps  (more = sharper, slower)")
                seed   = gr.Number(value=-1, label="Seed  (-1 = random each time)")
                gen_btn = gr.Button("✨ Generate Faces", variant="primary", size="lg")

            with gr.Column(scale=2):
                gr.Markdown("### 🖼️ Output")
                gen_out = gr.Image(label="Generated faces", type="pil",
                                   show_label=False, height=400)

        gen_btn.click(cb_generate, [num, steps, seed], gen_out)

        gr.Markdown("---")

        # ── Trajectory & Interpolation (accordion) ────────────────────
        with gr.Accordion("🎞️ Denoising Trajectory  (noise → face GIF)", open=False):
            gr.Markdown("Watch a single face emerge from pure Gaussian noise step by step.")
            with gr.Row():
                t_steps = gr.Slider(10, max_steps, value=20, step=5, label="Steps")
                t_seed  = gr.Number(value=42, label="Seed")
                t_btn   = gr.Button("Animate", variant="secondary")
            t_out = gr.Image(label="Denoising trajectory", type="filepath")
            t_btn.click(cb_trajectory, [t_steps, t_seed], t_out)

        with gr.Accordion("🔀 Latent Interpolation  (face A → face B)", open=False):
            gr.Markdown(
                "Spherical linear interpolation (slerp) between two noise vectors — "
                "each column is a smooth blend between two independently sampled faces."
            )
            with gr.Row():
                i_frames = gr.Slider(4, 10, value=6, step=1, label="Frames")
                i_steps  = gr.Slider(10, max_steps, value=20, step=5, label="DDIM steps")
                i_seed_a = gr.Number(value=0,  label="Seed A")
                i_seed_b = gr.Number(value=7,  label="Seed B")
                i_btn    = gr.Button("Interpolate", variant="secondary")
            i_out = gr.Image(label="A ⟶ B interpolation", type="pil")
            i_btn.click(cb_interpolate, [i_frames, i_steps, i_seed_a, i_seed_b], i_out)

        gr.Markdown("---")

        # ── Tech description ──────────────────────────────────────────
        with gr.Accordion("📖 How it works — architecture, training & theory", open=False):
            gr.Markdown(TECH_MD)

        gr.Markdown(
            "<div style='text-align:center;color:#888;font-size:0.85em'>"
            "Built from scratch · PyTorch · CelebA-HQ · Apple Silicon · "
            "<a href='https://github.com/Gh-Novel/DDIM_Image_Generation' target='_blank'>GitHub</a>"
            "</div>"
        )

    return demo


# ---------------------------------------------------------------------------
DEFAULT_CKPT = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            "checkpoints", "stage-64_best.pt")


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--ckpt", default=DEFAULT_CKPT)
    p.add_argument("--no-ema", action="store_true")
    p.add_argument("--share", action="store_true")
    p.add_argument("--port", type=int, default=7860)
    return p.parse_args()


def main():
    global STATE
    args = parse_args()
    STATE = State(args.ckpt, prefer_ema=not args.no_ema)
    demo = build_ui()
    demo.queue()
    demo.launch(
        server_name="0.0.0.0",   # required for HF Spaces Docker
        server_port=args.port,
        share=args.share,
    )


if __name__ == "__main__":
    main()