Spaces:

TeamSAS
/

ImageColorizer

Paused

App Files Files Community

sivakum4 commited on Aug 9, 2025

Commit

2d3cd80

1 Parent(s): 3d2e89f

Initial commit

Browse files

Files changed (7) hide show

.DS_Store +0 -0
app.py +223 -0
checkpoints/checkpoint_epoch_015_20250808_154437.pt +3 -0
examples/Places365_test_00000287.jpg +0 -0
examples/Places365_test_00000314.jpg +0 -0
model.py +236 -0
requirements.txt +11 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# app.py — Gradio-native metrics, clean UI, CUDA/CPU only
+import os, math, cv2, base64
+import torch, numpy as np, gradio as gr
+from PIL import Image
+# Optional (fine if missing)
+try:
+    import kornia.color as kc
+except Exception:
+    kc = None
+from skimage.metrics import peak_signal_noise_ratio as psnr_metric
+from skimage.metrics import structural_similarity as ssim_metric
+# ---------------- Device & Model (no MPS) ----------------
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+from model import ViTUNetColorizer
+CKPT = "checkpoints/checkpoint_epoch_015_20250808_154437.pt"
+model = None
+if os.path.exists(CKPT):
+    model = ViTUNetColorizer(vit_model_name="vit_tiny_patch16_224").to(device)
+    state = torch.load(CKPT, map_location=device)
+    sd = state.get("generator_state_dict", state)
+    model.load_state_dict(sd)
+    model.eval()
+# ---------------- Utils ----------------
+def is_grayscale(img: Image.Image) -> bool:
+    a = np.array(img)
+    if a.ndim == 2: return True
+    if a.ndim == 3 and a.shape[2] == 1: return True
+    if a.ndim == 3 and a.shape[2] == 3:
+        return np.allclose(a[...,0], a[...,1]) and np.allclose(a[...,1], a[...,2])
+    return False
+def to_L(rgb_np: np.ndarray):
+    # ViTUNetColorizer expects L in [0,1]
+    if kc is None:
+        gray = cv2.cvtColor(rgb_np, cv2.COLOR_RGB2GRAY).astype(np.float32)
+        L = gray / 100.0
+        return torch.from_numpy(L).unsqueeze(0).unsqueeze(0).float().to(device)
+    t = torch.from_numpy(rgb_np.astype(np.float32)/255.).permute(2,0,1).unsqueeze(0).to(device)
+    with torch.no_grad():
+        return kc.rgb_to_lab(t)[:,0:1]/100.0
+def lab_to_rgb(L, ab):
+    if kc is None:
+        lab = torch.cat([L*100.0, torch.clamp(ab, -1, 1)*110.0], dim=1)[0].permute(1,2,0).cpu().numpy()
+        lab = np.clip(lab, [0,-128,-128], [100,127,127]).astype(np.float32)
+        rgb = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
+        return (np.clip(rgb,0,1)*255).astype(np.uint8)
+    lab = torch.cat([L*100.0, torch.clamp(ab, -1, 1)*110.0], dim=1)
+    with torch.no_grad():
+        rgb = kc.lab_to_rgb(lab)
+    return (torch.clamp(rgb,0,1)[0].permute(1,2,0).cpu().numpy()*255).astype(np.uint8)
+def pad_to_multiple(img_np, m=16):
+    h,w = img_np.shape[:2]
+    ph, pw = math.ceil(h/m)*m, math.ceil(w/m)*m
+    return cv2.copyMakeBorder(img_np,0,ph-h,0,pw-w,cv2.BORDER_CONSTANT,value=(0,0,0)), (h,w)
+def compute_metrics(pred, gt):
+    p = pred.astype(np.float32)/255.; g = gt.astype(np.float32)/255.
+    mae  = float(np.mean(np.abs(p-g)))
+    psnr = float(psnr_metric(g, p, data_range=1.0))
+    try:
+        ssim = float(ssim_metric(g, p, channel_axis=2, data_range=1.0, win_size=7))
+    except TypeError:
+        ssim = float(ssim_metric(g, p, multichannel=True, data_range=1.0, win_size=7))
+    return round(mae,4), round(psnr,2), round(ssim,4)
+# ---------------- Inference ----------------
+def infer(image: Image.Image, want_metrics: bool, sizing_mode: str, show_L: bool):
+    if image is None:
+        return None, None, None, None, None, "", ""
+    if model is None:
+        return None, None, None, None, None, "", "<div>Checkpoint not found in /checkpoints.</div>"
+    pil = image.convert("RGB")
+    rgb = np.array(pil)
+    w,h = pil.size
+    was_color = not is_grayscale(pil)
+    if sizing_mode == "Pad to keep size":
+        proc, (oh, ow) = pad_to_multiple(rgb, 16); back = (ow, oh)
+    else:
+        proc = cv2.resize(rgb, (256,256), interpolation=cv2.INTER_CUBIC); back = (w,h)
+    L = to_L(proc)
+    with torch.no_grad():
+        ab = model(L)
+    out = lab_to_rgb(L, ab)
+    if sizing_mode == "Pad to keep size":
+        out = out[:back[1], :back[0]]
+    else:
+        out = cv2.resize(out, back, interpolation=cv2.INTER_CUBIC)
+    # Metrics (Gradio-native numbers)
+    mae = psnr = ssim = None
+    if want_metrics:
+        mae, psnr, ssim = compute_metrics(out, np.array(pil))
+    # Optional L preview
+    extra_html = ""
+    if show_L:
+        L01 = np.clip(L[0,0].detach().cpu().numpy(),0,1)
+        L_vis = (L01*255).astype(np.uint8)
+        L_vis = cv2.cvtColor(L_vis, cv2.COLOR_GRAY2RGB)
+        _, buf = cv2.imencode(".png", cv2.cvtColor(L_vis, cv2.COLOR_RGB2BGR))
+        L_b64 = "data:image/png;base64," + base64.b64encode(buf).decode()
+        extra_html += f"<div><b>L-channel</b><br/><img style='max-height:140px;border-radius:12px' src='{L_b64}'/></div>"
+    # Subtle notice only if needed
+    if was_color:
+        extra_html += "<div style='opacity:.8;margin-top:8px'>We used a grayscale version of your image for colorization.</div>"
+    # Compare slider (HTML only; easy to remove if you want 100% Gradio)
+    _, bo = cv2.imencode(".jpg", cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR))
+    _, bc = cv2.imencode(".jpg", cv2.cvtColor(out,         cv2.COLOR_RGB2BGR))
+    so = "data:image/jpeg;base64," + base64.b64encode(bo).decode()
+    sc = "data:image/jpeg;base64," + base64.b64encode(bc).decode()
+    compare = f"""
+    <div style="position:relative;max-width:500px;margin:auto;border-radius:14px;overflow:hidden;box-shadow:0 8px 20px rgba(0,0,0,.2)">
+    <img src="{so}" style="width:100%;display:block"/>
+    <div id="cmpTop" style="position:absolute;top:0;left:0;height:100%;width:50%;overflow:hidden">
+        <img src="{sc}" style="width:100%;display:block"/>
+    </div>
+    <input id="cmpRange" type="range" min="0" max="100" value="50"
+            oninput="document.getElementById('cmpTop').style.width=this.value+'%';"
+            style="position:absolute;left:0;right:0;bottom:8px;width:60%;margin:auto"/>
+    </div>
+    """
+    return Image.fromarray(np.array(pil)), Image.fromarray(out), mae, psnr, ssim, compare, extra_html
+# ---------------- Theme (fallback-safe) ----------------
+def make_theme():
+    try:
+        from gradio.themes.utils import colors, fonts, sizes
+        return gr.themes.Soft(
+            primary_hue=colors.indigo,
+            neutral_hue=colors.gray,
+            font=fonts.GoogleFont("Inter"),
+        ).set(radius_size=sizes.radius_lg, spacing_size=sizes.spacing_md)
+    except Exception:
+        return gr.themes.Soft()
+THEME = make_theme()
+# ---------------- UI ----------------
+with gr.Blocks(theme=THEME, title="Neural Colorizer") as demo:
+    gr.Markdown("# 🎨 Neural Colorizer")
+    with gr.Row():
+        with gr.Column(scale=5):
+            img_in = gr.Image(
+                label="Upload grayscale or color image",
+                type="pil",
+                image_mode="RGB",
+                height=320,
+                sources=["upload", "clipboard"]
+            )
+            with gr.Row():
+                sizing = gr.Radio(
+                    ["Resize to 256", "Pad to keep size"],
+                    value="Resize to 256",
+                    label="Sizing"
+                )
+                show_L = gr.Checkbox(label="Show L-channel", value=False)
+            show_m = gr.Checkbox(label="Show metrics", value=True)
+            with gr.Row():
+                run = gr.Button("Colorize")
+                clr = gr.Button("Clear")
+            examples = gr.Examples(
+                examples=[os.path.join("examples", f) for f in os.listdir("examples")] if os.path.exists("examples") else [],
+                inputs=img_in,
+                examples_per_page=8,
+                label=None
+            )
+        with gr.Column(scale=7):
+            with gr.Row():
+                orig = gr.Image(label="Original", interactive=False, height=300, show_download_button=True)
+                out  = gr.Image(label="Result",   interactive=False, height=300, show_download_button=True)
+            # Pure Gradio metric fields
+            with gr.Row():
+                mae_box  = gr.Number(label="MAE",       interactive=False, precision=4)
+                psnr_box = gr.Number(label="PSNR (dB)", interactive=False, precision=2)
+                ssim_box = gr.Number(label="SSIM",      interactive=False, precision=4)
+            gr.Markdown("**Compare**")
+            compare = gr.HTML()
+            extras  = gr.HTML()
+    def _go(image, want_metrics, sizing_mode, show_L):
+        o, c, mae, psnr, ssim, cmp_html, extra = infer(image, want_metrics, sizing_mode, show_L)
+        if not want_metrics:
+            mae = psnr = ssim = None
+        return o, c, mae, psnr, ssim, cmp_html, extra
+    run.click(
+        _go,
+        inputs=[img_in, show_m, sizing, show_L],
+        outputs=[orig, out, mae_box, psnr_box, ssim_box, compare, extras]
+    )
+    def _clear():
+        return None, None, None, None, None, "", ""
+    clr.click(_clear, inputs=None, outputs=[orig, out, mae_box, psnr_box, ssim_box, compare, extras])
+if __name__ == "__main__":
+    # No queue, no API panel
+    try:
+        demo.launch(show_api=False)
+    except TypeError:
+        demo.launch()

checkpoints/checkpoint_epoch_015_20250808_154437.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:160e9bdb21f474f0da5dd059866966e0fe74b3ae4008307f5e9b1e245b3019c1
+size 84569969

examples/Places365_test_00000287.jpg ADDED Viewed

examples/Places365_test_00000314.jpg ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import timm
+import json
+from torch.nn.utils import spectral_norm
+from torchinfo import summary
+class EncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(EncoderBlock, self).__init__()
+        self.conv_block = nn.Sequential(
+                            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+                            nn.GroupNorm(8, out_channels),
+                            nn.LeakyReLU(0.01, inplace=True),
+                            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+                            nn.GroupNorm(8, out_channels),
+                            nn.LeakyReLU(0.01, inplace=True),
+                        )
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+    def forward(self, x):
+        features = self.conv_block(x)
+        pooled = self.pool(features)
+        return pooled, features
+class DecoderBlock(nn.Module):
+    def __init__(self, in_channels, skip_channels, out_channels):
+        super(DecoderBlock, self).__init__()
+        self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
+        self.ag = AttentionGate(F_g=in_channels // 2, F_l=skip_channels, F_int=in_channels // 4)
+        conv_in_channels = in_channels // 2 + skip_channels
+        self.conv_block = nn.Sequential(
+                            nn.Conv2d(conv_in_channels, out_channels, kernel_size=3, padding=1),
+                            nn.GroupNorm(8, out_channels),
+                            nn.LeakyReLU(0.01, inplace=True),
+                            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+                            nn.GroupNorm(8, out_channels),
+                            nn.LeakyReLU(0.01, inplace=True),
+                        )
+    def forward(self, x, skip):
+        x = self.up(x)
+        skip = self.ag(x, skip)
+        x = torch.cat([x, skip], dim=1)
+        x = self.conv_block(x)
+        return x
+class AttentionGate(nn.Module):
+    def __init__(self, F_g, F_l, F_int):
+        super(AttentionGate, self).__init__()
+        self.W_g = nn.Sequential(
+                    nn.Conv2d(F_g, F_int, kernel_size=1, stride=1, padding=0, bias=True),
+                    nn.GroupNorm(8, F_int),
+                )
+        self.W_x = nn.Sequential(
+                    nn.Conv2d(F_l, F_int, kernel_size=1, stride=1, padding=0, bias=True),
+                    nn.GroupNorm(8, F_int),
+                )
+        self.psi = nn.Sequential(
+                    nn.Conv2d(F_int, 1, kernel_size=1, stride=1, padding=0, bias=True),
+                    nn.GroupNorm(1, 1),
+                    nn.Sigmoid(),
+                )
+        self.relu = nn.LeakyReLU(0.01, inplace=True)
+    def forward(self, g, x):
+        g1 = self.W_g(g)
+        x1 = self.W_x(x)
+        psi = self.relu(g1 + x1)
+        psi = self.psi(psi)
+        return x * psi
+class ViTUNetColorizer(nn.Module):
+    def __init__(self, vit_model_name="vit_tiny_patch16_224", freeze_vit_epochs=10):
+        super(ViTUNetColorizer, self).__init__()
+        self.vit = timm.create_model(vit_model_name, pretrained=True, num_classes=0)
+        self.vit_embed_dim = self.vit.embed_dim
+        self.vit.head = nn.Identity()
+        self.enc1 = EncoderBlock(1, 16)
+        self.enc2 = EncoderBlock(16, 32)
+        self.enc3 = EncoderBlock(32, 64)
+        self.enc4 = EncoderBlock(64, 128)
+        self.bottleneck_processor = nn.Sequential(
+            nn.Conv2d(128, 128, kernel_size=3, padding=1),
+            nn.GroupNorm(8, 128),
+            nn.LeakyReLU(0.01, inplace=True),
+            nn.AdaptiveAvgPool2d((14, 14)),
+        )
+        self.fusion_layer = nn.Sequential(
+            nn.Conv2d(128 + self.vit_embed_dim, 128, kernel_size=1), # type: ignore
+            nn.GroupNorm(8, 128),
+            nn.LeakyReLU(0.01, inplace=True),
+            nn.Conv2d(128, 128, kernel_size=3, padding=1),
+            nn.GroupNorm(8, 128),
+            nn.LeakyReLU(0.01, inplace=True),
+        )
+        self.dec4 = DecoderBlock(128, 64, 64)
+        self.dec3 = DecoderBlock(64, 32, 32)
+        self.dec2 = DecoderBlock(32, 16, 16)
+        self.final_conv = nn.Sequential(
+            nn.Conv2d(16, 8, kernel_size=3, padding=1),
+            nn.GroupNorm(8, 8),
+            nn.LeakyReLU(0.01, inplace=True),
+            nn.Conv2d(8, 2, kernel_size=1),
+            nn.Tanh(),
+        )
+        self.freeze_vit_epochs = freeze_vit_epochs
+        self.current_epoch = 0
+    def extract_vit_features(self, x):
+        B = x.shape[0]
+        x_3ch = x.repeat(1, 3, 1, 1)
+        if x_3ch.shape[-1] != 224:
+            x_3ch = F.interpolate(
+                x_3ch, size=(224, 224), mode="bicubic", align_corners=False
+            )
+        x_vit = self.vit.patch_embed(x_3ch) # type: ignore
+        if hasattr(self.vit, 'pos_embed') and self.vit.pos_embed is not None:
+            x_vit = x_vit + self.vit.pos_embed[:, 1:, :] # type: ignore
+        x_vit = self.vit.pos_drop(x_vit) # type: ignore
+        for block in self.vit.blocks: # type: ignore
+            x_vit = block(x_vit)
+        x_vit = self.vit.norm(x_vit) # type: ignore
+        x_vit = x_vit.transpose(1, 2).reshape(B, self.vit_embed_dim, 14, 14)
+        return x_vit
+    def forward(self, x):
+        x1, skip1 = self.enc1(x)
+        x2, skip2 = self.enc2(x1)
+        x3, skip3 = self.enc3(x2)
+        x4, skip4 = self.enc4(x3)
+        bottleneck = self.bottleneck_processor(x4)
+        vit_features = self.extract_vit_features(x)
+        fused = torch.cat([bottleneck, vit_features], dim=1)
+        fused = self.fusion_layer(fused)
+        fused = F.interpolate(fused, size=x3.shape[2:], mode="bilinear", align_corners=False)
+        d4 = self.dec4(fused, skip3)
+        d3 = self.dec3(d4, skip2)
+        d2 = self.dec2(d3, skip1)
+        out = self.final_conv(d2)
+        return out
+    def set_epoch(self, epoch):
+        self.current_epoch = epoch
+        requires_grad = epoch >= self.freeze_vit_epochs
+        for param in self.vit.parameters():
+            param.requires_grad = requires_grad
+    def get_param_groups(self, lr_decoder=1e-4, lr_vit=1e-5):
+        vit_params = []
+        decoder_params = []
+        for name, param in self.named_parameters():
+            if "vit" in name:
+                vit_params.append(param)
+            else:
+                decoder_params.append(param)
+        return [
+            {"params": decoder_params, "lr": lr_decoder},
+            {"params": vit_params, "lr": lr_vit},
+        ]
+class PatchDiscriminator(nn.Module):
+    def __init__(self, in_channels=3, n_filters=64):
+        super(PatchDiscriminator, self).__init__()
+        def discriminator_block(in_filters, out_filters, stride=2):
+            return [
+                spectral_norm(
+                    nn.Conv2d(
+                        in_filters, out_filters, kernel_size=4, stride=stride, padding=1
+                    )
+                ),
+                nn.LeakyReLU(0.01, inplace=True)
+            ]
+        self.model = nn.Sequential(
+            *discriminator_block(in_channels, n_filters),
+            *discriminator_block(n_filters, n_filters * 2),
+            *discriminator_block(n_filters * 2, n_filters * 4),
+            spectral_norm(nn.Conv2d(n_filters * 4, 1, kernel_size=4, padding=1))
+        )
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, 0.0, 0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, L, ab):
+        img_input = torch.cat((L, ab), dim=1)
+        return self.model(img_input)
+if __name__ == "__main__":
+    try:
+        with open("hyperparameters.json", "r") as f:
+            hparams = json.load(f)
+        resolution = hparams.get("resolution", 256)
+    except FileNotFoundError:
+        resolution = 256
+        print("Using default resolution: 256x256")
+    generator = ViTUNetColorizer()
+    generator_input_size = (1, 1, resolution, resolution)
+    summary(generator, input_size=generator_input_size)
+    discriminator = PatchDiscriminator()
+    discriminator_input_size = [(1, 1, resolution, resolution), (1, 2, resolution, resolution)]
+    summary(discriminator, input_size=discriminator_input_size)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+torch
+torchvision
+torchinfo
+numpy
+opencv-python-headless
+Pillow
+scikit-image
+kornia
+matplotlib
+timm