Library-Monika
/

SPACE-ANIME

Model card Files Files and versions

xet

Community

Plana-Archive commited on 25 days ago

Commit

f19179c

verified ·

1 Parent(s): 658d9e6

Upload wd-tagger-heatmap-more-models/tagger/model.py with huggingface_hub

Browse files

Files changed (1) hide show

wd-tagger-heatmap-more-models/tagger/model.py +206 -0

wd-tagger-heatmap-more-models/tagger/model.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import math
+from pathlib import Path
+import colorcet as cc
+import cv2
+import numpy as np
+import timm
+import torch
+from PIL import Image
+from matplotlib.colors import LinearSegmentedColormap
+from timm.data import create_transform, resolve_data_config
+from timm.models import VisionTransformer
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torchvision import transforms as T
+from .common import Heatmap, ImageLabels, LabelData, pil_make_grid
+# working dir, either file parent dir or cwd if interactive
+work_dir = (Path(__file__).parent if "__file__" in locals() else Path.cwd()).resolve()
+temp_dir = work_dir.joinpath("temp")
+temp_dir.mkdir(exist_ok=True, parents=True)
+# model cache
+model_cache: dict[str, VisionTransformer] = {}
+transform_cache: dict[str, T.Compose] = {}
+# device to use
+torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class RGBtoBGR(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        if x.ndim == 4:
+            return x[:, [2, 1, 0], :, :]
+        return x[[2, 1, 0], :, :]
+def model_device(model: nn.Module) -> torch.device:
+    return next(model.parameters()).device
+def load_model(repo_id: str) -> VisionTransformer:
+    global model_cache
+    if model_cache.get(repo_id, None) is None:
+        # save model to cache
+        model_cache[repo_id] = timm.create_model("hf-hub:" + repo_id, pretrained=True).eval().to(torch_device)
+    return model_cache[repo_id]
+def load_model_and_transform(repo_id: str) -> tuple[VisionTransformer, T.Compose]:
+    global transform_cache
+    global model_cache
+    if model_cache.get(repo_id, None) is None:
+        # save model to cache
+        model_cache[repo_id] = timm.create_model("hf-hub:" + repo_id, pretrained=True).eval()
+    model = model_cache[repo_id]
+    if transform_cache.get(repo_id, None) is None:
+        transforms = create_transform(**resolve_data_config(model.pretrained_cfg, model=model))
+        # hack in the RGBtoBGR transform, save to cache
+        transform_cache[repo_id] = T.Compose(transforms.transforms + [RGBtoBGR()])
+    transform = transform_cache[repo_id]
+    return model, transform
+def get_tags(
+    probs: Tensor,
+    labels: LabelData,
+    gen_threshold: float,
+    char_threshold: float,
+):
+    # Convert indices+probs to labels
+    probs = list(zip(labels.names, probs.numpy()))
+    # First 4 labels are actually ratings
+    rating_labels = dict([probs[i] for i in labels.rating])
+    # General labels, pick any where prediction confidence > threshold
+    gen_labels = [probs[i] for i in labels.general]
+    gen_labels = dict([x for x in gen_labels if x[1] > gen_threshold])
+    gen_labels = dict(sorted(gen_labels.items(), key=lambda item: item[1], reverse=True))
+    # Character labels, pick any where prediction confidence > threshold
+    char_labels = [probs[i] for i in labels.character]
+    char_labels = dict([x for x in char_labels if x[1] > char_threshold])
+    char_labels = dict(sorted(char_labels.items(), key=lambda item: item[1], reverse=True))
+    # Combine general and character labels, sort by confidence
+    combined_names = [x for x in gen_labels]
+    combined_names.extend([x for x in char_labels])
+    # Convert to a string suitable for use as a training caption
+    caption = ", ".join(combined_names).replace("(", "\(").replace(")", "\)")
+    booru = caption.replace("_", " ")
+    return caption, booru, rating_labels, char_labels, gen_labels
+@torch.no_grad()
+def render_heatmap(
+    image: Tensor,
+    gradients: Tensor,
+    image_feats: Tensor,
+    image_probs: Tensor,
+    image_labels: list[str],
+    cmap: LinearSegmentedColormap = cc.m_linear_bmy_10_95_c71,
+    pos_embed_dim: int = 784,
+    image_size: tuple[int, int] = (448, 448),
+    font_args: dict = {
+        "fontFace": cv2.FONT_HERSHEY_SIMPLEX,
+        "fontScale": 1,
+        "color": (255, 255, 255),
+        "thickness": 2,
+        "lineType": cv2.LINE_AA,
+    },
+    partial_rows: bool = True,
+) -> tuple[list[Heatmap], Image.Image]:
+    # hmap_dim = int(math.sqrt(pos_embed_dim))
+    image_hmaps = gradients.mean(2, keepdim=True).mul(image_feats.unsqueeze(0)).squeeze()
+    hmap_dim = int(math.sqrt(image_hmaps.mean(-1).numel() / len(image_labels)))
+    image_hmaps = image_hmaps.mean(-1).reshape(len(image_labels), -1)
+    image_hmaps = image_hmaps[..., -hmap_dim ** 2:]
+    image_hmaps = image_hmaps.reshape(len(image_labels), hmap_dim, hmap_dim)
+    image_hmaps = image_hmaps.max(torch.zeros_like(image_hmaps))
+    image_hmaps /= image_hmaps.reshape(image_hmaps.shape[0], -1).max(-1)[0].unsqueeze(-1).unsqueeze(-1)
+    # normalize to 0-1
+    image_hmaps = torch.stack([(x - x.min()) / (x.max() - x.min()) for x in image_hmaps]).unsqueeze(1)
+    # interpolate to input image size
+    image_hmaps = F.interpolate(image_hmaps, size=image_size, mode="bilinear").squeeze(1)
+    hmap_imgs: list[Heatmap] = []
+    for tag, hmap, score in zip(image_labels, image_hmaps, image_probs.cpu()):
+        image_pixels = image.add(1).mul(127.5).squeeze().permute(1, 2, 0).cpu().numpy().astype(np.uint8)
+        hmap_pixels = cmap(hmap.cpu().numpy(), bytes=True)[:, :, :3]
+        hmap_cv2 = cv2.cvtColor(hmap_pixels, cv2.COLOR_RGB2BGR)
+        hmap_image = cv2.addWeighted(image_pixels, 0.5, hmap_cv2, 0.5, 0)
+        if tag is not None:
+            cv2.putText(hmap_image, tag, (10, 30), **font_args)
+            cv2.putText(hmap_image, f"{score:.3f}", org=(10, 60), **font_args)
+        hmap_pil = Image.fromarray(cv2.cvtColor(hmap_image, cv2.COLOR_BGR2RGB))
+        hmap_imgs.append(Heatmap(tag, score.item(), hmap_pil))
+    hmap_imgs = sorted(hmap_imgs, key=lambda x: x.score, reverse=True)
+    hmap_grid = pil_make_grid([x.image for x in hmap_imgs], partial_rows=partial_rows)
+    return hmap_imgs, hmap_grid
+def process_heatmap(
+    model: VisionTransformer,
+    image: Tensor,
+    labels: LabelData,
+    threshold: float = 0.5,
+    partial_rows: bool = True,
+) -> tuple[list[tuple[float, str, Image.Image]], Image.Image, ImageLabels]:
+    torch_device = model_device(model)
+    with torch.set_grad_enabled(True):
+        features = model.forward_features(image.to(torch_device))
+        probs = model.forward_head(features)
+        probs = F.sigmoid(probs).squeeze(0)
+        probs_mask = probs > threshold
+        heatmap_probs = probs[probs_mask]
+        label_indices = torch.nonzero(probs_mask, as_tuple=False).squeeze(1)
+        image_labels = [labels.names[label_indices[i]] for i in range(len(label_indices))]
+        eye = torch.eye(heatmap_probs.shape[0], device=torch_device)
+        grads = torch.autograd.grad(
+            outputs=heatmap_probs,
+            inputs=features,
+            grad_outputs=eye,
+            is_grads_batched=True,
+            retain_graph=True,
+        )
+        grads = grads[0].detach().requires_grad_(False)[:, 0, :, :].unsqueeze(1)
+    with torch.set_grad_enabled(False):
+        hmap_imgs, hmap_grid = render_heatmap(
+            image=image,
+            gradients=grads,
+            image_feats=features,
+            image_probs=heatmap_probs,
+            image_labels=image_labels,
+            partial_rows=partial_rows,
+        )
+        caption, booru, ratings, character, general = get_tags(
+            probs=probs.cpu(),
+            labels=labels,
+            gen_threshold=threshold,
+            char_threshold=threshold,
+        )
+        labels = ImageLabels(caption, booru, ratings, general, character)
+    return hmap_imgs, hmap_grid, labels