NoMoreCopyrightOrg
/

flux-test5

Model card Files Files and versions

xet

Community

refoundd commited on Mar 24, 2025

Commit

2861775

verified ·

1 Parent(s): 28c7227

Update handler.py

Browse files

Files changed (1) hide show

handler.py +145 -193

handler.py CHANGED Viewed

@@ -6,218 +6,148 @@ from PIL import Image
 from huggingface_inference_toolkit.logging import logger
 from pymongo.mongo_client import MongoClient
 from diffusers.utils import load_image
-import huggingface_hub
 import numpy as np
-import onnxruntime as rt
 import pandas as pd
 import time
-import subprocess
-# List of commands to execute
-commands = [
-    "wget https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2004-9.8.0_1.0-1_amd64.deb",
-    "sudo dpkg -i cudnn-local-repo-ubuntu2004-9.8.0_1.0-1_amd64.deb",
-    "sudo cp /var/cudnn-local-repo-ubuntu2004-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/",
-    "sudo apt-get update",
-    "sudo apt-get -y install cudnn",
-    "sudo apt-get -y install cudnn-cuda-12"
-]
-# Execute each command
-for command in commands:
-    try:
-        print(f"Running command: {command}")
-        subprocess.run(command, shell=True, check=True)
-        print(f"Command executed successfully: {command}")
-    except subprocess.CalledProcessError as e:
-        print(f"Error occurred while executing command: {e}")
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-# Dataset v3 series of models:
-VIT_LARGE_MODEL_DSV3_REPO = "SmilingWolf/wd-vit-large-tagger-v3"
-# Files to download from the repos
-MODEL_FILENAME = "model.onnx"
-LABEL_FILENAME = "selected_tags.csv"
-# https://github.com/toriato/stable-diffusion-webui-wd14-tagger/blob/a9eacb1eff904552d3012babfa28b57e1d3e295c/tagger/ui.py#L368
-kaomojis = [
-    "0_0",
-    "(o)_(o)",
-    "+_+",
-    "+_-",
-    "._.",
-    "<o>_<o>",
-    "<|>_<|>",
-    "=_=",
-    ">_<",
-    "3_3",
-    "6_9",
-    ">_o",
-    "@_@",
-    "^_^",
-    "o_o",
-    "u_u",
-    "x_x",
-    "|_|",
-    "||_||",
-]
-def load_labels(dataframe) -> list[str]:
-    name_series = dataframe["name"]
-    name_series = name_series.map(
-        lambda x: x.replace("_", " ") if x not in kaomojis else x
-    )
-    tag_names = name_series.tolist()
-    rating_indexes = list(np.where(dataframe["category"] == 9)[0])
-    general_indexes = list(np.where(dataframe["category"] == 0)[0])
-    character_indexes = list(np.where(dataframe["category"] == 4)[0])
-    return tag_names, rating_indexes, general_indexes, character_indexes
-def mcut_threshold(probs):
-    """
-    Maximum Cut Thresholding (MCut)
-    Largeron, C., Moulin, C., & Gery, M. (2012). MCut: A Thresholding Strategy
-     for Multi-label Classification. In 11th International Symposium, IDA 2012
-     (pp. 172-183).
-    """
-    sorted_probs = probs[probs.argsort()[::-1]]
-    difs = sorted_probs[:-1] - sorted_probs[1:]
-    t = difs.argmax()
-    thresh = (sorted_probs[t] + sorted_probs[t + 1]) / 2
-    return thresh
-class Predictor:
-    def __init__(self):
-        self.model_target_size = None
-        self.last_loaded_repo = None
-    def download_model(self, model_repo):
-        csv_path = huggingface_hub.hf_hub_download(
-            model_repo,
-            LABEL_FILENAME,
-            use_auth_token=HF_TOKEN,
-        )
-        model_path = huggingface_hub.hf_hub_download(
-            model_repo,
-            MODEL_FILENAME,
-            use_auth_token=HF_TOKEN,
-        )
-        return csv_path, model_path
-    def load_model(self, model_repo):
-        if model_repo == self.last_loaded_repo:
-            return
-        csv_path, model_path = self.download_model(model_repo)
-        tags_df = pd.read_csv(csv_path)
-        sep_tags = load_labels(tags_df)
-        self.tag_names = sep_tags[0]
-        self.rating_indexes = sep_tags[1]
-        self.general_indexes = sep_tags[2]
-        self.character_indexes = sep_tags[3]
-        model = rt.InferenceSession(model_path,providers=['CUDAExecutionProvider','CPUExecutionProvider'])
-        _, height, width, _ = model.get_inputs()[0].shape
-        self.model_target_size = height
-        self.last_loaded_repo = model_repo
-        self.model = model
-    def prepare_image(self, image):
-        target_size = self.model_target_size
         canvas = Image.new("RGBA", image.size, (255, 255, 255))
         canvas.alpha_composite(image)
         image = canvas.convert("RGB")
-        # Pad image to square
-        image_shape = image.size
-        max_dim = max(image_shape)
-        pad_left = (max_dim - image_shape[0]) // 2
-        pad_top = (max_dim - image_shape[1]) // 2
-        padded_image = Image.new("RGB", (max_dim, max_dim), (255, 255, 255))
-        padded_image.paste(image, (pad_left, pad_top))
-        # Resize
-        if max_dim != target_size:
-            padded_image = padded_image.resize(
-                (target_size, target_size),
-                Image.BICUBIC,
-            )
-        # Convert to numpy array
-        image_array = np.asarray(padded_image, dtype=np.float32)
-        # Convert PIL-native RGB to BGR
-        image_array = image_array[:, :, ::-1]
-        return np.expand_dims(image_array, axis=0)
-    def predict(
-        self,
-        image,
-        model_repo,
-        general_thresh,
-        general_mcut_enabled,
-        character_thresh,
-        character_mcut_enabled,
-    ):
-        self.load_model(model_repo)
-        image = self.prepare_image(image)
-        input_name = self.model.get_inputs()[0].name
-        label_name = self.model.get_outputs()[0].name
-        preds = self.model.run([label_name], {input_name: image})[0]
-        labels = list(zip(self.tag_names, preds[0].astype(float)))
-        # First 4 labels are actually ratings: pick one with argmax
-        ratings_names = [labels[i] for i in self.rating_indexes]
-        rating = dict(ratings_names)
-        # Then we have general tags: pick any where prediction confidence > threshold
-        general_names = [labels[i] for i in self.general_indexes]
-        if general_mcut_enabled:
-            general_probs = np.array([x[1] for x in general_names])
-            general_thresh = mcut_threshold(general_probs)
-        general_res = [x for x in general_names if x[1] > general_thresh]
-        general_res = dict(general_res)
-        # Everything else is characters: pick any where prediction confidence > threshold
-        character_names = [labels[i] for i in self.character_indexes]
-        if character_mcut_enabled:
-            character_probs = np.array([x[1] for x in character_names])
-            character_thresh = mcut_threshold(character_probs)
-            character_thresh = max(0.15, character_thresh)
-        character_res = [x for x in character_names if x[1] > character_thresh]
-        character_res = dict(character_res)
-        sorted_general_strings = sorted(
-            general_res.items(),
-            key=lambda x: x[1],
-            reverse=True,
-        )
-        sorted_general_strings = [x[0] for x in sorted_general_strings]
-        sorted_general_strings = (
-            ", ".join(sorted_general_strings).replace("(", "\\(").replace(")", "\\)")
-        )
-        return {**rating, **character_res, **general_res}
 class EndpointHandler:
     def __init__(self, path=""):
-        self.predictor = Predictor()
-        self.model_repo = VIT_LARGE_MODEL_DSV3_REPO
-        uri = os.environ.get("MongoDB", "")
         self.client = MongoClient(uri)
         self.db = self.client['nomorecopyright']
@@ -244,18 +174,40 @@ class EndpointHandler:
         start_time=time.time()
         for document in data:
                 image=load_image(document.get('createdImage', 'https://nomorecopyright.com/default.jpg'))
-                image = image.convert("RGBA")
-                outputs = self.predictor.predict(
-                    image,
-                    self.model_repo,
-                    general_thresh=0.35,
-                    general_mcut_enabled=False,
-                    character_thresh=0.85,
-                    character_mcut_enabled=False,
                 )
                 saveQuery = {"_id": document.get('_id')}
                 # Update operation to add keywords with confidence scores
-                update_result = self.collection.update_one(saveQuery , {'$set': {'keywords': outputs}})
         end_time=time.time()
         print(f"Time taken: {end_time-start_time:.2f} seconds")
         return 'OK'

 from huggingface_inference_toolkit.logging import logger
 from pymongo.mongo_client import MongoClient
 from diffusers.utils import load_image
 import numpy as np
 import pandas as pd
 import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pandas as pd
+import timm
+import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
+from PIL import Image
+from simple_parsing import field
+from timm.data import create_transform, resolve_data_config
+from torch import Tensor, nn
+from torch.nn import functional as F
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_REPO_MAP = {
+    "vit": "SmilingWolf/wd-vit-large-tagger-v3",
+}
+def pil_ensure_rgb(image: Image.Image) -> Image.Image:
+    # convert to RGB/RGBA if not already (deals with palette images etc.)
+    if image.mode not in ["RGB", "RGBA"]:
+        image = image.convert("RGBA") if "transparency" in image.info else image.convert("RGB")
+    # convert RGBA to RGB with white background
+    if image.mode == "RGBA":
         canvas = Image.new("RGBA", image.size, (255, 255, 255))
         canvas.alpha_composite(image)
         image = canvas.convert("RGB")
+    return image
+def pil_pad_square(image: Image.Image) -> Image.Image:
+    w, h = image.size
+    # get the largest dimension so we can pad to a square
+    px = max(image.size)
+    # pad to square with white background
+    canvas = Image.new("RGB", (px, px), (255, 255, 255))
+    canvas.paste(image, ((px - w) // 2, (px - h) // 2))
+    return canvas
+@dataclass
+class LabelData:
+    names: list[str]
+    rating: list[np.int64]
+    general: list[np.int64]
+    character: list[np.int64]
+def load_labels_hf(
+    repo_id: str,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+) -> LabelData:
+    try:
+        csv_path = hf_hub_download(
+            repo_id=repo_id, filename="selected_tags.csv", revision=revision, token=token
+        )
+        csv_path = Path(csv_path).resolve()
+    except HfHubHTTPError as e:
+        raise FileNotFoundError(f"selected_tags.csv failed to download from {repo_id}") from e
+    df: pd.DataFrame = pd.read_csv(csv_path, usecols=["name", "category"])
+    tag_data = LabelData(
+        names=df["name"].tolist(),
+        rating=list(np.where(df["category"] == 9)[0]),
+        general=list(np.where(df["category"] == 0)[0]),
+        character=list(np.where(df["category"] == 4)[0]),
+    )
+    return tag_data
+def get_tags(
+    probs: Tensor,
+    labels: LabelData,
+    gen_threshold: float,
+    char_threshold: float,
+):
+    # Convert indices+probs to labels
+    probs = list(zip(labels.names, probs.numpy()))
+    # First 4 labels are actually ratings
+    rating_labels = dict([probs[i] for i in labels.rating])
+    # General labels, pick any where prediction confidence > threshold
+    gen_labels = [probs[i] for i in labels.general]
+    gen_labels = dict([x for x in gen_labels if x[1] > gen_threshold])
+    gen_labels = dict(sorted(gen_labels.items(), key=lambda item: item[1], reverse=True))
+    # Character labels, pick any where prediction confidence > threshold
+    char_labels = [probs[i] for i in labels.character]
+    char_labels = dict([x for x in char_labels if x[1] > char_threshold])
+    char_labels = dict(sorted(char_labels.items(), key=lambda item: item[1], reverse=True))
+    # Combine general and character labels, sort by confidence
+    combined_names = [x for x in gen_labels]
+    combined_names.extend([x for x in char_labels])
+    # Convert to a string suitable for use as a training caption
+    caption = ", ".join(combined_names)
+    taglist = caption.replace("_", " ").replace("(", "\(").replace(")", "\)")
+    return caption, taglist, rating_labels, char_labels, gen_labels
+@dataclass
+class ScriptOptions:
+    image_file: Path = field(positional=True)
+    model: str = field(default="vit")
+    gen_threshold: float = field(default=0.35)
+    char_threshold: float = field(default=0.75)
 class EndpointHandler:
     def __init__(self, path=""):
+        self.opts = ScriptOptions
+        repo_id = MODEL_REPO_MAP.get(self.opts.model)
+        print(f"Loading model '{self.opts.model}' from '{repo_id}'...")
+        self.model: nn.Module = timm.create_model("hf-hub:" + repo_id).eval()
+        state_dict = timm.models.load_state_dict_from_hf(repo_id)
+        self.model.load_state_dict(state_dict)
+        print("Loading tag list...")
+        self.labels: LabelData = load_labels_hf(repo_id=repo_id)
+        print("Creating data transform...")
+        self.transform = create_transform(**resolve_data_config(model.pretrained_cfg, model=model))
+        with torch.inference_mode():
+            # move model to GPU, if available
+            if torch_device.type != "cpu":
+                self.model = self.model.to(torch_device)
+        uri = os.environ.get("MongoDB", "mongodb+srv://jamie:qJiuKQpqhXMHGb74@cluster0.i5ujz.mongodb.net/")
         self.client = MongoClient(uri)
         self.db = self.client['nomorecopyright']
         start_time=time.time()
         for document in data:
                 image=load_image(document.get('createdImage', 'https://nomorecopyright.com/default.jpg'))
+                print("Loading image and preprocessing...")
+                # get image
+                # ensure image is RGB
+                img_input = pil_ensure_rgb(image)
+                # pad to square with white background
+                img_input = pil_pad_square(img_input)
+                # run the model's input transform to convert to tensor and rescale
+                inputs: Tensor = self.transform(img_input).unsqueeze(0)
+                # NCHW image RGB to BGR
+                inputs = inputs[:, [2, 1, 0]]
+                inputs = inputs.to(torch_device)
+                print("Running inference...")
+                outputs = self.model.forward(inputs)
+                # apply the final activation function (timm doesn't support doing this internally)
+                outputs = F.sigmoid(outputs)
+                # move inputs, outputs, and model back to to cpu if we were on GPU
+                if torch_device.type != "cpu":
+                    inputs = inputs.to("cpu")
+                    outputs = outputs.to("cpu")
+                print("Processing results...")
+                caption, taglist, ratings, character, general = get_tags(
+                    probs=outputs.squeeze(0),
+                    labels=self.labels,
+                    gen_threshold=self.opts.gen_threshold,
+                    char_threshold=self.opts.char_threshold,
                 )
+                results={**ratings, **character, **general}
+                print(results)
                 saveQuery = {"_id": document.get('_id')}
                 # Update operation to add keywords with confidence scores
+                update_result = self.collection.update_one(saveQuery , {'$set': {'keywords': results}})
         end_time=time.time()
         print(f"Time taken: {end_time-start_time:.2f} seconds")
         return 'OK'