Spaces:

IdlecloudX
/

Image_Inversion

Running

App Files Files Community

IdlecloudX commited on 15 days ago

Commit

12cfca5

verified ·

1 Parent(s): f40029f

Upload handler.py

Browse files

Files changed (1) hide show

handler.py +279 -0

handler.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import base64
+import io
+import json
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any
+import requests
+import timm
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+class TaggingHead(torch.nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super().__init__()
+        self.input_dim = input_dim
+        self.num_classes = num_classes
+        self.head = torch.nn.Sequential(torch.nn.Linear(input_dim, num_classes))
+    def forward(self, x):
+        logits = self.head(x)
+        probs = torch.nn.functional.sigmoid(logits)
+        return probs
+def get_tags(tags_file: Path) -> tuple[dict[str, int], int, int]:
+    with tags_file.open("r", encoding="utf-8") as f:
+        tag_info = json.load(f)
+    tag_map = tag_info["tag_map"]
+    tag_split = tag_info["tag_split"]
+    gen_tag_count = tag_split["gen_tag_count"]
+    character_tag_count = tag_split["character_tag_count"]
+    return tag_map, gen_tag_count, character_tag_count
+def get_character_ip_mapping(mapping_file: Path):
+    with mapping_file.open("r", encoding="utf-8") as f:
+        mapping = json.load(f)
+    return mapping
+def get_encoder():
+    base_model_repo = "hf_hub:SmilingWolf/wd-eva02-large-tagger-v3"
+    encoder = timm.create_model(base_model_repo, pretrained=False)
+    encoder.reset_classifier(0)
+    return encoder
+def get_decoder():
+    decoder = TaggingHead(1024, 13461)
+    return decoder
+def get_model():
+    encoder = get_encoder()
+    decoder = get_decoder()
+    model = torch.nn.Sequential(encoder, decoder)
+    return model
+def load_model(weights_file, device):
+    model = get_model()
+    states_dict = torch.load(weights_file, map_location=device, weights_only=True)
+    model.load_state_dict(states_dict)
+    model.to(device)
+    model.eval()
+    return model
+def pure_pil_alpha_to_color_v2(
+    image: Image.Image, color: tuple[int, int, int] = (255, 255, 255)
+) -> Image.Image:
+    """
+    Convert a PIL image with an alpha channel to a RGB image.
+    This is a workaround for the fact that the model expects a RGB image, but the image may have an alpha channel.
+    This function will convert the image to a RGB image, and fill the alpha channel with the given color.
+    The alpha channel is the 4th channel of the image.
+    """
+    image.load()  # needed for split()
+    background = Image.new("RGB", image.size, color)
+    background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+    return background
+def pil_to_rgb(image: Image.Image) -> Image.Image:
+    if image.mode == "RGBA":
+        image = pure_pil_alpha_to_color_v2(image)
+    elif image.mode == "P":
+        image = pure_pil_alpha_to_color_v2(image.convert("RGBA"))
+    else:
+        image = image.convert("RGB")
+    return image
+class EndpointHandler:
+    def __init__(self, path: str):
+        repo_path = Path(path)
+        assert repo_path.is_dir(), f"Model directory not found: {repo_path}"
+        weights_file = repo_path / "model_v0.9.pth"
+        tags_file = repo_path / "tags_v0.9_13k.json"
+        mapping_file = repo_path / "char_ip_map.json"
+        if not weights_file.exists():
+            raise FileNotFoundError(f"Model file not found: {weights_file}")
+        if not tags_file.exists():
+            raise FileNotFoundError(f"Tags file not found: {tags_file}")
+        if not mapping_file.exists():
+            raise FileNotFoundError(f"Mapping file not found: {mapping_file}")
+        # Robust device selection: prefer CPU unless CUDA is truly usable
+        force_cpu = os.environ.get("FORCE_CPU", "0") in {"1", "true", "TRUE", "yes", "on"}
+        if not force_cpu and torch.cuda.is_available():
+            try:
+                # Probe that CUDA can actually be used (driver present)
+                torch.zeros(1).to("cuda")
+                self.device = "cuda"
+            except Exception:
+                self.device = "cpu"
+        else:
+            self.device = "cpu"
+        self.model = load_model(str(weights_file), self.device)
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((448, 448)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+            ]
+        )
+        self.fetch_image_timeout = 5.0
+        self.default_general_threshold = 0.3
+        self.default_character_threshold = 0.85
+        tag_map, self.gen_tag_count, self.character_tag_count = get_tags(tags_file)
+        # Invert the tag_map for efficient index-to-tag lookups
+        self.index_to_tag_map = {v: k for k, v in tag_map.items()}
+        self.character_ip_mapping = get_character_ip_mapping(mapping_file)
+    def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
+        inputs = data.pop("inputs", data)
+        fetch_start_time = time.time()
+        if isinstance(inputs, Image.Image):
+            image = inputs
+        elif image_url := inputs.pop("url", None):
+            with requests.get(
+                image_url, stream=True, timeout=self.fetch_image_timeout
+            ) as res:
+                res.raise_for_status()
+                image = Image.open(res.raw)
+        elif image_base64_encoded := inputs.pop("image", None):
+            image = Image.open(io.BytesIO(base64.b64decode(image_base64_encoded)))
+        else:
+            raise ValueError(f"No image or url provided: {data}")
+        # remove alpha channel if it exists
+        image = pil_to_rgb(image)
+        fetch_time = time.time() - fetch_start_time
+        parameters = data.pop("parameters", {})
+        general_threshold = parameters.pop(
+            "general_threshold", self.default_general_threshold
+        )
+        character_threshold = parameters.pop(
+            "character_threshold", self.default_character_threshold
+        )
+        # Optional behavior controls
+        mode = parameters.pop("mode", "threshold")  # "threshold" | "topk"
+        include_scores = bool(parameters.pop("include_scores", False))
+        topk_general = int(parameters.pop("topk_general", 25))
+        topk_character = int(parameters.pop("topk_character", 10))
+        inference_start_time = time.time()
+        with torch.inference_mode():
+            # Preprocess image on CPU
+            image_tensor = self.transform(image).unsqueeze(0)
+            # Pin memory and use non_blocking transfer only when using CUDA
+            if self.device == "cuda":
+                image_tensor = image_tensor.pin_memory().to(self.device, non_blocking=True)
+            else:
+                image_tensor = image_tensor.to(self.device)
+            # Run model on GPU
+            probs = self.model(image_tensor)[0]  # Get probs for the single image
+            if mode == "topk":
+                # Select top-k by category, independent of thresholds
+                gen_slice = probs[: self.gen_tag_count]
+                char_slice = probs[self.gen_tag_count :]
+                k_gen = max(0, min(int(topk_general), self.gen_tag_count))
+                k_char = max(0, min(int(topk_character), self.character_tag_count))
+                gen_scores, gen_idx = (torch.tensor([]), torch.tensor([], dtype=torch.long))
+                char_scores, char_idx = (torch.tensor([]), torch.tensor([], dtype=torch.long))
+                if k_gen > 0:
+                    gen_scores, gen_idx = torch.topk(gen_slice, k_gen)
+                if k_char > 0:
+                    char_scores, char_idx = torch.topk(char_slice, k_char)
+                    char_idx = char_idx + self.gen_tag_count
+                # Merge for unified post-processing
+                combined_indices = torch.cat((gen_idx, char_idx)).cpu()
+                combined_scores = torch.cat((gen_scores, char_scores)).cpu()
+            else:
+                # Perform thresholding directly on the GPU
+                general_mask = probs[: self.gen_tag_count] > general_threshold
+                character_mask = probs[self.gen_tag_count :] > character_threshold
+                # Get the indices of positive tags on the GPU
+                general_indices = general_mask.nonzero(as_tuple=True)[0]
+                character_indices = (
+                    character_mask.nonzero(as_tuple=True)[0] + self.gen_tag_count
+                )
+                # Combine indices and move the small result tensor to the CPU
+                combined_indices = torch.cat((general_indices, character_indices)).cpu()
+                combined_scores = probs[combined_indices].detach().float().cpu()
+        inference_time = time.time() - inference_start_time
+        post_process_start_time = time.time()
+        cur_gen_tags = []
+        cur_char_tags = []
+        gen_scores_out: dict[str, float] = {}
+        char_scores_out: dict[str, float] = {}
+        # Use the efficient pre-computed map for lookups
+        for pos, i in enumerate(combined_indices):
+            idx = int(i.item())
+            tag = self.index_to_tag_map[idx]
+            if idx < self.gen_tag_count:
+                cur_gen_tags.append(tag)
+                if include_scores:
+                    score = float(combined_scores[pos].item())
+                    gen_scores_out[tag] = score
+            else:
+                cur_char_tags.append(tag)
+                if include_scores:
+                    score = float(combined_scores[pos].item())
+                    char_scores_out[tag] = score
+        ip_tags = []
+        for tag in cur_char_tags:
+            if tag in self.character_ip_mapping:
+                ip_tags.extend(self.character_ip_mapping[tag])
+        ip_tags = sorted(set(ip_tags))
+        post_process_time = time.time() - post_process_start_time
+        logging.info(
+            f"Timing - Fetch: {fetch_time:.3f}s, Inference: {inference_time:.3f}s, Post-process: {post_process_time:.3f}s, Total: {fetch_time + inference_time + post_process_time:.3f}s"
+        )
+        out: dict[str, Any] = {
+            "feature": cur_gen_tags,
+            "character": cur_char_tags,
+            "ip": ip_tags,
+            "_timings": {
+                "fetch_s": round(fetch_time, 4),
+                "inference_s": round(inference_time, 4),
+                "post_process_s": round(post_process_time, 4),
+                "total_s": round(fetch_time + inference_time + post_process_time, 4),
+            },
+            "_params": {
+                "mode": mode,
+                "general_threshold": general_threshold,
+                "character_threshold": character_threshold,
+                "topk_general": topk_general,
+                "topk_character": topk_character,
+            },
+        }
+        if include_scores:
+            out["feature_scores"] = gen_scores_out
+            out["character_scores"] = char_scores_out
+        return out