Spaces:

Johnny-Z
/

dan_tagger

Running

App Files Files Community

Johnny-Z commited on Nov 3, 2025

Commit

beefff8

verified ·

1 Parent(s): 93eed81

Upload 2 files

Browse files

Files changed (2) hide show

app.py +300 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,300 @@

+from transformers import CLIPImageProcessor, AutoModel
+import torch
+import json
+import torch.nn as nn
+from PIL import Image
+import gradio as gr
+import os
+from huggingface_hub import login, snapshot_download
+TITLE = "Danbooru Tagger"
+DESCRIPTION = """
+## Dataset
+- Source: Cleaned Danbooru
+## Metrics
+- Validation Split: 10% of Dataset
+- Validation Results:
+### General
+| Metric          | Value       |
+|-----------------|-------------|
+| Macro F1        | 0.4678      |
+| Macro Precision | 0.4605      |
+| Macro Recall    | 0.5229      |
+| Micro F1        | 0.6661      |
+| Micro Precision | 0.6049      |
+| Micro Recall    | 0.7411      |
+### Character
+| Metric          | Value       |
+|-----------------|-------------|
+| Macro F1        | 0.8925      |
+| Macro Precision | 0.9099      |
+| Macro Recall    | 0.8935      |
+| Micro F1        | 0.9232      |
+| Micro Precision | 0.9264      |
+| Micro Recall    | 0.9199      |
+### Artist
+| Metric          | Value       |
+|-----------------|-------------|
+| Macro F1        | 0.7904      |
+| Macro Precision | 0.8286      |
+| Macro Recall    | 0.7904      |
+| Micro F1        | 0.5989      |
+| Micro Precision | 0.5975      |
+| Micro Recall    | 0.6004      |
+"""
+kaomojis = [
+    "0_0",
+    "(o)_(o)",
+    "+_+",
+    "+_-",
+    "._.",
+    "<o>_<o>",
+    "<|>_<|>",
+    "=_=",
+    ">_<",
+    "3_3",
+    "6_9",
+    ">_o",
+    "@_@",
+    "^_^",
+    "o_o",
+    "u_u",
+    "x_x",
+    "|_|",
+    "||_||",
+]
+device = torch.device('cpu')
+dtype = torch.float32
+hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    login(token=hf_token)
+else:
+    raise ValueError("environment variable HF_TOKEN not found.")
+repo_id = "Johnny-Z/vit-e4"
+repo_dir = snapshot_download(repo_id)
+model = AutoModel.from_pretrained(repo_id, dtype=dtype, trust_remote_code=True, device_map=device)
+processor = CLIPImageProcessor.from_pretrained(repo_id)
+class MultiheadAttentionPoolingHead(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.map_probe = nn.Parameter(torch.randn(1, 1, input_size))
+        self.map_layernorm0 = nn.LayerNorm(input_size, eps=1e-08)
+        self.map_attention = torch.nn.MultiheadAttention(input_size, input_size // 64, batch_first=True)
+        self.map_layernorm1 = nn.LayerNorm(input_size, eps=1e-08)
+        self.map_ffn = nn.Sequential(
+            nn.Linear(input_size, input_size * 4),
+            nn.SiLU(),
+            nn.Linear(input_size * 4, input_size)
+        )
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.map_probe.repeat(batch_size, 1, 1)
+        hidden_state = self.map_layernorm0(hidden_state)
+        hidden_state = self.map_attention(probe, hidden_state, hidden_state)[0]
+        hidden_state = self.map_layernorm1(hidden_state)
+        residual = hidden_state
+        hidden_state = residual + self.map_ffn(hidden_state)
+        return hidden_state[:, 0]
+class MLP(nn.Module):
+    def __init__(self, input_size, class_num):
+        super().__init__()
+        self.mlp_layer0 = nn.Sequential(
+            nn.LayerNorm(input_size, eps=1e-08),
+            nn.Linear(input_size, input_size // 2),
+            nn.SiLU()
+        )
+        self.mlp_layer1 = nn.Linear(input_size // 2, class_num)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        x = self.mlp_layer0(x)
+        x = self.mlp_layer1(x)
+        x = self.sigmoid(x)
+        return x
+with open(os.path.join(repo_dir, 'general_tag_dict.json'), 'r', encoding='utf-8') as f:
+    general_dict = json.load(f)
+with open(os.path.join(repo_dir, 'character_tag_dict.json'), 'r', encoding='utf-8') as f:
+    character_dict = json.load(f)
+with open(os.path.join(repo_dir, 'artist_tag_dict.json'), 'r', encoding='utf-8') as f:
+    artist_dict = json.load(f)
+with open(os.path.join(repo_dir, 'implications_list.json'), 'r', encoding='utf-8') as f:
+    implications_list = json.load(f)
+with open(os.path.join(repo_dir, 'artist_threshold.json'), 'r', encoding='utf-8') as f:
+    artist_thresholds = json.load(f)
+with open(os.path.join(repo_dir, 'character_threshold.json'), 'r', encoding='utf-8') as f:
+    character_thresholds = json.load(f)
+with open(os.path.join(repo_dir, 'general_threshold.json'), 'r', encoding='utf-8') as f:
+    general_thresholds = json.load(f)
+model_map = MultiheadAttentionPoolingHead(2048)
+model_map.load_state_dict(torch.load(os.path.join(repo_dir, "map_head.pth"), map_location=device, weights_only=True))
+model_map.to(device).to(dtype).eval()
+general_class = 9775
+mlp_general = MLP(2048, general_class)
+mlp_general.load_state_dict(torch.load(os.path.join(repo_dir, "cls_predictor_general.pth"), map_location=device, weights_only=True))
+mlp_general.to(device).to(dtype).eval()
+character_class = 7568
+mlp_character = MLP(2048, character_class)
+mlp_character.load_state_dict(torch.load(os.path.join(repo_dir, "cls_predictor_character.pth"), map_location=device, weights_only=True))
+mlp_character.to(device).to(dtype).eval()
+artist_class = 13957
+mlp_artist = MLP(2048, artist_class)
+mlp_artist.load_state_dict(torch.load(os.path.join(repo_dir, "cls_predictor_artist.pth"), map_location=device, weights_only=True))
+mlp_artist.to(device).to(dtype).eval()
+def prediction_to_tag(prediction, tag_dict, class_num):
+    prediction = prediction.view(class_num)
+    predicted_ids = (prediction >= 0.2).nonzero(as_tuple=True)[0].cpu().numpy() + 1
+    general = {}
+    character = {}
+    artist = {}
+    date = {}
+    rating = {}
+    for tag, value in tag_dict.items():
+        if value[2] in predicted_ids:
+            tag_value = round(prediction[value[2] - 1].item(), 6)
+            if value[1] == "general" and tag_value >= general_thresholds.get(tag, {}).get("Threshold", 0.75):
+                general[tag] = tag_value
+            elif value[1] == "character" and tag_value >= character_thresholds.get(tag, {}).get("Threshold", 0.75):
+                character[tag] = tag_value
+            elif value[1] == "artist" and tag_value >= artist_thresholds.get(tag, {}).get("Threshold", 0.75):
+                artist[tag] = tag_value
+            elif value[1] == "rating":
+                rating[tag] = tag_value
+            elif value[1] == "date":
+                date[tag] = tag_value
+    general = dict(sorted(general.items(), key=lambda item: item[1], reverse=True))
+    character = dict(sorted(character.items(), key=lambda item: item[1], reverse=True))
+    artist = dict(sorted(artist.items(), key=lambda item: item[1], reverse=True))
+    if date:
+        date = {max(date, key=date.get): date[max(date, key=date.get)]}
+    if rating:
+        rating = {max(rating, key=rating.get): rating[max(rating, key=rating.get)]}
+    return general, character, artist, date, rating
+def process_image(image):
+    try:
+        image = image.convert('RGBA')
+        background = Image.new('RGBA', image.size, (255, 255, 255, 255))
+        image = Image.alpha_composite(background, image).convert('RGB')
+        image_inputs = processor(images=[image], return_tensors="pt").to(device).to(dtype)
+    except (OSError, IOError) as e:
+        print(f"Error opening image: {e}")
+        return
+    with torch.no_grad():
+        embedding = model(image_inputs.pixel_values)
+        embedding = model_map(embedding)
+        general_prediction = mlp_general(embedding)
+        general_ = prediction_to_tag(general_prediction, general_dict, general_class)
+        general_tags = general_[0]
+        rating = general_[4]
+        character_prediction = mlp_character(embedding)
+        character_ = prediction_to_tag(character_prediction, character_dict, character_class)
+        character_tags = character_[1]
+        artist_prediction = mlp_artist(embedding)
+        artist_ = prediction_to_tag(artist_prediction, artist_dict, artist_class)
+        artist_tags = artist_[2]
+        date = artist_[3]
+    combined_tags = {**general_tags}
+    tags_list = [tag for tag in combined_tags]
+    remove_list = []
+    for tag in tags_list:
+        if tag in implications_list:
+            for implication in implications_list[tag]:
+                remove_list.append(implication)
+    tags_list = [tag for tag in tags_list if tag not in remove_list]
+    tags_list = [tag.replace("_", " ") if tag not in kaomojis else tag for tag in tags_list]
+    tags_str = ", ".join(tags_list).replace("(", r"\(").replace(")", r"\)")
+    return tags_str, artist_tags, character_tags, general_tags, rating, date
+def main():
+    with gr.Blocks(title=TITLE) as demo:
+        with gr.Column():
+            gr.Markdown(
+                value=f"<h1 style='text-align: center; margin-bottom: 1rem'>{TITLE}</h1>"
+            )
+            with gr.Row():
+                with gr.Column(variant="panel"):
+                    submit = gr.Button(value="Submit", variant="primary", size="lg")
+                    image = gr.Image(type="pil", image_mode="RGBA", label="Input")
+                    with gr.Row():
+                        clear = gr.ClearButton(
+                            components=[
+                                image,
+                            ],
+                            variant="secondary",
+                            size="lg",
+                        )
+                    gr.Markdown(value=DESCRIPTION)
+                with gr.Column(variant="panel"):
+                    tags_str = gr.Textbox(label="Output", lines=4)
+                    with gr.Row():
+                        rating = gr.Label(label="Rating")
+                        date = gr.Label(label="Year")
+                    artist_tags = gr.Label(label="Artist")
+                    character_tags = gr.Label(label="Character")
+                    general_tags = gr.Label(label="General")
+                    clear.add(
+                        [
+                            tags_str,
+                            artist_tags,
+                            general_tags,
+                            character_tags,
+                            rating,
+                            date,
+                        ]
+                    )
+        submit.click(
+            process_image,
+            inputs=[
+                image
+            ],
+            outputs=[tags_str, artist_tags, character_tags, general_tags, rating, date],
+        )
+    demo.queue(max_size=10)
+    demo.launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+transformers
+Pillow
+gradio
+einops
+timm
+accelerate