Spaces:

Johnny-Z
/

dan_retrieval

Sleeping

App Files Files Community

Johnny-Z commited on Dec 17, 2025

Commit

f613759

verified ·

1 Parent(s): 7126f84

Upload app.py

Browse files

Files changed (1) hide show

app.py +57 -83

app.py CHANGED Viewed

@@ -13,41 +13,44 @@ from huggingface_hub import login, snapshot_download
 TITLE = "Danbooru Tagger"
 DESCRIPTION = """
 ## Dataset
-- Source: Cleaned Danbooru
-## Metrics
 - Validation Split: 10% of Dataset
-- Validation Results:
 ### General
 | Metric          | Value       |
 |-----------------|-------------|
-| Macro F1        | 0.4678      |
-| Macro Precision | 0.4605      |
-| Macro Recall    | 0.5229      |
-| Micro F1        | 0.6661      |
-| Micro Precision | 0.6049      |
-| Micro Recall    | 0.7411      |
 ### Character
 | Metric          | Value       |
 |-----------------|-------------|
-| Macro F1        | 0.8925      |
-| Macro Precision | 0.9099      |
-| Macro Recall    | 0.8935      |
-| Micro F1        | 0.9232      |
-| Micro Precision | 0.9264      |
-| Micro Recall    | 0.9199      |
 ### Artist
 | Metric          | Value       |
 |-----------------|-------------|
-| Macro F1        | 0.7904      |
-| Macro Precision | 0.8286      |
-| Macro Recall    | 0.7904      |
-| Micro F1        | 0.5989      |
-| Micro Precision | 0.5975      |
-| Micro Recall    | 0.6004      |
 """
 kaomojis = [
@@ -81,10 +84,10 @@ if hf_token:
 else:
     raise ValueError("environment variable HF_TOKEN not found.")
-repo = snapshot_download('Johnny-Z/vit-e4')
 model = AutoModel.from_pretrained(repo, dtype=dtype, trust_remote_code=True, device_map=device)
-index_dir = snapshot_download('Johnny-Z/dan_index', repo_type='dataset')
 processor = CLIPImageProcessor.from_pretrained(repo)
@@ -131,30 +134,11 @@ class MLP(nn.Module):
         x = self.sigmoid(x)
         return x
-class MLP_Retrieval(nn.Module):
-    def __init__(self, input_size, class_num):
-        super().__init__()
-        self.mlp_layer0 = nn.Sequential(
-            nn.Linear(input_size, input_size // 2),
-            nn.SiLU()
-        )
-        self.mlp_layer1 = nn.Linear(input_size // 2, class_num)
-    def forward(self, x):
-        x = self.mlp_layer0(x)
-        x = self.mlp_layer1(x)
-        x1, x2 = x[:, :15], x[:, 15:]
-        x1 = torch.softmax(x1, dim=1)
-        x2 = torch.softmax(x2, dim=1)
-        x = torch.cat([x1, x2], dim=1)
-        return x
 class MLP_R(nn.Module):
     def __init__(self, input_size):
         super().__init__()
         self.mlp_layer0 = nn.Sequential(
-            nn.Linear(input_size, 256),
         )
     def forward(self, x):
@@ -186,25 +170,21 @@ model_map = MultiheadAttentionPoolingHead(2048)
 model_map.load_state_dict(torch.load(os.path.join(repo, "map_head.pth"), map_location=device, weights_only=True))
 model_map.to(device).to(dtype).eval()
-general_class = 9775
 mlp_general = MLP(2048, general_class)
 mlp_general.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_general.pth"), map_location=device, weights_only=True))
 mlp_general.to(device).to(dtype).eval()
-character_class = 7568
 mlp_character = MLP(2048, character_class)
 mlp_character.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_character.pth"), map_location=device, weights_only=True))
 mlp_character.to(device).to(dtype).eval()
-artist_class = 13957
 mlp_artist = MLP(2048, artist_class)
 mlp_artist.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_artist.pth"), map_location=device, weights_only=True))
 mlp_artist.to(device).to(dtype).eval()
-mlp_artist_retrieval = MLP_Retrieval(2048, artist_class)
-mlp_artist_retrieval.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_artist_retrieval.pth"), map_location=device, weights_only=True))
-mlp_artist_retrieval.to(device).to(dtype).eval()
 mlp_r = MLP_R(2048)
 mlp_r.load_state_dict(torch.load(os.path.join(repo, "retrieval_head.pth"), map_location=device, weights_only=True))
 mlp_r.to(device).to(dtype).eval()
@@ -244,29 +224,6 @@ def prediction_to_tag(prediction, tag_dict, class_num):
     return general, character, artist, date, rating
-def prediction_to_retrieval(prediction, tag_dict, class_num, top_k):
-    prediction = prediction.view(class_num)
-    predicted_ids = (prediction>=0.005).nonzero(as_tuple=True)[0].cpu().numpy() + 1
-    artist = {}
-    date = {}
-    for tag, value in tag_dict.items():
-        if value[2] in predicted_ids:
-            tag_value = round(prediction[value[2] - 1].item(), 6)
-            if value[1] == "artist":
-                artist[tag] = tag_value
-            elif value[1] == "date":
-                date[tag] = tag_value
-    artist = dict(sorted(artist.items(), key=lambda item: item[1], reverse=True))
-    artist = dict(list(artist.items())[:top_k])
-    if date:
-        date = {max(date, key=date.get): date[max(date, key=date.get)]}
-    return artist, date
 def load_id_map(id_map_path):
     with open(id_map_path, "r") as f:
         id_map = json.load(f)
@@ -309,7 +266,7 @@ def search_index(query_vector, k=32, distance_threshold_min=0, distance_threshol
     return results
-def fetch_retrieval_image_urls(retrieval_results, sleep_sec=0.25, timeout=4.0):
     pairs = []
     for item in retrieval_results:
         oid = item.get("original_id")
@@ -332,9 +289,10 @@ def fetch_retrieval_image_urls(retrieval_results, sleep_sec=0.25, timeout=4.0):
                 url = "https:" + url
             elif url.startswith("/"):
                 url = "https://danbooru.donmai.us" + url
-            pairs.append((url, oid))
-        except Exception:
             pass
         finally:
@@ -363,7 +321,18 @@ def process_image(image, k, distance_threshold_min, distance_threshold_max):
         url_id_pairs = fetch_retrieval_image_urls(retrieval_results)
-        retrieval_gallery_items = [(url, f"https://danbooru.donmai.us/posts/{oid}") for url, oid in url_id_pairs]
         general_prediction = mlp_general(embedding)
         general_ = prediction_to_tag(general_prediction, general_dict, general_class)
@@ -374,10 +343,10 @@ def process_image(image, k, distance_threshold_min, distance_threshold_max):
         character_ = prediction_to_tag(character_prediction, character_dict, character_class)
         character_tags = character_[1]
-        artist_retrieval_prediction = mlp_artist_retrieval(embedding)
-        artist_retrieval_ = prediction_to_retrieval(artist_retrieval_prediction, artist_dict, artist_class, 10)
-        artist_tags = artist_retrieval_[0]
-        date = artist_retrieval_[1]
     combined_tags = {**general_tags}
@@ -400,6 +369,7 @@ def process_image(image, k, distance_threshold_min, distance_threshold_max):
         rating,
         date,
         retrieval_gallery_items,
     )
 def main():
@@ -414,7 +384,7 @@ def main():
                     image = gr.Image(type="pil", image_mode="RGBA", label="Input")
                     k_slider = gr.Slider(1, 100, value=32, step=1, label="Top K Results")
                     distance_min_slider = gr.Slider(0, 128, value=0, step=1, label="Min Distance Threshold")
-                    distance_max_slider = gr.Slider(0, 128, value=80, step=1, label="Max Distance Threshold")
                     with gr.Row():
                         clear = gr.ClearButton(
                             components=[
@@ -440,6 +410,8 @@ def main():
                     label="Retrieval Preview",
                     columns=5,
                 )
             clear.add(
                 [
                     tags_str,
@@ -449,6 +421,7 @@ def main():
                     rating,
                     date,
                     retrieval_gallery,
                 ]
             )
@@ -463,6 +436,7 @@ def main():
                 rating,
                 date,
                 retrieval_gallery,
             ],
         )

 TITLE = "Danbooru Tagger"
 DESCRIPTION = """
 ## Dataset
+- Source: Danbooru
+- Cutoff Date: 2025-11-27
 - Validation Split: 10% of Dataset
+## Validation Results
 ### General
+Tags Count: 11046
 | Metric          | Value       |
 |-----------------|-------------|
+| Macro F1        | 0.4439      |
+| Macro Precision | 0.4168      |
+| Macro Recall    | 0.4964      |
+| Micro F1        | 0.6595      |
+| Micro Precision | 0.5982      |
+| Micro Recall    | 0.7349      |
 ### Character
+Tags Count: 9148
 | Metric          | Value       |
 |-----------------|-------------|
+| Macro F1        | 0.8646      |
+| Macro Precision | 0.8897      |
+| Macro Recall    | 0.8492      |
+| Micro F1        | 0.9092      |
+| Micro Precision | 0.9195      |
+| Micro Recall    | 0.8991      |
 ### Artist
+Tags Count: 17171
 | Metric          | Value       |
 |-----------------|-------------|
+| Macro F1        | 0.8008      |
+| Macro Precision | 0.8669      |
+| Macro Recall    | 0.7641      |
+| Micro F1        | 0.8596      |
+| Micro Precision | 0.8948      |
+| Micro Recall    | 0.8271      |
 """
 kaomojis = [
 else:
     raise ValueError("environment variable HF_TOKEN not found.")
+repo = snapshot_download('Johnny-Z/danbooru_vfm')
 model = AutoModel.from_pretrained(repo, dtype=dtype, trust_remote_code=True, device_map=device)
+index_dir = snapshot_download('Johnny-Z/dan_index_a', repo_type='dataset')
 processor = CLIPImageProcessor.from_pretrained(repo)
         x = self.sigmoid(x)
         return x
 class MLP_R(nn.Module):
     def __init__(self, input_size):
         super().__init__()
         self.mlp_layer0 = nn.Sequential(
+            nn.Linear(input_size, 384),
         )
     def forward(self, x):
 model_map.load_state_dict(torch.load(os.path.join(repo, "map_head.pth"), map_location=device, weights_only=True))
 model_map.to(device).to(dtype).eval()
+general_class = 11046
 mlp_general = MLP(2048, general_class)
 mlp_general.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_general.pth"), map_location=device, weights_only=True))
 mlp_general.to(device).to(dtype).eval()
+character_class = 9148
 mlp_character = MLP(2048, character_class)
 mlp_character.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_character.pth"), map_location=device, weights_only=True))
 mlp_character.to(device).to(dtype).eval()
+artist_class = 17171
 mlp_artist = MLP(2048, artist_class)
 mlp_artist.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_artist.pth"), map_location=device, weights_only=True))
 mlp_artist.to(device).to(dtype).eval()
 mlp_r = MLP_R(2048)
 mlp_r.load_state_dict(torch.load(os.path.join(repo, "retrieval_head.pth"), map_location=device, weights_only=True))
 mlp_r.to(device).to(dtype).eval()
     return general, character, artist, date, rating
 def load_id_map(id_map_path):
     with open(id_map_path, "r") as f:
         id_map = json.load(f)
     return results
+def fetch_retrieval_image_urls(retrieval_results, sleep_sec=0.1, timeout=2.0):
     pairs = []
     for item in retrieval_results:
         oid = item.get("original_id")
                 url = "https:" + url
             elif url.startswith("/"):
                 url = "https://danbooru.donmai.us" + url
+            dist = item.get("l2_distance")
+            pairs.append((url, oid, dist))
+        except Exception:
             pass
         finally:
         url_id_pairs = fetch_retrieval_image_urls(retrieval_results)
+        retrieval_gallery_items = [
+            (
+                url,
+                f"distance={dist:.3f} | id={oid}"
+            )
+            for url, oid, dist in url_id_pairs
+        ]
+        retrieval_links = "\n".join(
+            f"[id={oid}](https://danbooru.donmai.us/posts/{oid})"
+            for url, oid, dist in url_id_pairs
+        )
         general_prediction = mlp_general(embedding)
         general_ = prediction_to_tag(general_prediction, general_dict, general_class)
         character_ = prediction_to_tag(character_prediction, character_dict, character_class)
         character_tags = character_[1]
+        artist_prediction = mlp_artist(embedding)
+        artist_ = prediction_to_tag(artist_prediction, artist_dict, artist_class)
+        artist_tags = artist_[2]
+        date = artist_[3]
     combined_tags = {**general_tags}
         rating,
         date,
         retrieval_gallery_items,
+        retrieval_links,
     )
 def main():
                     image = gr.Image(type="pil", image_mode="RGBA", label="Input")
                     k_slider = gr.Slider(1, 100, value=32, step=1, label="Top K Results")
                     distance_min_slider = gr.Slider(0, 128, value=0, step=1, label="Min Distance Threshold")
+                    distance_max_slider = gr.Slider(0, 128, value=64, step=1, label="Max Distance Threshold")
                     with gr.Row():
                         clear = gr.ClearButton(
                             components=[
                     label="Retrieval Preview",
                     columns=5,
                 )
+            retrieval_links = gr.Markdown(label="Retrieval Links")
             clear.add(
                 [
                     tags_str,
                     rating,
                     date,
                     retrieval_gallery,
+                    retrieval_links,
                 ]
             )
                 rating,
                 date,
                 retrieval_gallery,
+                retrieval_links,
             ],
         )