Spaces:

throaway2854
/

AI_Video_Auto-Tagger

Running

App Files Files Community

throaway2854 commited on 19 days ago

Commit

b00ebae

verified ·

1 Parent(s): cd33eb4

Update app.py

Browse files

Files changed (1) hide show

app.py +176 -48

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Dict, Tuple
 import cv2
 import gradio as gr
@@ -17,6 +17,7 @@ combined (deduplicated) tags using a selected **WD-style tagging model**.
 - Extract every N-th frame (e.g., every 10th frame).
 - Control thresholds for **General Tags** and **Character Tags**.
 - All tags from all sampled frames are merged into **one unique, comma-separated string**.
 """
 DEFAULT_MODEL_REPO = "SmilingWolf/wd-eva02-large-tagger-v3"
@@ -218,6 +219,8 @@ class VideoTagger:
         frame_interval: int,
         general_thresh: float,
         character_thresh: float,
         progress=None,
     ) -> Tuple[str, Dict]:
         """
@@ -242,7 +245,7 @@ class VideoTagger:
         # Estimate total frames and how many will be processed
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
         if total_frames <= 0:
-            total_frames = 1  # avoid division by zero / weird metadata
         frames_to_process = max(1, (total_frames + frame_interval - 1) // frame_interval)
@@ -298,8 +301,35 @@ class VideoTagger:
         # Merge character + general tags, sorted by score (desc)
         all_tags_with_scores = {**aggregated_general, **aggregated_character}
         sorted_tags = sorted(
-            all_tags_with_scores.items(),
             key=lambda kv: kv[1],
             reverse=True,
         )
@@ -313,12 +343,14 @@ class VideoTagger:
             "frames_processed": int(processed_frames),
             "estimated_total_frames": int(total_frames),
             "estimated_frames_to_process": int(frames_to_process),
-            "num_general_tags": len(aggregated_general),
-            "num_character_tags": len(aggregated_character),
-            "total_unique_tags": len(unique_tags),
             "frame_interval": int(frame_interval),
             "general_threshold": float(general_thresh),
             "character_threshold": float(character_thresh),
         }
         return combined_tags_str, debug_info
@@ -334,12 +366,57 @@ def get_tagger(model_repo: str) -> VideoTagger:
     return _tagger_cache[model_repo]
 def tag_video_interface(
     video_path: str,
     frame_interval: int,
     general_thresh: float,
     character_thresh: float,
     model_repo: str,
     progress=gr.Progress(track_tqdm=False),
 ):
     if video_path is None:
@@ -347,11 +424,17 @@ def tag_video_interface(
     try:
         tagger = get_tagger(model_repo)
         return tagger.tag_video(
             video_path=video_path,
             frame_interval=frame_interval,
             general_thresh=general_thresh,
             character_thresh=character_thresh,
             progress=progress,
         )
     except Exception as e:
@@ -362,60 +445,105 @@ with gr.Blocks(title=TITLE) as demo:
     gr.Markdown(f"## {TITLE}")
     gr.Markdown(DESCRIPTION)
-    with gr.Row():
-        with gr.Column():
-            video_input = gr.Video(
-                label="Video (.mp4 or .mov)",
-                sources=["upload"],
-                format="mp4",
-            )
-            model_choice = gr.Dropdown(
-                choices=MODEL_OPTIONS,
-                value=DEFAULT_MODEL_REPO,
-                label="Tagging Model",
-            )
-            frame_interval = gr.Slider(
-                minimum=1,
-                maximum=60,
-                step=1,
-                value=10,
-                label="Extract Every N Frames",
-                info="For example, 10 = use every 10th frame.",
-            )
-            general_thresh = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                step=0.01,
-                value=0.35,
-                label="General Tags Threshold",
-            )
-            character_thresh = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                step=0.01,
-                value=0.85,
-                label="Character Tags Threshold",
             )
-            run_button = gr.Button("Generate Tags", variant="primary")
-        with gr.Column():
-            combined_tags = gr.Textbox(
-                show_label=True,
-                label="Combined Unique Tags (All Frames)",
-                lines=6,
             )
-            debug_info = gr.JSON(
-                label="Details / Debug Info",
             )
     run_button.click(
         fn=tag_video_interface,
-        inputs=[video_input, frame_interval, general_thresh, character_thresh, model_choice],
         outputs=[combined_tags, debug_info],
     )

 import os
+from typing import Dict, Tuple, List, Set
 import cv2
 import gradio as gr
 - Extract every N-th frame (e.g., every 10th frame).
 - Control thresholds for **General Tags** and **Character Tags**.
 - All tags from all sampled frames are merged into **one unique, comma-separated string**.
+- Use the **Tag Control** tab to define tag substitutions and exclusions for the final output.
 """
 DEFAULT_MODEL_REPO = "SmilingWolf/wd-eva02-large-tagger-v3"
         frame_interval: int,
         general_thresh: float,
         character_thresh: float,
+        tag_substitutes: Dict[str, str],
+        tag_exclusions: Set[str],
         progress=None,
     ) -> Tuple[str, Dict]:
         """
         # Estimate total frames and how many will be processed
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
         if total_frames <= 0:
+            total_frames = 1  # avoid division issues
         frames_to_process = max(1, (total_frames + frame_interval - 1) // frame_interval)
         # Merge character + general tags, sorted by score (desc)
         all_tags_with_scores = {**aggregated_general, **aggregated_character}
+        # Apply substitutions & exclusions BEFORE final dedup
+        adjusted_all_tags: Dict[str, float] = {}
+        # Normalize keys in substitutes/exclusions (strip whitespace)
+        normalized_subs = {k.strip(): v.strip() for k, v in tag_substitutes.items() if k and v}
+        normalized_exclusions = {t.strip() for t in tag_exclusions if t}
+        for tag, score in all_tags_with_scores.items():
+            original_tag = tag.strip()
+            # Skip if original tag is excluded
+            if original_tag in normalized_exclusions:
+                continue
+            # Apply substitution (if any)
+            new_tag = normalized_subs.get(original_tag, original_tag)
+            # Skip if substituted tag is excluded
+            if new_tag in normalized_exclusions:
+                continue
+            # Keep max score for each resulting tag
+            if new_tag not in adjusted_all_tags or score > adjusted_all_tags[new_tag]:
+                adjusted_all_tags[new_tag] = score
+        # Sort by score descending
         sorted_tags = sorted(
+            adjusted_all_tags.items(),
             key=lambda kv: kv[1],
             reverse=True,
         )
             "frames_processed": int(processed_frames),
             "estimated_total_frames": int(total_frames),
             "estimated_frames_to_process": int(frames_to_process),
+            "num_general_tags_raw": len(aggregated_general),
+            "num_character_tags_raw": len(aggregated_character),
+            "total_unique_tags_after_control": len(unique_tags),
             "frame_interval": int(frame_interval),
             "general_threshold": float(general_thresh),
             "character_threshold": float(character_thresh),
+            "num_substitution_rules": len(normalized_subs),
+            "num_exclusions": len(normalized_exclusions),
         }
         return combined_tags_str, debug_info
     return _tagger_cache[model_repo]
+def _normalize_tag_substitutes(data) -> Dict[str, str]:
+    """
+    Convert Dataframe (as array: list[list]) into {original: substitute}.
+    """
+    mapping: Dict[str, str] = {}
+    if data is None:
+        return mapping
+    # Expect data as list of [original, substitute]
+    for row in data:
+        if not row or len(row) < 2:
+            continue
+        orig = (row[0] or "").strip()
+        sub = (row[1] or "").strip()
+        if orig and sub:
+            mapping[orig] = sub
+    return mapping
+def _normalize_tag_exclusions(data) -> Set[str]:
+    """
+    Convert Dataframe (as array: list[list]) into set of tags to exclude.
+    """
+    exclusions: Set[str] = set()
+    if data is None:
+        return exclusions
+    # Expect data as list of [tag] rows
+    for row in data:
+        if row is None:
+            continue
+        if isinstance(row, (list, tuple)):
+            if not row:
+                continue
+            val = row[0]
+        else:
+            val = row
+        val = (val or "").strip()
+        if val:
+            exclusions.add(val)
+    return exclusions
 def tag_video_interface(
     video_path: str,
     frame_interval: int,
     general_thresh: float,
     character_thresh: float,
     model_repo: str,
+    tag_substitutes_df,
+    tag_exclusions_df,
     progress=gr.Progress(track_tqdm=False),
 ):
     if video_path is None:
     try:
         tagger = get_tagger(model_repo)
+        tag_substitutes = _normalize_tag_substitutes(tag_substitutes_df)
+        tag_exclusions = _normalize_tag_exclusions(tag_exclusions_df)
         return tagger.tag_video(
             video_path=video_path,
             frame_interval=frame_interval,
             general_thresh=general_thresh,
             character_thresh=character_thresh,
+            tag_substitutes=tag_substitutes,
+            tag_exclusions=tag_exclusions,
             progress=progress,
         )
     except Exception as e:
     gr.Markdown(f"## {TITLE}")
     gr.Markdown(DESCRIPTION)
+    with gr.Tabs():
+        # ---------------- TAB 1: TAGGING ----------------
+        with gr.Tab("Tagging"):
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Video(
+                        label="Video (.mp4 or .mov)",
+                        sources=["upload"],
+                        format="mp4",
+                    )
+                    model_choice = gr.Dropdown(
+                        choices=MODEL_OPTIONS,
+                        value=DEFAULT_MODEL_REPO,
+                        label="Tagging Model",
+                    )
+                    frame_interval = gr.Slider(
+                        minimum=1,
+                        maximum=60,
+                        step=1,
+                        value=10,
+                        label="Extract Every N Frames",
+                        info="For example, 10 = use every 10th frame.",
+                    )
+                    general_thresh = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.01,
+                        value=0.35,
+                        label="General Tags Threshold",
+                    )
+                    character_thresh = gr.Slider(
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.01,
+                        value=0.85,
+                        label="Character Tags Threshold",
+                    )
+                    run_button = gr.Button("Generate Tags", variant="primary")
+                with gr.Column():
+                    combined_tags = gr.Textbox(
+                        label="Combined Unique Tags (All Frames)",
+                        lines=6,
+                        buttons: list[Literal['copy']],
+                    )
+                    debug_info = gr.JSON(
+                        label="Details / Debug Info",
+                    )
+        # ---------------- TAB 2: TAG CONTROL ----------------
+        with gr.Tab("Tag Control"):
+            gr.Markdown("### Tag Substitutes")
+            gr.Markdown(
+                "Add rows where **Original Tag** will be replaced by **Substitute Tag** "
+                "in the final combined output (after all frames are processed)."
             )
+            tag_substitutes_df = gr.Dataframe(
+                headers=["Original Tag", "Substitute Tag"],
+                datatype=["str", "str"],
+                row_count=3,
+                col_count=2,
+                type="array",
+                label="Tag Substitutes",
+                interactive=True,
+            )
+            gr.Markdown("### Tag Exclusions")
+            gr.Markdown(
+                "Add tags that should be **removed entirely** from the final combined output."
             )
+            tag_exclusions_df = gr.Dataframe(
+                headers=["Tag to Exclude"],
+                datatype=["str"],
+                row_count=3,
+                col_count=1,
+                type="array",
+                label="Tag Exclusions",
+                interactive=True,
             )
+    # Wiring the button AFTER all components are defined
     run_button.click(
         fn=tag_video_interface,
+        inputs=[
+            video_input,
+            frame_interval,
+            general_thresh,
+            character_thresh,
+            model_choice,
+            tag_substitutes_df,
+            tag_exclusions_df,
+        ],
         outputs=[combined_tags, debug_info],
     )