Spaces:

throaway2854
/

AI_Video_Auto-Tagger

Running

App Files Files Community

throaway2854 commited on 19 days ago

Commit

53617e7

verified ·

1 Parent(s): f8ca038

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -9

app.py CHANGED Viewed

@@ -9,17 +9,28 @@ import onnxruntime as rt
 import pandas as pd
 from PIL import Image
-TITLE = "Video Tagger (SmilingWolf/wd-eva02-large-tagger-v3)"
 DESCRIPTION = """
 Upload a .mp4 or .mov video, choose how often to sample frames, and generate
-combined (deduplicated) tags using **SmilingWolf/wd-eva02-large-tagger-v3**.
 - Extract every N-th frame (e.g., every 10th frame).
 - Control thresholds for **General Tags** and **Character Tags**.
 - All tags from all sampled frames are merged into **one unique, comma-separated string**.
 """
-MODEL_REPO = "SmilingWolf/wd-eva02-large-tagger-v3"
 MODEL_FILENAME = "model.onnx"
 LABEL_FILENAME = "selected_tags.csv"
@@ -74,11 +85,11 @@ def load_labels(df: pd.DataFrame):
 class VideoTagger:
     """
-    Wraps wd-eva02-large-tagger-v3 ONNX model and tag metadata,
     and exposes helpers to tag PIL images and full videos.
     """
-    def __init__(self, model_repo: str = MODEL_REPO):
         self.model_repo = model_repo
         self.model = None
         self.model_target_size = None  # will be set from ONNX input shape
@@ -207,6 +218,7 @@ class VideoTagger:
         frame_interval: int,
         general_thresh: float,
         character_thresh: float,
     ) -> Tuple[str, Dict]:
         """
         Tag a video by sampling every N-th frame and aggregating tags.
@@ -220,10 +232,20 @@ class VideoTagger:
         frame_interval = max(int(frame_interval), 1)
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise RuntimeError("Unable to open video file.")
         # Store max score seen for each tag across all frames
         aggregated_general: Dict[str, float] = {}
         aggregated_character: Dict[str, float] = {}
@@ -260,10 +282,20 @@ class VideoTagger:
                     processed_frames += 1
                 frame_idx += 1
         finally:
             cap.release()
         # Merge character + general tags, sorted by score (desc)
         all_tags_with_scores = {**aggregated_general, **aggregated_character}
         sorted_tags = sorted(
@@ -276,8 +308,11 @@ class VideoTagger:
         combined_tags_str = ", ".join(unique_tags)
         debug_info = {
             "frames_read": int(frame_idx),
             "frames_processed": int(processed_frames),
             "num_general_tags": len(aggregated_general),
             "num_character_tags": len(aggregated_character),
             "total_unique_tags": len(unique_tags),
@@ -289,8 +324,14 @@ class VideoTagger:
         return combined_tags_str, debug_info
-# Global model instance (loaded once per Space)
-video_tagger = VideoTagger()
 def tag_video_interface(
@@ -298,16 +339,20 @@ def tag_video_interface(
     frame_interval: int,
     general_thresh: float,
     character_thresh: float,
 ):
     if video_path is None:
         return "", {"error": "Please upload a video file."}
     try:
-        return video_tagger.tag_video(
             video_path=video_path,
             frame_interval=frame_interval,
             general_thresh=general_thresh,
             character_thresh=character_thresh,
         )
     except Exception as e:
         return "", {"error": str(e)}
@@ -325,6 +370,12 @@ with gr.Blocks(title=TITLE) as demo:
                 format="mp4",
             )
             frame_interval = gr.Slider(
                 minimum=1,
                 maximum=60,
@@ -363,7 +414,7 @@ with gr.Blocks(title=TITLE) as demo:
     run_button.click(
         fn=tag_video_interface,
-        inputs=[video_input, frame_interval, general_thresh, character_thresh],
         outputs=[combined_tags, debug_info],
     )

 import pandas as pd
 from PIL import Image
+TITLE = "Video Tagger (WD Tagger Variants)"
 DESCRIPTION = """
 Upload a .mp4 or .mov video, choose how often to sample frames, and generate
+combined (deduplicated) tags using a selected **WD-style tagging model**.
 - Extract every N-th frame (e.g., every 10th frame).
 - Control thresholds for **General Tags** and **Character Tags**.
 - All tags from all sampled frames are merged into **one unique, comma-separated string**.
 """
+DEFAULT_MODEL_REPO = "SmilingWolf/wd-eva02-large-tagger-v3"
+MODEL_OPTIONS = [
+    "SmilingWolf/wd-eva02-large-tagger-v3",
+    "SmilingWolf/wd-vit-large-tagger-v3",
+    "SmilingWolf/wd-vit-tagger-v3",
+    "SmilingWolf/wd-convnext-tagger-v3",
+    "SmilingWolf/wd-swinv2-tagger-v3",
+    "deepghs/idolsankaku-eva02-large-tagger-v1",
+    "deepghs/idolsankaku-swinv2-tagger-v1",
+]
 MODEL_FILENAME = "model.onnx"
 LABEL_FILENAME = "selected_tags.csv"
 class VideoTagger:
     """
+    Wraps a WD-style ONNX model and tag metadata,
     and exposes helpers to tag PIL images and full videos.
     """
+    def __init__(self, model_repo: str):
         self.model_repo = model_repo
         self.model = None
         self.model_target_size = None  # will be set from ONNX input shape
         frame_interval: int,
         general_thresh: float,
         character_thresh: float,
+        progress=None,
     ) -> Tuple[str, Dict]:
         """
         Tag a video by sampling every N-th frame and aggregating tags.
         frame_interval = max(int(frame_interval), 1)
+        if progress is not None:
+            progress(0.0, desc="Opening video...")
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise RuntimeError("Unable to open video file.")
+        # Estimate total frames and how many will be processed
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
+        if total_frames <= 0:
+            total_frames = 1  # avoid division by zero / weird metadata
+        frames_to_process = max(1, (total_frames + frame_interval - 1) // frame_interval)
         # Store max score seen for each tag across all frames
         aggregated_general: Dict[str, float] = {}
         aggregated_character: Dict[str, float] = {}
                     processed_frames += 1
+                    if progress is not None:
+                        ratio = min(processed_frames / frames_to_process, 0.99)
+                        progress(
+                            ratio,
+                            desc=f"Processing frame {processed_frames}/{frames_to_process}...",
+                        )
                 frame_idx += 1
         finally:
             cap.release()
+        if progress is not None:
+            progress(1.0, desc="Finalizing tags...")
         # Merge character + general tags, sorted by score (desc)
         all_tags_with_scores = {**aggregated_general, **aggregated_character}
         sorted_tags = sorted(
         combined_tags_str = ", ".join(unique_tags)
         debug_info = {
+            "model_repo": self.model_repo,
             "frames_read": int(frame_idx),
             "frames_processed": int(processed_frames),
+            "estimated_total_frames": int(total_frames),
+            "estimated_frames_to_process": int(frames_to_process),
             "num_general_tags": len(aggregated_general),
             "num_character_tags": len(aggregated_character),
             "total_unique_tags": len(unique_tags),
         return combined_tags_str, debug_info
+# Cache of VideoTagger instances per model repo
+_tagger_cache: Dict[str, VideoTagger] = {}
+def get_tagger(model_repo: str) -> VideoTagger:
+    if model_repo not in _tagger_cache:
+        _tagger_cache[model_repo] = VideoTagger(model_repo=model_repo)
+    return _tagger_cache[model_repo]
 def tag_video_interface(
     frame_interval: int,
     general_thresh: float,
     character_thresh: float,
+    model_repo: str,
+    progress=gr.Progress(track_tqdm=False),
 ):
     if video_path is None:
         return "", {"error": "Please upload a video file."}
     try:
+        tagger = get_tagger(model_repo)
+        return tagger.tag_video(
             video_path=video_path,
             frame_interval=frame_interval,
             general_thresh=general_thresh,
             character_thresh=character_thresh,
+            progress=progress,
         )
     except Exception as e:
         return "", {"error": str(e)}
                 format="mp4",
             )
+            model_choice = gr.Dropdown(
+                choices=MODEL_OPTIONS,
+                value=DEFAULT_MODEL_REPO,
+                label="Tagging Model",
+            )
             frame_interval = gr.Slider(
                 minimum=1,
                 maximum=60,
     run_button.click(
         fn=tag_video_interface,
+        inputs=[video_input, frame_interval, general_thresh, character_thresh, model_choice],
         outputs=[combined_tags, debug_info],
     )