Spaces:

throaway2854
/

AI_Video_Auto-Tagger

Running

App Files Files Community

throaway2854 commited on 18 days ago

Commit

d254ba7

verified ·

1 Parent(s): d012bfe

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -50

app.py CHANGED Viewed

@@ -120,7 +120,7 @@ class VideoTagger:
     and exposes helpers to tag PIL images and full videos.
     """
-    def __init__(self, model_repo: str):
         self.model_repo = model_repo
         self.model = None
         self.model_target_size = None  # will be set from ONNX input shape
@@ -128,6 +128,7 @@ class VideoTagger:
         self.rating_indexes = None
         self.general_indexes = None
         self.character_indexes = None
     def _download_model_files(self) -> Tuple[str, str]:
         csv_path = huggingface_hub.hf_hub_download(
@@ -202,6 +203,92 @@ class VideoTagger:
         arr = np.expand_dims(arr, axis=0)
         return arr
     def tag_image(
         self,
         image: Image.Image,
@@ -225,6 +312,7 @@ class VideoTagger:
         labels = list(zip(self.tag_names, preds))
         # General tags
         general_names = [labels[i] for i in self.general_indexes]
         general_res = {
@@ -243,6 +331,40 @@ class VideoTagger:
         return general_res, character_res
     def tag_video(
         self,
         video_path: str,
@@ -265,6 +387,8 @@ class VideoTagger:
         frame_interval = max(int(frame_interval), 1)
         if progress is not None:
             progress(0.0, desc="Opening video...")
@@ -272,20 +396,20 @@ class VideoTagger:
         if not cap.isOpened():
             raise RuntimeError("Unable to open video file.")
-        # Estimate total frames and how many will be processed
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
         if total_frames <= 0:
-            total_frames = 1  # avoid division issues
         frames_to_process = max(1, (total_frames + frame_interval - 1) // frame_interval)
-        # Store max score seen for each tag across all frames
         aggregated_general: Dict[str, float] = {}
         aggregated_character: Dict[str, float] = {}
         frame_idx = 0
         processed_frames = 0
         try:
             while True:
                 ret, frame = cap.read()
@@ -294,38 +418,44 @@ class VideoTagger:
                 # Only process every N-th frame
                 if frame_idx % frame_interval == 0:
-                    # Convert OpenCV BGR frame -> PIL image with alpha
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    pil_image = Image.fromarray(frame_rgb).convert("RGBA")
-                    general_res, character_res = self.tag_image(
-                        pil_image,
-                        general_thresh=general_thresh,
-                        character_thresh=character_thresh,
-                    )
-                    # Aggregate by keeping max score per tag
-                    for tag, score in general_res.items():
-                        if tag not in aggregated_general or score > aggregated_general[tag]:
-                            aggregated_general[tag] = score
-                    for tag, score in character_res.items():
-                        if tag not in aggregated_character or score > aggregated_character[tag]:
-                            aggregated_character[tag] = score
-                    processed_frames += 1
-                    if progress is not None:
-                        ratio = min(processed_frames / frames_to_process, 0.99)
-                        progress(
-                            ratio,
-                            desc=f"Processing frame {processed_frames}/{frames_to_process}...",
                         )
                 frame_idx += 1
         finally:
             cap.release()
         if progress is not None:
             progress(1.0, desc="Finalizing tags...")
@@ -335,29 +465,23 @@ class VideoTagger:
         # Apply substitutions & exclusions BEFORE final dedup
         adjusted_all_tags: Dict[str, float] = {}
-        # Normalize keys in substitutes/exclusions (strip whitespace)
         normalized_subs = {k.strip(): v.strip() for k, v in tag_substitutes.items() if k and v}
         normalized_exclusions = {t.strip() for t in tag_exclusions if t}
         for tag, score in all_tags_with_scores.items():
             original_tag = tag.strip()
-            # Skip if original tag is excluded
             if original_tag in normalized_exclusions:
                 continue
-            # Apply substitution (if any)
             new_tag = normalized_subs.get(original_tag, original_tag)
-            # Skip if substituted tag is excluded
             if new_tag in normalized_exclusions:
                 continue
-            # Keep max score for each resulting tag
             if new_tag not in adjusted_all_tags or score > adjusted_all_tags[new_tag]:
                 adjusted_all_tags[new_tag] = score
-        # Sort by score descending
         sorted_tags = sorted(
             adjusted_all_tags.items(),
             key=lambda kv: kv[1],
@@ -381,6 +505,7 @@ class VideoTagger:
             "character_threshold": float(character_thresh),
             "num_substitution_rules": len(normalized_subs),
             "num_exclusions": len(normalized_exclusions),
         }
         return combined_tags_str, debug_info
@@ -447,6 +572,7 @@ def tag_video_interface(
     model_repo: str,
     tag_substitutes_df,
     tag_exclusions_df,
     progress=gr.Progress(track_tqdm=False),
 ):
     if video_path is None:
@@ -454,6 +580,7 @@ def tag_video_interface(
     try:
         tagger = get_tagger(model_repo)
         tag_substitutes = _normalize_tag_substitutes(tag_substitutes_df)
         tag_exclusions = _normalize_tag_exclusions(tag_exclusions_df)
@@ -485,22 +612,13 @@ with gr.Blocks(title=TITLE) as demo:
                         sources=["upload"],
                         format="mp4",
                     )
                     model_choice = gr.Dropdown(
                         choices=MODEL_OPTIONS,
                         value=DEFAULT_MODEL_REPO,
                         label="Tagging Model",
                     )
-                    frame_interval = gr.Slider(
-                        minimum=1,
-                        maximum=60,
-                        step=1,
-                        value=10,
-                        label="Extract Every N Frames",
-                        info="For example, 10 = use every 10th frame.",
-                    )
                     general_thresh = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
@@ -508,7 +626,7 @@ with gr.Blocks(title=TITLE) as demo:
                         value=0.35,
                         label="General Tags Threshold",
                     )
                     character_thresh = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
@@ -516,9 +634,32 @@ with gr.Blocks(title=TITLE) as demo:
                         value=0.85,
                         label="Character Tags Threshold",
                     )
                     run_button = gr.Button("Generate Tags", variant="primary")
                 with gr.Column():
                     combined_tags = gr.Textbox(
                         label="Combined Unique Tags (All Frames)",
@@ -529,6 +670,7 @@ with gr.Blocks(title=TITLE) as demo:
                         label="Details / Debug Info",
                     )
         # ---------------- TAB 2: TAG CONTROL ----------------
         with gr.Tab("Tag Control"):
             gr.Markdown("### Tag Substitutes")
@@ -582,7 +724,6 @@ with gr.Blocks(title=TITLE) as demo:
     )
-    # Wiring the button AFTER all components are defined
     run_button.click(
         fn=tag_video_interface,
         inputs=[
@@ -593,6 +734,7 @@ with gr.Blocks(title=TITLE) as demo:
             model_choice,
             tag_substitutes_df,
             tag_exclusions_df,
         ],
         outputs=[combined_tags, debug_info],
     )

     and exposes helpers to tag PIL images and full videos.
     """
+    def __init__(self, model_repo: str, batch_size: int = 16):
         self.model_repo = model_repo
         self.model = None
         self.model_target_size = None  # will be set from ONNX input shape
         self.rating_indexes = None
         self.general_indexes = None
         self.character_indexes = None
+        self.batch_size = batch_size
     def _download_model_files(self) -> Tuple[str, str]:
         csv_path = huggingface_hub.hf_hub_download(
         arr = np.expand_dims(arr, axis=0)
         return arr
+    def _prepare_frame_bgr(self, frame_bgr: np.ndarray) -> np.ndarray:
+        """
+        Fast path for OpenCV frames (BGR uint8).
+        Pads to square, resizes to model_target_size, converts to float32.
+        Returns: (H, W, 3) float32 array in BGR format (no batch dim).
+        """
+        self._load_model_if_needed()
+        target_size = self.model_target_size
+        h, w, _ = frame_bgr.shape
+        max_dim = max(h, w)
+        # Compute symmetric padding to make it square
+        pad_vert = max_dim - h
+        pad_horiz = max_dim - w
+        top = pad_vert // 2
+        bottom = pad_vert - top
+        left = pad_horiz // 2
+        right = pad_horiz - left
+        # Pad with white background (255, 255, 255) in BGR
+        frame_square = cv2.copyMakeBorder(
+            frame_bgr,
+            top, bottom, left, right,
+            borderType=cv2.BORDER_CONSTANT,
+            value=(255, 255, 255),
+        )
+        # Resize if needed
+        if max_dim != target_size:
+            frame_square = cv2.resize(
+                frame_square,
+                (target_size, target_size),
+                interpolation=cv2.INTER_AREA,
+            )
+        # To float32, no color channel reordering needed (already BGR)
+        arr = frame_square.astype(np.float32)
+        return arr  # (H, W, 3)
+    def _run_batch_and_aggregate(
+        self,
+        batch_tensors: List[np.ndarray],
+        general_thresh: float,
+        character_thresh: float,
+        aggregated_general: Dict[str, float],
+        aggregated_character: Dict[str, float],
+    ) -> int:
+        """
+        Run ONNX inference on a batch of preprocessed frames and
+        update aggregated_general / aggregated_character with max scores.
+        Returns: number of frames processed in this batch.
+        """
+        if not batch_tensors:
+            return 0
+        self._load_model_if_needed()
+        input_name = self.model.get_inputs()[0].name
+        output_name = self.model.get_outputs()[0].name
+        # Stack into shape (B, H, W, 3)
+        input_tensor = np.stack(batch_tensors, axis=0)  # float32
+        preds_batch = self.model.run([output_name], {input_name: input_tensor})[0]
+        # preds_batch: (B, num_tags)
+        for preds in preds_batch:
+            general_res, character_res = self._extract_tags_from_scores(
+                preds,
+                general_thresh=general_thresh,
+                character_thresh=character_thresh,
+            )
+            # Aggregate max score for each tag
+            for tag, score in general_res.items():
+                if tag not in aggregated_general or score > aggregated_general[tag]:
+                    aggregated_general[tag] = score
+            for tag, score in character_res.items():
+                if tag not in aggregated_character or score > aggregated_character[tag]:
+                    aggregated_character[tag] = score
+        return len(batch_tensors)
     def tag_image(
         self,
         image: Image.Image,
         labels = list(zip(self.tag_names, preds))
         # General tags
         general_names = [labels[i] for i in self.general_indexes]
         general_res = {
         return general_res, character_res
+    def _extract_tags_from_scores(
+        self,
+        preds: np.ndarray,
+        general_thresh: float,
+        character_thresh: float,
+    ) -> Tuple[Dict[str, float], Dict[str, float]]:
+        """
+        Given a 1D preds array (num_tags,), return dicts of general/character tags.
+        More efficient than rebuilding label tuples every time.
+        """
+        # Ensure numpy array of floats
+        preds = preds.astype(float)
+        general_res: Dict[str, float] = {}
+        character_res: Dict[str, float] = {}
+        # General tags
+        general_scores = preds[self.general_indexes]
+        general_idx_array = np.array(self.general_indexes)
+        general_mask = general_scores > general_thresh
+        for idx, score in zip(general_idx_array[general_mask], general_scores[general_mask]):
+            tag = self.tag_names[idx]
+            general_res[tag] = float(score)
+        # Character tags
+        character_scores = preds[self.character_indexes]
+        character_idx_array = np.array(self.character_indexes)
+        character_mask = character_scores > character_thresh
+        for idx, score in zip(character_idx_array[character_mask], character_scores[character_mask]):
+            tag = self.tag_names[idx]
+            character_res[tag] = float(score)
+        return general_res, character_res
     def tag_video(
         self,
         video_path: str,
         frame_interval = max(int(frame_interval), 1)
+        self._load_model_if_needed()
         if progress is not None:
             progress(0.0, desc="Opening video...")
         if not cap.isOpened():
             raise RuntimeError("Unable to open video file.")
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
         if total_frames <= 0:
+            total_frames = 1
         frames_to_process = max(1, (total_frames + frame_interval - 1) // frame_interval)
         aggregated_general: Dict[str, float] = {}
         aggregated_character: Dict[str, float] = {}
         frame_idx = 0
         processed_frames = 0
+        batch_tensors: List[np.ndarray] = []
         try:
             while True:
                 ret, frame = cap.read()
                 # Only process every N-th frame
                 if frame_idx % frame_interval == 0:
+                    # frame is BGR uint8 from OpenCV
+                    arr = self._prepare_frame_bgr(frame)  # (H, W, 3) float32
+                    batch_tensors.append(arr)
+                    # If batch is full, run inference
+                    if len(batch_tensors) >= self.batch_size:
+                        num_done = self._run_batch_and_aggregate(
+                            batch_tensors,
+                            general_thresh=general_thresh,
+                            character_thresh=character_thresh,
+                            aggregated_general=aggregated_general,
+                            aggregated_character=aggregated_character,
                         )
+                        processed_frames += num_done
+                        batch_tensors = []
+                        if progress is not None:
+                            ratio = min(processed_frames / frames_to_process, 0.99)
+                            progress(
+                                ratio,
+                                desc=f"Processing frames {processed_frames}/{frames_to_process}...",
+                            )
                 frame_idx += 1
         finally:
             cap.release()
+        # Process any leftover frames in the last partial batch
+        if batch_tensors:
+            num_done = self._run_batch_and_aggregate(
+                batch_tensors,
+                general_thresh=general_thresh,
+                character_thresh=character_thresh,
+                aggregated_general=aggregated_general,
+                aggregated_character=aggregated_character,
+            )
+            processed_frames += num_done
         if progress is not None:
             progress(1.0, desc="Finalizing tags...")
         # Apply substitutions & exclusions BEFORE final dedup
         adjusted_all_tags: Dict[str, float] = {}
         normalized_subs = {k.strip(): v.strip() for k, v in tag_substitutes.items() if k and v}
         normalized_exclusions = {t.strip() for t in tag_exclusions if t}
         for tag, score in all_tags_with_scores.items():
             original_tag = tag.strip()
             if original_tag in normalized_exclusions:
                 continue
             new_tag = normalized_subs.get(original_tag, original_tag)
             if new_tag in normalized_exclusions:
                 continue
             if new_tag not in adjusted_all_tags or score > adjusted_all_tags[new_tag]:
                 adjusted_all_tags[new_tag] = score
         sorted_tags = sorted(
             adjusted_all_tags.items(),
             key=lambda kv: kv[1],
             "character_threshold": float(character_thresh),
             "num_substitution_rules": len(normalized_subs),
             "num_exclusions": len(normalized_exclusions),
+            "batch_size": int(self.batch_size),
         }
         return combined_tags_str, debug_info
     model_repo: str,
     tag_substitutes_df,
     tag_exclusions_df,
+    batch_size: int,
     progress=gr.Progress(track_tqdm=False),
 ):
     if video_path is None:
     try:
         tagger = get_tagger(model_repo)
+        tagger.batch_size = int(batch_size)
         tag_substitutes = _normalize_tag_substitutes(tag_substitutes_df)
         tag_exclusions = _normalize_tag_exclusions(tag_exclusions_df)
                         sources=["upload"],
                         format="mp4",
                     )
                     model_choice = gr.Dropdown(
                         choices=MODEL_OPTIONS,
                         value=DEFAULT_MODEL_REPO,
                         label="Tagging Model",
                     )
                     general_thresh = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
                         value=0.35,
                         label="General Tags Threshold",
                     )
                     character_thresh = gr.Slider(
                         minimum=0.0,
                         maximum=1.0,
                         value=0.85,
                         label="Character Tags Threshold",
                     )
+                    gr.Markdown("### Processing")
+                    frame_interval = gr.Slider(
+                        minimum=1,
+                        maximum=60,
+                        step=1,
+                        value=10,
+                        label="Extract Every N Frames",
+                        info="For example, 10 = use every 10th frame.",
+                    )
+                    batch_size = gr.Slider(
+                        minimum=1,
+                        maximum=32,
+                        step=1,
+                        value=8,
+                        label="Batch Size",
+                        info=(
+                            "Larger batch sizes may increase initial loading time but can significantly "
+                            "improve total processing speed, especially for longer videos or high frame counts."
+                        ),
+                    )
                     run_button = gr.Button("Generate Tags", variant="primary")
                 with gr.Column():
                     combined_tags = gr.Textbox(
                         label="Combined Unique Tags (All Frames)",
                         label="Details / Debug Info",
                     )
         # ---------------- TAB 2: TAG CONTROL ----------------
         with gr.Tab("Tag Control"):
             gr.Markdown("### Tag Substitutes")
     )
     run_button.click(
         fn=tag_video_interface,
         inputs=[
             model_choice,
             tag_substitutes_df,
             tag_exclusions_df,
+            batch_size,
         ],
         outputs=[combined_tags, debug_info],
     )