Spaces:

Darknsu
/

HATTAL

Runtime error

App Files Files Community

Darknsu commited on Jun 21, 2025

Commit

a51395e

verified ·

1 Parent(s): f7a6c33

Upload 24 files

Browse files

Files changed (24) hide show

.gitattributes +35 -35
LICENSE +21 -0
README.md +91 -13
annotated video generate main.py +924 -0
annotated video with bar main.py +0 -0
dataset.py +533 -0
eval.py +39 -0
feature_extractor.py +29 -0
frame fps none bar color main.py +1234 -0
iou_utils.py +65 -0
loss_func.py +374 -0
main.py +1144 -0
models.py +232 -0
opts_egtea.py +62 -0
requirements.txt +5 -0
result image main.py +779 -0
result image opts_egtea.py +62 -0
rgb bar main.py +1144 -0
short main.py +1040 -0
single prediction and Gt print main.py +613 -0
single result dataset.py +533 -0
single result main.py +523 -0
single result opts_egtea.py +198 -0
supnet.py +637 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Sakib Reza
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,91 @@
----
-title: HATTAL
-emoji: 🐠
-colorFrom: indigo
-colorTo: green
-sdk: gradio
-sdk_version: 5.34.1
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# HAT: History-Augmented Anchor Transformer for Online Temporal Action Localization (ECCV 2024)
+### Sakib Reza, Yuexi Zhang, Mohsen Moghaddam, Octavia Camps
+#### Northeastern University, Boston, United States
+{reza.s,zhang.yuex,mohsen,o.camps}@northeastern.edu
+## [Arxiv Preprint](https://arxiv.org/abs/2408.06437)
+## Updates
+- Aug 22, 2024 - EGTEA pre-extracted features and config files for other datasets added
+- Aug 14, 2024 - Arxiv preprint added
+- July 7, 2024 - initial code release
+## Installation
+### Prerequisites
+- Ubuntu 20.04
+- Python 3.10.9
+- CUDA 12.0
+### Requirements
+- pytorch==2.0.0
+- numpy==1.23.5
+- h5py==3.9.0
+- ...
+To install all required libraries, execute the pip command below.
+```
+pip install -r requirement.txt
+```
+## Training
+### Input Features
+The Kinetics I3D pre-trained feature of EGTEA dataset can be downloaded from [GDrive link](https://drive.google.com/drive/folders/1Zj1B2UZnjPgLrylhKOfu7m_9rkQFa14T?usp=sharing).
+Files should be located in 'data/'.
+You can get other features from the following links -
+- [EPIC-Kitchen 100](https://github.com/happyharrycn/actionformer_release)
+- [THUMOS'14](https://github.com/YHKimGithub/OAT-OSN/)
+- [MUSES](https://songbai.site/muses/)
+### Config Files
+The configuration files for EGTEA are already provided in the repository. For other datasets, they can be downloaded from [GDrive link](https://drive.google.com/drive/folders/19__GnM2HZCCDshED9kadsLNAI9XBvrFd?usp=sharing).
+### Training Model
+To train the main HAT model, execute the command below.
+```
+python main.py --mode=train --split=[split #]*
+```
+```
+!python main.py --mode=train --batch_size=256 --epoch=1
+```
+*If the dataset has any splits (e.g., EGTEA has 4 splits)
+To train the post-processing network (OSN), execute the commands below.
+```
+python supnet.py --mode=make --inference_subset=train --split=[split #]
+python supnet.py --mode=make --inference_subset=test --split=[split #]
+python supnet.py --mode=train --split=[split #]
+```
+## Testing
+To test HAT, execute the command below.
+```
+python main.py --mode=test --split=[split #]
+```
+```
+!python main.py --mode=test --batch_size=256 --epoch=1
+```
+## Citing HAT
+Please cite our paper in your publications if it helps your research:
+```BibTeX
+@inproceedings{reza2022history,
+  title={HAT: History-Augmented Anchor Transformer for Online Temporal Action Localization},
+  author={Reza, Sakib and Zhang, Yuexi and Moghaddam, Mohsen and Camps, Octavia},
+  booktitle={European Conference on Computer Vision},
+  pages={XXX--XXX},
+  year={2024},
+  organization={Springer}
+}
+```
+## Acknowledgment
+This repository is created based on the repository of the baseline work [OAT-OSN](https://github.com/YHKimGithub/OAT-OSN/).

annotated video generate main.py ADDED Viewed

	@@ -0,0 +1,924 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, calc_iou
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2
+from typing import List, Dict, Optional
+# Visualization Configuration
+# Visualization Configuration
+# Visualization Configuration (Updated)
+# Visualization Configuration (Updated)
+VIS_CONFIG = {
+    'frame_interval': 1.0,
+    'max_frames': 20,
+    'save_dir': './output/visualizations',
+    'video_save_dir': './output/videos',
+    'gt_color': '#1f77b4',  # Blue for ground truth (RGB: 31, 119, 180)
+    'pred_color': '#ff7f0e',  # Orange for predictions (RGB: 255, 127, 14)
+    'fontsize_label': 10,
+    'fontsize_title': 14,
+    'frame_highlight_both': 'green',
+    'frame_highlight_gt': 'red',
+    'frame_highlight_pred': 'black',
+    'iou_threshold': 0.3,
+    'frame_scale_factor': 0.8,
+    'video_text_scale': 0.5,  # Smaller text size
+    'video_gt_text_color': (180, 119, 31),  # BGR for OpenCV
+    'video_pred_text_color': (14, 127, 255),  # BGR for OpenCV
+    'video_text_thickness': 1,  # Thinner for smaller text
+    'video_font_path': './fonts/Roboto-Regular.ttf',  # Path to TrueType font
+    'video_pred_text_y': 0.45,  # Fraction of frame height (slightly above middle)
+    'video_gt_text_y': 0.55,  # Fraction of frame height (slightly below middle)
+}
+from PIL import Image, ImageDraw, ImageFont
+import warnings
+def annotate_video_with_actions(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    save_dir: str = VIS_CONFIG['video_save_dir'],
+    text_scale: float = VIS_CONFIG['video_text_scale'],
+    gt_text_color: tuple = VIS_CONFIG['video_gt_text_color'],
+    pred_text_color: tuple = VIS_CONFIG['video_pred_text_color'],
+    text_thickness: int = VIS_CONFIG['video_text_thickness']
+) -> None:
+    """
+    Annotate a video with predicted and ground truth action labels overlaid on frames using a stylish font.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        save_dir: Directory to save the annotated video.
+        text_scale: Scale factor for text size.
+        gt_text_color: BGR color tuple for ground truth text.
+        pred_text_color: BGR color tuple for predicted text.
+        text_thickness: Thickness of text strokes.
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Open input video
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Error: Could not open video {video_path}. Skipping video annotation.")
+        return
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    print(f"Input Video: FPS={fps:.2f}, Resolution={frame_width}x{frame_height}, Total Frames={total_frames}")
+    # Define output video
+    output_path = os.path.join(save_dir, f"annotated_{video_id}_{opt['exp']}.avi")
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))
+    if not out.isOpened():
+        print(f"Error: Could not initialize video writer for {output_path}. Check codec availability.")
+        cap.release()
+        return
+    # Load font
+    font_path = VIS_CONFIG['video_font_path']
+    font_size = int(20 * text_scale)  # Base size adjusted by scale
+    try:
+        font = ImageFont.truetype(font_path, font_size)
+    except IOError:
+        print(f"Warning: Font {font_path} not found. Falling back to OpenCV default font.")
+        font = None
+    frame_idx = 0
+    written_frames = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Calculate current timestamp
+        timestamp = frame_idx / fps
+        # Find active GT actions
+        gt_labels = [seg['label'] for seg in gt_segments if seg['start'] <= timestamp <= seg['end']]
+        gt_text = "GT: " + ", ".join(gt_labels) if gt_labels else "GT: None"
+        # Find active predicted actions
+        pred_labels = [seg['label'] for seg in pred_segments if seg['start'] <= timestamp <= seg['end']]
+        pred_text = "Pred: " + ", ".join(pred_labels) if pred_labels else "Pred: None"
+        if font:
+            # Convert frame to PIL image
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame_rgb)
+            draw = ImageDraw.Draw(pil_image)
+            # Draw GT text (left-middle, slightly below center)
+            gt_y = int(frame_height * VIS_CONFIG['video_gt_text_y'])
+            draw.text((10, gt_y), gt_text, font=font, fill=(gt_text_color[2], gt_text_color[1], gt_text_color[0]))
+            # Draw predicted text (left-middle, slightly above center)
+            pred_y = int(frame_height * VIS_CONFIG['video_pred_text_y'])
+            draw.text((10, pred_y), pred_text, font=font, fill=(pred_text_color[2], pred_text_color[1], pred_text_color[0]))
+            # Convert back to OpenCV frame
+            frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+        else:
+            # Fallback to OpenCV font
+            cv2.putText(
+                frame,
+                gt_text,
+                (10, int(frame_height * VIS_CONFIG['video_gt_text_y'])),
+                cv2.FONT_HERSHEY_DUPLEX,  # Slightly more stylish than SIMPLEX
+                text_scale,
+                gt_text_color,
+                text_thickness,
+                cv2.LINE_AA
+            )
+            cv2.putText(
+                frame,
+                pred_text,
+                (10, int(frame_height * VIS_CONFIG['video_pred_text_y'])),
+                cv2.FONT_HERSHEY_DUPLEX,
+                text_scale,
+                pred_text_color,
+                text_thickness,
+                cv2.LINE_AA
+            )
+        # Write frame to output video
+        out.write(frame)
+        written_frames += 1
+        frame_idx += 1
+    # Release resources
+    cap.release()
+    out.release()
+    print(f"[✅ Saved Annotated Video]: {output_path}, Written Frames={written_frames}")
+    print("Note: If .avi is not playable, convert to .mp4 using FFmpeg:")
+    print(f"ffmpeg -i {output_path} -vcodec libx264 -acodec aac {output_path.replace('.avi', '.mp4')}")
+def visualize_action_lengths(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    duration: float,
+    save_dir: str = VIS_CONFIG['save_dir'],
+    frame_interval: float = VIS_CONFIG['frame_interval']
+) -> None:
+    """
+    Generate a visualization plot comparing ground truth and predicted action lengths with video frames.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        duration: Total duration of the video in seconds.
+        save_dir: Directory to save the output image.
+        frame_interval: Time interval between sampled frames (seconds).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Calculate frame sampling times
+    num_frames = int(duration / frame_interval) + 1
+    if num_frames > VIS_CONFIG['max_frames']:
+        frame_interval = duration / (VIS_CONFIG['max_frames'] - 1)
+        num_frames = VIS_CONFIG['max_frames']
+        print(f"Warning: Video duration ({duration:.1f}s) requires {num_frames} frames. Adjusted frame_interval to {frame_interval:.2f}s.")
+    frame_times = np.linspace(0, duration, num_frames, endpoint=False)
+    # Load video frames
+    frames = []
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Warning: Could not open video {video_path}. Using placeholder frames.")
+        frames = [np.ones((100, 100, 3), dtype=np.uint8) * 255 for _ in frame_times]
+    else:
+        for t in frame_times:
+            cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+            ret, frame = cap.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize frame to reduce memory usage
+                frame = cv2.resize(frame, (int(frame.shape[1] * 0.5), int(frame.shape[0] * 0.5)))
+                frames.append(frame)
+            else:
+                frames.append(np.ones((100, 100, 3), dtype=np.uint8) * 255)
+        cap.release()
+    # Initialize figure
+    fig = plt.figure(figsize=(num_frames * VIS_CONFIG['frame_scale_factor'], 6), constrained_layout=True)
+    gs = fig.add_gridspec(3, num_frames, height_ratios=[3, 1, 1])
+    # Plot frames
+    for i, (t, frame) in enumerate(zip(frame_times, frames)):
+        ax = fig.add_subplot(gs[0, i])
+        # Check if frame falls within GT or predicted segments
+        gt_hit = any(seg['start'] <= t <= seg['end'] for seg in gt_segments)
+        pred_hit = any(seg['start'] <= t <= seg['end'] for seg in pred_segments)
+        # Set border color
+        border_color = None
+        if gt_hit and pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_both']
+        elif gt_hit:
+            border_color = VIS_CONFIG['frame_highlight_gt']
+        elif pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_pred']
+        ax.imshow(frame)
+        ax.axis('off')
+        if border_color:
+            for spine in ax.spines.values():
+                spine.set_edgecolor(border_color)
+                spine.set_linewidth(2)
+        ax.set_title(f"{t:.1f}s", fontsize=VIS_CONFIG['fontsize_label'],
+                     color=border_color if border_color else 'black')
+    # Plot ground truth bar
+    ax_gt = fig.add_subplot(gs[1, :])
+    ax_gt.set_xlim(0, duration)
+    ax_gt.set_ylim(0, 1)
+    ax_gt.axis('off')
+    ax_gt.text(-0.02 * duration, 0.5, "Ground Truth", fontsize=VIS_CONFIG['fontsize_title'],
+               va='center', ha='right', weight='bold')
+    for seg in gt_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_gt.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['gt_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_gt.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                   fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_gt.text(start, 0.2, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_gt.text(end, 0.2, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Plot prediction bar
+    ax_pred = fig.add_subplot(gs[2, :])
+    ax_pred.set_xlim(0, duration)
+    ax_pred.set_ylim(0, 1)
+    ax_pred.axis('off')
+    ax_pred.text(-0.02 * duration, 0.5, "Prediction", fontsize=VIS_CONFIG['fontsize_title'],
+                 va='center', ha='right', weight='bold')
+    for seg in pred_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_pred.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['pred_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_pred.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                     fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_pred.text(start, 0.8, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_pred.text(end, 0.8, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Save plot
+    jpg_path = os.path.join(save_dir, f"viz_{video_id}_{opt['exp']}.png")  # Use PNG
+    plt.savefig(jpg_path, dpi=100, bbox_inches='tight')  # Lower DPI
+    print(f"[✅ Saved Visualization]: {jpg_path}")
+    plt.close()
+def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=opt['batch_size'], shuffle=True,
+                                               num_workers=0, pin_memory=True, drop_last=False)
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    epoch_cost_snip = 0
+    total_iter = len(train_dataset) // opt['batch_size']
+    cls_loss = MultiCrossEntropyLoss(focal=True)
+    snip_loss = MultiCrossEntropyLoss(focal=True)
+    for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
+        if warmup:
+            for g in optimizer.param_groups:
+                g['lr'] = n_iter * (opt['lr']) / total_iter
+        act_cls, act_reg, snip_cls = model(input_data.float().cuda())
+        act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
+        snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func_(cls_loss, cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
+        cost_snip = loss
+        epoch_cost_snip += cost_snip.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
+        epoch_cost += cost.detach().cpu().numpy()
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+    return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip
+def eval_one_epoch(opt, model, test_dataset):
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)
+    result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    IoUmAP = evaluation_detection(opt, verbose=False)
+    IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])
+    return cls_loss, reg_loss, tot_loss, IoUmAP_5
+def train(opt):
+    writer = SummaryWriter()
+    model = MYNET(opt).cuda()
+    rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
+    optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])
+    train_dataset = VideoDataSet(opt, subset="train")
+    test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])
+    warmup = False
+    for n_epoch in range(opt['epoch']):
+        if n_epoch >= 1:
+            warmup = False
+        n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)
+        writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
+        print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
+                                                                                         epoch_cost / (n_iter + 1),
+                                                                                         epoch_cost_cls / (n_iter + 1),
+                                                                                         epoch_cost_reg / (n_iter + 1),
+                                                                                         epoch_cost_snip / (n_iter + 1),
+                                                                                         optimizer.param_groups[-1]["lr"]))
+        scheduler.step()
+        model.eval()
+        cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)
+        writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
+        print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))
+        state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
+        if IoUmAP_5 > model.best_map:
+            model.best_map = IoUmAP_5
+            torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")
+        model.train()
+    writer.close()
+    return model.best_map
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    # Compare predicted and ground truth action lengths
+    if video_name:
+        print("\nComparing Predicted and Ground Truth Action Lengths for Video:", video_name)
+        with open(opt["video_anno"].format(opt["split"]), 'r') as f:
+            anno_data = json.load(f)
+        gt_annotations = anno_data['database'][video_name]['annotations']
+        duration = anno_data['database'][video_name]['duration']
+        gt_segments = []
+        for anno in gt_annotations:
+            start, end = anno['segment']
+            label = anno['label']
+            duration_seg = end - start
+            gt_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg})
+        pred_segments = []
+        for pred in result_dict[video_name]:
+            start, end = pred['segment']
+            label = pred['label']
+            score = pred['score']
+            duration_seg = end - start
+            pred_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg, 'score': score})
+        # Print comparison table
+        matches = []
+        iou_threshold = VIS_CONFIG['iou_threshold']
+        used_gt_indices = set()
+        for pred in pred_segments:
+            best_iou = 0
+            best_gt_idx = None
+            for gt_idx, gt in enumerate(gt_segments):
+                if gt_idx in used_gt_indices:
+                    continue
+                iou = calc_iou([pred['end'], pred['duration']], [gt['end'], gt['duration']])
+                if iou > best_iou and iou >= iou_threshold:
+                    best_iou = iou
+                    best_gt_idx = gt_idx
+            if best_gt_idx is not None:
+                matches.append({
+                    'pred': pred,
+                    'gt': gt_segments[best_gt_idx],
+                    'iou': best_iou
+                })
+                used_gt_indices.add(best_gt_idx)
+            else:
+                matches.append({'pred': pred, 'gt': None, 'iou': 0})
+        for gt_idx, gt in enumerate(gt_segments):
+            if gt_idx not in used_gt_indices:
+                matches.append({'pred': None, 'gt': gt, 'iou': 0})
+        print("\n{:<20} {:<30} {:<30} {:<15} {:<10}".format(
+            "Action Label", "Predicted Segment (s)", "Ground Truth Segment (s)", "Duration Diff (s)", "IoU"))
+        print("-" * 105)
+        for match in matches:
+            pred = match['pred']
+            gt = match['gt']
+            iou = match['iou']
+            if pred and gt:
+                label = pred['label'] if pred['label'] == gt['label'] else f"{pred['label']} (GT: {gt['label']})"
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                duration_diff = pred['duration'] - gt['duration']
+                print("{:<20} {:<30} {:<30} {:<15.2f} {:<10.2f}".format(
+                    label, pred_str, gt_str, duration_diff, iou))
+            elif pred:
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    pred['label'], pred_str, "None", "N/A", iou))
+            elif gt:
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    gt['label'], "None", gt_str, "N/A", iou))
+        # Summarize
+        matched_count = sum(1 for m in matches if m['pred'] and m['gt'])
+        avg_duration_diff = np.mean([m['pred']['duration'] - m['gt']['duration'] for m in matches if m['pred'] and m['gt']]) if matched_count > 0 else 0
+        avg_iou = np.mean([m['iou'] for m in matches if m['iou'] > 0]) if any(m['iou'] > 0 for m in matches) else 0
+        print(f"\nSummary:")
+        print(f"- Total Predictions: {len(pred_segments)}")
+        print(f"- Total Ground Truth: {len(gt_segments)}")
+        print(f"- Matched Segments: {matched_count}")
+        print(f"- Average Duration Difference (Matched): {avg_duration_diff:.2f}s")
+        print(f"- Average IoU (Matched): {avg_iou:.2f}")
+        # Generate static visualization
+        video_path = opt.get('video_path', '')
+        if os.path.exists(video_path):
+            visualize_action_lengths(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path,
+                duration=duration
+            )
+            # Generate annotated video
+            annotate_video_with_actions(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path
+            )
+        else:
+            print(f"Warning: Video path {video_path} not found. Skipping visualization and video annotation.")
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

annotated video with bar main.py ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset.py ADDED Viewed

	@@ -0,0 +1,533 @@

+import numpy as np
+import h5py
+import json
+import torch
+import torch.utils.data as data
+import os
+import pickle
+from multiprocessing import Pool
+def load_json(file):
+    with open(file) as json_file:
+        data = json.load(json_file)
+        return data
+def calc_iou(a, b):
+    st = a[0] - a[1]
+    ed = a[0]
+    target_st = b[0] - b[1]
+    target_ed = b[0]
+    sst = min(st, target_st)
+    led = max(ed, target_ed)
+    lst = max(st, target_st)
+    sed = min(ed, target_ed)
+    iou = (sed - lst) / max(led - sst, 1)
+    return iou
+def box_include(y, target):
+    st = y[0] - y[1]
+    ed = y[0]
+    target_st = target[0] - target[1]
+    target_ed = target[0]
+    detection_point = target_st
+    if ed > detection_point and target_st < st and target_ed > ed:
+        return True
+    return False
+class VideoDataSet(data.Dataset):
+    def __init__(self, opt, subset="train", video_name=None):
+        self.subset = subset
+        self.mode = opt["mode"]
+        self.predefined_fps = opt["predefined_fps"]
+        self.video_anno_path = opt["video_anno"].format(opt["split"])
+        self.video_len_path = opt["video_len_file"].format(self.subset + '_' + opt["setup"])
+        self.num_of_class = opt["num_of_class"]
+        self.segment_size = opt["segment_size"]
+        self.label_name = []
+        self.match_score = {}
+        self.match_score_end = {}
+        self.match_length = {}
+        self.gt_action = {}
+        self.cls_label = {}
+        self.reg_label = {}
+        self.snip_label = {}
+        self.inputs = []
+        self.inputs_all = []
+        self.data_rescale = opt["data_rescale"]
+        self.anchors = opt["anchors"]
+        self.pos_threshold = opt["pos_threshold"]
+        self.single_video_name = video_name
+        self._getDatasetDict()
+        self._loadFeaturelen(opt)
+        self._getMatchScore()
+        self._makeInputSeq()
+        self._loadPropLabel(opt['proposal_label_file'].format(self.subset + '_' + opt["setup"]))
+        if self.subset == "train":
+            if opt['data_format'] == "h5":
+                feature_rgb_file = h5py.File(opt["video_feature_rgb_train"], 'r')
+                self.feature_rgb_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_rgb_file:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_train']}")
+                    self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
+                if opt['rgb_only']:
+                    self.feature_flow_file = None
+                else:
+                    self.feature_flow_file = {}
+                    feature_flow_file = h5py.File(opt["video_feature_flow_train"], 'r')
+                    for vidx in range(len(keys)):
+                        if keys[vidx] not in feature_flow_file:
+                            raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_train']}")
+                        self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
+            elif opt['data_format'] == "pickle":
+                feature_All = pickle.load(open(opt["video_feature_all_train"], 'rb'))
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_All:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_train']}")
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "npz":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)['feats']
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+            elif opt['data_format'] == "npz_i3d":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "pt":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.pt'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = torch.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+        else:
+            if opt['data_format'] == "h5":
+                feature_rgb_file = h5py.File(opt["video_feature_rgb_test"], 'r')
+                self.feature_rgb_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_rgb_file:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_test']}")
+                    self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
+                if opt['rgb_only']:
+                    self.feature_flow_file = None
+                else:
+                    self.feature_flow_file = {}
+                    feature_flow_file = h5py.File(opt["video_feature_flow_test"], 'r')
+                    for vidx in range(len(keys)):
+                        if keys[vidx] not in feature_flow_file:
+                            raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_test']}")
+                        self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
+            elif opt['data_format'] == "pickle":
+                feature_All = pickle.load(open(opt["video_feature_all_test"], 'rb'))
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_All:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_test']}")
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "npz":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)['feats']
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+            elif opt['data_format'] == "npz_i3d":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "pt":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.pt'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = torch.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+    def _loadFeaturelen(self, opt):
+        if os.path.exists(self.video_len_path):
+            self.video_len = load_json(self.video_len_path)
+            return
+        self.video_len = {}
+        if self.subset == "train":
+            if opt['data_format'] == "h5":
+                feature_file = h5py.File(opt["video_feature_rgb_train"], 'r')
+            elif opt['data_format'] == "pickle":
+                feature_file = pickle.load(open(opt["video_feature_all_train"], 'rb'))
+            elif opt['data_format'] == "npz":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')['feats']
+            elif opt['data_format'] == "npz_i3d":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')
+            elif opt['data_format'] == "pt":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = torch.load(opt["video_feature_all_train"] + file + '.pt')
+        else:
+            if opt['data_format'] == "h5":
+                feature_file = h5py.File(opt["video_feature_rgb_test"], 'r')
+            elif opt['data_format'] == "pickle":
+                feature_file = pickle.load(open(opt["video_feature_all_test"], 'rb'))
+            elif opt['data_format'] == "npz":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')['feats']
+            elif opt['data_format'] == "npz_i3d":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')
+            elif opt['data_format'] == "pt":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = torch.load(opt["video_feature_all_test"] + file + '.pt')
+        keys = self.video_list
+        if opt['data_format'] == "h5":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        elif opt['data_format'] == "pickle":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
+        elif opt['data_format'] == "npz":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        elif opt['data_format'] == "npz_i3d":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
+        elif opt['data_format'] == "pt":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        outfile = open(self.video_len_path, "w")
+        json.dump(self.video_len, outfile, indent=2)
+        outfile.close()
+    def _getDatasetDict(self):
+        anno_database = load_json(self.video_anno_path)
+        anno_database = anno_database['database']
+        self.video_dict = {}
+        if self.single_video_name:
+            if self.single_video_name in anno_database:
+                video_info = anno_database[self.single_video_name]
+                video_subset = video_info['subset']
+                if self.subset == "full" or self.subset in video_subset:
+                    self.video_dict[self.single_video_name] = video_info
+                for seg in video_info['annotations']:
+                    if not seg['label'] in self.label_name:
+                        self.label_name.append(seg['label'])
+            else:
+                raise ValueError(f"Video {self.single_video_name} not found in annotation database")
+        else:
+            for video_name in anno_database:
+                video_info = anno_database[video_name]
+                video_subset = anno_database[video_name]['subset']
+                if self.subset == "full" or self.subset in video_subset:
+                    self.video_dict[video_name] = video_info
+                for seg in video_info['annotations']:
+                    if not seg['label'] in self.label_name:
+                        self.label_name.append(seg['label'])
+        # Ensure all 22 EGTEA action classes are included
+        expected_labels = [
+            'Clean/Wipe', 'Close', 'Compress', 'Crack', 'Cut', 'Divide/Pull Apart',
+            'Dry', 'Inspect/Read', 'Mix', 'Move Around', 'Open', 'Operate', 'Other',
+            'Pour', 'Put', 'Squeeze', 'Take', 'Transfer', 'Turn off', 'Turn on', 'Wash',
+            'Spread'  # Assumed missing label; replace with actual label if known
+        ]
+        for label in expected_labels:
+            if label not in self.label_name:
+                self.label_name.append(label)
+        self.label_name.sort()
+        self.video_list = list(self.video_dict.keys())
+        print(f"Labels in dataset.label_name: {self.label_name}")
+        print(f"Number of labels: {len(self.label_name)}, Expected: {self.num_of_class-1}")
+        print(f"{self.subset} subset video numbers: {len(self.video_list)}")
+    def _getMatchScore(self):
+        self.action_end_count = torch.zeros(2)
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            video_info = self.video_dict[video_name]
+            video_labels = video_info['annotations']
+            gt_bbox = []
+            gt_edlen = []
+            second_to_frame = self.video_len[video_name] / float(video_info['duration'])
+            for j in range(len(video_labels)):
+                tmp_info = video_labels[j]
+                tmp_start = tmp_info['segment'][0] * second_to_frame
+                tmp_end = tmp_info['segment'][1] * second_to_frame
+                tmp_label = self.label_name.index(tmp_info['label'])
+                gt_bbox.append([tmp_start, tmp_end, tmp_label])
+                gt_edlen.append([gt_bbox[-1][1], gt_bbox[-1][1] - gt_bbox[-1][0], tmp_label])
+            gt_bbox = np.array(gt_bbox)
+            gt_edlen = np.array(gt_edlen)
+            self.gt_action[video_name] = gt_edlen
+            match_score = np.zeros((self.video_len[video_name], self.num_of_class - 1), dtype=np.float32)
+            for idx in range(gt_bbox.shape[0]):
+                ed = int(gt_bbox[idx, 1]) + 1
+                st = int(gt_bbox[idx, 0])
+                match_score[st:ed, int(gt_bbox[idx, 2])] = idx + 1
+            self.match_score[video_name] = match_score
+    def _makeInputSeq(self):
+        data_idx = 0
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            duration = self.match_score[video_name].shape[0]
+            for i in range(1, duration + 1):
+                st = i - self.segment_size
+                ed = i
+                self.inputs_all.append([video_name, st, ed, data_idx])
+                data_idx += 1
+        self.inputs = self.inputs_all.copy()
+        print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
+    def _makePropLabelUnit(self, i):
+        video_name = self.inputs_all[i][0]
+        st = self.inputs_all[i][1]
+        ed = self.inputs_all[i][2]
+        cls_anc = []
+        reg_anc = []
+        for j in range(0, len(self.anchors)):
+            v1 = np.zeros(self.num_of_class)
+            v1[-1] = 1
+            v2 = np.zeros(2)
+            v2[-1] = -1e3
+            y_box = [ed - 1, self.anchors[j]]
+            subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[j], ed)
+            idx_list = []
+            for ii in range(0, subset_label.shape[0]):
+                for jj in range(0, subset_label.shape[1]):
+                    idx = int(subset_label[ii, jj])
+                    if idx > 0 and idx - 1 not in idx_list:
+                        idx_list.append(idx - 1)
+            for idx in idx_list:
+                target_box = self.gt_action[video_name][idx]
+                cls = int(target_box[2])
+                iou = calc_iou(y_box, target_box)
+                if iou >= self.pos_threshold or (j == len(self.anchors) - 1 and box_include(y_box, target_box)) or (j == 0 and box_include(target_box, y_box)):
+                    v1[cls] = 1
+                    v1[-1] = 0
+                    v2[0] = 1.0 * (target_box[0] - y_box[0]) / self.anchors[j]
+                    v2[1] = np.log(1.0 * max(1, target_box[1]) / y_box[1])
+            cls_anc.append(v1)
+            reg_anc.append(v2)
+        v0 = np.zeros(self.num_of_class)
+        v0[-1] = 1
+        segment_size = ed - st
+        y_box = [ed - 1, self.anchors[-1]]
+        subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[-1], ed)
+        idx_list = []
+        for ii in range(0, subset_label.shape[0]):
+            for jj in range(0, subset_label.shape[1]):
+                idx = int(subset_label[ii, jj])
+                if idx > 0 and idx - 1 not in idx_list:
+                    idx_list.append(idx - 1)
+        for idx in idx_list:
+            target_box = self.gt_action[video_name][idx]
+            cls = int(target_box[2])
+            iou = calc_iou(y_box, target_box)
+            if iou >= 0:
+                v0[cls] = 1
+                v0[-1] = 0
+        cls_anc = np.stack(cls_anc, axis=0)
+        reg_anc = np.stack(reg_anc, axis=0)
+        cls_snip = np.array(v0)
+        return cls_anc, reg_anc, cls_snip
+    def _loadPropLabel(self, filename):
+        if os.path.exists(filename):
+            prop_label_file = h5py.File(filename, 'r')
+            self.cls_label = np.array(prop_label_file['cls_label'][:])
+            self.reg_label = np.array(prop_label_file['reg_label'][:])
+            self.snip_label = np.array(prop_label_file['snip_label'][:])
+            prop_label_file.close()
+            self.action_frame_count = np.sum(self.cls_label.reshape((-1, self.cls_label.shape[-1])), axis=0)
+            self.action_frame_count = torch.Tensor(self.action_frame_count)
+            return
+        pool = Pool(os.cpu_count() // 2)
+        labels = pool.map(self._makePropLabelUnit, range(0, len(self.inputs_all)))
+        pool.close()
+        pool.join()
+        cls_label = []
+        reg_label = []
+        snip_label = []
+        for i in range(0, len(labels)):
+            cls_label.append(labels[i][0])
+            reg_label.append(labels[i][1])
+            snip_label.append(labels[i][2])
+        self.cls_label = np.stack(cls_label, axis=0)
+        self.reg_label = np.stack(reg_label, axis=0)
+        self.snip_label = np.stack(snip_label, axis=0)
+        outfile = h5py.File(filename, 'w')
+        dset_cls = outfile.create_dataset('/cls_label', self.cls_label.shape, maxshape=self.cls_label.shape, chunks=True, dtype=np.float32)
+        dset_cls[:, :] = self.cls_label[:, :]
+        dset_reg = outfile.create_dataset('/reg_label', self.reg_label.shape, maxshape=self.reg_label.shape, chunks=True, dtype=np.float32)
+        dset_reg[:, :] = self.reg_label[:, :]
+        dset_snip = outfile.create_dataset('/snip_label', self.snip_label.shape, maxshape=self.snip_label.shape, chunks=True, dtype=np.float32)
+        dset_snip[:, :] = self.snip_label[:, :]
+        outfile.close()
+        return
+    def __getitem__(self, index):
+        video_name, st, ed, data_idx = self.inputs[index]
+        if st >= 0:
+            feature = self._get_base_data(video_name, st, ed)
+        else:
+            feature = self._get_base_data(video_name, 0, ed)
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, -st, 0), 0)
+            feature = padfunc2d(feature)
+        cls_label = torch.Tensor(self.cls_label[data_idx])
+        reg_label = torch.Tensor(self.reg_label[data_idx])
+        snip_label = torch.Tensor(self.snip_label[data_idx])
+        return feature, cls_label, reg_label, snip_label
+    def _get_base_data(self, video_name, st, ed):
+        feature_rgb = self.feature_rgb_file[video_name]
+        feature_rgb = feature_rgb[st:ed, :]
+        if self.feature_flow_file is not None:
+            feature_flow = self.feature_flow_file[video_name]
+            feature_flow = feature_flow[st:ed, :]
+            feature = np.append(feature_rgb, feature_flow, axis=1)
+        else:
+            feature = feature_rgb
+        feature = torch.from_numpy(np.array(feature))
+        return feature
+    def _get_train_label_with_class(self, video_name, st, ed):
+        duration = len(self.match_score[video_name])
+        st_padding = 0
+        ed_padding = 0
+        if st < 0:
+            st_padding = -st
+            st = 0
+        if ed > duration:
+            ed_padding = ed - duration
+            ed = duration
+        match_score = torch.Tensor(self.match_score[video_name][st:ed])
+        if st_padding > 0:
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, st_padding, 0), 0)
+            match_score = padfunc2d(match_score)
+        if ed_padding > 0:
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, 0, ed_padding), 0)
+            match_score = padfunc2d(match_score)
+        return match_score
+    def __len__(self):
+        return len(self.inputs)
+    def reset_sample(self):
+        self.inputs = self.inputs_all.copy()
+    def select_sample(self, idx):
+        inputs = [self.inputs_all[i] for i in idx]
+        self.inputs = inputs.copy()
+        return
+class SuppressDataSet(data.Dataset):
+    def __init__(self, opt, subset="train"):
+        self.subset = subset
+        self.mode = opt["mode"]
+        self.data_file = h5py.File(opt["suppress_label_file"].format(self.subset + "_" + opt['setup']), 'r')
+        self.video_list = list(self.data_file.keys())
+        self.inputs = []
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            duration = self.data_file[video_name + '/input'].shape[0]
+            for i in range(0, duration):
+                self.inputs.append([video_name, i])
+        print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
+    def __getitem__(self, index):
+        video_name, idx = self.inputs[index]
+        input_seq = self.data_file[video_name + '/input'][idx]
+        label = self.data_file[video_name + '/label'][idx]
+        input_seq = torch.from_numpy(input_seq)
+        label = torch.from_numpy(label)
+        return input_seq, label
+    def __len__(self):
+        return len(self.inputs)

eval.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# -*- coding: utf-8 -*-
+import sys
+sys.path.append('./Evaluation')
+from eval_detection_gentime import ANETdetection
+import matplotlib.pyplot as plt
+import numpy as np
+def run_evaluation_detection(opt, ground_truth_filename, prediction_filename,
+                   tiou_thresholds=np.linspace(0.5, 0.95, 10),
+                   subset='validation', verbose=True):
+    anet_detection = ANETdetection(opt, ground_truth_filename, prediction_filename,
+                                   subset=subset, tiou_thresholds=tiou_thresholds,
+                                   verbose=verbose, check_status=False)
+    anet_detection.evaluate()
+    ap = anet_detection.ap
+    mAP = anet_detection.mAP
+    tdiff = anet_detection.tdiff
+    return (mAP, ap, tdiff)
+def evaluation_detection(opt, verbose=True):
+    mAP, AP, tdiff = run_evaluation_detection(
+        opt,
+        opt["video_anno"].format(opt["split"]),
+        opt["result_file"].format(opt['exp']),
+        tiou_thresholds=np.linspace(0.1, 0.50, 5),
+        subset=opt['inference_subset'], verbose=verbose)
+    if verbose:
+        print('mAP')
+        print(mAP)
+        print('AEDT')
+        print(tdiff)
+    return mAP

feature_extractor.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from models.i3d.extract_i3d import ExtractI3D
+from utils.utils import build_cfg_path
+from omegaconf import OmegaConf
+import torch
+from tqdm import tqdm
+import os
+import numpy as np
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(torch.cuda.get_device_name(0))
+# Select the feature type
+feature_type = 'i3d'
+# Load and patch the config
+args = OmegaConf.load(build_cfg_path(feature_type))
+args.step_size = 12
+args.flow_type = 'raft' # 'pwc'
+# Load the model
+extractor = ExtractI3D(args)
+args.video_paths = os.listdir('./Videos')
+# Extract features
+for video_path in tqdm(args.video_paths):
+    print(f'Extracting for {video_path}')
+    feature_dict = extractor.extract('./Videos/'+video_path)
+    np.savez('./I3D/'+video_path[:-4]+'.npz', **feature_dict)
+    [(print(k), print(v.shape)) for k, v in feature_dict.items()]

frame fps none bar color main.py ADDED Viewed

	@@ -0,0 +1,1234 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, calc_iou
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2
+from typing import List, Dict, Optional
+from PIL import Image, ImageDraw, ImageFont
+import warnings
+# Visualization Configuration (Updated)
+VIS_CONFIG = {
+    'frame_interval': 1.0,
+    'max_frames': 20,
+    'save_dir': './output/visualizations',
+    'video_save_dir': './output/videos',
+    'gt_color': '#1f77b4',  # Blue for ground truth (RGB: 31, 119, 180)
+    'pred_color': '#ff7f0e',  # Orange for predictions (RGB: 255, 127, 14)
+    'fontsize_label': 10,
+    'fontsize_title': 14,
+    'frame_highlight_both': 'green',
+    'frame_highlight_gt': 'red',
+    'frame_highlight_pred': 'black',
+    'iou_threshold': 0.3,
+    'frame_scale_factor': 0.8,
+    'video_text_scale': 0.5,
+    'video_gt_text_color': (180, 119, 31),  # BGR for OpenCV
+    'video_pred_text_color': (14, 127, 255),  # BGR for OpenCV
+    'video_text_thickness': 1,
+    'video_font_path': "./data/Poppins ExtraBold Italic 800.ttf",
+    'video_font_fallback': '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
+    'video_pred_text_y': 0.45,
+    'video_gt_text_y': 0.55,
+    'video_footer_height': 150,  # Increased to accommodate labels
+    'video_gt_bar_y': 0.5,
+    'video_pred_bar_y': 0.8,
+    'video_bar_height': 0.15,
+    'video_bar_text_scale': 0.4,
+    'min_segment_duration': 1.0,
+    'video_frame_text_y': 0.05,  # Position for frame number and FPS
+    'video_bar_label_x': 10,  # X-position for GT/Pred labels
+    'video_bar_label_scale': 0.5,
+    'scroll_window_duration': 30.0,  # Duration of the visible time window (seconds)
+    'scroll_speed': 0.5,  # Seconds to advance the window per second of video
+}
+def annotate_video_with_actions(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    save_dir: str = VIS_CONFIG['video_save_dir'],
+    text_scale: float = VIS_CONFIG['video_text_scale'],
+    gt_text_color: tuple = VIS_CONFIG['video_gt_text_color'],
+    pred_text_color: tuple = VIS_CONFIG['video_pred_text_color'],
+    text_thickness: int = VIS_CONFIG['video_text_thickness']
+) -> None:
+    """
+    Annotate a video with predicted and ground truth action labels, cumulative bars, frame number, and FPS.
+    Use fixed 20-second windows with original bar animation, resetting bars at each window boundary.
+    Assign different colors to different actions for GT and Pred bars, with reduced vertical gap.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        save_dir: Directory to save the annotated video.
+        text_scale: Scale factor for text size in video.
+        gt_text_color: BGR color tuple for ground truth text (fallback).
+        pred_text_color: BGR color tuple for predicted text (fallback).
+        text_thickness: Thickness of text strokes.
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Open input video
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Error: Could not open video {video_path}. Skipping video annotation.")
+        return
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / fps
+    print(f"Input Video: FPS={fps:.2f}, Resolution={frame_width}x{frame_height}, Total Frames={total_frames}, Duration={duration:.2f}s")
+    # Define output video with extended height for footer
+    footer_height = VIS_CONFIG['video_footer_height']
+    output_height = frame_height + footer_height
+    output_path = os.path.join(save_dir, f"annotated_{video_id}_{opt['exp']}.avi")
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, output_height))
+    if not out.isOpened():
+        print(f"Error: Could not initialize video writer for {output_path}. Check codec availability.")
+        cap.release()
+        return
+    # Filter short segments
+    min_duration = VIS_CONFIG['min_segment_duration']
+    gt_segments = [seg for seg in gt_segments if seg['duration'] >= min_duration]
+    pred_segments = [seg for seg in pred_segments if seg['duration'] >= min_duration]
+    print(f"Filtered Segments: GT={len(gt_segments)}, Pred={len(pred_segments)} (min_duration={min_duration}s)")
+    # Create color mapping for actions
+    action_labels = set(seg['label'] for seg in gt_segments).union(seg['label'] for seg in pred_segments)
+    # Define a BGR color palette (20 distinct colors)
+    color_palette = [
+        (255, 0, 0),     # Red
+        (0, 255, 0),     # Green
+        (0, 0, 255),     # Blue
+        (255, 255, 0),   # Yellow
+        (255, 0, 255),   # Magenta
+        (0, 255, 255),   # Cyan
+        (128, 0, 0),     # Maroon
+        (0, 128, 0),     # Dark Green
+        (0, 0, 128),     # Navy
+        (128, 128, 0),   # Olive
+        (128, 0, 128),   # Purple
+        (0, 128, 128),   # Teal
+        (255, 165, 0),   # Orange
+        (255, 192, 203), # Pink
+        (128, 128, 128), # Gray
+        (210, 105, 30),  # Chocolate
+        (100, 149, 237), # Cornflower Blue
+        (154, 205, 50),  # Yellow Green
+        (75, 0, 130),    # Indigo
+        (245, 245, 220), # Beige
+    ]
+    action_color_map = {label: color_palette[i % len(color_palette)] for i, label in enumerate(action_labels)}
+    print(f"Action Color Mapping: {action_color_map}")
+    # Convert fallback colors to RGB for PIL
+    gt_color_rgb = (gt_text_color[2], gt_text_color[1], gt_text_color[0])  # BGR to RGB
+    pred_color_rgb = (pred_text_color[2], pred_text_color[1], pred_text_color[0])  # BGR to RGB
+    # Load font
+    font_path = VIS_CONFIG['video_font_path']
+    font_fallback = VIS_CONFIG['video_font_fallback']
+    font_size = int(20 * text_scale)
+    bar_font_size = int(20 * VIS_CONFIG['video_bar_text_scale'])
+    font = None
+    bar_font = None
+    if font_path:
+        try:
+            font = ImageFont.truetype(font_path, font_size)
+            bar_font = ImageFont.truetype(font_path, bar_font_size)
+            print(f"Using font: {font_path}")
+        except IOError:
+            print(f"Warning: Font {font_path} not found. Trying fallback font.")
+    if not font:
+        try:
+            font = ImageFont.truetype(font_fallback, font_size)
+            bar_font = ImageFont.truetype(font_fallback, bar_font_size)
+            print(f"Using fallback font: {font_fallback}")
+        except IOError:
+            print(f"Warning: Fallback font {font_fallback} not found. Using OpenCV default font.")
+            font = None
+            bar_font = None
+    # Fixed window configuration
+    window_size = 20.0  # 20-second windows
+    num_windows = int(np.ceil(duration / window_size))
+    frame_idx = 0
+    written_frames = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Create extended frame with footer
+        extended_frame = np.zeros((output_height, frame_width, 3), dtype=np.uint8)
+        extended_frame[:frame_height, :, :] = frame
+        extended_frame[frame_height:, :, :] = 255  # White footer
+        # Calculate current timestamp
+        timestamp = frame_idx / fps
+        # Determine current window
+        window_idx = int(timestamp // window_size)
+        window_start = window_idx * window_size
+        window_end = min(window_start + window_size, duration)
+        window_duration = window_end - window_start
+        window_timestamp = timestamp - window_start  # Relative timestamp within window
+        # Find active GT actions (for text overlay)
+        gt_labels = [seg['label'] for seg in gt_segments if seg['start'] <= timestamp <= seg['end']]
+        gt_text = "GT: " + ", ".join(gt_labels) if gt_labels else ""
+        # Find active predicted actions (for text overlay)
+        pred_labels = [seg['label'] for seg in pred_segments if seg['start'] <= timestamp <= seg['end']]
+        pred_text = "Pred: " + ", ".join(pred_labels) if pred_labels else ""
+        # Draw GT and prediction bars in footer (within current window, using original animation)
+        footer_y = frame_height
+        gt_bar_y = footer_y + int(0.2 * footer_height)  # Reduced gap
+        pred_bar_y = footer_y + int(0.5 * footer_height)  # Reduced gap
+        bar_height = int(VIS_CONFIG['video_bar_height'] * footer_height)
+        for seg in gt_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = int(((start_t - window_start) / window_duration) * frame_width)
+                end_x = int(((end_t - window_start) / window_duration) * frame_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, gt_bar_y),
+                        (end_x, gt_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        for seg in pred_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = int(((start_t - window_start) / window_duration) * frame_width)
+                end_x = int(((end_t - window_start) / window_duration) * frame_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, pred_bar_y),
+                        (end_x, pred_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        if font:
+            # Convert frame to PIL image
+            frame_rgb = cv2.cvtColor(extended_frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame_rgb)
+            draw = ImageDraw.Draw(pil_image)
+            # Draw frame number and FPS at top center
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            frame_text_bbox = draw.textbbox((0, 0), frame_info, font=font)
+            frame_text_width = frame_text_bbox[2] - frame_text_bbox[0]
+            frame_text_x = (frame_width - frame_text_width) // 2
+            draw.text((frame_text_x, 10), frame_info, font=font, fill=(0, 0, 0))
+            # Draw window timestamp range at top of footer
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_bbox = draw.textbbox((0, 0), window_info, font=bar_font)
+            window_text_width = window_text_bbox[2] - window_text_bbox[0]
+            window_text_x = (frame_width - window_text_width) // 2
+            draw.text((window_text_x, footer_y + 10), window_info, font=bar_font, fill=(0, 0, 0))
+            # Draw GT text in video only if there are actions
+            if gt_text:
+                gt_y = int(frame_height * VIS_CONFIG['video_gt_text_y'])
+                draw.text((10, gt_y), gt_text, font=font, fill=gt_color_rgb)
+            # Draw predicted text in video only if there are actions
+            if pred_text:
+                pred_y = int(frame_height * VIS_CONFIG['video_pred_text_y'])
+                draw.text((10, pred_y), pred_text, font=font, fill=pred_color_rgb)
+            # Draw labels in bars
+            for seg in gt_segments:
+                if seg['start'] <= window_end and seg['end'] >= window_start:
+                    label = seg['label'][:8] + '...' if len(seg['label']) > 8 else seg['label']
+                    start_t = max(seg['start'], window_start)
+                    end_t = min(seg['end'], window_start + window_timestamp)
+                    start_x = int(((start_t - window_start) / window_duration) * frame_width)
+                    end_x = int(((end_t - window_start) / window_duration) * frame_width)
+                    if end_x - start_x >= 20:
+                        draw.text(
+                            ((start_x + end_x) / 2, gt_bar_y + bar_height / 2),
+                            label,
+                            font=bar_font,
+                            fill=(255, 255, 255)  # White for readability
+                        )
+                        action_color_rgb = (action_color_map[seg['label']][2], action_color_map[seg['label']][1], action_color_map[seg['label']][0])
+                        draw.text((start_x, gt_bar_y - 10), f"{start_t:.1f}", font=bar_font, fill=action_color_rgb)
+                        draw.text((end_x, gt_bar_y - 10), f"{end_t:.1f}", font=bar_font, fill=action_color_rgb)
+            for seg in pred_segments:
+                if seg['start'] <= window_end and seg['end'] >= window_start:
+                    label = seg['label'][:8] + '...' if len(seg['label']) > 8 else seg['label']
+                    start_t = max(seg['start'], window_start)
+                    end_t = min(seg['end'], window_start + window_timestamp)
+                    start_x = int(((start_t - window_start) / window_duration) * frame_width)
+                    end_x = int(((end_t - window_start) / window_duration) * frame_width)
+                    if end_x - start_x >= 20:
+                        draw.text(
+                            ((start_x + end_x) / 2, pred_bar_y + bar_height / 2),
+                            label,
+                            font=bar_font,
+                            fill=(255, 255, 255)  # White for readability
+                        )
+                        action_color_rgb = (action_color_map[seg['label']][2], action_color_map[seg['label']][1], action_color_map[seg['label']][0])
+                        draw.text((start_x, pred_bar_y + bar_height + 10), f"{start_t:.1f}", font=bar_font, fill=action_color_rgb)
+                        draw.text((end_x, pred_bar_y + bar_height + 10), f"{end_t:.1f}", font=bar_font, fill=action_color_rgb)
+            # Draw GT and Pred labels before bars
+            draw.text((10, gt_bar_y + bar_height / 2), "GT", font=bar_font, fill=gt_color_rgb)
+            draw.text((10, pred_bar_y + bar_height / 2), "Pred", font=bar_font, fill=pred_color_rgb)
+            # Convert back to OpenCV frame
+            extended_frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+        else:
+            # Fallback to OpenCV font
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            text_size, _ = cv2.getTextSize(frame_info, cv2.FONT_HERSHEY_DUPLEX, text_scale, text_thickness)
+            frame_text_x = (frame_width - text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                frame_info,
+                (frame_text_x, 30),
+                cv2.FONT_HERSHEY_DUPLEX,
+                text_scale,
+                (0, 0, 0),
+                text_thickness,
+                cv2.LINE_AA
+            )
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_size, _ = cv2.getTextSize(window_info, cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            window_text_x = (frame_width - window_text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                window_info,
+                (window_text_x, footer_y + 20),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                (0, 0, 0),
+                1,
+                cv2.LINE_AA
+            )
+            if gt_text:
+                cv2.putText(
+                    extended_frame,
+                    gt_text,
+                    (10, int(frame_height * VIS_CONFIG['video_gt_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    gt_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            if pred_text:
+                cv2.putText(
+                    extended_frame,
+                    pred_text,
+                    (10, int(frame_height * VIS_CONFIG['video_pred_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    pred_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            for seg in gt_segments:
+                if seg['start'] <= window_end and seg['end'] >= window_start:
+                    label = seg['label'][:8] + '...' if len(seg['label']) > 8 else seg['label']
+                    start_t = max(seg['start'], window_start)
+                    end_t = min(seg['end'], window_start + window_timestamp)
+                    start_x = int(((start_t - window_start) / window_duration) * frame_width)
+                    end_x = int(((end_t - window_start) / window_duration) * frame_width)
+                    if end_x - start_x >= 20:
+                        cv2.putText(
+                            extended_frame,
+                            label,
+                            (start_x + (end_x - start_x) // 2 - 20, gt_bar_y + bar_height // 2 + 5),
+                            cv2.FONT_HERSHEY_DUPLEX,
+                            VIS_CONFIG['video_bar_text_scale'],
+                            (255, 255, 255),
+                            1,
+                            cv2.LINE_AA
+                        )
+                        cv2.putText(
+                            extended_frame,
+                            f"{start_t:.1f}",
+                            (start_x, gt_bar_y - 5),
+                            cv2.FONT_HERSHEY_DUPLEX,
+                            VIS_CONFIG['video_bar_text_scale'],
+                            action_color_map[seg['label']],
+                            1,
+                            cv2.LINE_AA
+                        )
+                        cv2.putText(
+                            extended_frame,
+                            f"{end_t:.1f}",
+                            (end_x, gt_bar_y - 5),
+                            cv2.FONT_HERSHEY_DUPLEX,
+                            VIS_CONFIG['video_bar_text_scale'],
+                            action_color_map[seg['label']],
+                            1,
+                            cv2.LINE_AA
+                        )
+            for seg in pred_segments:
+                if seg['start'] <= window_end and seg['end'] >= window_start:
+                    label = seg['label'][:8] + '...' if len(seg['label']) > 8 else seg['label']
+                    start_t = max(seg['start'], window_start)
+                    end_t = min(seg['end'], window_start + window_timestamp)
+                    start_x = int(((start_t - window_start) / window_duration) * frame_width)
+                    end_x = int(((end_t - window_start) / window_duration) * frame_width)
+                    if end_x - start_x >= 20:
+                        cv2.putText(
+                            extended_frame,
+                            label,
+                            (start_x + (end_x - start_x) // 2 - 20, pred_bar_y + bar_height // 2 + 5),
+                            cv2.FONT_HERSHEY_DUPLEX,
+                            VIS_CONFIG['video_bar_text_scale'],
+                            (255, 255, 255),
+                            1,
+                            cv2.LINE_AA
+                        )
+                        cv2.putText(
+                            extended_frame,
+                            f"{start_t:.1f}",
+                            (start_x, pred_bar_y + bar_height + 15),
+                            cv2.FONT_HERSHEY_DUPLEX,
+                            VIS_CONFIG['video_bar_text_scale'],
+                            action_color_map[seg['label']],
+                            1,
+                            cv2.LINE_AA
+                        )
+                        cv2.putText(
+                            extended_frame,
+                            f"{end_t:.1f}",
+                            (end_x, pred_bar_y + bar_height + 15),
+                            cv2.FONT_HERSHEY_DUPLEX,
+                            VIS_CONFIG['video_bar_text_scale'],
+                            action_color_map[seg['label']],
+                            1,
+                            cv2.LINE_AA
+                        )
+            cv2.putText(
+                extended_frame,
+                "GT",
+                (10, gt_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                gt_text_color,
+                1,
+                cv2.LINE_AA
+            )
+            cv2.putText(
+                extended_frame,
+                "Pred",
+                (10, pred_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                pred_text_color,
+                1,
+                cv2.LINE_AA
+            )
+        # Write frame to output video
+        out.write(extended_frame)
+        written_frames += 1
+        frame_idx += 1
+    # Release resources
+    cap.release()
+    out.release()
+    print(f"[✅ Saved Annotated Video]: {output_path}, Written Frames={written_frames}")
+    print("Note: If .avi is not playable, convert to .mp4 using FFmpeg:")
+    print(f"ffmpeg -i {output_path} -vcodec libx264 -acodec aac {output_path.replace('.avi', '.mp4')}")
+def visualize_action_lengths(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    duration: float,
+    save_dir: str = VIS_CONFIG['save_dir'],
+    frame_interval: float = VIS_CONFIG['frame_interval']
+) -> None:
+    """
+    Generate a visualization plot comparing ground truth and predicted action lengths with video frames.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        duration: Total duration of the video in seconds.
+        save_dir: Directory to save the output image.
+        frame_interval: Time interval between sampled frames (seconds).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Calculate frame sampling times
+    num_frames = int(duration / frame_interval) + 1
+    if num_frames > VIS_CONFIG['max_frames']:
+        frame_interval = duration / (VIS_CONFIG['max_frames'] - 1)
+        num_frames = VIS_CONFIG['max_frames']
+        print(f"Warning: Video duration ({duration:.1f}s) requires {num_frames} frames. Adjusted frame_interval to {frame_interval:.2f}s.")
+    frame_times = np.linspace(0, duration, num_frames, endpoint=False)
+    # Load video frames
+    frames = []
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Warning: Could not open video {video_path}. Using placeholder frames.")
+        frames = [np.ones((100, 100, 3), dtype=np.uint8) * 255 for _ in frame_times]
+    else:
+        for t in frame_times:
+            cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+            ret, frame = cap.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize frame to reduce memory usage
+                frame = cv2.resize(frame, (int(frame.shape[1] * 0.5), int(frame.shape[0] * 0.5)))
+                frames.append(frame)
+            else:
+                frames.append(np.ones((100, 100, 3), dtype=np.uint8) * 255)
+        cap.release()
+    # Initialize figure
+    fig = plt.figure(figsize=(num_frames * VIS_CONFIG['frame_scale_factor'], 6), constrained_layout=True)
+    gs = fig.add_gridspec(3, num_frames, height_ratios=[3, 1, 1])
+    # Plot frames
+    for i, (t, frame) in enumerate(zip(frame_times, frames)):
+        ax = fig.add_subplot(gs[0, i])
+        # Check if frame falls within GT or predicted segments
+        gt_hit = any(seg['start'] <= t <= seg['end'] for seg in gt_segments)
+        pred_hit = any(seg['start'] <= t <= seg['end'] for seg in pred_segments)
+        # Set border color
+        border_color = None
+        if gt_hit and pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_both']
+        elif gt_hit:
+            border_color = VIS_CONFIG['frame_highlight_gt']
+        elif pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_pred']
+        ax.imshow(frame)
+        ax.axis('off')
+        if border_color:
+            for spine in ax.spines.values():
+                spine.set_edgecolor(border_color)
+                spine.set_linewidth(2)
+        ax.set_title(f"{t:.1f}s", fontsize=VIS_CONFIG['fontsize_label'],
+                     color=border_color if border_color else 'black')
+    # Plot ground truth bar
+    ax_gt = fig.add_subplot(gs[1, :])
+    ax_gt.set_xlim(0, duration)
+    ax_gt.set_ylim(0, 1)
+    ax_gt.axis('off')
+    ax_gt.text(-0.02 * duration, 0.5, "Ground Truth", fontsize=VIS_CONFIG['fontsize_title'],
+               va='center', ha='right', weight='bold')
+    for seg in gt_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_gt.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['gt_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_gt.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                   fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_gt.text(start, 0.2, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_gt.text(end, 0.2, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Plot prediction bar
+    ax_pred = fig.add_subplot(gs[2, :])
+    ax_pred.set_xlim(0, duration)
+    ax_pred.set_ylim(0, 1)
+    ax_pred.axis('off')
+    ax_pred.text(-0.02 * duration, 0.5, "Prediction", fontsize=VIS_CONFIG['fontsize_title'],
+                 va='center', ha='right', weight='bold')
+    for seg in pred_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_pred.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['pred_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_pred.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                     fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_pred.text(start, 0.8, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_pred.text(end, 0.8, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Save plot
+    jpg_path = os.path.join(save_dir, f"viz_{video_id}_{opt['exp']}.png")  # Use PNG
+    plt.savefig(jpg_path, dpi=100, bbox_inches='tight')  # Lower DPI
+    print(f"[✅ Saved Visualization]: {jpg_path}")
+    plt.close()
+def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=opt['batch_size'], shuffle=True,
+                                               num_workers=0, pin_memory=True, drop_last=False)
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    epoch_cost_snip = 0
+    total_iter = len(train_dataset) // opt['batch_size']
+    cls_loss = MultiCrossEntropyLoss(focal=True)
+    snip_loss = MultiCrossEntropyLoss(focal=True)
+    for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
+        if warmup:
+            for g in optimizer.param_groups:
+                g['lr'] = n_iter * (opt['lr']) / total_iter
+        act_cls, act_reg, snip_cls = model(input_data.float().cuda())
+        act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
+        snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func_(cls_loss, cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
+        cost_snip = loss
+        epoch_cost_snip += cost_snip.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
+        epoch_cost += cost.detach().cpu().numpy()
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+    return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip
+def eval_one_epoch(opt, model, test_dataset):
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)
+    result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    IoUmAP = evaluation_detection(opt, verbose=False)
+    IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])
+    return cls_loss, reg_loss, tot_loss, IoUmAP_5
+def train(opt):
+    writer = SummaryWriter()
+    model = MYNET(opt).cuda()
+    rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
+    optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])
+    train_dataset = VideoDataSet(opt, subset="train")
+    test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])
+    warmup = False
+    for n_epoch in range(opt['epoch']):
+        if n_epoch >= 1:
+            warmup = False
+        n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)
+        writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
+        print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
+                                                                                         epoch_cost / (n_iter + 1),
+                                                                                         epoch_cost_cls / (n_iter + 1),
+                                                                                         epoch_cost_reg / (n_iter + 1),
+                                                                                         epoch_cost_snip / (n_iter + 1),
+                                                                                         optimizer.param_groups[-1]["lr"]))
+        scheduler.step()
+        model.eval()
+        cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)
+        writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
+        print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))
+        state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
+        if IoUmAP_5 > model.best_map:
+            model.best_map = IoUmAP_5
+            torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")
+        model.train()
+    writer.close()
+    return model.best_map
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    # Compare predicted and ground truth action lengths
+    if video_name:
+        print("\nComparing Predicted and Ground Truth Action Lengths for Video:", video_name)
+        with open(opt["video_anno"].format(opt["split"]), 'r') as f:
+            anno_data = json.load(f)
+        gt_annotations = anno_data['database'][video_name]['annotations']
+        duration = anno_data['database'][video_name]['duration']
+        gt_segments = []
+        for anno in gt_annotations:
+            start, end = anno['segment']
+            label = anno['label']
+            duration_seg = end - start
+            gt_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg})
+        pred_segments = []
+        for pred in result_dict[video_name]:
+            start, end = pred['segment']
+            label = pred['label']
+            score = pred['score']
+            duration_seg = end - start
+            pred_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg, 'score': score})
+        # Print comparison table
+        matches = []
+        iou_threshold = VIS_CONFIG['iou_threshold']
+        used_gt_indices = set()
+        for pred in pred_segments:
+            best_iou = 0
+            best_gt_idx = None
+            for gt_idx, gt in enumerate(gt_segments):
+                if gt_idx in used_gt_indices:
+                    continue
+                iou = calc_iou([pred['end'], pred['duration']], [gt['end'], gt['duration']])
+                if iou > best_iou and iou >= iou_threshold:
+                    best_iou = iou
+                    best_gt_idx = gt_idx
+            if best_gt_idx is not None:
+                matches.append({
+                    'pred': pred,
+                    'gt': gt_segments[best_gt_idx],
+                    'iou': best_iou
+                })
+                used_gt_indices.add(best_gt_idx)
+            else:
+                matches.append({'pred': pred, 'gt': None, 'iou': 0})
+        for gt_idx, gt in enumerate(gt_segments):
+            if gt_idx not in used_gt_indices:
+                matches.append({'pred': None, 'gt': gt, 'iou': 0})
+        print("\n{:<20} {:<30} {:<30} {:<15} {:<10}".format(
+            "Action Label", "Predicted Segment (s)", "Ground Truth Segment (s)", "Duration Diff (s)", "IoU"))
+        print("-" * 105)
+        for match in matches:
+            pred = match['pred']
+            gt = match['gt']
+            iou = match['iou']
+            if pred and gt:
+                label = pred['label'] if pred['label'] == gt['label'] else f"{pred['label']} (GT: {gt['label']})"
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                duration_diff = pred['duration'] - gt['duration']
+                print("{:<20} {:<30} {:<30} {:<15.2f} {:<10.2f}".format(
+                    label, pred_str, gt_str, duration_diff, iou))
+            elif pred:
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    pred['label'], pred_str, "None", "N/A", iou))
+            elif gt:
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    gt['label'], "None", gt_str, "N/A", iou))
+        # Summarize
+        matched_count = sum(1 for m in matches if m['pred'] and m['gt'])
+        avg_duration_diff = np.mean([m['pred']['duration'] - m['gt']['duration'] for m in matches if m['pred'] and m['gt']]) if matched_count > 0 else 0
+        avg_iou = np.mean([m['iou'] for m in matches if m['iou'] > 0]) if any(m['iou'] > 0 for m in matches) else 0
+        print(f"\nSummary:")
+        print(f"- Total Predictions: {len(pred_segments)}")
+        print(f"- Total Ground Truth: {len(gt_segments)}")
+        print(f"- Matched Segments: {matched_count}")
+        print(f"- Average Duration Difference (Matched): {avg_duration_diff:.2f}s")
+        print(f"- Average IoU (Matched): {avg_iou:.2f}")
+        # Generate static visualization
+        video_path = opt.get('video_path', '')
+        if os.path.exists(video_path):
+            visualize_action_lengths(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path,
+                duration=duration
+            )
+            # Generate annotated video
+            annotate_video_with_actions(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path
+            )
+        else:
+            print(f"Warning: Video path {video_path} not found. Skipping visualization and video annotation.")
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

iou_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import numpy as np
+def non_max_suppression(proposals, overlapThresh=0.3):
+    # if there are no intervals, return an empty list
+    if len(proposals) == 0:
+        return []
+    # initialize the list of picked indexes
+    pick = []
+    sorted_proposal = sorted(proposals, key=lambda proposal:proposal['score'], reverse=True)
+    idx=0
+    total_proposal= len(sorted_proposal)
+    while idx < total_proposal:
+        proposal = sorted_proposal[idx]
+        st = proposal['segment'][0]
+        ed = proposal['segment'][1]
+        label = proposal['label']
+        delete_item = []
+        for j in range(idx+1, total_proposal):
+            target_proposal = sorted_proposal[j]
+            target_st = target_proposal['segment'][0]
+            target_ed = target_proposal['segment'][1]
+            target_label = target_proposal['label']
+            if(label == target_label):
+                sst = np.minimum(st, target_st)
+                led = np.maximum(ed, target_ed)
+                lst = np.maximum(st, target_st)
+                sed = np.minimum(ed, target_ed)
+                iou = (sed-lst) / max(led-sst,1)
+                if(iou > overlapThresh):
+                    delete_item.append(target_proposal)
+        for item in delete_item:
+            sorted_proposal.remove(item)
+        total_proposal=len(sorted_proposal)
+        idx+=1
+    return sorted_proposal
+def check_overlap_proposal(proposal_list, new_proposal, overlapThresh=0.3):
+    for proposal in proposal_list:
+        st = proposal['segment'][0]
+        ed = proposal['segment'][1]
+        label = proposal['label']
+        new_st = new_proposal['segment'][0]
+        new_ed = new_proposal['segment'][1]
+        new_label = new_proposal['label']
+        if(label == new_label):
+            sst = np.minimum(st, new_st)
+            led = np.maximum(ed, new_ed)
+            lst = np.maximum(st, new_st)
+            sed = np.minimum(ed, new_ed)
+            iou = (sed-lst) / max(led-sst,1)
+            if(iou > overlapThresh):
+                return proposal
+    return None

loss_func.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from functools import partial
+class MultiCrossEntropyLoss(nn.Module):
+    def __init__(self, focal=False, weight=None, reduce=True):
+        super(MultiCrossEntropyLoss, self).__init__()
+        self.num_classes = 23
+        self.focal = focal
+        self.weight= weight
+        self.reduce = reduce
+        self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
+        self.gamma_f = 0.05
+        self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
+        self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
+        self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
+    def forward(self, input, target):
+        target_sum = torch.sum(target, dim=1)
+        target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
+        target = target/target_div
+        logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
+        gamma = self.gamma_.clone()
+        gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
+        if not self.focal:
+            if self.weight is None:
+                output = torch.sum(-target * logsoftmax(input), 1)
+            else:
+                output = torch.sum(-target * logsoftmax(input) /self.weight, 1)
+        else:
+            softmax = nn.Softmax(dim=1).to(input.device)
+            p = softmax(input)
+            output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
+        if self.reduce:
+            return torch.mean(output)
+        else:
+            return output
+    def map_func(self, x, s):
+        min_val = torch.min(x)
+        max_val = torch.max(x)
+        mu = torch.mean(x)
+        x = (x - min_val) / (max_val - min_val)
+        return 1 / (1 + torch.exp(-s * (x - mu)))
+    def collect_grad(self, target, grad):
+        grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
+        target = target.reshape(-1, target.shape[-1]).cuda()
+        pos_grad = torch.sum(grad * target, dim=0)[:-1]
+        neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
+        self.pos_grad += pos_grad
+        self.neg_grad += neg_grad
+        self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
+        self.pos_neg = self.map_func(self.pos_neg, 1)
+def cls_loss_func(y,output, use_focal=False, weight=None, reduce=True):
+    input_size=y.size()
+    y = y.float().cuda()
+    if weight is not None:
+        weight = weight.cuda()
+    loss_func = MultiCrossEntropyLoss(focal=True, weight=weight, reduce=reduce)
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    loss = loss_func(output,y)
+    if not reduce:
+        loss = loss.reshape(input_size[:-1])
+    return loss
+def cls_loss_func_(loss_func, y,output, use_focal=False, weight=None, reduce=True):
+    input_size=y.size()
+    y = y.float().cuda()
+    if weight is not None:
+        weight = weight.cuda()
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    loss = loss_func(output,y)
+    if not reduce:
+        loss = loss.reshape(input_size[:-1])
+    return loss
+def regress_loss_func(y,output):
+    y = y.float().cuda()
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    bgmask= y[:,1] < -1e2
+    fg_logits = output[~bgmask]
+    bg_logits = output[bgmask]
+    fg_target = y[~bgmask]
+    bg_target = y[bgmask]
+    loss = nn.functional.l1_loss(fg_logits,fg_target)
+    if(loss.isnan()):
+        return torch.tensor([0.0], requires_grad=True).cuda()
+    return loss
+def suppress_loss_func(y,output):
+    y = y.float().cuda()
+    y=y.reshape(-1,y.size(-1))
+    output=output.reshape(-1,output.size(-1))
+    loss = nn.functional.binary_cross_entropy(output,y)
+    return loss
+# import torch
+# import numpy as np
+# import torch.nn as nn
+# import torch.nn.functional as F
+# import torch.distributed as dist
+# from functools import partial
+# class MultiCrossEntropyLoss(nn.Module):
+#     def __init__(self, focal=False, weight=None, reduce=True):
+#         super(MultiCrossEntropyLoss, self).__init__()
+#         self.num_classes = 23
+#         self.focal = focal
+#         self.weight= weight
+#         self.reduce = reduce
+#         self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
+#         self.gamma_f = 0.05
+#         self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
+#     def forward(self, input, target):
+#         target_sum = torch.sum(target, dim=1)
+#         target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
+#         target = target/target_div
+#         logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
+#         gamma = self.gamma_.clone()
+#         gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
+#         if not self.focal:
+#             if self.weight is None:
+#                 output = torch.sum(-target * logsoftmax(input), 1)
+#             else:
+#                 output = torch.sum(-target * logsoftmax(input) /self.weight, 1)
+#         else:
+#             softmax = nn.Softmax(dim=1).to(input.device)
+#             p = softmax(input)
+#             output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
+#         if self.reduce:
+#             return torch.mean(output)
+#         else:
+#             return output
+#     def map_func(self, x, s):
+#         min_val = torch.min(x)
+#         max_val = torch.max(x)
+#         mu = torch.mean(x)
+#         x = (x - min_val) / (max_val - min_val)
+#         return 1 / (1 + torch.exp(-s * (x - mu)))
+#     def collect_grad(self, target, grad):
+#         grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
+#         target = target.reshape(-1, target.shape[-1]).cuda()
+#         pos_grad = torch.sum(grad * target, dim=0)[:-1]
+#         neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
+#         self.pos_grad += pos_grad
+#         self.neg_grad += neg_grad
+#         self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
+#         self.pos_neg = self.map_func(self.pos_neg, 1)
+# def cls_loss_func(y,output, use_focal=False, weight=None, reduce=True):
+#     input_size=y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     loss_func = MultiCrossEntropyLoss(focal=True, weight=weight, reduce=reduce)
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     loss = loss_func(output,y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def cls_loss_func_(loss_func, y,output, use_focal=False, weight=None, reduce=True):
+#     input_size=y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     loss = loss_func(output,y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def regress_loss_func(y,output):
+#     y = y.float().cuda()
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     bgmask= y[:,1] < -1e2
+#     fg_logits = output[~bgmask]
+#     bg_logits = output[bgmask]
+#     fg_target = y[~bgmask]
+#     bg_target = y[bgmask]
+#     loss = nn.functional.l1_loss(fg_logits,fg_target)
+#     if(loss.isnan()):
+#         return torch.tensor([0.0], requires_grad=True).cuda()
+#     return loss
+# def suppress_loss_func(y,output):
+#     y = y.float().cuda()
+#     y=y.reshape(-1,y.size(-1))
+#     output=output.reshape(-1,output.size(-1))
+#     loss = nn.functional.binary_cross_entropy(output,y)
+#     return loss
+# import torch
+# import numpy as np
+# import torch.nn as nn
+# import torch.nn.functional as F
+# import torch.distributed as dist
+# from functools import partial
+# class MultiCrossEntropyLoss(nn.Module):
+#     def __init__(self, num_classes, focal=False, weight=None, reduce=True):
+#         super(MultiCrossEntropyLoss, self).__init__()
+#         self.num_classes = num_classes  # Use the provided num_classes
+#         self.focal = focal
+#         self.weight = weight
+#         self.reduce = reduce
+#         self.gamma_ = torch.zeros(self.num_classes).cuda() + 0.025
+#         self.gamma_f = 0.05
+#         self.register_buffer('pos_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('neg_grad', torch.zeros(self.num_classes-1).cuda())
+#         self.register_buffer('pos_neg', torch.ones(self.num_classes-1).cuda())
+#     def forward(self, input, target):
+#         target_sum = torch.sum(target, dim=1)
+#         target_div = torch.where(target_sum != 0, target_sum, torch.ones_like(target_sum)).unsqueeze(1)
+#         target = target / target_div
+#         logsoftmax = nn.LogSoftmax(dim=1).to(input.device)
+#         gamma = self.gamma_.clone()
+#         gamma[:-1] = gamma[:-1] + self.gamma_f * (1 - self.pos_neg)
+#         if not self.focal:
+#             if self.weight is None:
+#                 output = torch.sum(-target * logsoftmax(input), 1)
+#             else:
+#                 output = torch.sum(-target * logsoftmax(input) / self.weight, 1)
+#         else:
+#             softmax = nn.Softmax(dim=1).to(input.device)
+#             p = softmax(input)
+#             output = torch.sum(-target * (1 - p)**gamma * logsoftmax(input), 1)
+#         if self.reduce:
+#             return torch.mean(output)
+#         else:
+#             return output
+#     def map_func(self, x, s):
+#         min_val = torch.min(x)
+#         max_val = torch.max(x)
+#         mu = torch.mean(x)
+#         x = (x - min_val) / (max_val - min_val)
+#         return 1 / (1 + torch.exp(-s * (x - mu)))
+#     def collect_grad(self, target, grad):
+#         grad = torch.abs(grad.reshape(-1, grad.shape[-1])).cuda()
+#         target = target.reshape(-1, target.shape[-1]).cuda()
+#         pos_grad = torch.sum(grad * target, dim=0)[:-1]
+#         neg_grad = torch.sum(grad * (1 - target), dim=0)[:-1]
+#         self.pos_grad += pos_grad
+#         self.neg_grad += neg_grad
+#         self.pos_neg = torch.clamp(self.pos_grad / (self.neg_grad + 1e-10), min=0, max=1)
+#         self.pos_neg = self.map_func(self.pos_neg, 1)
+# def cls_loss_func(y, output, use_focal=False, weight=None, reduce=True):
+#     input_size = y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     loss_func = MultiCrossEntropyLoss(num_classes=y.size(-1), focal=use_focal, weight=weight, reduce=reduce)
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     loss = loss_func(output, y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def cls_loss_func_(loss_func, y, output, use_focal=False, weight=None, reduce=True):
+#     input_size = y.size()
+#     y = y.float().cuda()
+#     if weight is not None:
+#         weight = weight.cuda()
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     loss = loss_func(output, y)
+#     if not reduce:
+#         loss = loss.reshape(input_size[:-1])
+#     return loss
+# def regress_loss_func(y, output):
+#     y = y.float().cuda()
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     bgmask = y[:, 1] < -1e2
+#     fg_logits = output[~bgmask]
+#     bg_logits = output[bgmask]
+#     fg_target = y[~bgmask]
+#     bg_target = y[bgmask]
+#     loss = nn.functional.l1_loss(fg_logits, fg_target)
+#     if loss.isnan():
+#         return torch.tensor([0.0], requires_grad=True).cuda()
+#     return loss
+# def suppress_loss_func(y, output):
+#     y = y.float().cuda()
+#     y = y.reshape(-1, y.size(-1))
+#     output = output.reshape(-1, output.size(-1))
+#     loss = nn.functional.binary_cross_entropy(output, y)
+#     return loss

main.py ADDED Viewed

	@@ -0,0 +1,1144 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, calc_iou
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2
+from typing import List, Dict, Optional
+from PIL import Image, ImageDraw, ImageFont
+import warnings
+# Visualization Configuration (Updated)
+VIS_CONFIG = {
+    'frame_interval': 1.0,
+    'max_frames': 20,
+    'save_dir': './output/visualizations',
+    'video_save_dir': './output/videos',
+    'gt_color': '#1f77b4',  # Blue for ground truth (RGB: 31, 119, 180)
+    'pred_color': '#ff7f0e',  # Orange for predictions (RGB: 255, 127, 14)
+    'fontsize_label': 10,
+    'fontsize_title': 14,
+    'frame_highlight_both': 'green',
+    'frame_highlight_gt': 'red',
+    'frame_highlight_pred': 'black',
+    'iou_threshold': 0.3,
+    'frame_scale_factor': 0.8,
+    'video_text_scale': 0.5,
+    'video_gt_text_color': (180, 119, 31),  # BGR for OpenCV
+    'video_pred_text_color': (14, 127, 255),  # BGR for OpenCV
+    'video_text_thickness': 1,
+    'video_font_path': "./data/Poppins ExtraBold Italic 800.ttf",
+    'video_font_fallback': '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
+    'video_pred_text_y': 0.45,
+    'video_gt_text_y': 0.55,
+    'video_footer_height': 150,  # Increased to accommodate labels
+    'video_gt_bar_y': 0.5,
+    'video_pred_bar_y': 0.8,
+    'video_bar_height': 0.15,
+    'video_bar_text_scale': 0.7,
+    'min_segment_duration': 1.0,
+    'video_frame_text_y': 0.05,  # Position for frame number and FPS
+    'video_bar_label_x': 10,  # X-position for GT/Pred labels
+    'video_bar_label_scale': 0.5,
+    'scroll_window_duration': 30.0,  # Duration of the visible time window (seconds)
+    'scroll_speed': 0.5,  # Seconds to advance the window per second of video
+}
+def annotate_video_with_actions(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    save_dir: str = VIS_CONFIG['video_save_dir'],
+    text_scale: float = VIS_CONFIG['video_text_scale'] * 1.5,  # Increased text size by 50%
+    gt_text_color: tuple = VIS_CONFIG['video_gt_text_color'],
+    pred_text_color: tuple = VIS_CONFIG['video_pred_text_color'],
+    text_thickness: int = VIS_CONFIG['video_text_thickness']
+) -> None:
+    """
+    Annotate a video with predicted and ground truth action labels, cumulative bars, frame number, and FPS.
+    Use fixed 20-second windows with original bar animation, resetting bars at each window boundary.
+    Different colors for different action classes, no labels or timestamps on bars, increased text size.
+    GT and Pred text labels are on the left, with bars starting 0.5 inches (48 pixels) to the right.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        save_dir: Directory to save the annotated video.
+        text_scale: Scale factor for text size in video (increased).
+        gt_text_color: BGR color tuple for ground truth text.
+        pred_text_color: BGR color tuple for predicted text.
+        text_thickness: Thickness of text strokes.
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Open input video
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Error: Could not open video {video_path}. Skipping video annotation.")
+        return
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / fps
+    print(f"Input Video: FPS={fps:.2f}, Resolution={frame_width}x{frame_height}, Total Frames={total_frames}, Duration={duration:.2f}s")
+    # Define output video with extended height for footer
+    footer_height = VIS_CONFIG['video_footer_height']
+    output_height = frame_height + footer_height
+    output_path = os.path.join(save_dir, f"annotated_{video_id}_{opt['exp']}.avi")
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, output_height))
+    if not out.isOpened():
+        print(f"Error: Could not initialize video writer for {output_path}. Check codec availability.")
+        cap.release()
+        return
+    # Filter short segments
+    min_duration = VIS_CONFIG['min_segment_duration']
+    gt_segments = [seg for seg in gt_segments if seg['duration'] >= min_duration]
+    pred_segments = [seg for seg in pred_segments if seg['duration'] >= min_duration]
+    print(f"Filtered Segments: GT={len(gt_segments)}, Pred={len(pred_segments)} (min_duration={min_duration}s)")
+    # Define color palette (BGR)
+    color_palette = [
+        (128, 0, 0),      # Navy Blue
+        (60, 20, 220),    # Crimson Red
+        (0, 128, 0),      # Emerald Green
+        (128, 0, 128),    # Royal Purple
+        (79, 69, 54),     # Charcoal Gray
+        (128, 128, 0),    # Teal
+        (0, 0, 128),      # Maroon
+        (130, 0, 75),     # Indigo
+        (34, 139, 34),    # Forest Green
+        (0, 85, 204),     # Burnt Orange
+        (149, 146, 209),  # Dusty Rose
+        (235, 206, 135),  # Sky Blue
+        (250, 230, 230),  # Lavender
+        (191, 226, 159),  # Seafoam Green
+        (185, 218, 255),  # Peach
+        (255, 204, 204),  # Periwinkle
+        (193, 182, 255),  # Blush Pink
+        (201, 252, 189),  # Mint Green
+        (144, 128, 112),  # Slate Gray
+        (112, 25, 25),    # Midnight Blue
+        (102, 51, 102),   # Deep Plum
+        (0, 128, 128),    # Olive Green
+        (171, 71, 0)      # Cobalt Blue
+    ]
+    # Create color mapping for actions
+    action_labels = set(seg['label'] for seg in gt_segments).union(seg['label'] for seg in pred_segments)
+    action_color_map = {label: color_palette[i % len(color_palette)] for i, label in enumerate(action_labels)}
+    print(f"Action Color Mapping: {action_color_map}")
+    # Convert fallback colors to RGB for PIL
+    gt_color_rgb = (gt_text_color[2], gt_text_color[1], gt_text_color[0])  # BGR to RGB
+    pred_color_rgb = (pred_text_color[2], pred_text_color[1], pred_text_color[0])  # BGR to RGB
+    # Load font
+    font_path = VIS_CONFIG['video_font_path']
+    font_fallback = VIS_CONFIG['video_font_fallback']
+    font_size = int(20 * text_scale)
+    bar_font_size = int(20 * VIS_CONFIG['video_bar_text_scale'])
+    font = None
+    bar_font = None
+    if font_path:
+        try:
+            font = ImageFont.truetype(font_path, font_size)
+            bar_font = ImageFont.truetype(font_path, bar_font_size)
+            print(f"Using font: {font_path}")
+        except IOError:
+            print(f"Warning: Font {font_path} not found. Trying fallback font.")
+    if not font:
+        try:
+            font = ImageFont.truetype(font_fallback, font_size)
+            bar_font = ImageFont.truetype(font_fallback, bar_font_size)
+            print(f"Using fallback font: {font_fallback}")
+        except IOError:
+            print(f"Warning: Fallback font {font_fallback} not found. Using OpenCV default font.")
+            font = None
+            bar_font = None
+    # Fixed window configuration
+    window_size = 20.0  # 20-second windows
+    num_windows = int(np.ceil(duration / window_size))
+    # Define horizontal gap (0.5 inch = 48 pixels at 96 DPI)
+    text_bar_gap = 48  # Pixels
+    text_x = 10  # Fixed x-position for GT and Pred labels
+    frame_idx = 0
+    written_frames = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Create extended frame with footer
+        extended_frame = np.zeros((output_height, frame_width, 3), dtype=np.uint8)
+        extended_frame[:frame_height, :, :] = frame
+        extended_frame[frame_height:, :, :] = 255  # White footer
+        # Calculate current timestamp
+        timestamp = frame_idx / fps
+        # Determine current window
+        window_idx = int(timestamp // window_size)
+        window_start = window_idx * window_size
+        window_end = min(window_start + window_size, duration)
+        window_duration = window_end - window_start
+        window_timestamp = timestamp - window_start  # Relative timestamp within window
+        # Find active GT actions (for text overlay)
+        gt_labels = [seg['label'] for seg in gt_segments if seg['start'] <= timestamp <= seg['end']]
+        gt_text = "GT: " + ", ".join(gt_labels) if gt_labels else ""
+        # Find active predicted actions (for text overlay)
+        pred_labels = [seg['label'] for seg in pred_segments if seg['start'] <= timestamp <= seg['end']]
+        pred_text = "Pred: " + ", ".join(pred_labels) if pred_labels else ""
+        # Draw GT and prediction bars in footer (within current window, using original animation)
+        footer_y = frame_height
+        gt_bar_y = footer_y + int(0.2 * footer_height)  # GT bar position
+        pred_bar_y = footer_y + int(0.5 * footer_height)  # Pred bar position
+        bar_height = int(VIS_CONFIG['video_bar_height'] * footer_height)
+        # Calculate text width for GT and Pred labels to determine bar start
+        if font:
+            gt_text_bbox = bar_font.getbbox("GT")
+            pred_text_bbox = bar_font.getbbox("Pred")
+            gt_text_width = gt_text_bbox[2] - gt_text_bbox[0]
+            pred_text_width = pred_text_bbox[2] - pred_text_bbox[0]
+        else:
+            gt_text_size, _ = cv2.getTextSize("GT", cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            pred_text_size, _ = cv2.getTextSize("Pred", cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            gt_text_width = gt_text_size[0]
+            pred_text_width = pred_text_size[0]
+        max_text_width = max(gt_text_width, pred_text_width)
+        bar_start_x = text_x + max_text_width + text_bar_gap  # Bars start after text + 0.5-inch gap
+        bar_width = frame_width - bar_start_x  # Adjust bar width to fit remaining space
+        # Draw bars with action-specific colors
+        for seg in gt_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = bar_start_x + int(((start_t - window_start) / window_duration) * bar_width)
+                end_x = bar_start_x + int(((end_t - window_start) / window_duration) * bar_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, gt_bar_y),
+                        (end_x, gt_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        for seg in pred_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = bar_start_x + int(((start_t - window_start) / window_duration) * bar_width)
+                end_x = bar_start_x + int(((end_t - window_start) / window_duration) * bar_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, pred_bar_y),
+                        (end_x, pred_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        if font:
+            # Convert frame to PIL image
+            frame_rgb = cv2.cvtColor(extended_frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame_rgb)
+            draw = ImageDraw.Draw(pil_image)
+            # Draw frame number and FPS at top center
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            frame_text_bbox = draw.textbbox((0, 0), frame_info, font=font)
+            frame_text_width = frame_text_bbox[2] - frame_text_bbox[0]
+            frame_text_x = (frame_width - frame_text_width) // 2
+            draw.text((frame_text_x, 10), frame_info, font=font, fill=(0, 0, 0))
+            # Draw window timestamp range at top of footer
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_bbox = draw.textbbox((0, 0), window_info, font=bar_font)
+            window_text_width = window_text_bbox[2] - window_text_bbox[0]
+            window_text_x = (frame_width - window_text_width) // 2
+            draw.text((window_text_x, footer_y + 10), window_info, font=bar_font, fill=(0, 0, 0))
+            # Draw GT text in video only if there are actions
+            if gt_text:
+                gt_y = int(frame_height * VIS_CONFIG['video_gt_text_y'])
+                draw.text((10, gt_y), gt_text, font=font, fill=gt_color_rgb)
+            # Draw predicted text in video only if there are actions
+            if pred_text:
+                pred_y = int(frame_height * VIS_CONFIG['video_pred_text_y'])
+                draw.text((10, pred_y), pred_text, font=font, fill=pred_color_rgb)
+            # Draw GT and Pred labels in footer
+            draw.text((text_x, gt_bar_y + bar_height // 2), "GT", font=bar_font, fill=gt_color_rgb)
+            draw.text((text_x, pred_bar_y + bar_height // 2), "Pred", font=bar_font, fill=pred_color_rgb)
+            # Convert back to OpenCV frame
+            extended_frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+        else:
+            # Fallback to OpenCV font
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            text_size, _ = cv2.getTextSize(frame_info, cv2.FONT_HERSHEY_DUPLEX, text_scale, text_thickness)
+            frame_text_x = (frame_width - text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                frame_info,
+                (frame_text_x, 30),
+                cv2.FONT_HERSHEY_DUPLEX,
+                text_scale,
+                (0, 0, 0),
+                text_thickness,
+                cv2.LINE_AA
+            )
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_size, _ = cv2.getTextSize(window_info, cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            window_text_x = (frame_width - window_text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                window_info,
+                (window_text_x, footer_y + 20),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                (0, 0, 0),
+                1,
+                cv2.LINE_AA
+            )
+            if gt_text:
+                cv2.putText(
+                    extended_frame,
+                    gt_text,
+                    (10, int(frame_height * VIS_CONFIG['video_gt_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    gt_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            if pred_text:
+                cv2.putText(
+                    extended_frame,
+                    pred_text,
+                    (10, int(frame_height * VIS_CONFIG['video_pred_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    pred_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            cv2.putText(
+                extended_frame,
+                "GT",
+                (text_x, gt_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                gt_text_color,
+                1,
+                cv2.LINE_AA
+            )
+            cv2.putText(
+                extended_frame,
+                "Pred",
+                (text_x, pred_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                pred_text_color,
+                1,
+                cv2.LINE_AA
+            )
+        # Write frame to output video
+        out.write(extended_frame)
+        written_frames += 1
+        frame_idx += 1
+    # Release resources
+    cap.release()
+    out.release()
+    print(f"[✅ Saved Annotated Video]: {output_path}, Written Frames={written_frames}")
+    print("Note: If .avi is not playable, convert to .mp4 using FFmpeg:")
+    print(f"ffmpeg -i {output_path} -vcodec libx264 -acodec aac {output_path.replace('.avi', '.mp4')}")
+def visualize_action_lengths(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    duration: float,
+    save_dir: str = VIS_CONFIG['save_dir'],
+    frame_interval: float = VIS_CONFIG['frame_interval']
+) -> None:
+    """
+    Generate a visualization plot comparing ground truth and predicted action lengths with video frames.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        duration: Total duration of the video in seconds.
+        save_dir: Directory to save the output image.
+        frame_interval: Time interval between sampled frames (seconds).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Calculate frame sampling times
+    num_frames = int(duration / frame_interval) + 1
+    if num_frames > VIS_CONFIG['max_frames']:
+        frame_interval = duration / (VIS_CONFIG['max_frames'] - 1)
+        num_frames = VIS_CONFIG['max_frames']
+        print(f"Warning: Video duration ({duration:.1f}s) requires {num_frames} frames. Adjusted frame_interval to {frame_interval:.2f}s.")
+    frame_times = np.linspace(0, duration, num_frames, endpoint=False)
+    # Load video frames
+    frames = []
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Warning: Could not open video {video_path}. Using placeholder frames.")
+        frames = [np.ones((100, 100, 3), dtype=np.uint8) * 255 for _ in frame_times]
+    else:
+        for t in frame_times:
+            cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+            ret, frame = cap.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize frame to reduce memory usage
+                frame = cv2.resize(frame, (int(frame.shape[1] * 0.5), int(frame.shape[0] * 0.5)))
+                frames.append(frame)
+            else:
+                frames.append(np.ones((100, 100, 3), dtype=np.uint8) * 255)
+        cap.release()
+    # Initialize figure
+    fig = plt.figure(figsize=(num_frames * VIS_CONFIG['frame_scale_factor'], 6), constrained_layout=True)
+    gs = fig.add_gridspec(3, num_frames, height_ratios=[3, 1, 1])
+    # Plot frames
+    for i, (t, frame) in enumerate(zip(frame_times, frames)):
+        ax = fig.add_subplot(gs[0, i])
+        # Check if frame falls within GT or predicted segments
+        gt_hit = any(seg['start'] <= t <= seg['end'] for seg in gt_segments)
+        pred_hit = any(seg['start'] <= t <= seg['end'] for seg in pred_segments)
+        # Set border color
+        border_color = None
+        if gt_hit and pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_both']
+        elif gt_hit:
+            border_color = VIS_CONFIG['frame_highlight_gt']
+        elif pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_pred']
+        ax.imshow(frame)
+        ax.axis('off')
+        if border_color:
+            for spine in ax.spines.values():
+                spine.set_edgecolor(border_color)
+                spine.set_linewidth(2)
+        ax.set_title(f"{t:.1f}s", fontsize=VIS_CONFIG['fontsize_label'],
+                     color=border_color if border_color else 'black')
+    # Plot ground truth bar
+    ax_gt = fig.add_subplot(gs[1, :])
+    ax_gt.set_xlim(0, duration)
+    ax_gt.set_ylim(0, 1)
+    ax_gt.axis('off')
+    ax_gt.text(-0.02 * duration, 0.5, "Ground Truth", fontsize=VIS_CONFIG['fontsize_title'],
+               va='center', ha='right', weight='bold')
+    for seg in gt_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_gt.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['gt_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_gt.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                   fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_gt.text(start, 0.2, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_gt.text(end, 0.2, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Plot prediction bar
+    ax_pred = fig.add_subplot(gs[2, :])
+    ax_pred.set_xlim(0, duration)
+    ax_pred.set_ylim(0, 1)
+    ax_pred.axis('off')
+    ax_pred.text(-0.02 * duration, 0.5, "Prediction", fontsize=VIS_CONFIG['fontsize_title'],
+                 va='center', ha='right', weight='bold')
+    for seg in pred_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_pred.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['pred_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_pred.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                     fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_pred.text(start, 0.8, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_pred.text(end, 0.8, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Save plot
+    jpg_path = os.path.join(save_dir, f"viz_{video_id}_{opt['exp']}.png")  # Use PNG
+    plt.savefig(jpg_path, dpi=100, bbox_inches='tight')  # Lower DPI
+    print(f"[✅ Saved Visualization]: {jpg_path}")
+    plt.close()
+def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=opt['batch_size'], shuffle=True,
+                                               num_workers=0, pin_memory=True, drop_last=False)
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    epoch_cost_snip = 0
+    total_iter = len(train_dataset) // opt['batch_size']
+    cls_loss = MultiCrossEntropyLoss(focal=True)
+    snip_loss = MultiCrossEntropyLoss(focal=True)
+    for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
+        if warmup:
+            for g in optimizer.param_groups:
+                g['lr'] = n_iter * (opt['lr']) / total_iter
+        act_cls, act_reg, snip_cls = model(input_data.float().cuda())
+        act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
+        snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func_(cls_loss, cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
+        cost_snip = loss
+        epoch_cost_snip += cost_snip.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
+        epoch_cost += cost.detach().cpu().numpy()
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+    return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip
+def eval_one_epoch(opt, model, test_dataset):
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)
+    result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    IoUmAP = evaluation_detection(opt, verbose=False)
+    IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])
+    return cls_loss, reg_loss, tot_loss, IoUmAP_5
+def train(opt):
+    writer = SummaryWriter()
+    model = MYNET(opt).cuda()
+    rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
+    optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])
+    train_dataset = VideoDataSet(opt, subset="train")
+    test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])
+    warmup = False
+    for n_epoch in range(opt['epoch']):
+        if n_epoch >= 1:
+            warmup = False
+        n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)
+        writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
+        print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
+                                                                                         epoch_cost / (n_iter + 1),
+                                                                                         epoch_cost_cls / (n_iter + 1),
+                                                                                         epoch_cost_reg / (n_iter + 1),
+                                                                                         epoch_cost_snip / (n_iter + 1),
+                                                                                         optimizer.param_groups[-1]["lr"]))
+        scheduler.step()
+        model.eval()
+        cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)
+        writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
+        print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))
+        state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
+        if IoUmAP_5 > model.best_map:
+            model.best_map = IoUmAP_5
+            torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")
+        model.train()
+    writer.close()
+    return model.best_map
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    # Compare predicted and ground truth action lengths
+    if video_name:
+        print("\nComparing Predicted and Ground Truth Action Lengths for Video:", video_name)
+        with open(opt["video_anno"].format(opt["split"]), 'r') as f:
+            anno_data = json.load(f)
+        gt_annotations = anno_data['database'][video_name]['annotations']
+        duration = anno_data['database'][video_name]['duration']
+        gt_segments = []
+        for anno in gt_annotations:
+            start, end = anno['segment']
+            label = anno['label']
+            duration_seg = end - start
+            gt_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg})
+        pred_segments = []
+        for pred in result_dict[video_name]:
+            start, end = pred['segment']
+            label = pred['label']
+            score = pred['score']
+            duration_seg = end - start
+            pred_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg, 'score': score})
+        # Print comparison table
+        matches = []
+        iou_threshold = VIS_CONFIG['iou_threshold']
+        used_gt_indices = set()
+        for pred in pred_segments:
+            best_iou = 0
+            best_gt_idx = None
+            for gt_idx, gt in enumerate(gt_segments):
+                if gt_idx in used_gt_indices:
+                    continue
+                iou = calc_iou([pred['end'], pred['duration']], [gt['end'], gt['duration']])
+                if iou > best_iou and iou >= iou_threshold:
+                    best_iou = iou
+                    best_gt_idx = gt_idx
+            if best_gt_idx is not None:
+                matches.append({
+                    'pred': pred,
+                    'gt': gt_segments[best_gt_idx],
+                    'iou': best_iou
+                })
+                used_gt_indices.add(best_gt_idx)
+            else:
+                matches.append({'pred': pred, 'gt': None, 'iou': 0})
+        for gt_idx, gt in enumerate(gt_segments):
+            if gt_idx not in used_gt_indices:
+                matches.append({'pred': None, 'gt': gt, 'iou': 0})
+        print("\n{:<20} {:<30} {:<30} {:<15} {:<10}".format(
+            "Action Label", "Predicted Segment (s)", "Ground Truth Segment (s)", "Duration Diff (s)", "IoU"))
+        print("-" * 105)
+        for match in matches:
+            pred = match['pred']
+            gt = match['gt']
+            iou = match['iou']
+            if pred and gt:
+                label = pred['label'] if pred['label'] == gt['label'] else f"{pred['label']} (GT: {gt['label']})"
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                duration_diff = pred['duration'] - gt['duration']
+                print("{:<20} {:<30} {:<30} {:<15.2f} {:<10.2f}".format(
+                    label, pred_str, gt_str, duration_diff, iou))
+            elif pred:
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    pred['label'], pred_str, "None", "N/A", iou))
+            elif gt:
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    gt['label'], "None", gt_str, "N/A", iou))
+        # Summarize
+        matched_count = sum(1 for m in matches if m['pred'] and m['gt'])
+        avg_duration_diff = np.mean([m['pred']['duration'] - m['gt']['duration'] for m in matches if m['pred'] and m['gt']]) if matched_count > 0 else 0
+        avg_iou = np.mean([m['iou'] for m in matches if m['iou'] > 0]) if any(m['iou'] > 0 for m in matches) else 0
+        print(f"\nSummary:")
+        print(f"- Total Predictions: {len(pred_segments)}")
+        print(f"- Total Ground Truth: {len(gt_segments)}")
+        print(f"- Matched Segments: {matched_count}")
+        print(f"- Average Duration Difference (Matched): {avg_duration_diff:.2f}s")
+        print(f"- Average IoU (Matched): {avg_iou:.2f}")
+        # Generate static visualization
+        video_path = opt.get('video_path', '')
+        if os.path.exists(video_path):
+            visualize_action_lengths(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path,
+                duration=duration
+            )
+            # Generate annotated video
+            annotate_video_with_actions(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path
+            )
+        else:
+            print(f"Warning: Video path {video_path} not found. Skipping visualization and video annotation.")
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

models.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import numpy as np
+import torch
+import math
+from torch.autograd import Variable
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import init
+from torch.nn.functional import normalize
+class PositionalEncoding(nn.Module):
+    def __init__(self,
+                 emb_size: int,
+                 dropout: float = 0.1,
+                 maxlen: int = 750):
+        super(PositionalEncoding, self).__init__()
+        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
+        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
+        pos_embedding = torch.zeros((maxlen, emb_size))
+        pos_embedding[:, 0::2] = torch.sin(pos * den)
+        pos_embedding[:, 1::2] = torch.cos(pos * den)
+        pos_embedding = pos_embedding.unsqueeze(-2)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer('pos_embedding', pos_embedding)
+    def forward(self, token_embedding: torch.Tensor):
+        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
+class HistoryUnit(torch.nn.Module):
+    def __init__(self, opt):
+        super(HistoryUnit, self).__init__()
+        self.n_feature=opt["feat_dim"]
+        n_class=opt["num_of_class"]
+        n_embedding_dim=opt["hidden_dim"]
+        n_hist_dec_head = 4
+        n_hist_dec_layer = 5
+        n_hist_dec_head_2 = 4
+        n_hist_dec_layer_2 = 2
+        self.anchors=opt["anchors"]
+        self.history_tokens = 16
+        self.short_window_size = 16
+        self.anchors_stride=[]
+        dropout=0.3
+        self.best_loss=1000000
+        self.best_map=0
+        self.history_positional_encoding = PositionalEncoding(n_embedding_dim, dropout, maxlen=400)
+        self.history_encoder_block1 = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_hist_dec_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_hist_dec_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.history_encoder_block2 = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_hist_dec_head_2,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_hist_dec_layer_2,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.snip_head = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim//4), nn.ReLU())
+        self.snip_classifier = nn.Sequential(nn.Linear(self.history_tokens*n_embedding_dim//4, (self.history_tokens*n_embedding_dim//4)//4), nn.ReLU(), nn.Linear((self.history_tokens*n_embedding_dim//4)//4,n_class))
+        self.history_token = nn.Parameter(torch.zeros(self.history_tokens, 1, n_embedding_dim))
+        # self.history_token_extra = nn.Parameter(torch.zeros(self.history_tokens*2, 1, n_embedding_dim))
+        self.norm2 = nn.LayerNorm(n_embedding_dim)
+        self.dropout2 = nn.Dropout(0.1)
+    def forward(self, long_x, encoded_x):
+        ## History Encoder
+        hist_pe_x = self.history_positional_encoding(long_x)
+        history_token = self.history_token.expand(-1, hist_pe_x.shape[1], -1)
+        hist_encoded_x_1 = self.history_encoder_block1(history_token, hist_pe_x)
+        hist_encoded_x_2 = self.history_encoder_block2(hist_encoded_x_1, encoded_x)
+        hist_encoded_x_2 = hist_encoded_x_2 + self.dropout2(hist_encoded_x_1)
+        hist_encoded_x = self.norm2(hist_encoded_x_2)
+        ## Snippet Classfication Head
+        snippet_feat = self.snip_head(hist_encoded_x_1)
+        snippet_feat = torch.flatten(snippet_feat.permute(1, 0, 2), start_dim=1)
+        snip_cls = self.snip_classifier(snippet_feat)
+        return hist_encoded_x, snip_cls
+class MYNET(torch.nn.Module):
+    def __init__(self, opt):
+        super(MYNET, self).__init__()
+        self.n_feature=opt["feat_dim"]
+        n_class=opt["num_of_class"]
+        n_embedding_dim=opt["hidden_dim"]
+        n_enc_layer=opt["enc_layer"]
+        n_enc_head=opt["enc_head"]
+        n_dec_layer=opt["dec_layer"]
+        n_dec_head=opt["dec_head"]
+        n_comb_dec_head = 4
+        n_comb_dec_layer = 5
+        n_seglen=opt["segment_size"]
+        self.anchors=opt["anchors"]
+        self.history_tokens = 16
+        self.short_window_size = 16
+        self.anchors_stride=[]
+        dropout=0.3
+        self.best_loss=1000000
+        self.best_map=0
+        self.feature_reduction_rgb = nn.Linear(self.n_feature//2, n_embedding_dim//2)
+        self.feature_reduction_flow = nn.Linear(self.n_feature//2, n_embedding_dim//2)
+        self.positional_encoding = PositionalEncoding(n_embedding_dim, dropout, maxlen=400)
+        self.encoder = nn.TransformerEncoder(
+                                            nn.TransformerEncoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_enc_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_enc_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.decoder = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_dec_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_dec_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.history_unit = HistoryUnit(opt)
+        self.history_anchor_decoder_block1 = nn.TransformerDecoder(
+                                            nn.TransformerDecoderLayer(d_model=n_embedding_dim,
+                                                                        nhead=n_comb_dec_head,
+                                                                        dropout=dropout,
+                                                                        activation='gelu'),
+                                            n_comb_dec_layer,
+                                            nn.LayerNorm(n_embedding_dim))
+        self.classifier = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim), nn.ReLU(), nn.Linear(n_embedding_dim,n_class))
+        self.regressor = nn.Sequential(nn.Linear(n_embedding_dim,n_embedding_dim), nn.ReLU(), nn.Linear(n_embedding_dim,2))
+        self.decoder_token = nn.Parameter(torch.zeros(len(self.anchors), 1, n_embedding_dim))
+        self.norm1 = nn.LayerNorm(n_embedding_dim)
+        self.dropout1 = nn.Dropout(0.1)
+        self.relu = nn.ReLU(True)
+        self.softmaxd1 = nn.Softmax(dim=-1)
+    def forward(self, inputs):
+        # base_x_rgb = self.feature_reduction_rgb(inputs[:,:,:self.n_feature//2])
+        # base_x_flow = self.feature_reduction_flow(inputs[:,:,self.n_feature//2:])
+        base_x_rgb = self.feature_reduction_rgb(inputs[:,:,:self.n_feature//2].float())
+        base_x_flow = self.feature_reduction_flow(inputs[:,:,self.n_feature//2:].float())
+        base_x = torch.cat([base_x_rgb,base_x_flow],dim=-1)
+        base_x = base_x.permute([1,0,2])# seq_len x batch x featsize x
+        short_x = base_x[-self.short_window_size:]
+        long_x = base_x[:-self.short_window_size]
+        ## Anchor Feature Generator
+        pe_x = self.positional_encoding(short_x)
+        encoded_x = self.encoder(pe_x)
+        decoder_token = self.decoder_token.expand(-1, encoded_x.shape[1], -1)
+        decoded_x = self.decoder(decoder_token, encoded_x)
+        decoded_x = decoded_x
+        ## Future-Supervised History Module
+        hist_encoded_x, snip_cls = self.history_unit(long_x, encoded_x)
+        ## History Driven Anchor Refinement
+        decoded_anchor_feat = self.history_anchor_decoder_block1(decoded_x, hist_encoded_x)
+        decoded_anchor_feat = decoded_anchor_feat + self.dropout1(decoded_x)
+        decoded_anchor_feat = self.norm1(decoded_anchor_feat)
+        decoded_anchor_feat = decoded_anchor_feat.permute([1, 0, 2])
+        # Predition Module
+        anc_cls = self.classifier(decoded_anchor_feat)
+        anc_reg = self.regressor(decoded_anchor_feat)
+        return anc_cls, anc_reg, snip_cls
+class SuppressNet(torch.nn.Module):
+    def __init__(self, opt):
+        super(SuppressNet, self).__init__()
+        n_class=opt["num_of_class"]-1
+        n_seglen=opt["segment_size"]
+        n_embedding_dim=2*n_seglen
+        dropout=0.3
+        self.best_loss=1000000
+        self.best_map=0
+        # FC layers for the 2 streams
+        self.mlp1 = nn.Linear(n_seglen, n_embedding_dim)
+        self.mlp2 = nn.Linear(n_embedding_dim, 1)
+        self.norm = nn.InstanceNorm1d(n_class)
+        self.relu = nn.ReLU(True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, inputs):
+        #inputs - batch x seq_len x class
+        base_x = inputs.permute([0,2,1])
+        base_x = self.norm(base_x)
+        x = self.relu(self.mlp1(base_x))
+        x = self.sigmoid(self.mlp2(x))
+        x = x.squeeze(-1)
+        return x

opts_egtea.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+def parse_opt():
+    parser = argparse.ArgumentParser()
+    # Overall settings
+    parser.add_argument('--mode', type=str, default='train')
+    parser.add_argument('--video_name', type=str, default=None, help='Name of the single video to evaluate')
+    parser.add_argument('--video_path', type=str, default='', help='Path to the input video file for visualization')
+    parser.add_argument('--checkpoint_path', type=str, default='./checkpoint')
+    parser.add_argument('--segment_size', type=int, default=64)
+    parser.add_argument('--anchors', type=str, default='2,4,6,8,12,16')
+    parser.add_argument('--seed', default=7, type=int, help='random seed for reproducibility')
+    # Overall Dataset settings
+    parser.add_argument('--num_of_class', type=int, default=23)
+    parser.add_argument('--data_format', type=str, default="npz_i3d")
+    parser.add_argument('--data_rescale', default=False, action='store_true')
+    parser.add_argument('--predefined_fps', default=None, type=float)
+    parser.add_argument('--rgb_only', default=False, action='store_true')
+    parser.add_argument('--video_anno', type=str, default="./data/egtea_annotations_split{}.json")
+    parser.add_argument('--video_feature_all_train', type=str, default="./data/I3D/")
+    parser.add_argument('--video_feature_all_test', type=str, default="./data/I3D/")
+    parser.add_argument('--setup', type=str, default="")
+    parser.add_argument('--exp', type=str, default="01")
+    parser.add_argument('--split', type=str, default="1")
+    # Network
+    parser.add_argument('--feat_dim', type=int, default=2048)
+    parser.add_argument('--hidden_dim', type=int, default=1024)
+    parser.add_argument('--out_dim', type=int, default=23)
+    parser.add_argument('--enc_layer', type=int, default=3)
+    parser.add_argument('--enc_head', type=int, default=8)
+    parser.add_argument('--dec_layer', type=int, default=5)
+    parser.add_argument('--dec_head', type=int, default=4)
+    # Training settings
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--lr', type=float, default=1e-4)
+    parser.add_argument('--weight_decay', type=float, default=1e-4)
+    parser.add_argument('--epoch', type=int, default=5)
+    parser.add_argument('--lr_step', type=int, default=3)
+    # Post processing
+    parser.add_argument('--alpha', type=float, default=1)
+    parser.add_argument('--beta', type=float, default=1)
+    parser.add_argument('--gamma', type=float, default=0.2)
+    parser.add_argument('--pptype', type=str, default="net")
+    parser.add_argument('--pos_threshold', type=float, default=0.5)
+    parser.add_argument('--sup_threshold', type=float, default=0.1)
+    parser.add_argument('--threshold', type=float, default=0.1)
+    parser.add_argument('--inference_subset', type=str, default="test")
+    parser.add_argument('--soft_nms', type=float, default=0.3)
+    parser.add_argument('--video_len_file', type=str, default="./output/video_len_{}.json")
+    parser.add_argument('--proposal_label_file', type=str, default="./output/proposal_label_{}.h5")
+    parser.add_argument('--suppress_label_file', type=str, default="./output/suppress_label_{}.h5")
+    parser.add_argument('--suppress_result_file', type=str, default="./output/suppress_result{}.h5")
+    parser.add_argument('--frame_result_file', type=str, default="./output/frame_result{}.h5")
+    parser.add_argument('--result_file', type=str, default="./output/result_proposal{}.json")
+    parser.add_argument('--wterm', type=bool, default=False)
+    args = parser.parse_args()
+    return args

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+h5py
+ipdb
+sklearn
+matplotlib
+tensorboardX

result image main.py ADDED Viewed

	@@ -0,0 +1,779 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, calc_iou
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2
+from typing import List, Dict, Optional
+# Visualization Configuration
+# Visualization Configuration
+VIS_CONFIG = {
+    'frame_interval': 1.0,  # Sample frames every 1 second
+    'max_frames': 20,  # Maximum number of frames to display
+    'save_dir': './output/visualizations',
+    'gt_color': '#1f77b4',  # Blue for ground truth
+    'pred_color': '#ff7f0e',  # Orange for predictions
+    'fontsize_label': 10,  # Reduced for better fit
+    'fontsize_title': 14,
+    'frame_highlight_both': 'green',
+    'frame_highlight_gt': 'red',
+    'frame_highlight_pred': 'black',
+    'iou_threshold': 0.3,
+    'frame_scale_factor': 0.8,  # Reduced scaling for smaller figure
+}
+def visualize_action_lengths(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    duration: float,
+    save_dir: str = VIS_CONFIG['save_dir'],
+    frame_interval: float = VIS_CONFIG['frame_interval']
+) -> None:
+    """
+    Generate a visualization plot comparing ground truth and predicted action lengths with video frames.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        duration: Total duration of the video in seconds.
+        save_dir: Directory to save the output image.
+        frame_interval: Time interval between sampled frames (seconds).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Calculate frame sampling times
+    num_frames = int(duration / frame_interval) + 1
+    if num_frames > VIS_CONFIG['max_frames']:
+        frame_interval = duration / (VIS_CONFIG['max_frames'] - 1)
+        num_frames = VIS_CONFIG['max_frames']
+        print(f"Warning: Video duration ({duration:.1f}s) requires {num_frames} frames. Adjusted frame_interval to {frame_interval:.2f}s.")
+    frame_times = np.linspace(0, duration, num_frames, endpoint=False)
+    # Load video frames
+    frames = []
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Warning: Could not open video {video_path}. Using placeholder frames.")
+        frames = [np.ones((100, 100, 3), dtype=np.uint8) * 255 for _ in frame_times]
+    else:
+        for t in frame_times:
+            cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+            ret, frame = cap.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize frame to reduce memory usage
+                frame = cv2.resize(frame, (int(frame.shape[1] * 0.5), int(frame.shape[0] * 0.5)))
+                frames.append(frame)
+            else:
+                frames.append(np.ones((100, 100, 3), dtype=np.uint8) * 255)
+        cap.release()
+    # Initialize figure
+    fig = plt.figure(figsize=(num_frames * VIS_CONFIG['frame_scale_factor'], 6), constrained_layout=True)
+    gs = fig.add_gridspec(3, num_frames, height_ratios=[3, 1, 1])
+    # Plot frames
+    for i, (t, frame) in enumerate(zip(frame_times, frames)):
+        ax = fig.add_subplot(gs[0, i])
+        # Check if frame falls within GT or predicted segments
+        gt_hit = any(seg['start'] <= t <= seg['end'] for seg in gt_segments)
+        pred_hit = any(seg['start'] <= t <= seg['end'] for seg in pred_segments)
+        # Set border color
+        border_color = None
+        if gt_hit and pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_both']
+        elif gt_hit:
+            border_color = VIS_CONFIG['frame_highlight_gt']
+        elif pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_pred']
+        ax.imshow(frame)
+        ax.axis('off')
+        if border_color:
+            for spine in ax.spines.values():
+                spine.set_edgecolor(border_color)
+                spine.set_linewidth(2)
+        ax.set_title(f"{t:.1f}s", fontsize=VIS_CONFIG['fontsize_label'],
+                     color=border_color if border_color else 'black')
+    # Plot ground truth bar
+    ax_gt = fig.add_subplot(gs[1, :])
+    ax_gt.set_xlim(0, duration)
+    ax_gt.set_ylim(0, 1)
+    ax_gt.axis('off')
+    ax_gt.text(-0.02 * duration, 0.5, "Ground Truth", fontsize=VIS_CONFIG['fontsize_title'],
+               va='center', ha='right', weight='bold')
+    for seg in gt_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_gt.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['gt_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_gt.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                   fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_gt.text(start, 0.2, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_gt.text(end, 0.2, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Plot prediction bar
+    ax_pred = fig.add_subplot(gs[2, :])
+    ax_pred.set_xlim(0, duration)
+    ax_pred.set_ylim(0, 1)
+    ax_pred.axis('off')
+    ax_pred.text(-0.02 * duration, 0.5, "Prediction", fontsize=VIS_CONFIG['fontsize_title'],
+                 va='center', ha='right', weight='bold')
+    for seg in pred_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_pred.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['pred_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_pred.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                     fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_pred.text(start, 0.8, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_pred.text(end, 0.8, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Save plot
+    jpg_path = os.path.join(save_dir, f"viz_{video_id}_{opt['exp']}.png")  # Use PNG
+    plt.savefig(jpg_path, dpi=100, bbox_inches='tight')  # Lower DPI
+    print(f"[✅ Saved Visualization]: {jpg_path}")
+    plt.close()
+def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=opt['batch_size'], shuffle=True,
+                                               num_workers=0, pin_memory=True, drop_last=False)
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    epoch_cost_snip = 0
+    total_iter = len(train_dataset) // opt['batch_size']
+    cls_loss = MultiCrossEntropyLoss(focal=True)
+    snip_loss = MultiCrossEntropyLoss(focal=True)
+    for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
+        if warmup:
+            for g in optimizer.param_groups:
+                g['lr'] = n_iter * (opt['lr']) / total_iter
+        act_cls, act_reg, snip_cls = model(input_data.float().cuda())
+        act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
+        snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func_(cls_loss, cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
+        cost_snip = loss
+        epoch_cost_snip += cost_snip.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
+        epoch_cost += cost.detach().cpu().numpy()
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+    return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip
+def eval_one_epoch(opt, model, test_dataset):
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)
+    result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    IoUmAP = evaluation_detection(opt, verbose=False)
+    IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])
+    return cls_loss, reg_loss, tot_loss, IoUmAP_5
+def train(opt):
+    writer = SummaryWriter()
+    model = MYNET(opt).cuda()
+    rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
+    optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])
+    train_dataset = VideoDataSet(opt, subset="train")
+    test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])
+    warmup = False
+    for n_epoch in range(opt['epoch']):
+        if n_epoch >= 1:
+            warmup = False
+        n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)
+        writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
+        print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
+                                                                                         epoch_cost / (n_iter + 1),
+                                                                                         epoch_cost_cls / (n_iter + 1),
+                                                                                         epoch_cost_reg / (n_iter + 1),
+                                                                                         epoch_cost_snip / (n_iter + 1),
+                                                                                         optimizer.param_groups[-1]["lr"]))
+        scheduler.step()
+        model.eval()
+        cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)
+        writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
+        print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))
+        state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
+        if IoUmAP_5 > model.best_map:
+            model.best_map = IoUmAP_5
+            torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")
+        model.train()
+    writer.close()
+    return model.best_map
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    # Compare predicted and ground truth action lengths
+    if video_name:
+        print("\nComparing Predicted and Ground Truth Action Lengths for Video:", video_name)
+        # Load ground truth annotations
+        with open(opt["video_anno"].format(opt["split"]), 'r') as f:
+            anno_data = json.load(f)
+        gt_annotations = anno_data['database'][video_name]['annotations']
+        duration = anno_data['database'][video_name]['duration']
+        # Extract ground truth segments
+        gt_segments = []
+        for anno in gt_annotations:
+            start, end = anno['segment']
+            label = anno['label']
+            duration_seg = end - start
+            gt_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg})
+        # Extract predicted segments
+        pred_segments = []
+        for pred in result_dict[video_name]:
+            start, end = pred['segment']
+            label = pred['label']
+            score = pred['score']
+            duration_seg = end - start
+            pred_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg, 'score': score})
+        # Print comparison table
+        matches = []
+        iou_threshold = VIS_CONFIG['iou_threshold']
+        used_gt_indices = set()
+        for pred in pred_segments:
+            best_iou = 0
+            best_gt_idx = None
+            for gt_idx, gt in enumerate(gt_segments):
+                if gt_idx in used_gt_indices:
+                    continue
+                iou = calc_iou([pred['end'], pred['duration']], [gt['end'], gt['duration']])
+                if iou > best_iou and iou >= iou_threshold:
+                    best_iou = iou
+                    best_gt_idx = gt_idx
+            if best_gt_idx is not None:
+                matches.append({
+                    'pred': pred,
+                    'gt': gt_segments[best_gt_idx],
+                    'iou': best_iou
+                })
+                used_gt_indices.add(best_gt_idx)
+            else:
+                matches.append({'pred': pred, 'gt': None, 'iou': 0})
+        for gt_idx, gt in enumerate(gt_segments):
+            if gt_idx not in used_gt_indices:
+                matches.append({'pred': None, 'gt': gt, 'iou': 0})
+        print("\n{:<20} {:<30} {:<30} {:<15} {:<10}".format(
+            "Action Label", "Predicted Segment (s)", "Ground Truth Segment (s)", "Duration Diff (s)", "IoU"))
+        print("-" * 105)
+        for match in matches:
+            pred = match['pred']
+            gt = match['gt']
+            iou = match['iou']
+            if pred and gt:
+                label = pred['label'] if pred['label'] == gt['label'] else f"{pred['label']} (GT: {gt['label']})"
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                duration_diff = pred['duration'] - gt['duration']
+                print("{:<20} {:<30} {:<30} {:<15.2f} {:<10.2f}".format(
+                    label, pred_str, gt_str, duration_diff, iou))
+            elif pred:
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    pred['label'], pred_str, "None", "N/A", iou))
+            elif gt:
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    gt['label'], "None", gt_str, "N/A", iou))
+        # Summarize
+        matched_count = sum(1 for m in matches if m['pred'] and m['gt'])
+        avg_duration_diff = np.mean([m['pred']['duration'] - m['gt']['duration'] for m in matches if m['pred'] and m['gt']]) if matched_count > 0 else 0
+        avg_iou = np.mean([m['iou'] for m in matches if m['iou'] > 0]) if any(m['iou'] > 0 for m in matches) else 0
+        print(f"\nSummary:")
+        print(f"- Total Predictions: {len(pred_segments)}")
+        print(f"- Total Ground Truth: {len(gt_segments)}")
+        print(f"- Matched Segments: {matched_count}")
+        print(f"- Average Duration Difference (Matched): {avg_duration_diff:.2f}s")
+        print(f"- Average IoU (Matched): {avg_iou:.2f}")
+        # Generate visualization
+        video_path = opt.get('video_path', '')  # Add --video_path to opts_egtea.py
+        if os.path.exists(video_path):
+            visualize_action_lengths(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path,
+                duration=duration
+            )
+        else:
+            print(f"Warning: Video path {video_path} not found. Skipping visualization.")
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

result image opts_egtea.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+def parse_opt():
+    parser = argparse.ArgumentParser()
+    # Overall settings
+    parser.add_argument('--mode', type=str, default='train')
+    parser.add_argument('--video_name', type=str, default=None, help='Name of the single video to evaluate')
+    parser.add_argument('--video_path', type=str, default='', help='Path to the input video file for visualization')
+    parser.add_argument('--checkpoint_path', type=str, default='./checkpoint')
+    parser.add_argument('--segment_size', type=int, default=64)
+    parser.add_argument('--anchors', type=str, default='2,4,6,8,12,16')
+    parser.add_argument('--seed', default=7, type=int, help='random seed for reproducibility')
+    # Overall Dataset settings
+    parser.add_argument('--num_of_class', type=int, default=23)
+    parser.add_argument('--data_format', type=str, default="npz_i3d")
+    parser.add_argument('--data_rescale', default=False, action='store_true')
+    parser.add_argument('--predefined_fps', default=None, type=float)
+    parser.add_argument('--rgb_only', default=False, action='store_true')
+    parser.add_argument('--video_anno', type=str, default="./data/egtea_annotations_split{}.json")
+    parser.add_argument('--video_feature_all_train', type=str, default="./data/I3D/")
+    parser.add_argument('--video_feature_all_test', type=str, default="./data/I3D/")
+    parser.add_argument('--setup', type=str, default="")
+    parser.add_argument('--exp', type=str, default="01")
+    parser.add_argument('--split', type=str, default="1")
+    # Network
+    parser.add_argument('--feat_dim', type=int, default=2048)
+    parser.add_argument('--hidden_dim', type=int, default=1024)
+    parser.add_argument('--out_dim', type=int, default=23)
+    parser.add_argument('--enc_layer', type=int, default=3)
+    parser.add_argument('--enc_head', type=int, default=8)
+    parser.add_argument('--dec_layer', type=int, default=5)
+    parser.add_argument('--dec_head', type=int, default=4)
+    # Training settings
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--lr', type=float, default=1e-4)
+    parser.add_argument('--weight_decay', type=float, default=1e-4)
+    parser.add_argument('--epoch', type=int, default=5)
+    parser.add_argument('--lr_step', type=int, default=3)
+    # Post processing
+    parser.add_argument('--alpha', type=float, default=1)
+    parser.add_argument('--beta', type=float, default=1)
+    parser.add_argument('--gamma', type=float, default=0.2)
+    parser.add_argument('--pptype', type=str, default="net")
+    parser.add_argument('--pos_threshold', type=float, default=0.5)
+    parser.add_argument('--sup_threshold', type=float, default=0.1)
+    parser.add_argument('--threshold', type=float, default=0.1)
+    parser.add_argument('--inference_subset', type=str, default="test")
+    parser.add_argument('--soft_nms', type=float, default=0.3)
+    parser.add_argument('--video_len_file', type=str, default="./output/video_len_{}.json")
+    parser.add_argument('--proposal_label_file', type=str, default="./output/proposal_label_{}.h5")
+    parser.add_argument('--suppress_label_file', type=str, default="./output/suppress_label_{}.h5")
+    parser.add_argument('--suppress_result_file', type=str, default="./output/suppress_result{}.h5")
+    parser.add_argument('--frame_result_file', type=str, default="./output/frame_result{}.h5")
+    parser.add_argument('--result_file', type=str, default="./output/result_proposal{}.json")
+    parser.add_argument('--wterm', type=bool, default=False)
+    args = parser.parse_args()
+    return args

rgb bar main.py ADDED Viewed

	@@ -0,0 +1,1144 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, calc_iou
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2
+from typing import List, Dict, Optional
+from PIL import Image, ImageDraw, ImageFont
+import warnings
+# Visualization Configuration (Updated)
+VIS_CONFIG = {
+    'frame_interval': 1.0,
+    'max_frames': 20,
+    'save_dir': './output/visualizations',
+    'video_save_dir': './output/videos',
+    'gt_color': '#1f77b4',  # Blue for ground truth (RGB: 31, 119, 180)
+    'pred_color': '#ff7f0e',  # Orange for predictions (RGB: 255, 127, 14)
+    'fontsize_label': 10,
+    'fontsize_title': 14,
+    'frame_highlight_both': 'green',
+    'frame_highlight_gt': 'red',
+    'frame_highlight_pred': 'black',
+    'iou_threshold': 0.3,
+    'frame_scale_factor': 0.8,
+    'video_text_scale': 0.5,
+    'video_gt_text_color': (180, 119, 31),  # BGR for OpenCV
+    'video_pred_text_color': (14, 127, 255),  # BGR for OpenCV
+    'video_text_thickness': 1,
+    'video_font_path': "./data/Poppins ExtraBold Italic 800.ttf",
+    'video_font_fallback': '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
+    'video_pred_text_y': 0.45,
+    'video_gt_text_y': 0.55,
+    'video_footer_height': 150,  # Increased to accommodate labels
+    'video_gt_bar_y': 0.5,
+    'video_pred_bar_y': 0.8,
+    'video_bar_height': 0.15,
+    'video_bar_text_scale': 0.7,
+    'min_segment_duration': 1.0,
+    'video_frame_text_y': 0.05,  # Position for frame number and FPS
+    'video_bar_label_x': 10,  # X-position for GT/Pred labels
+    'video_bar_label_scale': 0.5,
+    'scroll_window_duration': 30.0,  # Duration of the visible time window (seconds)
+    'scroll_speed': 0.5,  # Seconds to advance the window per second of video
+}
+def annotate_video_with_actions(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    save_dir: str = VIS_CONFIG['video_save_dir'],
+    text_scale: float = VIS_CONFIG['video_text_scale'] * 1.5,  # Increased text size by 50%
+    gt_text_color: tuple = VIS_CONFIG['video_gt_text_color'],
+    pred_text_color: tuple = VIS_CONFIG['video_pred_text_color'],
+    text_thickness: int = VIS_CONFIG['video_text_thickness']
+) -> None:
+    """
+    Annotate a video with predicted and ground truth action labels, cumulative bars, frame number, and FPS.
+    Use fixed 20-second windows with original bar animation, resetting bars at each window boundary.
+    Different colors for different action classes, no labels or timestamps on bars, increased text size.
+    GT and Pred text labels are on the left, with bars starting 0.5 inches (48 pixels) to the right.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        save_dir: Directory to save the annotated video.
+        text_scale: Scale factor for text size in video (increased).
+        gt_text_color: BGR color tuple for ground truth text.
+        pred_text_color: BGR color tuple for predicted text.
+        text_thickness: Thickness of text strokes.
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Open input video
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Error: Could not open video {video_path}. Skipping video annotation.")
+        return
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / fps
+    print(f"Input Video: FPS={fps:.2f}, Resolution={frame_width}x{frame_height}, Total Frames={total_frames}, Duration={duration:.2f}s")
+    # Define output video with extended height for footer
+    footer_height = VIS_CONFIG['video_footer_height']
+    output_height = frame_height + footer_height
+    output_path = os.path.join(save_dir, f"annotated_{video_id}_{opt['exp']}.avi")
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, output_height))
+    if not out.isOpened():
+        print(f"Error: Could not initialize video writer for {output_path}. Check codec availability.")
+        cap.release()
+        return
+    # Filter short segments
+    min_duration = VIS_CONFIG['min_segment_duration']
+    gt_segments = [seg for seg in gt_segments if seg['duration'] >= min_duration]
+    pred_segments = [seg for seg in pred_segments if seg['duration'] >= min_duration]
+    print(f"Filtered Segments: GT={len(gt_segments)}, Pred={len(pred_segments)} (min_duration={min_duration}s)")
+    # Define color palette (BGR)
+    color_palette = [
+        (128, 0, 0),      # Navy Blue
+        (60, 20, 220),    # Crimson Red
+        (0, 128, 0),      # Emerald Green
+        (128, 0, 128),    # Royal Purple
+        (79, 69, 54),     # Charcoal Gray
+        (128, 128, 0),    # Teal
+        (0, 0, 128),      # Maroon
+        (130, 0, 75),     # Indigo
+        (34, 139, 34),    # Forest Green
+        (0, 85, 204),     # Burnt Orange
+        (149, 146, 209),  # Dusty Rose
+        (235, 206, 135),  # Sky Blue
+        (250, 230, 230),  # Lavender
+        (191, 226, 159),  # Seafoam Green
+        (185, 218, 255),  # Peach
+        (255, 204, 204),  # Periwinkle
+        (193, 182, 255),  # Blush Pink
+        (201, 252, 189),  # Mint Green
+        (144, 128, 112),  # Slate Gray
+        (112, 25, 25),    # Midnight Blue
+        (102, 51, 102),   # Deep Plum
+        (0, 128, 128),    # Olive Green
+        (171, 71, 0)      # Cobalt Blue
+    ]
+    # Create color mapping for actions
+    action_labels = set(seg['label'] for seg in gt_segments).union(seg['label'] for seg in pred_segments)
+    action_color_map = {label: color_palette[i % len(color_palette)] for i, label in enumerate(action_labels)}
+    print(f"Action Color Mapping: {action_color_map}")
+    # Convert fallback colors to RGB for PIL
+    gt_color_rgb = (gt_text_color[2], gt_text_color[1], gt_text_color[0])  # BGR to RGB
+    pred_color_rgb = (pred_text_color[2], pred_text_color[1], pred_text_color[0])  # BGR to RGB
+    # Load font
+    font_path = VIS_CONFIG['video_font_path']
+    font_fallback = VIS_CONFIG['video_font_fallback']
+    font_size = int(20 * text_scale)
+    bar_font_size = int(20 * VIS_CONFIG['video_bar_text_scale'])
+    font = None
+    bar_font = None
+    if font_path:
+        try:
+            font = ImageFont.truetype(font_path, font_size)
+            bar_font = ImageFont.truetype(font_path, bar_font_size)
+            print(f"Using font: {font_path}")
+        except IOError:
+            print(f"Warning: Font {font_path} not found. Trying fallback font.")
+    if not font:
+        try:
+            font = ImageFont.truetype(font_fallback, font_size)
+            bar_font = ImageFont.truetype(font_fallback, bar_font_size)
+            print(f"Using fallback font: {font_fallback}")
+        except IOError:
+            print(f"Warning: Fallback font {font_fallback} not found. Using OpenCV default font.")
+            font = None
+            bar_font = None
+    # Fixed window configuration
+    window_size = 20.0  # 20-second windows
+    num_windows = int(np.ceil(duration / window_size))
+    # Define horizontal gap (0.5 inch = 48 pixels at 96 DPI)
+    text_bar_gap = 48  # Pixels
+    text_x = 10  # Fixed x-position for GT and Pred labels
+    frame_idx = 0
+    written_frames = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Create extended frame with footer
+        extended_frame = np.zeros((output_height, frame_width, 3), dtype=np.uint8)
+        extended_frame[:frame_height, :, :] = frame
+        extended_frame[frame_height:, :, :] = 255  # White footer
+        # Calculate current timestamp
+        timestamp = frame_idx / fps
+        # Determine current window
+        window_idx = int(timestamp // window_size)
+        window_start = window_idx * window_size
+        window_end = min(window_start + window_size, duration)
+        window_duration = window_end - window_start
+        window_timestamp = timestamp - window_start  # Relative timestamp within window
+        # Find active GT actions (for text overlay)
+        gt_labels = [seg['label'] for seg in gt_segments if seg['start'] <= timestamp <= seg['end']]
+        gt_text = "GT: " + ", ".join(gt_labels) if gt_labels else ""
+        # Find active predicted actions (for text overlay)
+        pred_labels = [seg['label'] for seg in pred_segments if seg['start'] <= timestamp <= seg['end']]
+        pred_text = "Pred: " + ", ".join(pred_labels) if pred_labels else ""
+        # Draw GT and prediction bars in footer (within current window, using original animation)
+        footer_y = frame_height
+        gt_bar_y = footer_y + int(0.2 * footer_height)  # GT bar position
+        pred_bar_y = footer_y + int(0.5 * footer_height)  # Pred bar position
+        bar_height = int(VIS_CONFIG['video_bar_height'] * footer_height)
+        # Calculate text width for GT and Pred labels to determine bar start
+        if font:
+            gt_text_bbox = bar_font.getbbox("GT")
+            pred_text_bbox = bar_font.getbbox("Pred")
+            gt_text_width = gt_text_bbox[2] - gt_text_bbox[0]
+            pred_text_width = pred_text_bbox[2] - pred_text_bbox[0]
+        else:
+            gt_text_size, _ = cv2.getTextSize("GT", cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            pred_text_size, _ = cv2.getTextSize("Pred", cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            gt_text_width = gt_text_size[0]
+            pred_text_width = pred_text_size[0]
+        max_text_width = max(gt_text_width, pred_text_width)
+        bar_start_x = text_x + max_text_width + text_bar_gap  # Bars start after text + 0.5-inch gap
+        bar_width = frame_width - bar_start_x  # Adjust bar width to fit remaining space
+        # Draw bars with action-specific colors
+        for seg in gt_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = bar_start_x + int(((start_t - window_start) / window_duration) * bar_width)
+                end_x = bar_start_x + int(((end_t - window_start) / window_duration) * bar_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, gt_bar_y),
+                        (end_x, gt_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        for seg in pred_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = bar_start_x + int(((start_t - window_start) / window_duration) * bar_width)
+                end_x = bar_start_x + int(((end_t - window_start) / window_duration) * bar_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, pred_bar_y),
+                        (end_x, pred_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        if font:
+            # Convert frame to PIL image
+            frame_rgb = cv2.cvtColor(extended_frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame_rgb)
+            draw = ImageDraw.Draw(pil_image)
+            # Draw frame number and FPS at top center
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            frame_text_bbox = draw.textbbox((0, 0), frame_info, font=font)
+            frame_text_width = frame_text_bbox[2] - frame_text_bbox[0]
+            frame_text_x = (frame_width - frame_text_width) // 2
+            draw.text((frame_text_x, 10), frame_info, font=font, fill=(0, 0, 0))
+            # Draw window timestamp range at top of footer
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_bbox = draw.textbbox((0, 0), window_info, font=bar_font)
+            window_text_width = window_text_bbox[2] - window_text_bbox[0]
+            window_text_x = (frame_width - window_text_width) // 2
+            draw.text((window_text_x, footer_y + 10), window_info, font=bar_font, fill=(0, 0, 0))
+            # Draw GT text in video only if there are actions
+            if gt_text:
+                gt_y = int(frame_height * VIS_CONFIG['video_gt_text_y'])
+                draw.text((10, gt_y), gt_text, font=font, fill=gt_color_rgb)
+            # Draw predicted text in video only if there are actions
+            if pred_text:
+                pred_y = int(frame_height * VIS_CONFIG['video_pred_text_y'])
+                draw.text((10, pred_y), pred_text, font=font, fill=pred_color_rgb)
+            # Draw GT and Pred labels in footer
+            draw.text((text_x, gt_bar_y + bar_height // 2), "GT", font=bar_font, fill=gt_color_rgb)
+            draw.text((text_x, pred_bar_y + bar_height // 2), "Pred", font=bar_font, fill=pred_color_rgb)
+            # Convert back to OpenCV frame
+            extended_frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+        else:
+            # Fallback to OpenCV font
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            text_size, _ = cv2.getTextSize(frame_info, cv2.FONT_HERSHEY_DUPLEX, text_scale, text_thickness)
+            frame_text_x = (frame_width - text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                frame_info,
+                (frame_text_x, 30),
+                cv2.FONT_HERSHEY_DUPLEX,
+                text_scale,
+                (0, 0, 0),
+                text_thickness,
+                cv2.LINE_AA
+            )
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_size, _ = cv2.getTextSize(window_info, cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            window_text_x = (frame_width - window_text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                window_info,
+                (window_text_x, footer_y + 20),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                (0, 0, 0),
+                1,
+                cv2.LINE_AA
+            )
+            if gt_text:
+                cv2.putText(
+                    extended_frame,
+                    gt_text,
+                    (10, int(frame_height * VIS_CONFIG['video_gt_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    gt_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            if pred_text:
+                cv2.putText(
+                    extended_frame,
+                    pred_text,
+                    (10, int(frame_height * VIS_CONFIG['video_pred_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    pred_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            cv2.putText(
+                extended_frame,
+                "GT",
+                (text_x, gt_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                gt_text_color,
+                1,
+                cv2.LINE_AA
+            )
+            cv2.putText(
+                extended_frame,
+                "Pred",
+                (text_x, pred_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                pred_text_color,
+                1,
+                cv2.LINE_AA
+            )
+        # Write frame to output video
+        out.write(extended_frame)
+        written_frames += 1
+        frame_idx += 1
+    # Release resources
+    cap.release()
+    out.release()
+    print(f"[✅ Saved Annotated Video]: {output_path}, Written Frames={written_frames}")
+    print("Note: If .avi is not playable, convert to .mp4 using FFmpeg:")
+    print(f"ffmpeg -i {output_path} -vcodec libx264 -acodec aac {output_path.replace('.avi', '.mp4')}")
+def visualize_action_lengths(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    duration: float,
+    save_dir: str = VIS_CONFIG['save_dir'],
+    frame_interval: float = VIS_CONFIG['frame_interval']
+) -> None:
+    """
+    Generate a visualization plot comparing ground truth and predicted action lengths with video frames.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        duration: Total duration of the video in seconds.
+        save_dir: Directory to save the output image.
+        frame_interval: Time interval between sampled frames (seconds).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Calculate frame sampling times
+    num_frames = int(duration / frame_interval) + 1
+    if num_frames > VIS_CONFIG['max_frames']:
+        frame_interval = duration / (VIS_CONFIG['max_frames'] - 1)
+        num_frames = VIS_CONFIG['max_frames']
+        print(f"Warning: Video duration ({duration:.1f}s) requires {num_frames} frames. Adjusted frame_interval to {frame_interval:.2f}s.")
+    frame_times = np.linspace(0, duration, num_frames, endpoint=False)
+    # Load video frames
+    frames = []
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Warning: Could not open video {video_path}. Using placeholder frames.")
+        frames = [np.ones((100, 100, 3), dtype=np.uint8) * 255 for _ in frame_times]
+    else:
+        for t in frame_times:
+            cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+            ret, frame = cap.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize frame to reduce memory usage
+                frame = cv2.resize(frame, (int(frame.shape[1] * 0.5), int(frame.shape[0] * 0.5)))
+                frames.append(frame)
+            else:
+                frames.append(np.ones((100, 100, 3), dtype=np.uint8) * 255)
+        cap.release()
+    # Initialize figure
+    fig = plt.figure(figsize=(num_frames * VIS_CONFIG['frame_scale_factor'], 6), constrained_layout=True)
+    gs = fig.add_gridspec(3, num_frames, height_ratios=[3, 1, 1])
+    # Plot frames
+    for i, (t, frame) in enumerate(zip(frame_times, frames)):
+        ax = fig.add_subplot(gs[0, i])
+        # Check if frame falls within GT or predicted segments
+        gt_hit = any(seg['start'] <= t <= seg['end'] for seg in gt_segments)
+        pred_hit = any(seg['start'] <= t <= seg['end'] for seg in pred_segments)
+        # Set border color
+        border_color = None
+        if gt_hit and pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_both']
+        elif gt_hit:
+            border_color = VIS_CONFIG['frame_highlight_gt']
+        elif pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_pred']
+        ax.imshow(frame)
+        ax.axis('off')
+        if border_color:
+            for spine in ax.spines.values():
+                spine.set_edgecolor(border_color)
+                spine.set_linewidth(2)
+        ax.set_title(f"{t:.1f}s", fontsize=VIS_CONFIG['fontsize_label'],
+                     color=border_color if border_color else 'black')
+    # Plot ground truth bar
+    ax_gt = fig.add_subplot(gs[1, :])
+    ax_gt.set_xlim(0, duration)
+    ax_gt.set_ylim(0, 1)
+    ax_gt.axis('off')
+    ax_gt.text(-0.02 * duration, 0.5, "Ground Truth", fontsize=VIS_CONFIG['fontsize_title'],
+               va='center', ha='right', weight='bold')
+    for seg in gt_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_gt.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['gt_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_gt.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                   fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_gt.text(start, 0.2, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_gt.text(end, 0.2, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Plot prediction bar
+    ax_pred = fig.add_subplot(gs[2, :])
+    ax_pred.set_xlim(0, duration)
+    ax_pred.set_ylim(0, 1)
+    ax_pred.axis('off')
+    ax_pred.text(-0.02 * duration, 0.5, "Prediction", fontsize=VIS_CONFIG['fontsize_title'],
+                 va='center', ha='right', weight='bold')
+    for seg in pred_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_pred.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['pred_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_pred.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                     fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_pred.text(start, 0.8, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_pred.text(end, 0.8, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Save plot
+    jpg_path = os.path.join(save_dir, f"viz_{video_id}_{opt['exp']}.png")  # Use PNG
+    plt.savefig(jpg_path, dpi=100, bbox_inches='tight')  # Lower DPI
+    print(f"[✅ Saved Visualization]: {jpg_path}")
+    plt.close()
+def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=opt['batch_size'], shuffle=True,
+                                               num_workers=0, pin_memory=True, drop_last=False)
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    epoch_cost_snip = 0
+    total_iter = len(train_dataset) // opt['batch_size']
+    cls_loss = MultiCrossEntropyLoss(focal=True)
+    snip_loss = MultiCrossEntropyLoss(focal=True)
+    for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
+        if warmup:
+            for g in optimizer.param_groups:
+                g['lr'] = n_iter * (opt['lr']) / total_iter
+        act_cls, act_reg, snip_cls = model(input_data.float().cuda())
+        act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
+        snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func_(cls_loss, cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
+        cost_snip = loss
+        epoch_cost_snip += cost_snip.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
+        epoch_cost += cost.detach().cpu().numpy()
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+    return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip
+def eval_one_epoch(opt, model, test_dataset):
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)
+    result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    IoUmAP = evaluation_detection(opt, verbose=False)
+    IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])
+    return cls_loss, reg_loss, tot_loss, IoUmAP_5
+def train(opt):
+    writer = SummaryWriter()
+    model = MYNET(opt).cuda()
+    rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
+    optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])
+    train_dataset = VideoDataSet(opt, subset="train")
+    test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])
+    warmup = False
+    for n_epoch in range(opt['epoch']):
+        if n_epoch >= 1:
+            warmup = False
+        n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)
+        writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
+        print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
+                                                                                         epoch_cost / (n_iter + 1),
+                                                                                         epoch_cost_cls / (n_iter + 1),
+                                                                                         epoch_cost_reg / (n_iter + 1),
+                                                                                         epoch_cost_snip / (n_iter + 1),
+                                                                                         optimizer.param_groups[-1]["lr"]))
+        scheduler.step()
+        model.eval()
+        cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)
+        writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
+        print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))
+        state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
+        if IoUmAP_5 > model.best_map:
+            model.best_map = IoUmAP_5
+            torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")
+        model.train()
+    writer.close()
+    return model.best_map
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    # Compare predicted and ground truth action lengths
+    if video_name:
+        print("\nComparing Predicted and Ground Truth Action Lengths for Video:", video_name)
+        with open(opt["video_anno"].format(opt["split"]), 'r') as f:
+            anno_data = json.load(f)
+        gt_annotations = anno_data['database'][video_name]['annotations']
+        duration = anno_data['database'][video_name]['duration']
+        gt_segments = []
+        for anno in gt_annotations:
+            start, end = anno['segment']
+            label = anno['label']
+            duration_seg = end - start
+            gt_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg})
+        pred_segments = []
+        for pred in result_dict[video_name]:
+            start, end = pred['segment']
+            label = pred['label']
+            score = pred['score']
+            duration_seg = end - start
+            pred_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg, 'score': score})
+        # Print comparison table
+        matches = []
+        iou_threshold = VIS_CONFIG['iou_threshold']
+        used_gt_indices = set()
+        for pred in pred_segments:
+            best_iou = 0
+            best_gt_idx = None
+            for gt_idx, gt in enumerate(gt_segments):
+                if gt_idx in used_gt_indices:
+                    continue
+                iou = calc_iou([pred['end'], pred['duration']], [gt['end'], gt['duration']])
+                if iou > best_iou and iou >= iou_threshold:
+                    best_iou = iou
+                    best_gt_idx = gt_idx
+            if best_gt_idx is not None:
+                matches.append({
+                    'pred': pred,
+                    'gt': gt_segments[best_gt_idx],
+                    'iou': best_iou
+                })
+                used_gt_indices.add(best_gt_idx)
+            else:
+                matches.append({'pred': pred, 'gt': None, 'iou': 0})
+        for gt_idx, gt in enumerate(gt_segments):
+            if gt_idx not in used_gt_indices:
+                matches.append({'pred': None, 'gt': gt, 'iou': 0})
+        print("\n{:<20} {:<30} {:<30} {:<15} {:<10}".format(
+            "Action Label", "Predicted Segment (s)", "Ground Truth Segment (s)", "Duration Diff (s)", "IoU"))
+        print("-" * 105)
+        for match in matches:
+            pred = match['pred']
+            gt = match['gt']
+            iou = match['iou']
+            if pred and gt:
+                label = pred['label'] if pred['label'] == gt['label'] else f"{pred['label']} (GT: {gt['label']})"
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                duration_diff = pred['duration'] - gt['duration']
+                print("{:<20} {:<30} {:<30} {:<15.2f} {:<10.2f}".format(
+                    label, pred_str, gt_str, duration_diff, iou))
+            elif pred:
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    pred['label'], pred_str, "None", "N/A", iou))
+            elif gt:
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    gt['label'], "None", gt_str, "N/A", iou))
+        # Summarize
+        matched_count = sum(1 for m in matches if m['pred'] and m['gt'])
+        avg_duration_diff = np.mean([m['pred']['duration'] - m['gt']['duration'] for m in matches if m['pred'] and m['gt']]) if matched_count > 0 else 0
+        avg_iou = np.mean([m['iou'] for m in matches if m['iou'] > 0]) if any(m['iou'] > 0 for m in matches) else 0
+        print(f"\nSummary:")
+        print(f"- Total Predictions: {len(pred_segments)}")
+        print(f"- Total Ground Truth: {len(gt_segments)}")
+        print(f"- Matched Segments: {matched_count}")
+        print(f"- Average Duration Difference (Matched): {avg_duration_diff:.2f}s")
+        print(f"- Average IoU (Matched): {avg_iou:.2f}")
+        # Generate static visualization
+        video_path = opt.get('video_path', '')
+        if os.path.exists(video_path):
+            visualize_action_lengths(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path,
+                duration=duration
+            )
+            # Generate annotated video
+            annotate_video_with_actions(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path
+            )
+        else:
+            print(f"Warning: Video path {video_path} not found. Skipping visualization and video annotation.")
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

short main.py ADDED Viewed

	@@ -0,0 +1,1040 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, calc_iou
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2
+from typing import List, Dict, Optional
+from PIL import Image, ImageDraw, ImageFont
+import warnings
+# Visualization Configuration (Updated)
+VIS_CONFIG = {
+    'frame_interval': 1.0,
+    'max_frames': 20,
+    'save_dir': './output/visualizations',
+    'video_save_dir': './output/videos',
+    'gt_color': '#1f77b4',  # Blue for ground truth (RGB: 31, 119, 180)
+    'pred_color': '#ff7f0e',  # Orange for predictions (RGB: 255, 127, 14)
+    'fontsize_label': 10,
+    'fontsize_title': 14,
+    'frame_highlight_both': 'green',
+    'frame_highlight_gt': 'red',
+    'frame_highlight_pred': 'black',
+    'iou_threshold': 0.3,
+    'frame_scale_factor': 0.8,
+    'video_text_scale': 0.5,
+    'video_gt_text_color': (180, 119, 31),  # BGR for OpenCV
+    'video_pred_text_color': (14, 127, 255),  # BGR for OpenCV
+    'video_text_thickness': 1,
+    'video_font_path': "./data/Poppins ExtraBold Italic 800.ttf",
+    'video_font_fallback': '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
+    'video_pred_text_y': 0.45,
+    'video_gt_text_y': 0.55,
+    'video_footer_height': 150,  # Increased to accommodate labels
+    'video_gt_bar_y': 0.5,
+    'video_pred_bar_y': 0.8,
+    'video_bar_height': 0.15,
+    'video_bar_text_scale': 0.7,
+    'min_segment_duration': 1.0,
+    'video_frame_text_y': 0.05,  # Position for frame number and FPS
+    'video_bar_label_x': 10,  # X-position for GT/Pred labels
+    'video_bar_label_scale': 0.5,
+    'scroll_window_duration': 30.0,  # Duration of the visible time window (seconds)
+    'scroll_speed': 0.5,  # Seconds to advance the window per second of video
+}
+def annotate_video_with_actions(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    save_dir: str = VIS_CONFIG['video_save_dir'],
+    text_scale: float = VIS_CONFIG['video_text_scale'] * 1.5,  # Increased text size by 50%
+    gt_text_color: tuple = VIS_CONFIG['video_gt_text_color'],
+    pred_text_color: tuple = VIS_CONFIG['video_pred_text_color'],
+    text_thickness: int = VIS_CONFIG['video_text_thickness']
+) -> None:
+    """
+    Annotate a video with predicted and ground truth action labels, cumulative bars, frame number, and FPS.
+    Use fixed 20-second windows with original bar animation, resetting bars at each window boundary.
+    Different colors for different action classes, no labels or timestamps on bars, increased text size.
+    GT and Pred text labels are on the left, with bars starting 0.5 inches (48 pixels) to the right.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        save_dir: Directory to save the annotated video.
+        text_scale: Scale factor for text size in video (increased).
+        gt_text_color: BGR color tuple for ground truth text.
+        pred_text_color: BGR color tuple for predicted text.
+        text_thickness: Thickness of text strokes.
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Open input video
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Error: Could not open video {video_path}. Skipping video annotation.")
+        return
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / fps
+    print(f"Input Video: FPS={fps:.2f}, Resolution={frame_width}x{frame_height}, Total Frames={total_frames}, Duration={duration:.2f}s")
+    # Define output video with extended height for footer
+    footer_height = VIS_CONFIG['video_footer_height']
+    output_height = frame_height + footer_height
+    output_path = os.path.join(save_dir, f"annotated_{video_id}_{opt['exp']}.avi")
+    fourcc = cv2.VideoWriter_fourcc(*'XVID')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, output_height))
+    if not out.isOpened():
+        print(f"Error: Could not initialize video writer for {output_path}. Check codec availability.")
+        cap.release()
+        return
+    # Filter short segments
+    min_duration = VIS_CONFIG['min_segment_duration']
+    gt_segments = [seg for seg in gt_segments if seg['duration'] >= min_duration]
+    pred_segments = [seg for seg in pred_segments if seg['duration'] >= min_duration]
+    print(f"Filtered Segments: GT={len(gt_segments)}, Pred={len(pred_segments)} (min_duration={min_duration}s)")
+    # Define color palette (BGR)
+    color_palette = [
+        (128, 0, 0),      # Navy Blue
+        (60, 20, 220),    # Crimson Red
+        (0, 128, 0),      # Emerald Green
+        (128, 0, 128),    # Royal Purple
+        (79, 69, 54),     # Charcoal Gray
+        (128, 128, 0),    # Teal
+        (0, 0, 128),      # Maroon
+        (130, 0, 75),     # Indigo
+        (34, 139, 34),    # Forest Green
+        (0, 85, 204),     # Burnt Orange
+        (149, 146, 209),  # Dusty Rose
+        (235, 206, 135),  # Sky Blue
+        (250, 230, 230),  # Lavender
+        (191, 226, 159),  # Seafoam Green
+        (185, 218, 255),  # Peach
+        (255, 204, 204),  # Periwinkle
+        (193, 182, 255),  # Blush Pink
+        (201, 252, 189),  # Mint Green
+        (144, 128, 112),  # Slate Gray
+        (112, 25, 25),    # Midnight Blue
+        (102, 51, 102),   # Deep Plum
+        (0, 128, 128),    # Olive Green
+        (171, 71, 0)      # Cobalt Blue
+    ]
+    # Create color mapping for actions
+    action_labels = set(seg['label'] for seg in gt_segments).union(seg['label'] for seg in pred_segments)
+    action_color_map = {label: color_palette[i % len(color_palette)] for i, label in enumerate(action_labels)}
+    print(f"Action Color Mapping: {action_color_map}")
+    # Convert fallback colors to RGB for PIL
+    gt_color_rgb = (gt_text_color[2], gt_text_color[1], gt_text_color[0])  # BGR to RGB
+    pred_color_rgb = (pred_text_color[2], pred_text_color[1], pred_text_color[0])  # BGR to RGB
+    # Load font
+    font_path = VIS_CONFIG['video_font_path']
+    font_fallback = VIS_CONFIG['video_font_fallback']
+    font_size = int(20 * text_scale)
+    bar_font_size = int(20 * VIS_CONFIG['video_bar_text_scale'])
+    font = None
+    bar_font = None
+    if font_path:
+        try:
+            font = ImageFont.truetype(font_path, font_size)
+            bar_font = ImageFont.truetype(font_path, bar_font_size)
+            print(f"Using font: {font_path}")
+        except IOError:
+            print(f"Warning: Font {font_path} not found. Trying fallback font.")
+    if not font:
+        try:
+            font = ImageFont.truetype(font_fallback, font_size)
+            bar_font = ImageFont.truetype(font_fallback, bar_font_size)
+            print(f"Using fallback font: {font_fallback}")
+        except IOError:
+            print(f"Warning: Fallback font {font_fallback} not found. Using OpenCV default font.")
+            font = None
+            bar_font = None
+    # Fixed window configuration
+    window_size = 20.0  # 20-second windows
+    num_windows = int(np.ceil(duration / window_size))
+    # Define horizontal gap (0.5 inch = 48 pixels at 96 DPI)
+    text_bar_gap = 48  # Pixels
+    text_x = 10  # Fixed x-position for GT and Pred labels
+    frame_idx = 0
+    written_frames = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Create extended frame with footer
+        extended_frame = np.zeros((output_height, frame_width, 3), dtype=np.uint8)
+        extended_frame[:frame_height, :, :] = frame
+        extended_frame[frame_height:, :, :] = 255  # White footer
+        # Calculate current timestamp
+        timestamp = frame_idx / fps
+        # Determine current window
+        window_idx = int(timestamp // window_size)
+        window_start = window_idx * window_size
+        window_end = min(window_start + window_size, duration)
+        window_duration = window_end - window_start
+        window_timestamp = timestamp - window_start  # Relative timestamp within window
+        # Find active GT actions (for text overlay)
+        gt_labels = [seg['label'] for seg in gt_segments if seg['start'] <= timestamp <= seg['end']]
+        gt_text = "GT: " + ", ".join(gt_labels) if gt_labels else ""
+        # Find active predicted actions (for text overlay)
+        pred_labels = [seg['label'] for seg in pred_segments if seg['start'] <= timestamp <= seg['end']]
+        pred_text = "Pred: " + ", ".join(pred_labels) if pred_labels else ""
+        # Draw GT and prediction bars in footer (within current window, using original animation)
+        footer_y = frame_height
+        gt_bar_y = footer_y + int(0.2 * footer_height)  # GT bar position
+        pred_bar_y = footer_y + int(0.5 * footer_height)  # Pred bar position
+        bar_height = int(VIS_CONFIG['video_bar_height'] * footer_height)
+        # Calculate text width for GT and Pred labels to determine bar start
+        if font:
+            gt_text_bbox = bar_font.getbbox("GT")
+            pred_text_bbox = bar_font.getbbox("Pred")
+            gt_text_width = gt_text_bbox[2] - gt_text_bbox[0]
+            pred_text_width = pred_text_bbox[2] - pred_text_bbox[0]
+        else:
+            gt_text_size, _ = cv2.getTextSize("GT", cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            pred_text_size, _ = cv2.getTextSize("Pred", cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            gt_text_width = gt_text_size[0]
+            pred_text_width = pred_text_size[0]
+        max_text_width = max(gt_text_width, pred_text_width)
+        bar_start_x = text_x + max_text_width + text_bar_gap  # Bars start after text + 0.5-inch gap
+        bar_width = frame_width - bar_start_x  # Adjust bar width to fit remaining space
+        # Draw bars with action-specific colors
+        for seg in gt_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = bar_start_x + int(((start_t - window_start) / window_duration) * bar_width)
+                end_x = bar_start_x + int(((end_t - window_start) / window_duration) * bar_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, gt_bar_y),
+                        (end_x, gt_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        for seg in pred_segments:
+            if seg['start'] <= window_end and seg['end'] >= window_start:
+                start_t = max(seg['start'], window_start)
+                end_t = min(seg['end'], window_start + window_timestamp)  # Original animation
+                start_x = bar_start_x + int(((start_t - window_start) / window_duration) * bar_width)
+                end_x = bar_start_x + int(((end_t - window_start) / window_duration) * bar_width)
+                if end_x > start_x:
+                    cv2.rectangle(
+                        extended_frame,
+                        (start_x, pred_bar_y),
+                        (end_x, pred_bar_y + bar_height),
+                        action_color_map[seg['label']],  # Action-specific color
+                        -1
+                    )
+        if font:
+            # Convert frame to PIL image
+            frame_rgb = cv2.cvtColor(extended_frame, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(frame_rgb)
+            draw = ImageDraw.Draw(pil_image)
+            # Draw frame number and FPS at top center
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            frame_text_bbox = draw.textbbox((0, 0), frame_info, font=font)
+            frame_text_width = frame_text_bbox[2] - frame_text_bbox[0]
+            frame_text_x = (frame_width - frame_text_width) // 2
+            draw.text((frame_text_x, 10), frame_info, font=font, fill=(0, 0, 0))
+            # Draw window timestamp range at top of footer
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_bbox = draw.textbbox((0, 0), window_info, font=bar_font)
+            window_text_width = window_text_bbox[2] - window_text_bbox[0]
+            window_text_x = (frame_width - window_text_width) // 2
+            draw.text((window_text_x, footer_y + 10), window_info, font=bar_font, fill=(0, 0, 0))
+            # Draw GT text in video only if there are actions
+            if gt_text:
+                gt_y = int(frame_height * VIS_CONFIG['video_gt_text_y'])
+                draw.text((10, gt_y), gt_text, font=font, fill=gt_color_rgb)
+            # Draw predicted text in video only if there are actions
+            if pred_text:
+                pred_y = int(frame_height * VIS_CONFIG['video_pred_text_y'])
+                draw.text((10, pred_y), pred_text, font=font, fill=pred_color_rgb)
+            # Draw GT and Pred labels in footer
+            draw.text((text_x, gt_bar_y + bar_height // 2), "GT", font=bar_font, fill=gt_color_rgb)
+            draw.text((text_x, pred_bar_y + bar_height // 2), "Pred", font=bar_font, fill=pred_color_rgb)
+            # Convert back to OpenCV frame
+            extended_frame = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+        else:
+            # Fallback to OpenCV font
+            frame_info = f"Frame: {frame_idx} | FPS: {fps:.2f}"
+            text_size, _ = cv2.getTextSize(frame_info, cv2.FONT_HERSHEY_DUPLEX, text_scale, text_thickness)
+            frame_text_x = (frame_width - text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                frame_info,
+                (frame_text_x, 30),
+                cv2.FONT_HERSHEY_DUPLEX,
+                text_scale,
+                (0, 0, 0),
+                text_thickness,
+                cv2.LINE_AA
+            )
+            window_info = f"{window_start:.1f}s - {window_end:.1f}s"
+            window_text_size, _ = cv2.getTextSize(window_info, cv2.FONT_HERSHEY_DUPLEX, VIS_CONFIG['video_bar_text_scale'], 1)
+            window_text_x = (frame_width - window_text_size[0]) // 2
+            cv2.putText(
+                extended_frame,
+                window_info,
+                (window_text_x, footer_y + 20),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                (0, 0, 0),
+                1,
+                cv2.LINE_AA
+            )
+            if gt_text:
+                cv2.putText(
+                    extended_frame,
+                    gt_text,
+                    (10, int(frame_height * VIS_CONFIG['video_gt_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    gt_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            if pred_text:
+                cv2.putText(
+                    extended_frame,
+                    pred_text,
+                    (10, int(frame_height * VIS_CONFIG['video_pred_text_y'])),
+                    cv2.FONT_HERSHEY_DUPLEX,
+                    text_scale,
+                    pred_text_color,
+                    text_thickness,
+                    cv2.LINE_AA
+                )
+            cv2.putText(
+                extended_frame,
+                "GT",
+                (text_x, gt_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                gt_text_color,
+                1,
+                cv2.LINE_AA
+            )
+            cv2.putText(
+                extended_frame,
+                "Pred",
+                (text_x, pred_bar_y + bar_height // 2 + 5),
+                cv2.FONT_HERSHEY_DUPLEX,
+                VIS_CONFIG['video_bar_text_scale'],
+                pred_text_color,
+                1,
+                cv2.LINE_AA
+            )
+        # Write frame to output video
+        out.write(extended_frame)
+        written_frames += 1
+        frame_idx += 1
+    # Release resources
+    cap.release()
+    out.release()
+    print(f"[✅ Saved Annotated Video]: {output_path}, Written Frames={written_frames}")
+    print("Note: If .avi is not playable, convert to .mp4 using FFmpeg:")
+    print(f"ffmpeg -i {output_path} -vcodec libx264 -acodec aac {output_path.replace('.avi', '.mp4')}")
+def visualize_action_lengths(
+    video_id: str,
+    pred_segments: List[Dict],
+    gt_segments: List[Dict],
+    video_path: str,
+    duration: float,
+    save_dir: str = VIS_CONFIG['save_dir'],
+    frame_interval: float = VIS_CONFIG['frame_interval']
+) -> None:
+    """
+    Generate a visualization plot comparing ground truth and predicted action lengths with video frames.
+    Args:
+        video_id: Video identifier (e.g., 'my_video').
+        pred_segments: List of predicted segments with 'label', 'start', 'end', 'duration', 'score'.
+        gt_segments: List of ground truth segments with 'label', 'start', 'end', 'duration'.
+        video_path: Path to the input video file.
+        duration: Total duration of the video in seconds.
+        save_dir: Directory to save the output image.
+        frame_interval: Time interval between sampled frames (seconds).
+    """
+    os.makedirs(save_dir, exist_ok=True)
+    # Calculate frame sampling times
+    num_frames = int(duration / frame_interval) + 1
+    if num_frames > VIS_CONFIG['max_frames']:
+        frame_interval = duration / (VIS_CONFIG['max_frames'] - 1)
+        num_frames = VIS_CONFIG['max_frames']
+        print(f"Warning: Video duration ({duration:.1f}s) requires {num_frames} frames. Adjusted frame_interval to {frame_interval:.2f}s.")
+    frame_times = np.linspace(0, duration, num_frames, endpoint=False)
+    # Load video frames
+    frames = []
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Warning: Could not open video {video_path}. Using placeholder frames.")
+        frames = [np.ones((100, 100, 3), dtype=np.uint8) * 255 for _ in frame_times]
+    else:
+        for t in frame_times:
+            cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
+            ret, frame = cap.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Resize frame to reduce memory usage
+                frame = cv2.resize(frame, (int(frame.shape[1] * 0.5), int(frame.shape[0] * 0.5)))
+                frames.append(frame)
+            else:
+                frames.append(np.ones((100, 100, 3), dtype=np.uint8) * 255)
+        cap.release()
+    # Initialize figure
+    fig = plt.figure(figsize=(num_frames * VIS_CONFIG['frame_scale_factor'], 6), constrained_layout=True)
+    gs = fig.add_gridspec(3, num_frames, height_ratios=[3, 1, 1])
+    # Plot frames
+    for i, (t, frame) in enumerate(zip(frame_times, frames)):
+        ax = fig.add_subplot(gs[0, i])
+        # Check if frame falls within GT or predicted segments
+        gt_hit = any(seg['start'] <= t <= seg['end'] for seg in gt_segments)
+        pred_hit = any(seg['start'] <= t <= seg['end'] for seg in pred_segments)
+        # Set border color
+        border_color = None
+        if gt_hit and pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_both']
+        elif gt_hit:
+            border_color = VIS_CONFIG['frame_highlight_gt']
+        elif pred_hit:
+            border_color = VIS_CONFIG['frame_highlight_pred']
+        ax.imshow(frame)
+        ax.axis('off')
+        if border_color:
+            for spine in ax.spines.values():
+                spine.set_edgecolor(border_color)
+                spine.set_linewidth(2)
+        ax.set_title(f"{t:.1f}s", fontsize=VIS_CONFIG['fontsize_label'],
+                     color=border_color if border_color else 'black')
+    # Plot ground truth bar
+    ax_gt = fig.add_subplot(gs[1, :])
+    ax_gt.set_xlim(0, duration)
+    ax_gt.set_ylim(0, 1)
+    ax_gt.axis('off')
+    ax_gt.text(-0.02 * duration, 0.5, "Ground Truth", fontsize=VIS_CONFIG['fontsize_title'],
+               va='center', ha='right', weight='bold')
+    for seg in gt_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_gt.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['gt_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_gt.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                   fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_gt.text(start, 0.2, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_gt.text(end, 0.2, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Plot prediction bar
+    ax_pred = fig.add_subplot(gs[2, :])
+    ax_pred.set_xlim(0, duration)
+    ax_pred.set_ylim(0, 1)
+    ax_pred.axis('off')
+    ax_pred.text(-0.02 * duration, 0.5, "Prediction", fontsize=VIS_CONFIG['fontsize_title'],
+                 va='center', ha='right', weight='bold')
+    for seg in pred_segments:
+        start, end = seg['start'], seg['end']
+        width = end - start
+        label = seg['label'][:10] + '...' if len(seg['label']) > 10 else seg['label']
+        ax_pred.add_patch(patches.Rectangle(
+            (start, 0.3), width, 0.4, facecolor=VIS_CONFIG['pred_color'],
+            edgecolor='black', alpha=0.8
+        ))
+        ax_pred.text((start + end) / 2, 0.5, label, ha='center', va='center',
+                     fontsize=VIS_CONFIG['fontsize_label'], color='white')
+        ax_pred.text(start, 0.8, f"{start:.1f}", ha='center', fontsize=8, color='black')
+        ax_pred.text(end, 0.8, f"{end:.1f}", ha='center', fontsize=8, color='black')
+    # Save plot
+    jpg_path = os.path.join(save_dir, f"viz_{video_id}_{opt['exp']}.png")  # Use PNG
+    plt.savefig(jpg_path, dpi=100, bbox_inches='tight')  # Lower DPI
+    print(f"[✅ Saved Visualization]: {jpg_path}")
+    plt.close()
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    # Compare predicted and ground truth action lengths
+    if video_name:
+        print("\nComparing Predicted and Ground Truth Action Lengths for Video:", video_name)
+        with open(opt["video_anno"].format(opt["split"]), 'r') as f:
+            anno_data = json.load(f)
+        gt_annotations = anno_data['database'][video_name]['annotations']
+        duration = anno_data['database'][video_name]['duration']
+        gt_segments = []
+        for anno in gt_annotations:
+            start, end = anno['segment']
+            label = anno['label']
+            duration_seg = end - start
+            gt_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg})
+        pred_segments = []
+        for pred in result_dict[video_name]:
+            start, end = pred['segment']
+            label = pred['label']
+            score = pred['score']
+            duration_seg = end - start
+            pred_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration_seg, 'score': score})
+        # Print comparison table
+        matches = []
+        iou_threshold = VIS_CONFIG['iou_threshold']
+        used_gt_indices = set()
+        for pred in pred_segments:
+            best_iou = 0
+            best_gt_idx = None
+            for gt_idx, gt in enumerate(gt_segments):
+                if gt_idx in used_gt_indices:
+                    continue
+                iou = calc_iou([pred['end'], pred['duration']], [gt['end'], gt['duration']])
+                if iou > best_iou and iou >= iou_threshold:
+                    best_iou = iou
+                    best_gt_idx = gt_idx
+            if best_gt_idx is not None:
+                matches.append({
+                    'pred': pred,
+                    'gt': gt_segments[best_gt_idx],
+                    'iou': best_iou
+                })
+                used_gt_indices.add(best_gt_idx)
+            else:
+                matches.append({'pred': pred, 'gt': None, 'iou': 0})
+        for gt_idx, gt in enumerate(gt_segments):
+            if gt_idx not in used_gt_indices:
+                matches.append({'pred': None, 'gt': gt, 'iou': 0})
+        print("\n{:<20} {:<30} {:<30} {:<15} {:<10}".format(
+            "Action Label", "Predicted Segment (s)", "Ground Truth Segment (s)", "Duration Diff (s)", "IoU"))
+        print("-" * 105)
+        for match in matches:
+            pred = match['pred']
+            gt = match['gt']
+            iou = match['iou']
+            if pred and gt:
+                label = pred['label'] if pred['label'] == gt['label'] else f"{pred['label']} (GT: {gt['label']})"
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                duration_diff = pred['duration'] - gt['duration']
+                print("{:<20} {:<30} {:<30} {:<15.2f} {:<10.2f}".format(
+                    label, pred_str, gt_str, duration_diff, iou))
+            elif pred:
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    pred['label'], pred_str, "None", "N/A", iou))
+            elif gt:
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    gt['label'], "None", gt_str, "N/A", iou))
+        # Summarize
+        matched_count = sum(1 for m in matches if m['pred'] and m['gt'])
+        avg_duration_diff = np.mean([m['pred']['duration'] - m['gt']['duration'] for m in matches if m['pred'] and m['gt']]) if matched_count > 0 else 0
+        avg_iou = np.mean([m['iou'] for m in matches if m['iou'] > 0]) if any(m['iou'] > 0 for m in matches) else 0
+        print(f"\nSummary:")
+        print(f"- Total Predictions: {len(pred_segments)}")
+        print(f"- Total Ground Truth: {len(gt_segments)}")
+        print(f"- Matched Segments: {matched_count}")
+        print(f"- Average Duration Difference (Matched): {avg_duration_diff:.2f}s")
+        print(f"- Average IoU (Matched): {avg_iou:.2f}")
+        # Generate static visualization
+        video_path = opt.get('video_path', '')
+        if os.path.exists(video_path):
+            visualize_action_lengths(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path,
+                duration=duration
+            )
+            # Generate annotated video
+            annotate_video_with_actions(
+                video_id=video_name,
+                pred_segments=pred_segments,
+                gt_segments=gt_segments,
+                video_path=video_path
+            )
+        else:
+            print(f"Warning: Video path {video_path} not found. Skipping visualization and video annotation.")
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

single prediction and Gt print main.py ADDED Viewed

	@@ -0,0 +1,613 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, calc_iou  # Import calc_iou explicitly
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=opt['batch_size'], shuffle=True,
+                                               num_workers=0, pin_memory=True, drop_last=False)
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    epoch_cost_snip = 0
+    total_iter = len(train_dataset) // opt['batch_size']
+    cls_loss = MultiCrossEntropyLoss(focal=True)
+    snip_loss = MultiCrossEntropyLoss(focal=True)
+    for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
+        if warmup:
+            for g in optimizer.param_groups:
+                g['lr'] = n_iter * (opt['lr']) / total_iter
+        act_cls, act_reg, snip_cls = model(input_data.float().cuda())
+        act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
+        snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func_(cls_loss, cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
+        cost_snip = loss
+        epoch_cost_snip += cost_snip.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
+        epoch_cost += cost.detach().cpu().numpy()
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+    return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip
+def eval_one_epoch(opt, model, test_dataset):
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)
+    result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    IoUmAP = evaluation_detection(opt, verbose=False)
+    IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])
+    return cls_loss, reg_loss, tot_loss, IoUmAP_5
+def train(opt):
+    writer = SummaryWriter()
+    model = MYNET(opt).cuda()
+    rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
+    optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])
+    train_dataset = VideoDataSet(opt, subset="train")
+    test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])
+    warmup = False
+    for n_epoch in range(opt['epoch']):
+        if n_epoch >= 1:
+            warmup = False
+        n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)
+        writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
+        print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
+                                                                                         epoch_cost / (n_iter + 1),
+                                                                                         epoch_cost_cls / (n_iter + 1),
+                                                                                         epoch_cost_reg / (n_iter + 1),
+                                                                                         epoch_cost_snip / (n_iter + 1),
+                                                                                         optimizer.param_groups[-1]["lr"]))
+        scheduler.step()
+        model.eval()
+        cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)
+        writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
+        print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))
+        state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
+        if IoUmAP_5 > model.best_map:
+            model.best_map = IoUmAP_5
+            torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")
+        model.train()
+    writer.close()
+    return model.best_map
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    # New: Compare predicted and ground truth action lengths
+    if video_name:
+        print("\nComparing Predicted and Ground Truth Action Lengths for Video:", video_name)
+        # Load ground truth annotations
+        with open(opt["video_anno"].format(opt["split"]), 'r') as f:
+            anno_data = json.load(f)
+        gt_annotations = anno_data['database'][video_name]['annotations']
+        # Extract ground truth segments
+        gt_segments = []
+        for anno in gt_annotations:
+            start, end = anno['segment']
+            label = anno['label']
+            duration = end - start
+            gt_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration})
+        # Extract predicted segments from result_dict
+        pred_segments = []
+        for pred in result_dict[video_name]:
+            start, end = pred['segment']
+            label = pred['label']
+            score = pred['score']
+            duration = end - start
+            pred_segments.append({'label': label, 'start': start, 'end': end, 'duration': duration, 'score': score})
+        # Match predictions to ground truth using IoU
+        matches = []
+        iou_threshold = 0.3  # Same as evaluation default for matching
+        used_gt_indices = set()
+        for pred in pred_segments:
+            best_iou = 0
+            best_gt_idx = None
+            for gt_idx, gt in enumerate(gt_segments):
+                if gt_idx in used_gt_indices:
+                    continue
+                iou = calc_iou([pred['end'], pred['duration']], [gt['end'], gt['duration']])
+                if iou > best_iou and iou >= iou_threshold:
+                    best_iou = iou
+                    best_gt_idx = gt_idx
+            if best_gt_idx is not None:
+                matches.append({
+                    'pred': pred,
+                    'gt': gt_segments[best_gt_idx],
+                    'iou': best_iou
+                })
+                used_gt_indices.add(best_gt_idx)
+            else:
+                matches.append({'pred': pred, 'gt': None, 'iou': 0})
+        # Include unmatched ground truth segments
+        for gt_idx, gt in enumerate(gt_segments):
+            if gt_idx not in used_gt_indices:
+                matches.append({'pred': None, 'gt': gt, 'iou': 0})
+        # Print comparison table
+        print("\n{:<20} {:<30} {:<30} {:<15} {:<10}".format(
+            "Action Label", "Predicted Segment (s)", "Ground Truth Segment (s)", "Duration Diff (s)", "IoU"))
+        print("-" * 105)
+        for match in matches:
+            pred = match['pred']
+            gt = match['gt']
+            iou = match['iou']
+            if pred and gt:
+                label = pred['label'] if pred['label'] == gt['label'] else f"{pred['label']} (GT: {gt['label']})"
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                duration_diff = pred['duration'] - gt['duration']
+                print("{:<20} {:<30} {:<30} {:<15.2f} {:<10.2f}".format(
+                    label, pred_str, gt_str, duration_diff, iou))
+            elif pred:
+                pred_str = f"[{pred['start']:.2f}, {pred['end']:.2f}] ({pred['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    pred['label'], pred_str, "None", "N/A", iou))
+            elif gt:
+                gt_str = f"[{gt['start']:.2f}, {gt['end']:.2f}] ({gt['duration']:.2f}s)"
+                print("{:<20} {:<30} {:<30} {:<15} {:<10.2f}".format(
+                    gt['label'], "None", gt_str, "N/A", iou))
+        # Summarize
+        matched_count = sum(1 for m in matches if m['pred'] and m['gt'])
+        avg_duration_diff = np.mean([m['pred']['duration'] - m['gt']['duration'] for m in matches if m['pred'] and m['gt']])
+        avg_iou = np.mean([m['iou'] for m in matches if m['iou'] > 0])
+        print(f"\nSummary:")
+        print(f"- Total Predictions: {len(pred_segments)}")
+        print(f"- Total Ground Truth: {len(gt_segments)}")
+        print(f"- Matched Segments: {matched_count}")
+        print(f"- Average Duration Difference (Matched): {avg_duration_diff:.2f}s")
+        print(f"- Average IoU (Matched): {avg_iou:.2f}")
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

single result dataset.py ADDED Viewed

	@@ -0,0 +1,533 @@

+import numpy as np
+import h5py
+import json
+import torch
+import torch.utils.data as data
+import os
+import pickle
+from multiprocessing import Pool
+def load_json(file):
+    with open(file) as json_file:
+        data = json.load(json_file)
+        return data
+def calc_iou(a, b):
+    st = a[0] - a[1]
+    ed = a[0]
+    target_st = b[0] - b[1]
+    target_ed = b[0]
+    sst = min(st, target_st)
+    led = max(ed, target_ed)
+    lst = max(st, target_st)
+    sed = min(ed, target_ed)
+    iou = (sed - lst) / max(led - sst, 1)
+    return iou
+def box_include(y, target):
+    st = y[0] - y[1]
+    ed = y[0]
+    target_st = target[0] - target[1]
+    target_ed = target[0]
+    detection_point = target_st
+    if ed > detection_point and target_st < st and target_ed > ed:
+        return True
+    return False
+class VideoDataSet(data.Dataset):
+    def __init__(self, opt, subset="train", video_name=None):
+        self.subset = subset
+        self.mode = opt["mode"]
+        self.predefined_fps = opt["predefined_fps"]
+        self.video_anno_path = opt["video_anno"].format(opt["split"])
+        self.video_len_path = opt["video_len_file"].format(self.subset + '_' + opt["setup"])
+        self.num_of_class = opt["num_of_class"]
+        self.segment_size = opt["segment_size"]
+        self.label_name = []
+        self.match_score = {}
+        self.match_score_end = {}
+        self.match_length = {}
+        self.gt_action = {}
+        self.cls_label = {}
+        self.reg_label = {}
+        self.snip_label = {}
+        self.inputs = []
+        self.inputs_all = []
+        self.data_rescale = opt["data_rescale"]
+        self.anchors = opt["anchors"]
+        self.pos_threshold = opt["pos_threshold"]
+        self.single_video_name = video_name
+        self._getDatasetDict()
+        self._loadFeaturelen(opt)
+        self._getMatchScore()
+        self._makeInputSeq()
+        self._loadPropLabel(opt['proposal_label_file'].format(self.subset + '_' + opt["setup"]))
+        if self.subset == "train":
+            if opt['data_format'] == "h5":
+                feature_rgb_file = h5py.File(opt["video_feature_rgb_train"], 'r')
+                self.feature_rgb_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_rgb_file:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_train']}")
+                    self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
+                if opt['rgb_only']:
+                    self.feature_flow_file = None
+                else:
+                    self.feature_flow_file = {}
+                    feature_flow_file = h5py.File(opt["video_feature_flow_train"], 'r')
+                    for vidx in range(len(keys)):
+                        if keys[vidx] not in feature_flow_file:
+                            raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_train']}")
+                        self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
+            elif opt['data_format'] == "pickle":
+                feature_All = pickle.load(open(opt["video_feature_all_train"], 'rb'))
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_All:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_train']}")
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "npz":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)['feats']
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+            elif opt['data_format'] == "npz_i3d":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "pt":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_train"] + file + '.pt'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = torch.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+        else:
+            if opt['data_format'] == "h5":
+                feature_rgb_file = h5py.File(opt["video_feature_rgb_test"], 'r')
+                self.feature_rgb_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_rgb_file:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_rgb_test']}")
+                    self.feature_rgb_file[keys[vidx]] = np.array(feature_rgb_file[keys[vidx]][:])
+                if opt['rgb_only']:
+                    self.feature_flow_file = None
+                else:
+                    self.feature_flow_file = {}
+                    feature_flow_file = h5py.File(opt["video_feature_flow_test"], 'r')
+                    for vidx in range(len(keys)):
+                        if keys[vidx] not in feature_flow_file:
+                            raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_flow_test']}")
+                        self.feature_flow_file[keys[vidx]] = np.array(feature_flow_file[keys[vidx]][:])
+            elif opt['data_format'] == "pickle":
+                feature_All = pickle.load(open(opt["video_feature_all_test"], 'rb'))
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    if keys[vidx] not in feature_All:
+                        raise ValueError(f"Features for video {keys[vidx]} not found in {opt['video_feature_all_test']}")
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "npz":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)['feats']
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+            elif opt['data_format'] == "npz_i3d":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.npz'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = np.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]]['rgb']
+                    self.feature_flow_file[keys[vidx]] = feature_All[keys[vidx]]['flow']
+            elif opt['data_format'] == "pt":
+                feature_All = {}
+                self.feature_rgb_file = {}
+                self.feature_flow_file = {}
+                for file in self.video_list:
+                    feature_path = opt["video_feature_all_test"] + file + '.pt'
+                    if not os.path.exists(feature_path):
+                        raise ValueError(f"Feature file {feature_path} not found")
+                    feature_All[file] = torch.load(feature_path)
+                keys = self.video_list
+                for vidx in range(len(keys)):
+                    self.feature_rgb_file[keys[vidx]] = feature_All[keys[vidx]][:]
+                self.feature_flow_file = None
+    def _loadFeaturelen(self, opt):
+        if os.path.exists(self.video_len_path):
+            self.video_len = load_json(self.video_len_path)
+            return
+        self.video_len = {}
+        if self.subset == "train":
+            if opt['data_format'] == "h5":
+                feature_file = h5py.File(opt["video_feature_rgb_train"], 'r')
+            elif opt['data_format'] == "pickle":
+                feature_file = pickle.load(open(opt["video_feature_all_train"], 'rb'))
+            elif opt['data_format'] == "npz":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')['feats']
+            elif opt['data_format'] == "npz_i3d":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_train"] + file + '.npz')
+            elif opt['data_format'] == "pt":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = torch.load(opt["video_feature_all_train"] + file + '.pt')
+        else:
+            if opt['data_format'] == "h5":
+                feature_file = h5py.File(opt["video_feature_rgb_test"], 'r')
+            elif opt['data_format'] == "pickle":
+                feature_file = pickle.load(open(opt["video_feature_all_test"], 'rb'))
+            elif opt['data_format'] == "npz":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')['feats']
+            elif opt['data_format'] == "npz_i3d":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = np.load(opt["video_feature_all_test"] + file + '.npz')
+            elif opt['data_format'] == "pt":
+                feature_file = {}
+                for file in self.video_list:
+                    feature_file[file] = torch.load(opt["video_feature_all_test"] + file + '.pt')
+        keys = self.video_list
+        if opt['data_format'] == "h5":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        elif opt['data_format'] == "pickle":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
+        elif opt['data_format'] == "npz":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        elif opt['data_format'] == "npz_i3d":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]]['rgb'])
+        elif opt['data_format'] == "pt":
+            for vidx in range(len(keys)):
+                self.video_len[keys[vidx]] = len(feature_file[keys[vidx]])
+        outfile = open(self.video_len_path, "w")
+        json.dump(self.video_len, outfile, indent=2)
+        outfile.close()
+    def _getDatasetDict(self):
+        anno_database = load_json(self.video_anno_path)
+        anno_database = anno_database['database']
+        self.video_dict = {}
+        if self.single_video_name:
+            if self.single_video_name in anno_database:
+                video_info = anno_database[self.single_video_name]
+                video_subset = video_info['subset']
+                if self.subset == "full" or self.subset in video_subset:
+                    self.video_dict[self.single_video_name] = video_info
+                for seg in video_info['annotations']:
+                    if not seg['label'] in self.label_name:
+                        self.label_name.append(seg['label'])
+            else:
+                raise ValueError(f"Video {self.single_video_name} not found in annotation database")
+        else:
+            for video_name in anno_database:
+                video_info = anno_database[video_name]
+                video_subset = anno_database[video_name]['subset']
+                if self.subset == "full" or self.subset in video_subset:
+                    self.video_dict[video_name] = video_info
+                for seg in video_info['annotations']:
+                    if not seg['label'] in self.label_name:
+                        self.label_name.append(seg['label'])
+        # Ensure all 22 EGTEA action classes are included
+        expected_labels = [
+            'Clean/Wipe', 'Close', 'Compress', 'Crack', 'Cut', 'Divide/Pull Apart',
+            'Dry', 'Inspect/Read', 'Mix', 'Move Around', 'Open', 'Operate', 'Other',
+            'Pour', 'Put', 'Squeeze', 'Take', 'Transfer', 'Turn off', 'Turn on', 'Wash',
+            'Spread'  # Assumed missing label; replace with actual label if known
+        ]
+        for label in expected_labels:
+            if label not in self.label_name:
+                self.label_name.append(label)
+        self.label_name.sort()
+        self.video_list = list(self.video_dict.keys())
+        print(f"Labels in dataset.label_name: {self.label_name}")
+        print(f"Number of labels: {len(self.label_name)}, Expected: {self.num_of_class-1}")
+        print(f"{self.subset} subset video numbers: {len(self.video_list)}")
+    def _getMatchScore(self):
+        self.action_end_count = torch.zeros(2)
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            video_info = self.video_dict[video_name]
+            video_labels = video_info['annotations']
+            gt_bbox = []
+            gt_edlen = []
+            second_to_frame = self.video_len[video_name] / float(video_info['duration'])
+            for j in range(len(video_labels)):
+                tmp_info = video_labels[j]
+                tmp_start = tmp_info['segment'][0] * second_to_frame
+                tmp_end = tmp_info['segment'][1] * second_to_frame
+                tmp_label = self.label_name.index(tmp_info['label'])
+                gt_bbox.append([tmp_start, tmp_end, tmp_label])
+                gt_edlen.append([gt_bbox[-1][1], gt_bbox[-1][1] - gt_bbox[-1][0], tmp_label])
+            gt_bbox = np.array(gt_bbox)
+            gt_edlen = np.array(gt_edlen)
+            self.gt_action[video_name] = gt_edlen
+            match_score = np.zeros((self.video_len[video_name], self.num_of_class - 1), dtype=np.float32)
+            for idx in range(gt_bbox.shape[0]):
+                ed = int(gt_bbox[idx, 1]) + 1
+                st = int(gt_bbox[idx, 0])
+                match_score[st:ed, int(gt_bbox[idx, 2])] = idx + 1
+            self.match_score[video_name] = match_score
+    def _makeInputSeq(self):
+        data_idx = 0
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            duration = self.match_score[video_name].shape[0]
+            for i in range(1, duration + 1):
+                st = i - self.segment_size
+                ed = i
+                self.inputs_all.append([video_name, st, ed, data_idx])
+                data_idx += 1
+        self.inputs = self.inputs_all.copy()
+        print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
+    def _makePropLabelUnit(self, i):
+        video_name = self.inputs_all[i][0]
+        st = self.inputs_all[i][1]
+        ed = self.inputs_all[i][2]
+        cls_anc = []
+        reg_anc = []
+        for j in range(0, len(self.anchors)):
+            v1 = np.zeros(self.num_of_class)
+            v1[-1] = 1
+            v2 = np.zeros(2)
+            v2[-1] = -1e3
+            y_box = [ed - 1, self.anchors[j]]
+            subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[j], ed)
+            idx_list = []
+            for ii in range(0, subset_label.shape[0]):
+                for jj in range(0, subset_label.shape[1]):
+                    idx = int(subset_label[ii, jj])
+                    if idx > 0 and idx - 1 not in idx_list:
+                        idx_list.append(idx - 1)
+            for idx in idx_list:
+                target_box = self.gt_action[video_name][idx]
+                cls = int(target_box[2])
+                iou = calc_iou(y_box, target_box)
+                if iou >= self.pos_threshold or (j == len(self.anchors) - 1 and box_include(y_box, target_box)) or (j == 0 and box_include(target_box, y_box)):
+                    v1[cls] = 1
+                    v1[-1] = 0
+                    v2[0] = 1.0 * (target_box[0] - y_box[0]) / self.anchors[j]
+                    v2[1] = np.log(1.0 * max(1, target_box[1]) / y_box[1])
+            cls_anc.append(v1)
+            reg_anc.append(v2)
+        v0 = np.zeros(self.num_of_class)
+        v0[-1] = 1
+        segment_size = ed - st
+        y_box = [ed - 1, self.anchors[-1]]
+        subset_label = self._get_train_label_with_class(video_name, ed - self.anchors[-1], ed)
+        idx_list = []
+        for ii in range(0, subset_label.shape[0]):
+            for jj in range(0, subset_label.shape[1]):
+                idx = int(subset_label[ii, jj])
+                if idx > 0 and idx - 1 not in idx_list:
+                    idx_list.append(idx - 1)
+        for idx in idx_list:
+            target_box = self.gt_action[video_name][idx]
+            cls = int(target_box[2])
+            iou = calc_iou(y_box, target_box)
+            if iou >= 0:
+                v0[cls] = 1
+                v0[-1] = 0
+        cls_anc = np.stack(cls_anc, axis=0)
+        reg_anc = np.stack(reg_anc, axis=0)
+        cls_snip = np.array(v0)
+        return cls_anc, reg_anc, cls_snip
+    def _loadPropLabel(self, filename):
+        if os.path.exists(filename):
+            prop_label_file = h5py.File(filename, 'r')
+            self.cls_label = np.array(prop_label_file['cls_label'][:])
+            self.reg_label = np.array(prop_label_file['reg_label'][:])
+            self.snip_label = np.array(prop_label_file['snip_label'][:])
+            prop_label_file.close()
+            self.action_frame_count = np.sum(self.cls_label.reshape((-1, self.cls_label.shape[-1])), axis=0)
+            self.action_frame_count = torch.Tensor(self.action_frame_count)
+            return
+        pool = Pool(os.cpu_count() // 2)
+        labels = pool.map(self._makePropLabelUnit, range(0, len(self.inputs_all)))
+        pool.close()
+        pool.join()
+        cls_label = []
+        reg_label = []
+        snip_label = []
+        for i in range(0, len(labels)):
+            cls_label.append(labels[i][0])
+            reg_label.append(labels[i][1])
+            snip_label.append(labels[i][2])
+        self.cls_label = np.stack(cls_label, axis=0)
+        self.reg_label = np.stack(reg_label, axis=0)
+        self.snip_label = np.stack(snip_label, axis=0)
+        outfile = h5py.File(filename, 'w')
+        dset_cls = outfile.create_dataset('/cls_label', self.cls_label.shape, maxshape=self.cls_label.shape, chunks=True, dtype=np.float32)
+        dset_cls[:, :] = self.cls_label[:, :]
+        dset_reg = outfile.create_dataset('/reg_label', self.reg_label.shape, maxshape=self.reg_label.shape, chunks=True, dtype=np.float32)
+        dset_reg[:, :] = self.reg_label[:, :]
+        dset_snip = outfile.create_dataset('/snip_label', self.snip_label.shape, maxshape=self.snip_label.shape, chunks=True, dtype=np.float32)
+        dset_snip[:, :] = self.snip_label[:, :]
+        outfile.close()
+        return
+    def __getitem__(self, index):
+        video_name, st, ed, data_idx = self.inputs[index]
+        if st >= 0:
+            feature = self._get_base_data(video_name, st, ed)
+        else:
+            feature = self._get_base_data(video_name, 0, ed)
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, -st, 0), 0)
+            feature = padfunc2d(feature)
+        cls_label = torch.Tensor(self.cls_label[data_idx])
+        reg_label = torch.Tensor(self.reg_label[data_idx])
+        snip_label = torch.Tensor(self.snip_label[data_idx])
+        return feature, cls_label, reg_label, snip_label
+    def _get_base_data(self, video_name, st, ed):
+        feature_rgb = self.feature_rgb_file[video_name]
+        feature_rgb = feature_rgb[st:ed, :]
+        if self.feature_flow_file is not None:
+            feature_flow = self.feature_flow_file[video_name]
+            feature_flow = feature_flow[st:ed, :]
+            feature = np.append(feature_rgb, feature_flow, axis=1)
+        else:
+            feature = feature_rgb
+        feature = torch.from_numpy(np.array(feature))
+        return feature
+    def _get_train_label_with_class(self, video_name, st, ed):
+        duration = len(self.match_score[video_name])
+        st_padding = 0
+        ed_padding = 0
+        if st < 0:
+            st_padding = -st
+            st = 0
+        if ed > duration:
+            ed_padding = ed - duration
+            ed = duration
+        match_score = torch.Tensor(self.match_score[video_name][st:ed])
+        if st_padding > 0:
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, st_padding, 0), 0)
+            match_score = padfunc2d(match_score)
+        if ed_padding > 0:
+            padfunc2d = torch.nn.ConstantPad2d((0, 0, 0, ed_padding), 0)
+            match_score = padfunc2d(match_score)
+        return match_score
+    def __len__(self):
+        return len(self.inputs)
+    def reset_sample(self):
+        self.inputs = self.inputs_all.copy()
+    def select_sample(self, idx):
+        inputs = [self.inputs_all[i] for i in idx]
+        self.inputs = inputs.copy()
+        return
+class SuppressDataSet(data.Dataset):
+    def __init__(self, opt, subset="train"):
+        self.subset = subset
+        self.mode = opt["mode"]
+        self.data_file = h5py.File(opt["suppress_label_file"].format(self.subset + "_" + opt['setup']), 'r')
+        self.video_list = list(self.data_file.keys())
+        self.inputs = []
+        for index in range(0, len(self.video_list)):
+            video_name = self.video_list[index]
+            duration = self.data_file[video_name + '/input'].shape[0]
+            for i in range(0, duration):
+                self.inputs.append([video_name, i])
+        print(f"{self.subset} subset seg numbers: {len(self.inputs)}")
+    def __getitem__(self, index):
+        video_name, idx = self.inputs[index]
+        input_seq = self.data_file[video_name + '/input'][idx]
+        label = self.data_file[video_name + '/label'][idx]
+        input_seq = torch.from_numpy(input_seq)
+        label = torch.from_numpy(label)
+        return input_seq, label
+    def __len__(self):
+        return len(self.inputs)

single result main.py ADDED Viewed

	@@ -0,0 +1,523 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from tqdm import tqdm
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, cls_loss_func_, regress_loss_func
+from loss_func import MultiCrossEntropyLoss
+from functools import *
+def train_one_epoch(opt, model, train_dataset, optimizer, warmup=False):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                               batch_size=opt['batch_size'], shuffle=True,
+                                               num_workers=0, pin_memory=True, drop_last=False)
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    epoch_cost_snip = 0
+    total_iter = len(train_dataset) // opt['batch_size']
+    cls_loss = MultiCrossEntropyLoss(focal=True)
+    snip_loss = MultiCrossEntropyLoss(focal=True)
+    for n_iter, (input_data, cls_label, reg_label, snip_label) in enumerate(tqdm(train_loader)):
+        if warmup:
+            for g in optimizer.param_groups:
+                g['lr'] = n_iter * (opt['lr']) / total_iter
+        act_cls, act_reg, snip_cls = model(input_data.float().cuda())
+        act_cls.register_hook(partial(cls_loss.collect_grad, cls_label))
+        snip_cls.register_hook(partial(snip_loss.collect_grad, snip_label))
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func_(cls_loss, cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        loss = cls_loss_func_(snip_loss, snip_label, snip_cls)
+        cost_snip = loss
+        epoch_cost_snip += cost_snip.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg + opt['gamma'] * cost_snip
+        epoch_cost += cost.detach().cpu().numpy()
+        optimizer.zero_grad()
+        cost.backward()
+        optimizer.step()
+    return n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip
+def eval_one_epoch(opt, model, test_dataset):
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, test_dataset)
+    result_dict = eval_map_nms(opt, test_dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    IoUmAP = evaluation_detection(opt, verbose=False)
+    IoUmAP_5 = sum(IoUmAP[0:]) / len(IoUmAP[0:])
+    return cls_loss, reg_loss, tot_loss, IoUmAP_5
+def train(opt):
+    writer = SummaryWriter()
+    model = MYNET(opt).cuda()
+    rest_of_model_params = [param for name, param in model.named_parameters() if "history_unit" not in name]
+    optimizer = optim.Adam([{'params': model.history_unit.parameters(), 'lr': 1e-6}, {'params': rest_of_model_params}], lr=opt["lr"], weight_decay=opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=opt["lr_step"])
+    train_dataset = VideoDataSet(opt, subset="train")
+    test_dataset = VideoDataSet(opt, subset=opt['inference_subset'])
+    warmup = False
+    for n_epoch in range(opt['epoch']):
+        if n_epoch >= 1:
+            warmup = False
+        n_iter, epoch_cost, epoch_cost_cls, epoch_cost_reg, epoch_cost_snip = train_one_epoch(opt, model, train_dataset, optimizer, warmup)
+        writer.add_scalars('data/cost', {'train': epoch_cost / (n_iter + 1)}, n_epoch)
+        print("training loss(epoch %d): %.03f, cls - %f, reg - %f, snip - %f, lr - %f" % (n_epoch,
+                                                                                         epoch_cost / (n_iter + 1),
+                                                                                         epoch_cost_cls / (n_iter + 1),
+                                                                                         epoch_cost_reg / (n_iter + 1),
+                                                                                         epoch_cost_snip / (n_iter + 1),
+                                                                                         optimizer.param_groups[-1]["lr"]))
+        scheduler.step()
+        model.eval()
+        cls_loss, reg_loss, tot_loss, IoUmAP_5 = eval_one_epoch(opt, model, test_dataset)
+        writer.add_scalars('data/mAP', {'test': IoUmAP_5}, n_epoch)
+        print("testing loss(epoch %d): %.03f, cls - %f, reg - %f, mAP Avg - %f" % (n_epoch, tot_loss, cls_loss, reg_loss, IoUmAP_5))
+        state = {'epoch': n_epoch + 1, 'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_checkpoint_" + str(n_epoch + 1) + ".pth.tar")
+        if IoUmAP_5 > model.best_map:
+            model.best_map = IoUmAP_5
+            torch.save(state, opt["checkpoint_path"] + "/" + opt["exp"] + "_ckp_best.pth.tar")
+        model.train()
+    writer.close()
+    return model.best_map
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=opt['batch_size'], shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    labels_cls = {}
+    labels_reg = {}
+    output_cls = {}
+    output_reg = {}
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = []
+        labels_reg[video_name] = []
+        output_cls[video_name] = []
+        output_reg[video_name] = []
+    start_time = time.time()
+    total_frames = 0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter, (input_data, cls_label, reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.float().cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label, act_cls)
+        cost_cls = loss
+        epoch_cost_cls += cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label, act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost = opt['alpha'] * cost_cls + opt['beta'] * cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames += input_data.size(0)
+        for b in range(0, input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter * opt['batch_size'] + b]
+            output_cls[video_name] += [act_cls[b, :].detach().cpu().numpy()]
+            output_reg[video_name] += [act_reg[b, :].detach().cpu().numpy()]
+            labels_cls[video_name] += [cls_label[b, :].numpy()]
+            labels_reg[video_name] += [reg_label[b, :].numpy()]
+    end_time = time.time()
+    working_time = end_time - start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name] = np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name] = np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name] = np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name] = np.stack(output_reg[video_name], axis=0)
+    cls_loss = epoch_cost_cls / n_iter
+    reg_loss = epoch_cost_reg / n_iter
+    tot_loss = epoch_cost / n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_dict += proposal_anc_dict
+        proposal_dict = non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        conf_queue = torch.zeros((unit_size, num_class - 1))
+        for idx in range(0, duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            conf_queue[:-1, :] = conf_queue[1:, :].clone()
+            conf_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                conf_queue[-1, cls_idx] = proposal["score"]
+            minput = conf_queue.unsqueeze(0)
+            suppress_conf = model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    return result_dict
+def test_frame(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    outfile = h5py.File(opt['frame_result_file'].format(opt['exp']), 'w')
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    print("testing loss: %f, cls_loss: %f, reg_loss: %f" % (tot_loss, cls_loss, reg_loss))
+    for video_name in dataset.video_list:
+        o_cls = output_cls[video_name]
+        o_reg = output_reg[video_name]
+        l_cls = labels_cls[video_name]
+        l_reg = labels_reg[video_name]
+        dset_predcls = outfile.create_dataset(video_name + '/pred_cls', o_cls.shape, maxshape=o_cls.shape, chunks=True, dtype=np.float32)
+        dset_predcls[:, :] = o_cls[:, :]
+        dset_predreg = outfile.create_dataset(video_name + '/pred_reg', o_reg.shape, maxshape=o_reg.shape, chunks=True, dtype=np.float32)
+        dset_predreg[:, :] = o_reg[:, :]
+        dset_labelcls = outfile.create_dataset(video_name + '/label_cls', l_cls.shape, maxshape=l_cls.shape, chunks=True, dtype=np.float32)
+        dset_labelcls[:, :] = l_cls[:, :]
+        dset_labelreg = outfile.create_dataset(video_name + '/label_reg', l_reg.shape, maxshape=l_reg.shape, chunks=True, dtype=np.float32)
+        dset_labelreg[:, :] = l_reg[:, :]
+    outfile.close()
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    return cls_loss, reg_loss, tot_loss
+def patch_attention(m):
+    forward_orig = m.forward
+    def wrap(*args, **kwargs):
+        kwargs["need_weights"] = True
+        kwargs["average_attn_weights"] = False
+        return forward_orig(*args, **kwargs)
+    m.forward = wrap
+class SaveOutput:
+    def __init__(self):
+        self.outputs = []
+    def __call__(self, module, module_in, module_out):
+        self.outputs.append(module_out[1])
+    def clear(self):
+        self.outputs = []
+def test(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/" + opt['exp'] + "_ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames = eval_frame(opt, model, dataset)
+    if opt["pptype"] == "nms":
+        result_dict = eval_map_nms(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    if opt["pptype"] == "net":
+        result_dict = eval_map_supnet(opt, dataset, output_cls, output_reg, labels_cls, labels_reg)
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def test_online(opt, video_name=None):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best.pth.tar")
+    base_dict = checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    sup_model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"] + "/ckp_best_suppress.pth.tar")
+    base_dict = checkpoint['state_dict']
+    sup_model.load_state_dict(base_dict)
+    sup_model.eval()
+    dataset = VideoDataSet(opt, subset=opt['inference_subset'], video_name=video_name)
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                              batch_size=1, shuffle=False,
+                                              num_workers=0, pin_memory=True, drop_last=False)
+    result_dict = {}
+    proposal_dict = []
+    num_class = opt["num_of_class"]
+    unit_size = opt['segment_size']
+    threshold = opt['threshold']
+    anchors = opt['anchors']
+    start_time = time.time()
+    total_frames = 0
+    for video_name in dataset.video_list:
+        input_queue = torch.zeros((unit_size, opt['feat_dim']))
+        sup_queue = torch.zeros(((unit_size, num_class - 1)))
+        duration = dataset.video_len[video_name]
+        video_time = float(dataset.video_dict[video_name]["duration"])
+        frame_to_time = 100.0 * video_time / duration
+        for idx in range(0, duration):
+            total_frames += 1
+            input_queue[:-1, :] = input_queue[1:, :].clone()
+            input_queue[-1:, :] = dataset._get_base_data(video_name, idx, idx + 1)
+            minput = input_queue.unsqueeze(0)
+            act_cls, act_reg, _ = model(minput.cuda())
+            act_cls = torch.softmax(act_cls, dim=-1)
+            cls_anc = act_cls.squeeze(0).detach().cpu().numpy()
+            reg_anc = act_reg.squeeze(0).detach().cpu().numpy()
+            proposal_anc_dict = []
+            for anc_idx in range(0, len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1] > opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed = idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx] * np.exp(reg_anc[anc_idx][1])
+                st = ed - length
+                for cidx in range(0, len(cls)):
+                    label = cls[cidx]
+                    tmp_dict = {}
+                    tmp_dict["segment"] = [float(st * frame_to_time / 100.0), float(ed * frame_to_time / 100.0)]
+                    tmp_dict["score"] = float(cls_anc[anc_idx][label])
+                    tmp_dict["label"] = dataset.label_name[label]
+                    tmp_dict["gentime"] = float(idx * frame_to_time / 100.0)
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            sup_queue[:-1, :] = sup_queue[1:, :].clone()
+            sup_queue[-1, :] = 0
+            for proposal in proposal_anc_dict:
+                cls_idx = dataset.label_name.index(proposal['label'])
+                sup_queue[-1, cls_idx] = proposal["score"]
+            minput = sup_queue.unsqueeze(0)
+            suppress_conf = sup_model(minput.cuda())
+            suppress_conf = suppress_conf.squeeze(0).detach().cpu().numpy()
+            for cls in range(0, num_class - 1):
+                if suppress_conf[cls] > opt['sup_threshold']:
+                    for proposal in proposal_anc_dict:
+                        if proposal['label'] == dataset.label_name[cls]:
+                            if check_overlap_proposal(proposal_dict, proposal, overlapThresh=opt['soft_nms']) is None:
+                                proposal_dict.append(proposal)
+        result_dict[video_name] = proposal_dict
+        proposal_dict = []
+    end_time = time.time()
+    working_time = end_time - start_time
+    print("working time : {}s, {}fps, {} frames".format(working_time, total_frames / working_time, total_frames))
+    output_dict = {"version": "VERSION 1.3", "results": result_dict, "external_data": {}}
+    outfile = open(opt["result_file"].format(opt['exp']), "w")
+    json.dump(output_dict, outfile, indent=2)
+    outfile.close()
+    mAP = evaluation_detection(opt)
+    return mAP
+def main(opt, video_name=None):
+    max_perf = 0
+    if not video_name and 'video_name' in opt:
+        video_name = opt['video_name']
+    if opt['mode'] == 'train':
+        max_perf = train(opt)
+    if opt['mode'] == 'test':
+        max_perf = test(opt, video_name=video_name)
+    if opt['mode'] == 'test_frame':
+        max_perf = test_frame(opt, video_name=video_name)
+    if opt['mode'] == 'test_online':
+        max_perf = test_online(opt, video_name=video_name)
+    if opt['mode'] == 'eval':
+        max_perf = evaluation_detection(opt)
+    return max_perf
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file = open(opt["checkpoint_path"] + "/" + opt["exp"] + "_opts.json", "w")
+    json.dump(opt, opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    video_name = opt.get('video_name', None)
+    main(opt, video_name=video_name)
+    while(opt['wterm']):
+        pass

single result opts_egtea.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import argparse
+def parse_opt():
+    parser = argparse.ArgumentParser()
+    # Overall settings
+    parser.add_argument(
+        '--mode',
+        type=str,
+        default='train')
+    parser.add_argument(
+        '--video_name',
+        type=str,
+        default=None,
+        help='Name of the single video to evaluate')
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default='./checkpoint')
+    parser.add_argument(
+        '--segment_size',
+        type=int,
+        default=64)
+    parser.add_argument(
+        '--anchors',
+        type=str,
+        default='2,4,6,8,12,16')
+    parser.add_argument(
+        '--seed',
+        default=7,
+        type=int,
+        help='random seed for reproducibility')
+    # Overall Dataset settings
+    parser.add_argument(
+        '--num_of_class',
+        type=int,
+        default=23)
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default="npz_i3d")
+    parser.add_argument(
+        '--data_rescale',
+        default=False,
+        action='store_true')
+    parser.add_argument(
+        '--predefined_fps',
+        default=None,
+        type=float)
+    parser.add_argument(
+        '--rgb_only',
+        default=False,
+        action='store_true')
+    parser.add_argument(
+        '--video_anno',
+        type=str,
+        default="./data/egtea_annotations_split{}.json")
+    parser.add_argument(
+        '--video_feature_all_train',
+        type=str,
+        default="./data/I3D/")
+    parser.add_argument(
+        '--video_feature_all_test',
+        type=str,
+        default="./data/I3D/")
+    parser.add_argument(
+        '--setup',
+        type=str,
+        default="")
+    parser.add_argument(
+        '--exp',
+        type=str,
+        default="01")
+    parser.add_argument(
+        '--split',
+        type=str,
+        default="1")
+    # Network
+    parser.add_argument(
+        '--feat_dim',
+        type=int,
+        default=2048)
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=1024)
+    parser.add_argument(
+        '--out_dim',
+        type=int,
+        default=23)
+    parser.add_argument(
+        '--enc_layer',
+        type=int,
+        default=3)
+    parser.add_argument(
+        '--enc_head',
+        type=int,
+        default=8)
+    parser.add_argument(
+        '--dec_layer',
+        type=int,
+        default=5)
+    parser.add_argument(
+        '--dec_head',
+        type=int,
+        default=4)
+    # Training settings
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=128)
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=1e-4)
+    parser.add_argument(
+        '--weight_decay',
+        type=float,
+        default=1e-4)
+    parser.add_argument(
+        '--epoch',
+        type=int,
+        default=5)
+    parser.add_argument(
+        '--lr_step',
+        type=int,
+        default=3)
+    # Post processing
+    parser.add_argument(
+        '--alpha',
+        type=float,
+        default=1)
+    parser.add_argument(
+        '--beta',
+        type=float,
+        default=1)
+    parser.add_argument(
+        '--gamma',
+        type=float,
+        default=0.2)
+    parser.add_argument(
+        '--pptype',
+        type=str,
+        default="net")
+    parser.add_argument(
+        '--pos_threshold',
+        type=float,
+        default=0.5)
+    parser.add_argument(
+        '--sup_threshold',
+        type=float,
+        default=0.1)
+    parser.add_argument(
+        '--threshold',
+        type=float,
+        default=0.1)
+    parser.add_argument(
+        '--inference_subset',
+        type=str,
+        default="test")
+    parser.add_argument(
+        '--soft_nms',
+        type=float,
+        default=0.3)
+    parser.add_argument(
+        '--video_len_file',
+        type=str,
+        default="./output/video_len_{}.json")
+    parser.add_argument(
+        '--proposal_label_file',
+        type=str,
+        default="./output/proposal_label_{}.h5")
+    parser.add_argument(
+        '--suppress_label_file',
+        type=str,
+        default="./output/suppress_label_{}.h5")
+    parser.add_argument(
+        '--suppress_result_file',
+        type=str,
+        default="./output/suppress_result{}.h5")
+    parser.add_argument(
+        '--frame_result_file',
+        type=str,
+        default="./output/frame_result{}.h5")
+    parser.add_argument(
+        '--result_file',
+        type=str,
+        default="./output/result_proposal{}.json")
+    parser.add_argument(
+        '--wterm',
+        type=bool,
+        default=False)
+    args = parser.parse_args()
+    return args

supnet.py ADDED Viewed

	@@ -0,0 +1,637 @@

+import os
+import json
+import torch
+import torchvision
+import torch.nn.parallel
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import opts_egtea as opts
+import time
+import h5py
+from iou_utils import *
+from eval import evaluation_detection
+from tensorboardX import SummaryWriter
+from dataset import VideoDataSet, SuppressDataSet
+from models import MYNET, SuppressNet
+from loss_func import cls_loss_func, regress_loss_func, suppress_loss_func
+from tqdm import tqdm
+def train_one_epoch(opt, model, train_dataset, optimizer):
+    train_loader = torch.utils.data.DataLoader(train_dataset,
+                                                batch_size=opt['batch_size'], shuffle=True,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    epoch_cost = 0
+    for n_iter,(input_data,label) in enumerate(tqdm(train_loader)):
+        suppress_conf = model(input_data.cuda())
+        loss = suppress_loss_func(label,suppress_conf)
+        epoch_cost+= loss.detach().cpu().numpy()
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    return n_iter, epoch_cost
+def eval_one_epoch(opt, model, test_dataset):
+    test_loader = torch.utils.data.DataLoader(test_dataset,
+                                                batch_size=opt['batch_size'], shuffle=False,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    epoch_cost = 0
+    for n_iter,(input_data,label) in enumerate(tqdm(test_loader)):
+        suppress_conf = model(input_data.cuda())
+        loss = suppress_loss_func(label,suppress_conf)
+        epoch_cost+= loss.detach().cpu().numpy()
+    return n_iter, epoch_cost
+def train(opt):
+    writer = SummaryWriter()
+    model = SuppressNet(opt).cuda()
+    optimizer = optim.Adam( model.parameters(),lr=opt["lr"],weight_decay = opt["weight_decay"])
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = opt["lr_step"])
+    train_dataset = SuppressDataSet(opt,subset="train")
+    test_dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+    for n_epoch in range(opt['epoch']):
+        n_iter, epoch_cost = train_one_epoch(opt, model, train_dataset, optimizer)
+        writer.add_scalars('sup_data/cost', {'train': epoch_cost/(n_iter+1)}, n_epoch)
+        print("training loss(epoch %d): %f, lr - %f"%(n_epoch,
+                                                                epoch_cost/(n_iter+1),
+                                                                optimizer.param_groups[0]["lr"]) )
+        scheduler.step()
+        model.eval()
+        n_iter, eval_cost = eval_one_epoch(opt, model,test_dataset)
+        writer.add_scalars('sup_data/eval', {'test': eval_cost/(n_iter+1)}, n_epoch)
+        print("testing loss(epoch %d): %f"%(n_epoch,eval_cost/(n_iter+1)))
+        state = {'epoch': n_epoch + 1,
+                    'state_dict': model.state_dict()}
+        torch.save(state, opt["checkpoint_path"]+"/checkpoint_suppress_"+str(n_epoch+1)+".pth.tar" )
+        if eval_cost < model.best_loss:
+            model.best_loss = eval_cost
+            torch.save(state, opt["checkpoint_path"]+"/ckp_best_suppress.pth.tar" )
+        model.train()
+    writer.close()
+    return
+def eval_frame(opt, model, dataset):
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                                batch_size=opt['batch_size'], shuffle=False,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    labels_cls={}
+    labels_reg={}
+    output_cls={}
+    output_reg={}
+    for video_name in dataset.video_list:
+        labels_cls[video_name]=[]
+        labels_reg[video_name]=[]
+        output_cls[video_name]=[]
+        output_reg[video_name]=[]
+    start_time = time.time()
+    total_frames =0
+    epoch_cost = 0
+    epoch_cost_cls = 0
+    epoch_cost_reg = 0
+    for n_iter,(input_data,cls_label,reg_label, _) in enumerate(tqdm(test_loader)):
+        act_cls, act_reg, _ = model(input_data.cuda())
+        cost_reg = 0
+        cost_cls = 0
+        loss = cls_loss_func(cls_label,act_cls)
+        cost_cls = loss
+        epoch_cost_cls+= cost_cls.detach().cpu().numpy()
+        loss = regress_loss_func(reg_label,act_reg)
+        cost_reg = loss
+        epoch_cost_reg += cost_reg.detach().cpu().numpy()
+        cost= opt['alpha']*cost_cls +opt['beta']*cost_reg
+        epoch_cost += cost.detach().cpu().numpy()
+        act_cls = torch.softmax(act_cls, dim=-1)
+        total_frames+=input_data.size(0)
+        for b in range(0,input_data.size(0)):
+            video_name, st, ed, data_idx = dataset.inputs[n_iter*opt['batch_size']+b]
+            output_cls[video_name]+=[act_cls[b,:].detach().cpu().numpy()]
+            output_reg[video_name]+=[act_reg[b,:].detach().cpu().numpy()]
+            labels_cls[video_name]+=[cls_label[b,:].numpy()]
+            labels_reg[video_name]+=[reg_label[b,:].numpy()]
+    end_time = time.time()
+    working_time = end_time-start_time
+    for video_name in dataset.video_list:
+        labels_cls[video_name]=np.stack(labels_cls[video_name], axis=0)
+        labels_reg[video_name]=np.stack(labels_reg[video_name], axis=0)
+        output_cls[video_name]=np.stack(output_cls[video_name], axis=0)
+        output_reg[video_name]=np.stack(output_reg[video_name], axis=0)
+    cls_loss=epoch_cost_cls/n_iter
+    reg_loss=epoch_cost_reg/n_iter
+    tot_loss=epoch_cost/n_iter
+    return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+def test(opt):
+    model = SuppressNet(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"]+"/" + opt['exp'] + "ckp_best_suppress.pth.tar")
+    base_dict=checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+    test_loader = torch.utils.data.DataLoader(dataset,
+                                                batch_size=opt['batch_size'], shuffle=False,
+                                                num_workers=0, pin_memory=True,drop_last=False)
+    labels={}
+    output={}
+    for video_name in dataset.video_list:
+        labels[video_name]=[]
+        output[video_name]=[]
+    for n_iter,(input_data,label) in enumerate(test_loader):
+        suppress_conf = model(input_data.cuda())
+        for b in range(0,input_data.size(0)):
+            video_name, idx = dataset.inputs[n_iter*opt['batch_size']+b]
+            output[video_name]+=[suppress_conf[b,:].detach().cpu().numpy()]
+            labels[video_name]+=[label[b,:].numpy()]
+    for video_name in dataset.video_list:
+        labels[video_name]=np.stack(labels[video_name], axis=0)
+        output[video_name]=np.stack(output[video_name], axis=0)
+    outfile = h5py.File(opt['suppress_result_file'].format(opt['exp']), 'w')
+    for video_name in dataset.video_list:
+        o=output[video_name]
+        l=labels[video_name]
+        dset_pred = outfile.create_dataset(video_name+'/pred', o.shape, maxshape=o.shape, chunks=True, dtype=np.float32)
+        dset_pred[:,:] = o[:,:]
+        dset_label = outfile.create_dataset(video_name+'/label', l.shape, maxshape=l.shape, chunks=True, dtype=np.float32)
+        dset_label[:,:] = l[:,:]
+    outfile.close()
+    print('complete')
+def make_dataset(opt):
+    model = MYNET(opt).cuda()
+    checkpoint = torch.load(opt["checkpoint_path"]+"/"+opt['exp']+"_ckp_best.pth.tar")
+    base_dict=checkpoint['state_dict']
+    model.load_state_dict(base_dict)
+    model.eval()
+    dataset = VideoDataSet(opt,subset=opt['inference_subset'])
+    _, _, _, output_cls, output_reg, labels_cls, labels_reg, _, _ = eval_frame(opt, model,dataset)
+    proposal_dict=[]
+    outfile = h5py.File(opt['suppress_label_file'].format(opt['inference_subset']+'_'+opt['setup']), 'w')
+    num_class = opt["num_of_class"]-1
+    unit_size = opt['segment_size']
+    threshold=opt['threshold']
+    anchors=opt['anchors']
+    for video_name in dataset.video_list:
+        duration = dataset.video_len[video_name]
+        for idx in range(0,duration):
+            cls_anc = output_cls[video_name][idx]
+            reg_anc = output_reg[video_name][idx]
+            proposal_anc_dict=[]
+            for anc_idx in range(0,len(anchors)):
+                cls = np.argwhere(cls_anc[anc_idx][:-1]>opt['threshold']).reshape(-1)
+                if len(cls) == 0:
+                    continue
+                ed= idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+                length = anchors[anc_idx]* np.exp(reg_anc[anc_idx][1])
+                st= ed-length
+                for cidx in range(0,len(cls)):
+                    label=cls[cidx]
+                    tmp_dict={}
+                    tmp_dict["segment"] = [st, ed]
+                    tmp_dict["score"]= cls_anc[anc_idx][label]
+                    tmp_dict["label"]=label
+                    tmp_dict["gentime"]= idx
+                    proposal_anc_dict.append(tmp_dict)
+            proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+            proposal_dict+=proposal_anc_dict
+        nms_dict=non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+        input_table = np.zeros((duration,unit_size,num_class), dtype=np.float32)
+        label_table = np.zeros((duration,num_class), dtype=np.float32)
+        for proposal in proposal_dict:
+            idx = proposal["gentime"]
+            conf = proposal["score"]
+            cls = proposal["label"]
+            for i in range(0,unit_size):
+                if idx+i < duration:
+                    input_table[idx+i,unit_size-1-i,cls]=conf
+        for proposal in nms_dict:
+            idx = proposal["gentime"]
+            cls = proposal["label"]
+            label_table[idx:idx+3,cls]=1
+        dset_input_table = outfile.create_dataset(video_name+'/input', input_table.shape, maxshape=input_table.shape, chunks=True, dtype=np.float32)
+        dset_label_table = outfile.create_dataset(video_name+'/label', label_table.shape, maxshape=label_table.shape, chunks=True, dtype=np.float32)
+        dset_input_table[:]=input_table
+        dset_label_table[:]=label_table
+        proposal_dict=[]
+    print('complete')
+    return
+def main(opt):
+    if opt['mode'] == 'train':
+        train(opt)
+    if opt['mode'] == 'test':
+        test(opt)
+    if opt['mode'] == 'make':
+        make_dataset(opt)
+    return
+if __name__ == '__main__':
+    opt = opts.parse_opt()
+    opt = vars(opt)
+    if not os.path.exists(opt["checkpoint_path"]):
+        os.makedirs(opt["checkpoint_path"])
+    opt_file=open(opt["checkpoint_path"]+"/"+opt['exp']+"_opts.json","w")
+    json.dump(opt,opt_file)
+    opt_file.close()
+    if opt['seed'] >= 0:
+        seed = opt['seed']
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        #random.seed(seed)
+    opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+    main(opt)
+    while(opt['wterm']):
+        pass
+# import os
+# import json
+# import torch
+# import torchvision
+# import torch.nn.parallel
+# import torch.nn.functional as F
+# import torch.optim as optim
+# import numpy as np
+# # import opts_egtea as opts
+# import opts_thumos as opts
+# import time
+# import h5py
+# from iou_utils import *
+# from eval import evaluation_detection
+# from tensorboardX import SummaryWriter
+# from dataset import VideoDataSet, SuppressDataSet
+# from models import MYNET, SuppressNet
+# from loss_func import cls_loss_func, regress_loss_func, suppress_loss_func
+# from tqdm import tqdm
+# def train_one_epoch(opt, model, train_dataset, optimizer):
+#     train_loader = torch.utils.data.DataLoader(train_dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=True,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     epoch_cost = 0
+#     for n_iter,(input_data,label) in enumerate(tqdm(train_loader)):
+#         suppress_conf = model(input_data.cuda())
+#         loss = suppress_loss_func(label,suppress_conf)
+#         epoch_cost+= loss.detach().cpu().numpy()
+#         optimizer.zero_grad()
+#         loss.backward()
+#         optimizer.step()
+#     return n_iter, epoch_cost
+# def eval_one_epoch(opt, model, test_dataset):
+#     test_loader = torch.utils.data.DataLoader(test_dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=False,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     epoch_cost = 0
+#     for n_iter,(input_data,label) in enumerate(tqdm(test_loader)):
+#         suppress_conf = model(input_data.cuda())
+#         loss = suppress_loss_func(label,suppress_conf)
+#         epoch_cost+= loss.detach().cpu().numpy()
+#     return n_iter, epoch_cost
+# def train(opt):
+#     writer = SummaryWriter()
+#     model = SuppressNet(opt).cuda()
+#     optimizer = optim.Adam( model.parameters(),lr=opt["lr"],weight_decay = opt["weight_decay"])
+#     scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size = opt["lr_step"])
+#     train_dataset = SuppressDataSet(opt,subset="train")
+#     test_dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+#     for n_epoch in range(opt['epoch']):
+#         n_iter, epoch_cost = train_one_epoch(opt, model, train_dataset, optimizer)
+#         writer.add_scalars('sup_data/cost', {'train': epoch_cost/(n_iter+1)}, n_epoch)
+#         print("training loss(epoch %d): %f, lr - %f"%(n_epoch,
+#                                                                 epoch_cost/(n_iter+1),
+#                                                                 optimizer.param_groups[0]["lr"]) )
+#         scheduler.step()
+#         model.eval()
+#         n_iter, eval_cost = eval_one_epoch(opt, model,test_dataset)
+#         writer.add_scalars('sup_data/eval', {'test': eval_cost/(n_iter+1)}, n_epoch)
+#         print("testing loss(epoch %d): %f"%(n_epoch,eval_cost/(n_iter+1)))
+#         state = {'epoch': n_epoch + 1,
+#                     'state_dict': model.state_dict()}
+#         torch.save(state, opt["checkpoint_path"]+"/checkpoint_suppress_"+str(n_epoch+1)+".pth.tar" )
+#         if eval_cost < model.best_loss:
+#             model.best_loss = eval_cost
+#             torch.save(state, opt["checkpoint_path"]+"/ckp_best_suppress.pth.tar" )
+#         model.train()
+#     writer.close()
+#     return
+# def eval_frame(opt, model, dataset):
+#     test_loader = torch.utils.data.DataLoader(dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=False,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     labels_cls={}
+#     labels_reg={}
+#     output_cls={}
+#     output_reg={}
+#     for video_name in dataset.video_list:
+#         labels_cls[video_name]=[]
+#         labels_reg[video_name]=[]
+#         output_cls[video_name]=[]
+#         output_reg[video_name]=[]
+#     start_time = time.time()
+#     total_frames =0
+#     epoch_cost = 0
+#     epoch_cost_cls = 0
+#     epoch_cost_reg = 0
+#     for n_iter,(input_data,cls_label,reg_label, _) in enumerate(tqdm(test_loader)):
+#         act_cls, act_reg, _ = model(input_data.cuda())
+#         cost_reg = 0
+#         cost_cls = 0
+#         loss = cls_loss_func(cls_label,act_cls)
+#         cost_cls = loss
+#         epoch_cost_cls+= cost_cls.detach().cpu().numpy()
+#         loss = regress_loss_func(reg_label,act_reg)
+#         cost_reg = loss
+#         epoch_cost_reg += cost_reg.detach().cpu().numpy()
+#         cost= opt['alpha']*cost_cls +opt['beta']*cost_reg
+#         epoch_cost += cost.detach().cpu().numpy()
+#         act_cls = torch.softmax(act_cls, dim=-1)
+#         total_frames+=input_data.size(0)
+#         for b in range(0,input_data.size(0)):
+#             video_name, st, ed, data_idx = dataset.inputs[n_iter*opt['batch_size']+b]
+#             output_cls[video_name]+=[act_cls[b,:].detach().cpu().numpy()]
+#             output_reg[video_name]+=[act_reg[b,:].detach().cpu().numpy()]
+#             labels_cls[video_name]+=[cls_label[b,:].numpy()]
+#             labels_reg[video_name]+=[reg_label[b,:].numpy()]
+#     end_time = time.time()
+#     working_time = end_time-start_time
+#     for video_name in dataset.video_list:
+#         labels_cls[video_name]=np.stack(labels_cls[video_name], axis=0)
+#         labels_reg[video_name]=np.stack(labels_reg[video_name], axis=0)
+#         output_cls[video_name]=np.stack(output_cls[video_name], axis=0)
+#         output_reg[video_name]=np.stack(output_reg[video_name], axis=0)
+#     cls_loss=epoch_cost_cls/n_iter
+#     reg_loss=epoch_cost_reg/n_iter
+#     tot_loss=epoch_cost/n_iter
+#     return cls_loss, reg_loss, tot_loss, output_cls, output_reg, labels_cls, labels_reg, working_time, total_frames
+# def test(opt):
+#     model = SuppressNet(opt).cuda()
+#     checkpoint = torch.load(opt["checkpoint_path"]+"/" + opt['exp'] + "ckp_best_suppress.pth.tar")
+#     base_dict=checkpoint['state_dict']
+#     model.load_state_dict(base_dict)
+#     model.eval()
+#     dataset = SuppressDataSet(opt,subset=opt['inference_subset'])
+#     test_loader = torch.utils.data.DataLoader(dataset,
+#                                                 batch_size=opt['batch_size'], shuffle=False,
+#                                                 num_workers=0, pin_memory=True,drop_last=False)
+#     labels={}
+#     output={}
+#     for video_name in dataset.video_list:
+#         labels[video_name]=[]
+#         output[video_name]=[]
+#     for n_iter,(input_data,label) in enumerate(test_loader):
+#         suppress_conf = model(input_data.cuda())
+#         for b in range(0,input_data.size(0)):
+#             video_name, idx = dataset.inputs[n_iter*opt['batch_size']+b]
+#             output[video_name]+=[suppress_conf[b,:].detach().cpu().numpy()]
+#             labels[video_name]+=[label[b,:].numpy()]
+#     for video_name in dataset.video_list:
+#         labels[video_name]=np.stack(labels[video_name], axis=0)
+#         output[video_name]=np.stack(output[video_name], axis=0)
+#     outfile = h5py.File(opt['suppress_result_file'].format(opt['exp']), 'w')
+#     for video_name in dataset.video_list:
+#         o=output[video_name]
+#         l=labels[video_name]
+#         dset_pred = outfile.create_dataset(video_name+'/pred', o.shape, maxshape=o.shape, chunks=True, dtype=np.float32)
+#         dset_pred[:,:] = o[:,:]
+#         dset_label = outfile.create_dataset(video_name+'/label', l.shape, maxshape=l.shape, chunks=True, dtype=np.float32)
+#         dset_label[:,:] = l[:,:]
+#     outfile.close()
+#     print('complete')
+# def make_dataset(opt):
+#     model = MYNET(opt).cuda()
+#     checkpoint = torch.load(opt["checkpoint_path"]+"/"+opt['exp']+"_ckp_best.pth.tar")
+#     base_dict=checkpoint['state_dict']
+#     model.load_state_dict(base_dict)
+#     model.eval()
+#     # Fix: Set the 'split' key to match 'inference_subset'
+#     opt['split'] = opt['inference_subset']
+#     dataset = VideoDataSet(opt,subset=opt['inference_subset'])
+#     _, _, _, output_cls, output_reg, labels_cls, labels_reg, _, _ = eval_frame(opt, model,dataset)
+#     proposal_dict=[]
+#     outfile = h5py.File(opt['suppress_label_file'].format(opt['inference_subset']+'_'+opt['setup']), 'w')
+#     num_class = opt["num_of_class"]-1
+#     unit_size = opt['segment_size']
+#     threshold=opt['threshold']
+#     anchors=opt['anchors']
+#     for video_name in dataset.video_list:
+#         duration = dataset.video_len[video_name]
+#         for idx in range(0,duration):
+#             cls_anc = output_cls[video_name][idx]
+#             reg_anc = output_reg[video_name][idx]
+#             proposal_anc_dict=[]
+#             for anc_idx in range(0,len(anchors)):
+#                 cls = np.argwhere(cls_anc[anc_idx][:-1]>opt['threshold']).reshape(-1)
+#                 if len(cls) == 0:
+#                     continue
+#                 ed= idx + anchors[anc_idx] * reg_anc[anc_idx][0]
+#                 length = anchors[anc_idx]* np.exp(reg_anc[anc_idx][1])
+#                 st= ed-length
+#                 for cidx in range(0,len(cls)):
+#                     label=cls[cidx]
+#                     tmp_dict={}
+#                     tmp_dict["segment"] = [st, ed]
+#                     tmp_dict["score"]= cls_anc[anc_idx][label]
+#                     tmp_dict["label"]=label
+#                     tmp_dict["gentime"]= idx
+#                     proposal_anc_dict.append(tmp_dict)
+#             proposal_anc_dict = non_max_suppression(proposal_anc_dict, overlapThresh=opt['soft_nms'])
+#             proposal_dict+=proposal_anc_dict
+#         nms_dict=non_max_suppression(proposal_dict, overlapThresh=opt['soft_nms'])
+#         input_table = np.zeros((duration,unit_size,num_class), dtype=np.float32)
+#         label_table = np.zeros((duration,num_class), dtype=np.float32)
+#         for proposal in proposal_dict:
+#             idx = proposal["gentime"]
+#             conf = proposal["score"]
+#             cls = proposal["label"]
+#             for i in range(0,unit_size):
+#                 if idx+i < duration:
+#                     input_table[idx+i,unit_size-1-i,cls]=conf
+#         for proposal in nms_dict:
+#             idx = proposal["gentime"]
+#             cls = proposal["label"]
+#             label_table[idx:idx+3,cls]=1
+#         dset_input_table = outfile.create_dataset(video_name+'/input', input_table.shape, maxshape=input_table.shape, chunks=True, dtype=np.float32)
+#         dset_label_table = outfile.create_dataset(video_name+'/label', label_table.shape, maxshape=label_table.shape, chunks=True, dtype=np.float32)
+#         dset_input_table[:]=input_table
+#         dset_label_table[:]=label_table
+#         proposal_dict=[]
+#     outfile.close()  # Added missing close() call
+#     print('complete')
+#     return
+# def main(opt):
+#     if opt['mode'] == 'train':
+#         train(opt)
+#     if opt['mode'] == 'test':
+#         test(opt)
+#     if opt['mode'] == 'make':
+#         make_dataset(opt)
+#     return
+# if __name__ == '__main__':
+#     opt = opts.parse_opt()
+#     opt = vars(opt)
+#     if not os.path.exists(opt["checkpoint_path"]):
+#         os.makedirs(opt["checkpoint_path"])
+#     opt_file=open(opt["checkpoint_path"]+"/"+opt['exp']+"_opts.json","w")
+#     json.dump(opt,opt_file)
+#     opt_file.close()
+#     if opt['seed'] >= 0:
+#         seed = opt['seed']
+#         torch.manual_seed(seed)
+#         np.random.seed(seed)
+#         #random.seed(seed)
+#     opt['anchors'] = [int(item) for item in opt['anchors'].split(',')]
+#     main(opt)
+#     while(opt['wterm']):
+#         pass