add

Files changed (16) hide show

backend/config.py +184 -0
backend/scenedetect/detectors/motion_detector.py +92 -0
backend/scenedetect/detectors/threshold_detector.py +203 -0
backend/tools/common_tools.py +32 -0
backend/tools/inpaint_tools.py +117 -0
backend/tools/makedist.py +65 -0
backend/tools/merge_video.py +32 -0
backend/tools/train/dataset_sttn.py +85 -0
backend/tools/train/loss_sttn.py +56 -0
backend/tools/train/train_sttn.py +96 -0
backend/tools/train/trainer_sttn.py +319 -0
backend/tools/train/utils_sttn.py +271 -0
docker/Dockerfile +30 -0
google_colabs/README.md +126 -0
google_colabs/Video_Subtitle_Remover_Gradio.ipynb +218 -0
requirements.txt +29 -0

backend/config.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import warnings
+from enum import Enum, unique
+warnings.filterwarnings('ignore')
+import os
+import torch
+import logging
+import platform
+import stat
+from fsplit.filesplit import Filesplit
+import onnxruntime as ort
+# 项目版本号
+VERSION = "1.1.1"
+# ×××××××××××××××××××× [不要改] start ××××××××××××××××××××
+logging.disable(logging.DEBUG)  # 关闭DEBUG日志的打印
+logging.disable(logging.WARNING)  # 关闭WARNING日志的打印
+try:
+    import torch_directml
+    device = torch_directml.device(torch_directml.default_device())
+    USE_DML = True
+except:
+    USE_DML = False
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+LAMA_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'big-lama')
+STTN_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'sttn', 'infer_model.pth')
+VIDEO_INPAINT_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'video')
+MODEL_VERSION = 'V4'
+DET_MODEL_BASE = os.path.join(BASE_DIR, 'models')
+DET_MODEL_PATH = os.path.join(DET_MODEL_BASE, MODEL_VERSION, 'ch_det')
+# 查看该路径下是否有模型完整文件，没有的话合并小文件生成完整文件
+if 'big-lama.pt' not in (os.listdir(LAMA_MODEL_PATH)):
+    fs = Filesplit()
+    fs.merge(input_dir=LAMA_MODEL_PATH)
+if 'inference.pdiparams' not in os.listdir(DET_MODEL_PATH):
+    fs = Filesplit()
+    fs.merge(input_dir=DET_MODEL_PATH)
+if 'ProPainter.pth' not in os.listdir(VIDEO_INPAINT_MODEL_PATH):
+    fs = Filesplit()
+    fs.merge(input_dir=VIDEO_INPAINT_MODEL_PATH)
+# 指定ffmpeg可执行程序路径
+sys_str = platform.system()
+if sys_str == "Windows":
+    ffmpeg_bin = os.path.join('win_x64', 'ffmpeg.exe')
+elif sys_str == "Linux":
+    ffmpeg_bin = os.path.join('linux_x64', 'ffmpeg')
+else:
+    ffmpeg_bin = os.path.join('macos', 'ffmpeg')
+FFMPEG_PATH = os.path.join(BASE_DIR, '', 'ffmpeg', ffmpeg_bin)
+if 'ffmpeg.exe' not in os.listdir(os.path.join(BASE_DIR, '', 'ffmpeg', 'win_x64')):
+    fs = Filesplit()
+    fs.merge(input_dir=os.path.join(BASE_DIR, '', 'ffmpeg', 'win_x64'))
+# 将ffmpeg添加可执行权限
+os.chmod(FFMPEG_PATH, stat.S_IRWXU + stat.S_IRWXG + stat.S_IRWXO)
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+# 是否使用ONNX(DirectML/AMD/Intel)
+ONNX_PROVIDERS = []
+available_providers = ort.get_available_providers()
+for provider in available_providers:
+    if provider in [
+        "CPUExecutionProvider"
+    ]:
+        continue
+    if provider not in [
+        "DmlExecutionProvider",         # DirectML，适用于 Windows GPU
+        "ROCMExecutionProvider",        # AMD ROCm
+        "MIGraphXExecutionProvider",    # AMD MIGraphX
+        "VitisAIExecutionProvider",     # AMD VitisAI，适用于 RyzenAI & Windows, 实测和DirectML性能似乎差不多
+        "OpenVINOExecutionProvider",    # Intel GPU
+        "MetalExecutionProvider",       # Apple macOS
+        "CoreMLExecutionProvider",      # Apple macOS
+        "CUDAExecutionProvider",        # Nvidia GPU
+    ]:
+        continue
+    ONNX_PROVIDERS.append(provider)
+# ×××××××××××××××××××× [不要改] end ××××××××××××××××××××
+@unique
+class InpaintMode(Enum):
+    """
+    图像重绘算法枚举
+    """
+    STTN = 'sttn'
+    LAMA = 'lama'
+    PROPAINTER = 'propainter'
+    STABLE_DIFFUSION = 'sd'          # Stable Diffusion Inpainting
+    DIFFUERASER = 'diffueraser'       # DiffuEraser (diffusion-based)
+    E2FGVI = 'e2fgvi'                 # Flow-guided video inpainting
+# ×××××××××××××××××××× [可以改] start ××××××××××××××××××××
+# 是否使用h264编码，如果需要安卓手机分享生成的视频，请打开该选项
+USE_H264 = True
+# ×××××××××× 通用设置 start ××××××××××
+"""
+MODE可选算法类型
+- InpaintMode.STTN 算法：对于真人视频效果较好，速度快，可以跳过字幕检测
+- InpaintMode.LAMA 算法：对于动画类视频效果好，速度一般，不可以跳过字幕检测
+- InpaintMode.PROPAINTER 算法： 需要消耗大量显存，速度较慢，对运动非常剧烈的视频效果较好
+"""
+# 默认重绘算法模式 sttn/lama/propainter/sd/diffueraser/e2fgvi
+MODE = InpaintMode.STTN
+# ×××××××××××××××××××× Stable Diffusion Settings ××××××××××××××××××××
+SD_MODEL_PATH = 'backend/models/stable-diffusion-inpainting'
+SD_STEPS = 50                        # Inference steps
+SD_GUIDANCE_SCALE = 7.5              # Classifier-free guidance
+SD_PROMPT = "natural scene, high quality"  # Text prompt for guidance
+SD_USE_FP16 = True                   # Use half precision for faster inference
+# ×××××××××××××××××××× DiffuEraser Settings ××××××××××××××××××××
+DIFFUERASER_MODEL_PATH = 'backend/models/diffueraser'
+DIFFUERASER_STEPS = 50               # Diffusion steps
+DIFFUERASER_GUIDANCE = 7.5           # Guidance scale
+DIFFUERASER_USE_SAM2 = False         # Auto-masking with SAM2
+DIFFUERASER_MAX_LOAD_NUM = 80        # Max frames per batch
+# ×××××××××××××××××××× E2FGVI Settings ××××××××××××××××××××
+E2FGVI_MODEL_PATH = 'backend/models/e2fgvi'
+E2FGVI_MAX_LOAD_NUM = 80             # Max frames per batch
+E2FGVI_NEIGHBOR_LENGTH = 10          # Temporal window for flow
+# 【设置像素点偏差】
+# 用于判断是不是非字幕区域(一般认为字幕文本框的长度是要大于宽度的，如果字幕框的高大于宽，且大于的幅度超过指定像素点大小，则认为是错误检测)
+THRESHOLD_HEIGHT_WIDTH_DIFFERENCE = 10
+# 用于放大mask大小，防止自动检测的文本框过小，inpaint阶段出现文字边，有残留
+SUBTITLE_AREA_DEVIATION_PIXEL = 20
+# 同于判断两个文本框是否为同一行字幕，高度差距指定像素点以内认为是同一行
+THRESHOLD_HEIGHT_DIFFERENCE = 20
+# 用于判断两个字幕文本的矩形框是否相似，如果X轴和Y轴偏差都在指定阈值内，则认为时同一个文本框
+PIXEL_TOLERANCE_Y = 20  # 允许检测框纵向偏差的像素点数
+PIXEL_TOLERANCE_X = 20  # 允许检测框横向偏差的像素点数
+# ×××××××××× 通用设置 end ××××××××××
+# ×××××××××× InpaintMode.STTN算法设置 start ××××××××××
+# 以下参数仅适用STTN算法时，才生效
+"""
+1. STTN_SKIP_DETECTION
+含义：是否使用跳过检测
+效果：设置为True跳过字幕检测，会省去很大时间，但是可能误伤无字幕的视频帧或者会导致去除的字幕漏了
+2. STTN_NEIGHBOR_STRIDE
+含义：相邻帧数步长, 如果需要为第50帧填充缺失的区域，STTN_NEIGHBOR_STRIDE=5，那么算法会使用第45帧、第40帧等作为参照。
+效果：用于控制参考帧选择的密度，较大的步长意味着使用更少、更分散的参考帧，较小的步长意味着使用更多、更集中的参考帧。
+3. STTN_REFERENCE_LENGTH
+含义：参数帧数量，STTN算法会查看每个待修复帧的前后若干帧来获得用于修复的上下文信息
+效果：调大会增加显存占用，处理效果变好，但是处理速度变慢
+4. STTN_MAX_LOAD_NUM
+含义：STTN算法每次最多加载的视频帧数量
+效果：设置越大速度越慢，但效果越好
+注意：要保证STTN_MAX_LOAD_NUM大于STTN_NEIGHBOR_STRIDE和STTN_REFERENCE_LENGTH
+"""
+STTN_SKIP_DETECTION = True
+# 参考帧步长
+STTN_NEIGHBOR_STRIDE = 5
+# 参考帧长度（数量）
+STTN_REFERENCE_LENGTH = 10
+# 设置STTN算法最大同时处理的帧数量
+STTN_MAX_LOAD_NUM = 50
+if STTN_MAX_LOAD_NUM < STTN_REFERENCE_LENGTH * STTN_NEIGHBOR_STRIDE:
+    STTN_MAX_LOAD_NUM = STTN_REFERENCE_LENGTH * STTN_NEIGHBOR_STRIDE
+# ×××××××××× InpaintMode.STTN算法设置 end ××××××××××
+# ×××××××××× InpaintMode.PROPAINTER算法设置 start ××××××××××
+# 【根据自己的GPU显存大小设置】最大同时处理的图片数量，设置越大处理效果越好，但是要求显存越高
+# 1280x720p视频设置80需要25G显存，设置50需要19G显存
+# 720x480p视频设置80需要8G显存，设置50需要7G显存
+PROPAINTER_MAX_LOAD_NUM = 70
+# ×××××××××× InpaintMode.PROPAINTER算法设置 end ××××××××××
+# ×××××××××× InpaintMode.LAMA算法设置 start ××××××××××
+# 是否开启极速模式，开启后不保证inpaint效果，仅仅对包含文本的区域文本进行去除
+LAMA_SUPER_FAST = False
+# ×××××××××× InpaintMode.LAMA算法设置 end ××××××××××
+# ×××××××××××××××××××× [可以改] end ××××××××××××××××××××

backend/scenedetect/detectors/motion_detector.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# -*- coding: utf-8 -*-
+#
+#            PySceneDetect: Python-Based Video Scene Detector
+#   -------------------------------------------------------------------
+#     [  Site:    https://scenedetect.com                           ]
+#     [  Docs:    https://scenedetect.com/docs/                     ]
+#     [  Github:  https://github.com/Breakthrough/PySceneDetect/    ]
+#
+# Copyright (C) 2014-2023 Brandon Castellano <http://www.bcastell.com>.
+# PySceneDetect is licensed under the BSD 3-Clause License; see the
+# included LICENSE file, or visit one of the above pages for details.
+#
+""":class:`MotionDetector`, detects motion events using background subtraction, morphological
+transforms, and thresholding."""
+# Third-Party Library Imports
+import cv2
+# PySceneDetect Library Imports
+from scenedetect.scene_detector import SparseSceneDetector
+class MotionDetector(SparseSceneDetector):
+    """Detects motion events in scenes containing a static background.
+    Uses background subtraction followed by noise removal (via morphological
+    opening) to generate a frame score compared against the set threshold.
+    Attributes:
+        threshold:  floating point value compared to each frame's score, which
+            represents average intensity change per pixel (lower values are
+            more sensitive to motion changes).  Default 0.5, must be > 0.0.
+        num_frames_post_scene:  Number of frames to include in each motion
+            event after the frame score falls below the threshold, adding any
+            subsequent motion events to the same scene.
+        kernel_size:  Size of morphological opening kernel for noise removal.
+            Setting to -1 (default) will auto-compute based on video resolution
+            (typically 3 for SD, 5-7 for HD). Must be an odd integer > 1.
+    """
+    def __init__(self, threshold=0.50, num_frames_post_scene=30, kernel_size=-1):
+        """Initializes motion-based scene detector object."""
+        # TODO: Requires porting to v0.5 API.
+        raise NotImplementedError()
+        """
+        self.threshold = float(threshold)
+        self.num_frames_post_scene = int(num_frames_post_scene)
+        self.kernel_size = int(kernel_size)
+        if self.kernel_size < 0:
+            # Set kernel size when process_frame first runs based on
+            # video resolution (480p = 3x3, 720p = 5x5, 1080p = 7x7).
+            pass
+        self.bg_subtractor = cv2.createBackgroundSubtractorMOG2(
+            detectShadows = False )
+        self.last_frame_score = 0.0
+        self.in_motion_event = False
+        self.first_motion_frame_index = -1
+        self.last_motion_frame_index = -1
+        """
+    def process_frame(self, frame_num, frame_img):
+        # TODO.
+        """
+        frame_grayscale = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        masked_frame = self.bg_subtractor.apply(frame_grayscale)
+        kernel = numpy.ones((self.kernel_size, self.kernel_size), numpy.uint8)
+        filtered_frame = cv2.morphologyEx(fgmask, cv2.MORPH_OPEN, kernel)
+        frame_score = numpy.sum(filtered_frame) / float(
+            filtered_frame.shape[0] * filtered_frame.shape[1] )
+        """
+        return []
+    def post_process(self, frame_num):
+        """Writes the last scene if the video ends while in a motion event.
+        """
+        # If the last fade detected was a fade out, we add a corresponding new
+        # scene break to indicate the end of the scene.  This is only done for
+        # fade-outs, as a scene cut is already added when a fade-in is found.
+        """
+        if self.in_motion_event:
+            # Write new scene based on first and last motion event frames.
+            pass
+        return self.in_motion_event
+        """
+        return []

backend/scenedetect/detectors/threshold_detector.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# -*- coding: utf-8 -*-
+#
+#            PySceneDetect: Python-Based Video Scene Detector
+#   -------------------------------------------------------------------
+#     [  Site:    https://scenedetect.com                           ]
+#     [  Docs:    https://scenedetect.com/docs/                     ]
+#     [  Github:  https://github.com/Breakthrough/PySceneDetect/    ]
+#
+# Copyright (C) 2014-2023 Brandon Castellano <http://www.bcastell.com>.
+# PySceneDetect is licensed under the BSD 3-Clause License; see the
+# included LICENSE file, or visit one of the above pages for details.
+#
+""":class:`ThresholdDetector` uses a set intensity as a threshold to scene_detect cuts, which are
+triggered when the average pixel intensity exceeds or falls below this threshold.
+This detector is available from the command-line as the `scene_detect-threshold` command.
+"""
+from enum import Enum
+from logging import getLogger
+from typing import List, Optional
+import numpy
+from backend.scenedetect.scene_detector import SceneDetector
+logger = getLogger('pyscenedetect')
+##
+## ThresholdDetector Helper Functions
+##
+def _compute_frame_average(frame: numpy.ndarray) -> float:
+    """Computes the average pixel value/intensity for all pixels in a frame.
+    The value is computed by adding up the 8-bit R, G, and B values for
+    each pixel, and dividing by the number of pixels multiplied by 3.
+    Arguments:
+        frame: Frame representing the RGB pixels to average.
+    Returns:
+        Average pixel intensity across all 3 channels of `frame`
+    """
+    num_pixel_values = float(frame.shape[0] * frame.shape[1] * frame.shape[2])
+    avg_pixel_value = numpy.sum(frame[:, :, :]) / num_pixel_values
+    return avg_pixel_value
+##
+## ThresholdDetector Class Implementation
+##
+class ThresholdDetector(SceneDetector):
+    """Detects fast cuts/slow fades in from and out to a given threshold level.
+    Detects both fast cuts and slow fades so long as an appropriate threshold
+    is chosen (especially taking into account the minimum grey/black level).
+    """
+    class Method(Enum):
+        """Method for ThresholdDetector to use when comparing frame brightness to the threshold."""
+        FLOOR = 0
+        """Fade out happens when frame brightness falls below threshold."""
+        CEILING = 1
+        """Fade out happens when frame brightness rises above threshold."""
+    THRESHOLD_VALUE_KEY = 'average_rgb'
+    def __init__(
+        self,
+        threshold: float = 12,
+        min_scene_len: int = 15,
+        fade_bias: float = 0.0,
+        add_final_scene: bool = False,
+        method: Method = Method.FLOOR,
+        block_size=None,
+    ):
+        """
+        Arguments:
+            threshold:  8-bit intensity value that each pixel value (R, G, and B)
+                must be <= to in order to trigger a fade in/out.
+            min_scene_len:  FrameTimecode object or integer greater than 0 of the
+                minimum length, in frames, of a scene (or subsequent scene cut).
+            fade_bias:  Float between -1.0 and +1.0 representing the percentage of
+                timecode skew for the start of a scene (-1.0 causing a cut at the
+                fade-to-black, 0.0 in the middle, and +1.0 causing the cut to be
+                right at the position where the threshold is passed).
+            add_final_scene:  Boolean indicating if the video ends on a fade-out to
+                generate an additional scene at this timecode.
+            method: How to treat `threshold` when detecting fade events.
+            block_size: [DEPRECATED] DO NOT USE. For backwards compatibility.
+        """
+        # TODO(v0.7): Replace with DeprecationWarning that `block_size` will be removed in v0.8.
+        if block_size is not None:
+            logger.error('block_size is deprecated.')
+        super().__init__()
+        self.threshold = int(threshold)
+        self.method = ThresholdDetector.Method(method)
+        self.fade_bias = fade_bias
+        self.min_scene_len = min_scene_len
+        self.processed_frame = False
+        self.last_scene_cut = None
+        # Whether to add an additional scene or not when ending on a fade out
+        # (as cuts are only added on fade ins; see post_process() for details).
+        self.add_final_scene = add_final_scene
+        # Where the last fade (threshold crossing) was detected.
+        self.last_fade = {
+            'frame': 0,  # frame number where the last detected fade is
+            'type': None  # type of fade, can be either 'in' or 'out'
+        }
+        self._metric_keys = [ThresholdDetector.THRESHOLD_VALUE_KEY]
+    def get_metrics(self) -> List[str]:
+        return self._metric_keys
+    def process_frame(self, frame_num: int, frame_img: Optional[numpy.ndarray]) -> List[int]:
+        """
+        Args:
+            frame_num (int): Frame number of frame that is being passed.
+            frame_img (numpy.ndarray or None): Decoded frame image (numpy.ndarray) to perform
+                scene detection with. Can be None *only* if the self.is_processing_required()
+                method (inhereted from the base SceneDetector class) returns True.
+        Returns:
+            List[int]: List of frames where scene cuts have been detected. There may be 0
+            or more frames in the list, and not necessarily the same as frame_num.
+        """
+        # Initialize last scene cut point at the beginning of the frames of interest.
+        if self.last_scene_cut is None:
+            self.last_scene_cut = frame_num
+        # Compare the # of pixels under threshold in current_frame & last_frame.
+        # If absolute value of pixel intensity delta is above the threshold,
+        # then we trigger a new scene cut/break.
+        # List of cuts to return.
+        cut_list = []
+        # The metric used here to scene_detect scene breaks is the percent of pixels
+        # less than or equal to the threshold; however, since this differs on
+        # user-supplied values, we supply the average pixel intensity as this
+        # frame metric instead (to assist with manually selecting a threshold)
+        if (self.stats_manager is not None) and (self.stats_manager.metrics_exist(
+                frame_num, self._metric_keys)):
+            frame_avg = self.stats_manager.get_metrics(frame_num, self._metric_keys)[0]
+        else:
+            frame_avg = _compute_frame_average(frame_img)
+            if self.stats_manager is not None:
+                self.stats_manager.set_metrics(frame_num, {self._metric_keys[0]: frame_avg})
+        if self.processed_frame:
+            if self.last_fade['type'] == 'in' and ((
+                (self.method == ThresholdDetector.Method.FLOOR and frame_avg < self.threshold) or
+                (self.method == ThresholdDetector.Method.CEILING and frame_avg >= self.threshold))):
+                # Just faded out of a scene, wait for next fade in.
+                self.last_fade['type'] = 'out'
+                self.last_fade['frame'] = frame_num
+            elif self.last_fade['type'] == 'out' and (
+                (self.method == ThresholdDetector.Method.FLOOR and frame_avg >= self.threshold) or
+                (self.method == ThresholdDetector.Method.CEILING and frame_avg < self.threshold)):
+                # Only add the scene if min_scene_len frames have passed.
+                if (frame_num - self.last_scene_cut) >= self.min_scene_len:
+                    # Just faded into a new scene, compute timecode for the scene
+                    # split based on the fade bias.
+                    f_out = self.last_fade['frame']
+                    f_split = int(
+                        (frame_num + f_out + int(self.fade_bias * (frame_num - f_out))) / 2)
+                    cut_list.append(f_split)
+                    self.last_scene_cut = frame_num
+                self.last_fade['type'] = 'in'
+                self.last_fade['frame'] = frame_num
+        else:
+            self.last_fade['frame'] = 0
+            if frame_avg < self.threshold:
+                self.last_fade['type'] = 'out'
+            else:
+                self.last_fade['type'] = 'in'
+        self.processed_frame = True
+        return cut_list
+    def post_process(self, frame_num: int):
+        """Writes a final scene cut if the last detected fade was a fade-out.
+        Only writes the scene cut if add_final_scene is true, and the last fade
+        that was detected was a fade-out.  There is no bias applied to this cut
+        (since there is no corresponding fade-in) so it will be located at the
+        exact frame where the fade-out crossed the detection threshold.
+        """
+        # If the last fade detected was a fade out, we add a corresponding new
+        # scene break to indicate the end of the scene.  This is only done for
+        # fade-outs, as a scene cut is already added when a fade-in is found.
+        cut_times = []
+        if self.last_fade['type'] == 'out' and self.add_final_scene and (
+            (self.last_scene_cut is None and frame_num >= self.min_scene_len) or
+            (frame_num - self.last_scene_cut) >= self.min_scene_len):
+            cut_times.append(self.last_fade['frame'])
+        return cut_times

backend/tools/common_tools.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+video_extensions = {
+    '.mp4', '.m4a', '.m4v', '.f4v', '.f4a', '.m4b', '.m4r', '.f4b', '.mov',
+    '.3gp', '.3gp2', '.3g2', '.3gpp', '.3gpp2', '.ogg', '.oga', '.ogv', '.ogx',
+    '.wmv', '.wma', '.asf', '.webm', '.flv', '.avi', '.gifv', '.mkv', '.rm',
+    '.rmvb', '.vob', '.dvd', '.mpg', '.mpeg', '.mp2', '.mpe', '.mpv', '.mpg',
+    '.mpeg', '.m2v', '.svi', '.3gp', '.mxf', '.roq', '.nsv', '.flv', '.f4v',
+    '.f4p', '.f4a', '.f4b'
+}
+image_extensions = {
+    '.jpg', '.jpeg', '.jpe', '.jif', '.jfif', '.jfi', '.png', '.gif',
+    '.webp', '.tiff', '.tif', '.psd', '.raw', '.arw', '.cr2', '.nrw',
+    '.k25', '.bmp', '.dib', '.heif', '.heic', '.ind', '.indd', '.indt',
+    '.jp2', '.j2k', '.jpf', '.jpx', '.jpm', '.mj2', '.svg', '.svgz',
+    '.ai', '.eps', '.ico'
+}
+def is_video_file(filename):
+    return os.path.splitext(filename)[-1].lower() in video_extensions
+def is_image_file(filename):
+    return os.path.splitext(filename)[-1].lower() in image_extensions
+def is_video_or_image(filename):
+    file_extension = os.path.splitext(filename)[-1].lower()
+    # 检查扩展名是否在定义的视频或图片文件后缀集合中
+    return file_extension in video_extensions or file_extension in image_extensions

backend/tools/inpaint_tools.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import multiprocessing
+import cv2
+import numpy as np
+from backend import config
+from backend.inpaint.lama_inpaint import LamaInpaint
+def batch_generator(data, max_batch_size):
+    """
+    根据data大小，生成最大长度不超过max_batch_size的均匀批次数据
+    """
+    n_samples = len(data)
+    # 尝试找到一个比MAX_BATCH_SIZE小的batch_size，以使得所有的批次数量尽量接近
+    batch_size = max_batch_size
+    num_batches = n_samples // batch_size
+    # 处理最后一批可能不足batch_size的情况
+    # 如果最后一批少于其他批次，则减小batch_size尝试平衡每批的数量
+    while n_samples % batch_size < batch_size / 2.0 and batch_size > 1:
+        batch_size -= 1  # 减小批次大小
+        num_batches = n_samples // batch_size
+    # 生成前num_batches个批次
+    for i in range(num_batches):
+        yield data[i * batch_size:(i + 1) * batch_size]
+    # 将剩余的数据作为最后一个批次
+    last_batch_start = num_batches * batch_size
+    if last_batch_start < n_samples:
+        yield data[last_batch_start:]
+def inference_task(batch_data):
+    inpainted_frame_dict = dict()
+    for data in batch_data:
+        index, original_frame, coords_list = data
+        mask_size = original_frame.shape[:2]
+        mask = create_mask(mask_size, coords_list)
+        inpaint_frame = inpaint(original_frame, mask)
+        inpainted_frame_dict[index] = inpaint_frame
+    return inpainted_frame_dict
+def parallel_inference(inputs, batch_size=None, pool_size=None):
+    """
+    并行推理，同时保持结果顺序
+    """
+    if pool_size is None:
+        pool_size = multiprocessing.cpu_count()
+    # 使用上下文管理器自动管理进程池
+    with multiprocessing.Pool(processes=pool_size) as pool:
+        batched_inputs = list(batch_generator(inputs, batch_size))
+        # 使用map函数保证输入输出的顺序是一致的
+        batch_results = pool.map(inference_task, batched_inputs)
+    # 将批推理结果展平
+    index_inpainted_frames = [item for sublist in batch_results for item in sublist]
+    return index_inpainted_frames
+def inpaint(img, mask):
+    lama_inpaint_instance = LamaInpaint()
+    img_inpainted = lama_inpaint_instance(img, mask)
+    return img_inpainted
+def inpaint_with_multiple_masks(censored_img, mask_list):
+    inpainted_frame = censored_img
+    if mask_list:
+        for mask in mask_list:
+            inpainted_frame = inpaint(inpainted_frame, mask)
+    return inpainted_frame
+def create_mask(size, coords_list):
+    mask = np.zeros(size, dtype="uint8")
+    if coords_list:
+        for coords in coords_list:
+            xmin, xmax, ymin, ymax = coords
+            # 为了避免框过小，放大10个像素
+            x1 = xmin - config.SUBTITLE_AREA_DEVIATION_PIXEL
+            if x1 < 0:
+                x1 = 0
+            y1 = ymin - config.SUBTITLE_AREA_DEVIATION_PIXEL
+            if y1 < 0:
+                y1 = 0
+            x2 = xmax + config.SUBTITLE_AREA_DEVIATION_PIXEL
+            y2 = ymax + config.SUBTITLE_AREA_DEVIATION_PIXEL
+            cv2.rectangle(mask, (x1, y1),
+                          (x2, y2), (255, 255, 255), thickness=-1)
+    return mask
+def inpaint_video(video_path, sub_list):
+    index = 0
+    frame_to_inpaint_list = []
+    video_cap = cv2.VideoCapture(video_path)
+    while True:
+        # 读取视频帧
+        ret, frame = video_cap.read()
+        if not ret:
+            break
+        index += 1
+        if index in sub_list.keys():
+            frame_to_inpaint_list.append((index, frame, sub_list[index]))
+        if len(frame_to_inpaint_list) > config.PROPAINTER_MAX_LOAD_NUM:
+            batch_results = parallel_inference(frame_to_inpaint_list)
+            for index, frame in batch_results:
+                file_name = f'/home/yao/Documents/Project/video-subtitle-remover/test/temp/{index}.png'
+                cv2.imwrite(file_name, frame)
+                print(f"success write: {file_name}")
+            frame_to_inpaint_list.clear()
+    print(f'finished')
+if __name__ == '__main__':
+    multiprocessing.set_start_method("spawn")

backend/tools/makedist.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import argparse
+import os
+from qpt.executor import CreateExecutableModule as CEM
+from qpt.modules.cuda import CopyCUDAPackage
+from qpt.smart_opt import set_default_pip_source
+from qpt.kernel.qinterpreter import PYPI_PIP_SOURCE
+from qpt.modules.package import CustomPackage, DEFAULT_DEPLOY_MODE
+def main():
+    WORK_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    LAUNCH_PATH = os.path.join(WORK_DIR, 'gui.py')
+    SAVE_PATH = os.path.join(os.path.dirname(WORK_DIR), 'vsr_out')
+    ICON_PATH = os.path.join(WORK_DIR, "design", "vsr.ico")
+    # 解析命令行参数
+    parser = argparse.ArgumentParser(description="打包程序")
+    parser.add_argument(
+        "--cuda",
+        nargs="?",                # 可选参数值
+        const="11.8",             # 如果只写 --cuda，默认值是 10.2
+        default=None,             # 不写 --cuda，则为 None
+        help="是否包含CUDA模块，可指定版本，如 --cuda 或 --cuda=11.8"
+    )
+    parser.add_argument(
+        "--directml",
+        nargs="?",                # 可选参数值
+        const=True,               # 如果只写 --directml，默认为True
+        default=None,             # 不写 --directml，则为 None
+        help="是否使用DirectML加速，仅指定 --directml 即可启用"
+    )
+    args = parser.parse_args()
+    sub_modules = []
+    if args.cuda == "11.8":
+        sub_modules.append(CustomPackage("torch==2.7.0 torchvision==0.22.0", deploy_mode=DEFAULT_DEPLOY_MODE, find_links=PYPI_PIP_SOURCE, opts="--index-url https://download.pytorch.org/whl/cu118 "))
+    elif args.cuda == "12.6":
+        sub_modules.append(CustomPackage("torch==2.7.0 torchvision==0.22.0", deploy_mode=DEFAULT_DEPLOY_MODE, find_links=PYPI_PIP_SOURCE, opts="--index-url https://download.pytorch.org/whl/cu126 "))
+    elif args.cuda == "12.8":
+        sub_modules.append(CustomPackage("torch==2.7.0 torchvision==0.22.0", deploy_mode=DEFAULT_DEPLOY_MODE, find_links=PYPI_PIP_SOURCE, opts="--index-url https://download.pytorch.org/whl/cu128 "))
+    if args.directml:
+        sub_modules.append(CustomPackage("torch_directml==0.2.5.dev240914", deploy_mode=DEFAULT_DEPLOY_MODE))
+    if os.getenv("QPT_Action") == "True":
+        set_default_pip_source(PYPI_PIP_SOURCE)
+    module = CEM(
+        work_dir=WORK_DIR,
+        launcher_py_path=LAUNCH_PATH,
+        save_path=SAVE_PATH,
+        icon=ICON_PATH,
+        hidden_terminal=False,
+        requirements_file="./requirements.txt",
+        sub_modules=sub_modules,
+    )
+    module.make()
+if __name__ == '__main__':
+    main()

backend/tools/merge_video.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import cv2
+def merge_video(video_input_path0, video_input_path1, video_output_path):
+    """
+    将两个视频文件安装水平方向合并
+    """
+    input_video_cap0 = cv2.VideoCapture(video_input_path0)
+    input_video_cap1 = cv2.VideoCapture(video_input_path1)
+    fps = input_video_cap1.get(cv2.CAP_PROP_FPS)
+    size = (int(input_video_cap1.get(cv2.CAP_PROP_FRAME_WIDTH)), int(input_video_cap1.get(cv2.CAP_PROP_FRAME_HEIGHT)) * 2)
+    video_writer = cv2.VideoWriter(video_output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)
+    while True:
+        ret0, frame0 = input_video_cap0.read()
+        ret1, frame1 = input_video_cap1.read()
+        if not ret1 and not ret0:
+            break
+        else:
+            show = cv2.vconcat([frame0, frame1])
+            video_writer.write(show)
+    video_writer.release()
+if __name__ == '__main__':
+    v0_path = '../../test/test4.mp4'
+    v1_path = '../../test/test4_no_sub(1).mp4'
+    video_out_path = '../../test/demo.mp4'
+    merge_video(v0_path, v1_path, video_out_path)
+    # ffmpeg 命令 mp4转gif
+    # ffmpeg -i demo3.mp4 -vf "scale=w=720:h=-1,fps=15,split[s0][s1];[s0]palettegen[p];[s1][p]paletteuse" -loop 0 -r 15 -f gif output.gif
+    # 宽度固定400，高度成比例：
+    # ffmpeg - i input.avi -vf scale=400:-2

backend/tools/train/dataset_sttn.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import json
+import random
+import torch
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from backend.tools.train.utils_sttn import ZipReader, create_random_shape_with_random_motion
+from backend.tools.train.utils_sttn import Stack, ToTorchFormatTensor, GroupRandomHorizontalFlip
+# 自定义的数据集
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, args: dict, split='train', debug=False):
+        # 初始化函数，传入配置参数字典，数据集划分类型，默认为'train'
+        self.args = args
+        self.split = split
+        self.sample_length = args['sample_length']  # 样本长度参数
+        self.size = self.w, self.h = (args['w'], args['h'])  # 设置图像的目标宽高
+        # 打开存放数据相关信息的json文件
+        with open(os.path.join(args['data_root'], args['name'], split+'.json'), 'r') as f:
+            self.video_dict = json.load(f)  # 加载json文件内容
+        self.video_names = list(self.video_dict.keys())  # 获取视频的名称列表
+        if debug or split != 'train':  # 如果是调试模式或者不是训练集，只取前100个视频
+            self.video_names = self.video_names[:100]
+        # 定义数据的转换操作，转换成堆叠的张量
+        self._to_tensors = transforms.Compose([
+            Stack(),
+            ToTorchFormatTensor(),  # 便于在PyTorch中使用的张量格式
+        ])
+    def __len__(self):
+        # 返回数据集中视频的数量
+        return len(self.video_names)
+    def __getitem__(self, index):
+        # 获取一个样本项
+        try:
+            item = self.load_item(index)  # 尝试加载指定索引的数据项
+        except:
+            print('Loading error in video {}'.format(self.video_names[index]))  # 如果加载出错，打印出错信息
+            item = self.load_item(0)  # 加载第一个项目作为兜底
+        return item
+    def load_item(self, index):
+        # 加载数据项的具体实现
+        video_name = self.video_names[index]  # 根据索引获取视频名称
+        # 为所有视频帧生成帧文件名列表
+        all_frames = [f"{str(i).zfill(5)}.jpg" for i in range(self.video_dict[video_name])]
+        # 生成随机运动的随机形状的遮罩
+        all_masks = create_random_shape_with_random_motion(
+            len(all_frames), imageHeight=self.h, imageWidth=self.w)
+        # 获取参考帧的索引
+        ref_index = get_ref_index(len(all_frames), self.sample_length)
+        # 读取视频帧
+        frames = []
+        masks = []
+        for idx in ref_index:
+            # 读取图片，转化为RGB，调整大小并添加到列表中
+            img = ZipReader.imread('{}/{}/JPEGImages/{}.zip'.format(
+                self.args['data_root'], self.args['name'], video_name), all_frames[idx]).convert('RGB')
+            img = img.resize(self.size)
+            frames.append(img)
+            masks.append(all_masks[idx])
+        if self.split == 'train':
+            # 如果是训练集，随机水平翻转图像
+            frames = GroupRandomHorizontalFlip()(frames)
+        # 转换成张量形式
+        frame_tensors = self._to_tensors(frames)*2.0 - 1.0  # 归一化处理
+        mask_tensors = self._to_tensors(masks)  # 将遮罩转换成张量
+        return frame_tensors, mask_tensors  # 返回图像和遮罩的张量
+def get_ref_index(length, sample_length):
+    # 获取参考帧索引的实现
+    if random.uniform(0, 1) > 0.5:
+        # 有一半的概率随机选择帧
+        ref_index = random.sample(range(length), sample_length)
+        ref_index.sort()  # 排序保证顺序
+    else:
+        # 另一半概率选择连续的帧
+        pivot = random.randint(0, length-sample_length)
+        ref_index = [pivot+i for i in range(sample_length)]
+    return ref_index

backend/tools/train/loss_sttn.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn as nn
+class AdversarialLoss(nn.Module):
+    """
+    对抗性损失
+    根据论文 https://arxiv.org/abs/1711.10337 实现
+    """
+    def __init__(self, type='nsgan', target_real_label=1.0, target_fake_label=0.0):
+        """
+        可以选择的损失类型有 'nsgan' | 'lsgan' | 'hinge'
+        type: 指定使用哪种类型的 GAN 损失。
+        target_real_label: 真实图像的目标标签值。
+        target_fake_label: 生成图像的目标标签值。
+        """
+        super(AdversarialLoss, self).__init__()
+        self.type = type  # 损失类型
+        # 使用缓冲区注册标签，这样在模型保存和加载时会一同保存和加载
+        self.register_buffer('real_label', torch.tensor(target_real_label))
+        self.register_buffer('fake_label', torch.tensor(target_fake_label))
+        # 根据选择的类型初始化不同的损失函数
+        if type == 'nsgan':
+            self.criterion = nn.BCELoss()  # 二进制交叉熵损失（非饱和GAN）
+        elif type == 'lsgan':
+            self.criterion = nn.MSELoss()  # 均方误差损失（最小平方GAN）
+        elif type == 'hinge':
+            self.criterion = nn.ReLU()  # 适用于hinge损失的ReLU函数
+    def __call__(self, outputs, is_real, is_disc=None):
+        """
+        调用函数计算损失。
+        outputs: 网络输出。
+        is_real: 如果是真实样本，则为 True；如果是生成样本，则为 False。
+        is_disc: 指示当前是否在优化判别器。
+        """
+        if self.type == 'hinge':
+            # 对于 hinge 损失
+            if is_disc:
+                # 如果是判别器
+                if is_real:
+                    outputs = -outputs  # 对真实样本反向标签
+                # max(0, 1 - (真/假)示例输出)
+                return self.criterion(1 + outputs).mean()
+            else:
+                # 如果是生成器, -min(0, -输出) = max(0, 输出)
+                return (-outputs).mean()
+        else:
+            # 对于 nsgan 和 lsgan 损失
+            labels = (self.real_label if is_real else self.fake_label).expand_as(
+                outputs)
+            # 计算模型输出和目标标签之间的损失
+            loss = self.criterion(outputs, labels)
+            return loss

backend/tools/train/train_sttn.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import json
+import argparse
+from shutil import copyfile
+import torch
+import torch.multiprocessing as mp
+from backend.tools.train.trainer_sttn import Trainer
+from backend.tools.train.utils_sttn import (
+    get_world_size,
+    get_local_rank,
+    get_global_rank,
+    get_master_ip,
+)
+parser = argparse.ArgumentParser(description='STTN')
+parser.add_argument('-c', '--config', default='configs_sttn/youtube-vos.json', type=str)
+parser.add_argument('-m', '--model', default='sttn', type=str)
+parser.add_argument('-p', '--port', default='23455', type=str)
+parser.add_argument('-e', '--exam', action='store_true')
+args = parser.parse_args()
+def main_worker(rank, config):
+    # 如果配置中没有提到局部排序（local_rank），就给它和全局排序（global_rank）赋值为传入的排序（rank）
+    if 'local_rank' not in config:
+        config['local_rank'] = config['global_rank'] = rank
+    # 如果配置指定为分布式训练
+    if config['distributed']:
+        # 设置使用的CUDA设备为当前的本地排名对应的GPU
+        torch.cuda.set_device(int(config['local_rank']))
+        # 初始化分布式进程组，通过nccl后端
+        torch.distributed.init_process_group(
+            backend='nccl',
+            init_method=config['init_method'],
+            world_size=config['world_size'],
+            rank=config['global_rank'],
+            group_name='mtorch'
+        )
+        # 打印当前GPU的使用情况，输出全球排名和本地排名
+        print('using GPU {}-{} for training'.format(
+            int(config['global_rank']), int(config['local_rank']))
+        )
+    # 创建模型保存的目录路径，包括模型名和配置文件名
+    config['save_dir'] = os.path.join(
+        config['save_dir'], '{}_{}'.format(config['model'], os.path.basename(args.config).split('.')[0])
+    )
+    # 如果CUDA可用，则设置设备为相应的CUDA设备，否则为CPU
+    if torch.cuda.is_available():
+        config['device'] = torch.device("cuda:{}".format(config['local_rank']))
+    else:
+        config['device'] = 'cpu'
+    # 如果不是分布式训练，或者是分布式训练的主节点（rank 0）
+    if (not config['distributed']) or config['global_rank'] == 0:
+        # 创建模型保存目录，并允许如果该目录存在则忽略创建（exist_ok=True）
+        os.makedirs(config['save_dir'], exist_ok=True)
+        # 设置配置文件的保存路径
+        config_path = os.path.join(
+            config['save_dir'], config['config'].split('/')[-1]
+        )
+        # 如果配置文件不存在，则从给定的配置文件路径复制到新路径
+        if not os.path.isfile(config_path):
+            copyfile(config['config'], config_path)
+        # 打印创建目录的信息
+        print('[**] create folder {}'.format(config['save_dir']))
+    # 初始化训练器，传入配置参数和debug标记
+    trainer = Trainer(config, debug=args.exam)
+    # 开始训练
+    trainer.train()
+if __name__ == "__main__":
+    # 加载配置文件
+    config = json.load(open(args.config))
+    config['model'] = args.model  # 设置模型名称
+    config['config'] = args.config  # 设置配置文件路径
+    # 设置分布式训练的相关配置
+    config['world_size'] = get_world_size()  # 获取全局进程数，即训练过程中参与计算的总GPU数量
+    config['init_method'] = f"tcp://{get_master_ip()}:{args.port}"  # 设置初始化方法，包括主节点IP和端口
+    config['distributed'] = True if config['world_size'] > 1 else False  # 根据世界规模确定是否启用分布式训练
+    # 设置分布式并行训练环境
+    if get_master_ip() == "127.0.0.1":
+        # 如果主节点IP是本机地址，那么手动启动多个分布式训练进程
+        mp.spawn(main_worker, nprocs=config['world_size'], args=(config,))
+    else:
+        # 如果是由其他工具如OpenMPI启动的多个进程，不需手动创建进程。
+        config['local_rank'] = get_local_rank()  # 获取本地（单个节点）排名
+        config['global_rank'] = get_global_rank()  # 获取全局排名
+        main_worker(-1, config)  # 启动主工作函数

backend/tools/train/trainer_sttn.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import os
+import glob
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from tensorboardX import SummaryWriter
+from backend.inpaint.sttn.auto_sttn import Discriminator
+from backend.inpaint.sttn.auto_sttn import InpaintGenerator
+from backend.tools.train.dataset_sttn import Dataset
+from backend.tools.train.loss_sttn import AdversarialLoss
+class Trainer:
+    def __init__(self, config, debug=False):
+        # 训练器初始化
+        self.config = config  # 保存配置信息
+        self.epoch = 0  # 当前训练所处的epoch
+        self.iteration = 0  # 当前训练迭代次数
+        if debug:
+            # 如果是调试模式，设置更频繁的保存和验证频率
+            self.config['trainer']['save_freq'] = 5
+            self.config['trainer']['valid_freq'] = 5
+            self.config['trainer']['iterations'] = 5
+        # 设置数据集和数据加载器
+        self.train_dataset = Dataset(config['data_loader'], split='train', debug=debug)  # 创建训练集对象
+        self.train_sampler = None  # 初始化训练集采样器为None
+        self.train_args = config['trainer']  # 训练过程参数
+        if config['distributed']:
+            # 如果是分布式训练，则初始化分布式采样器
+            self.train_sampler = DistributedSampler(
+                self.train_dataset,
+                num_replicas=config['world_size'],
+                rank=config['global_rank']
+            )
+        self.train_loader = DataLoader(
+            self.train_dataset,
+            batch_size=self.train_args['batch_size'] // config['world_size'],
+            shuffle=(self.train_sampler is None),  # 如果没有采样器则进行打乱
+            num_workers=self.train_args['num_workers'],
+            sampler=self.train_sampler
+        )
+        # 设置损失函数
+        self.adversarial_loss = AdversarialLoss(type=self.config['losses']['GAN_LOSS'])  # 对抗性损失
+        self.adversarial_loss = self.adversarial_loss.to(self.config['device'])  # 将损失函数转移到相应设备
+        self.l1_loss = nn.L1Loss()  # L1损失
+        # 初始化生成器和判别器模型
+        self.netG = InpaintGenerator()  # 生成网络
+        self.netG = self.netG.to(self.config['device'])  # 转移到设备
+        self.netD = Discriminator(
+            in_channels=3, use_sigmoid=config['losses']['GAN_LOSS'] != 'hinge'
+        )
+        self.netD = self.netD.to(self.config['device'])  # 判别网络
+        # 初始化优化器
+        self.optimG = torch.optim.Adam(
+            self.netG.parameters(),  # 生成器参数
+            lr=config['trainer']['lr'],  # 学习率
+            betas=(self.config['trainer']['beta1'], self.config['trainer']['beta2'])
+        )
+        self.optimD = torch.optim.Adam(
+            self.netD.parameters(),  # 判别器参数
+            lr=config['trainer']['lr'],  # 学习率
+            betas=(self.config['trainer']['beta1'], self.config['trainer']['beta2'])
+        )
+        self.load()  # 加载模型
+        if config['distributed']:
+            # 如果是分布式训练，则使用分布式数据并行包装器
+            self.netG = DDP(
+                self.netG,
+                device_ids=[self.config['local_rank']],
+                output_device=self.config['local_rank'],
+                broadcast_buffers=True,
+                find_unused_parameters=False
+            )
+            self.netD = DDP(
+                self.netD,
+                device_ids=[self.config['local_rank']],
+                output_device=self.config['local_rank'],
+                broadcast_buffers=True,
+                find_unused_parameters=False
+            )
+        # 设置日志记录器
+        self.dis_writer = None  # 判别器写入器
+        self.gen_writer = None  # 生成器写入器
+        self.summary = {}  # 存放摘要统计
+        if self.config['global_rank'] == 0 or (not config['distributed']):
+            # 如果不是分布式训练或者为分布式训练的主节点
+            self.dis_writer = SummaryWriter(
+                os.path.join(config['save_dir'], 'dis')
+            )
+            self.gen_writer = SummaryWriter(
+                os.path.join(config['save_dir'], 'gen')
+            )
+    # 获取当前学习率
+    def get_lr(self):
+        return self.optimG.param_groups[0]['lr']
+    # 调整学习率
+    def adjust_learning_rate(self):
+        # 计算衰减的学习率
+        decay = 0.1 ** (min(self.iteration, self.config['trainer']['niter_steady']) // self.config['trainer']['niter'])
+        new_lr = self.config['trainer']['lr'] * decay
+        # 如果新的学习率和当前学习率不同，则更新优化器中的学习率
+        if new_lr != self.get_lr():
+            for param_group in self.optimG.param_groups:
+                param_group['lr'] = new_lr
+            for param_group in self.optimD.param_groups:
+                param_group['lr'] = new_lr
+    # 添加摘要信息
+    def add_summary(self, writer, name, val):
+        # 添加并更新统计信息，每次迭代都累加
+        if name not in self.summary:
+            self.summary[name] = 0
+        self.summary[name] += val
+        # 每100次迭代记录一次
+        if writer is not None and self.iteration % 100 == 0:
+            writer.add_scalar(name, self.summary[name] / 100, self.iteration)
+            self.summary[name] = 0
+    # 加载模型netG and netD
+    def load(self):
+        model_path = self.config['save_dir']  # 模型的保存路径
+        # 检测是否存在最近的模型检查点
+        if os.path.isfile(os.path.join(model_path, 'latest.ckpt')):
+            # 读取最后一个epoch的编号
+            latest_epoch = open(os.path.join(
+                model_path, 'latest.ckpt'), 'r').read().splitlines()[-1]
+        else:
+            # 如果不存在latest.ckpt，尝试读取存储好的模型文件列表，获取最近的一个
+            ckpts = [os.path.basename(i).split('.pth')[0] for i in glob.glob(
+                os.path.join(model_path, '*.pth'))]
+            ckpts.sort()  # 排序模型文件，以获取最近的一个
+            latest_epoch = ckpts[-1] if len(ckpts) > 0 else None  # 获取最近的epoch值
+        if latest_epoch is not None:
+            # 拼接得到生成器和判别器的模型文件路径
+            gen_path = os.path.join(
+                model_path, 'gen_{}.pth'.format(str(latest_epoch).zfill(5)))
+            dis_path = os.path.join(
+                model_path, 'dis_{}.pth'.format(str(latest_epoch).zfill(5)))
+            opt_path = os.path.join(
+                model_path, 'opt_{}.pth'.format(str(latest_epoch).zfill(5)))
+            # 如果是主节点，输出加载模型的信息
+            if self.config['global_rank'] == 0:
+                print('Loading model from {}...'.format(gen_path))
+            # 加载生成器模型
+            data = torch.load(gen_path, map_location=self.config['device'])
+            self.netG.load_state_dict(data['netG'])
+            # 加载判别器模型
+            data = torch.load(dis_path, map_location=self.config['device'])
+            self.netD.load_state_dict(data['netD'])
+            # 加载优化器状态
+            data = torch.load(opt_path, map_location=self.config['device'])
+            self.optimG.load_state_dict(data['optimG'])
+            self.optimD.load_state_dict(data['optimD'])
+            # 更新当前epoch和迭代次数
+            self.epoch = data['epoch']
+            self.iteration = data['iteration']
+        else:
+            # 如果没有找到模型文件，则输出警告信息
+            if self.config['global_rank'] == 0:
+                print('Warning: There is no trained model found. An initialized model will be used.')
+    # 保存模型参数，每次评估周期 (eval_epoch) 调用一次
+    def save(self, it):
+        # 只在全局排名为0的进程上执行保存操作，通常代表主节点
+        if self.config['global_rank'] == 0:
+            # 生成保存生成器模型状态字典的文件路径
+            gen_path = os.path.join(
+                self.config['save_dir'], 'gen_{}.pth'.format(str(it).zfill(5)))
+            # 生成保存判别器模型状态字典的文件路径
+            dis_path = os.path.join(
+                self.config['save_dir'], 'dis_{}.pth'.format(str(it).zfill(5)))
+            # 生成保存优化器状态字典的文件路径
+            opt_path = os.path.join(
+                self.config['save_dir'], 'opt_{}.pth'.format(str(it).zfill(5)))
+            # 打印消息表示模型正在保存
+            print('\nsaving model to {} ...'.format(gen_path))
+            # 判断模型是否是经过DataParallel或DDP包装的，若是则获取原始的模型
+            if isinstance(self.netG, torch.nn.DataParallel) or isinstance(self.netG, DDP):
+                netG = self.netG.module
+                netD = self.netD.module
+            else:
+                netG = self.netG
+                netD = self.netD
+            # 保存生成器和判别器的模型参数
+            torch.save({'netG': netG.state_dict()}, gen_path)
+            torch.save({'netD': netD.state_dict()}, dis_path)
+            # 保存当前的epoch、迭代次数和优化器的状态
+            torch.save({
+                'epoch': self.epoch,
+                'iteration': self.iteration,
+                'optimG': self.optimG.state_dict(),
+                'optimD': self.optimD.state_dict()
+            }, opt_path)
+            # 写入最新的迭代次数到"latest.ckpt"文件
+            os.system('echo {} > {}'.format(str(it).zfill(5),
+                                            os.path.join(self.config['save_dir'], 'latest.ckpt')))
+        # 训练入口
+    def train(self):
+        # 初始化进度条范围
+        pbar = range(int(self.train_args['iterations']))
+        # 如果是全局rank 0的进程，则设置显示进度条
+        if self.config['global_rank'] == 0:
+            pbar = tqdm(pbar, initial=self.iteration, dynamic_ncols=True, smoothing=0.01)
+        # 开始训练循环
+        while True:
+            self.epoch += 1  # epoch计数增加
+            if self.config['distributed']:
+                # 如果是分布式训练，则对采样器进行设置，保证每个进程获取的数据不同
+                self.train_sampler.set_epoch(self.epoch)
+            # 调用训练一个epoch的函数
+            self._train_epoch(pbar)
+            # 如果迭代次数超过配置中的迭代上限，则退出循环
+            if self.iteration > self.train_args['iterations']:
+                break
+        # 训练结束输出
+        print('\nEnd training....')
+        # 每个训练周期处理输入并计算损失
+    def _train_epoch(self, pbar):
+        device = self.config['device']  # 获取设备信息
+        # 遍历数据加载器中的数据
+        for frames, masks in self.train_loader:
+            # 调整学习率
+            self.adjust_learning_rate()
+            # 迭代次数+1
+            self.iteration += 1
+            # 将frames和masks转移到设备上
+            frames, masks = frames.to(device), masks.to(device)
+            b, t, c, h, w = frames.size()  # 获取帧和蒙版的尺寸
+            masked_frame = (frames * (1 - masks).float())  # 应用蒙版到图像
+            pred_img = self.netG(masked_frame, masks)  # 使用生成器生成填充图像
+            # 调整frames和masks的维度以符合网络的输入要求
+            frames = frames.view(b * t, c, h, w)
+            masks = masks.view(b * t, 1, h, w)
+            comp_img = frames * (1. - masks) + masks * pred_img  # 生成最终的组合图像
+            gen_loss = 0  # 初始化生成器损失
+            dis_loss = 0  # 初始化判别器损失
+            # 判别器对抗性损失
+            real_vid_feat = self.netD(frames)  # 判别器对真实图像判别
+            fake_vid_feat = self.netD(comp_img.detach())  # 判别器对生成图像判别，注意detach是为了不计算梯度
+            dis_real_loss = self.adversarial_loss(real_vid_feat, True, True)  # 真实图像的损失
+            dis_fake_loss = self.adversarial_loss(fake_vid_feat, False, True)  # 生成图像的损失
+            dis_loss += (dis_real_loss + dis_fake_loss) / 2  # 求平均的判别器损失
+            # 添加判别器损失到摘要
+            self.add_summary(self.dis_writer, 'loss/dis_vid_fake', dis_fake_loss.item())
+            self.add_summary(self.dis_writer, 'loss/dis_vid_real', dis_real_loss.item())
+            # 优化判别器
+            self.optimD.zero_grad()
+            dis_loss.backward()
+            self.optimD.step()
+            # 生成器对抗性损失
+            gen_vid_feat = self.netD(comp_img)
+            gan_loss = self.adversarial_loss(gen_vid_feat, True, False)  # 生成器的对抗损失
+            gan_loss = gan_loss * self.config['losses']['adversarial_weight']  # 权重放大
+            gen_loss += gan_loss  # 累加到生成器损失
+            # 添加生成器对抗性损失到摘要
+            self.add_summary(self.gen_writer, 'loss/gan_loss', gan_loss.item())
+            # 生成器L1损失
+            hole_loss = self.l1_loss(pred_img * masks, frames * masks)  # 只计算有蒙版区域的损失
+            # 考虑蒙版的平均值，乘以配置中的hole_weight
+            hole_loss = hole_loss / torch.mean(masks) * self.config['losses']['hole_weight']
+            gen_loss += hole_loss  # 累加到生成器损失
+            # 添加hole_loss到摘要
+            self.add_summary(self.gen_writer, 'loss/hole_loss', hole_loss.item())
+            # 计算蒙版外区域的L1损失
+            valid_loss = self.l1_loss(pred_img * (1 - masks), frames * (1 - masks))
+            # 考虑非蒙版区的平均值，乘以配置中的valid_weight
+            valid_loss = valid_loss / torch.mean(1 - masks) * self.config['losses']['valid_weight']
+            gen_loss += valid_loss  # 累加到生成器损失
+            # 添加valid_loss到摘要
+            self.add_summary(self.gen_writer, 'loss/valid_loss', valid_loss.item())
+            # 生成器优化
+            self.optimG.zero_grad()
+            gen_loss.backward()
+            self.optimG.step()
+            # 控制台日志输出
+            if self.config['global_rank'] == 0:
+                pbar.update(1)  # 进度条更新
+                pbar.set_description((  # 设置进度条描述
+                    f"d: {dis_loss.item():.3f}; g: {gan_loss.item():.3f};"  # 打印损失数值
+                    f"hole: {hole_loss.item():.3f}; valid: {valid_loss.item():.3f}")
+                )
+            # 模型保存
+            if self.iteration % self.train_args['save_freq'] == 0:
+                self.save(int(self.iteration // self.train_args['save_freq']))
+            # 迭代次数终止判断
+            if self.iteration > self.train_args['iterations']:
+                break

backend/tools/train/utils_sttn.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import os
+import matplotlib.patches as patches
+from matplotlib.path import Path
+import io
+import cv2
+import random
+import zipfile
+import numpy as np
+from PIL import Image, ImageOps
+import torch
+import matplotlib
+from matplotlib import pyplot as plt
+matplotlib.use('agg')
+class ZipReader(object):
+    file_dict = dict()
+    def __init__(self):
+        super(ZipReader, self).__init__()
+    @staticmethod
+    def build_file_dict(path):
+        file_dict = ZipReader.file_dict
+        if path in file_dict:
+            return file_dict[path]
+        else:
+            file_handle = zipfile.ZipFile(path, 'r')
+            file_dict[path] = file_handle
+            return file_dict[path]
+    @staticmethod
+    def imread(path, image_name):
+        zfile = ZipReader.build_file_dict(path)
+        data = zfile.read(image_name)
+        im = Image.open(io.BytesIO(data))
+        return im
+class GroupRandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+    def __init__(self, is_flow=False):
+        self.is_flow = is_flow
+    def __call__(self, img_group, is_flow=False):
+        v = random.random()
+        if v < 0.5:
+            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+            if self.is_flow:
+                for i in range(0, len(ret), 2):
+                    # invert flow pixel values when flipping
+                    ret[i] = ImageOps.invert(ret[i])
+            return ret
+        else:
+            return img_group
+class Stack(object):
+    def __init__(self, roll=False):
+        self.roll = roll
+    def __call__(self, img_group):
+        mode = img_group[0].mode
+        if mode == '1':
+            img_group = [img.convert('L') for img in img_group]
+            mode = 'L'
+        if mode == 'L':
+            return np.stack([np.expand_dims(x, 2) for x in img_group], axis=2)
+        elif mode == 'RGB':
+            if self.roll:
+                return np.stack([np.array(x)[:, :, ::-1] for x in img_group], axis=2)
+            else:
+                return np.stack(img_group, axis=2)
+        else:
+            raise NotImplementedError(f"Image mode {mode}")
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+    def __init__(self, div=True):
+        self.div = div
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            # numpy img: [L, C, H, W]
+            img = torch.from_numpy(pic).permute(2, 3, 0, 1).contiguous()
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.float().div(255) if self.div else img.float()
+        return img
+def create_random_shape_with_random_motion(video_length, imageHeight=240, imageWidth=432):
+    # get a random shape
+    height = random.randint(imageHeight//3, imageHeight-1)
+    width = random.randint(imageWidth//3, imageWidth-1)
+    edge_num = random.randint(6, 8)
+    ratio = random.randint(6, 8)/10
+    region = get_random_shape(
+        edge_num=edge_num, ratio=ratio, height=height, width=width)
+    region_width, region_height = region.size
+    # get random position
+    x, y = random.randint(
+        0, imageHeight-region_height), random.randint(0, imageWidth-region_width)
+    velocity = get_random_velocity(max_speed=3)
+    m = Image.fromarray(np.zeros((imageHeight, imageWidth)).astype(np.uint8))
+    m.paste(region, (y, x, y+region.size[0], x+region.size[1]))
+    masks = [m.convert('L')]
+    # return fixed masks
+    if random.uniform(0, 1) > 0.5:
+        return masks*video_length
+    # return moving masks
+    for _ in range(video_length-1):
+        x, y, velocity = random_move_control_points(
+            x, y, imageHeight, imageWidth, velocity, region.size, maxLineAcceleration=(3, 0.5), maxInitSpeed=3)
+        m = Image.fromarray(
+            np.zeros((imageHeight, imageWidth)).astype(np.uint8))
+        m.paste(region, (y, x, y+region.size[0], x+region.size[1]))
+        masks.append(m.convert('L'))
+    return masks
+def get_random_shape(edge_num=9, ratio=0.7, width=432, height=240):
+    '''
+      There is the initial point and 3 points per cubic bezier curve.
+      Thus, the curve will only pass though n points, which will be the sharp edges.
+      The other 2 modify the shape of the bezier curve.
+      edge_num, Number of possibly sharp edges
+      points_num, number of points in the Path
+      ratio, (0, 1) magnitude of the perturbation from the unit circle,
+    '''
+    points_num = edge_num*3 + 1
+    angles = np.linspace(0, 2*np.pi, points_num)
+    codes = np.full(points_num, Path.CURVE4)
+    codes[0] = Path.MOVETO
+    # Using this instad of Path.CLOSEPOLY avoids an innecessary straight line
+    verts = np.stack((np.cos(angles), np.sin(angles))).T * \
+        (2*ratio*np.random.random(points_num)+1-ratio)[:, None]
+    verts[-1, :] = verts[0, :]
+    path = Path(verts, codes)
+    # draw paths into images
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    patch = patches.PathPatch(path, facecolor='black', lw=2)
+    ax.add_patch(patch)
+    ax.set_xlim(np.min(verts)*1.1, np.max(verts)*1.1)
+    ax.set_ylim(np.min(verts)*1.1, np.max(verts)*1.1)
+    ax.axis('off')  # removes the axis to leave only the shape
+    fig.canvas.draw()
+    # convert plt images into numpy images
+    data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+    data = data.reshape((fig.canvas.get_width_height()[::-1] + (3,)))
+    plt.close(fig)
+    # postprocess
+    data = cv2.resize(data, (width, height))[:, :, 0]
+    data = (1 - np.array(data > 0).astype(np.uint8))*255
+    corrdinates = np.where(data > 0)
+    xmin, xmax, ymin, ymax = np.min(corrdinates[0]), np.max(
+        corrdinates[0]), np.min(corrdinates[1]), np.max(corrdinates[1])
+    region = Image.fromarray(data).crop((ymin, xmin, ymax, xmax))
+    return region
+def random_accelerate(velocity, maxAcceleration, dist='uniform'):
+    speed, angle = velocity
+    d_speed, d_angle = maxAcceleration
+    if dist == 'uniform':
+        speed += np.random.uniform(-d_speed, d_speed)
+        angle += np.random.uniform(-d_angle, d_angle)
+    elif dist == 'guassian':
+        speed += np.random.normal(0, d_speed / 2)
+        angle += np.random.normal(0, d_angle / 2)
+    else:
+        raise NotImplementedError(
+            f'Distribution type {dist} is not supported.')
+    return (speed, angle)
+def get_random_velocity(max_speed=3, dist='uniform'):
+    if dist == 'uniform':
+        speed = np.random.uniform(max_speed)
+    elif dist == 'guassian':
+        speed = np.abs(np.random.normal(0, max_speed / 2))
+    else:
+        raise NotImplementedError(
+            f'Distribution type {dist} is not supported.')
+    angle = np.random.uniform(0, 2 * np.pi)
+    return (speed, angle)
+def random_move_control_points(X, Y, imageHeight, imageWidth, lineVelocity, region_size, maxLineAcceleration=(3, 0.5), maxInitSpeed=3):
+    region_width, region_height = region_size
+    speed, angle = lineVelocity
+    X += int(speed * np.cos(angle))
+    Y += int(speed * np.sin(angle))
+    lineVelocity = random_accelerate(
+        lineVelocity, maxLineAcceleration, dist='guassian')
+    if (X > imageHeight - region_height) or (X < 0) or (Y > imageWidth - region_width) or (Y < 0):
+        lineVelocity = get_random_velocity(maxInitSpeed, dist='guassian')
+    new_X = np.clip(X, 0, imageHeight - region_height)
+    new_Y = np.clip(Y, 0, imageWidth - region_width)
+    return new_X, new_Y, lineVelocity
+def get_world_size():
+    """Find OMPI world size without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get('PMI_SIZE') is not None:
+        return int(os.environ.get('PMI_SIZE') or 1)
+    elif os.environ.get('OMPI_COMM_WORLD_SIZE') is not None:
+        return int(os.environ.get('OMPI_COMM_WORLD_SIZE') or 1)
+    else:
+        return torch.cuda.device_count()
+def get_global_rank():
+    """Find OMPI world rank without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get('PMI_RANK') is not None:
+        return int(os.environ.get('PMI_RANK') or 0)
+    elif os.environ.get('OMPI_COMM_WORLD_RANK') is not None:
+        return int(os.environ.get('OMPI_COMM_WORLD_RANK') or 0)
+    else:
+        return 0
+def get_local_rank():
+    """Find OMPI local rank without calling mpi functions
+    :rtype: int
+    """
+    if os.environ.get('MPI_LOCALRANKID') is not None:
+        return int(os.environ.get('MPI_LOCALRANKID') or 0)
+    elif os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') is not None:
+        return int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') or 0)
+    else:
+        return 0
+def get_master_ip():
+    if os.environ.get('AZ_BATCH_MASTER_NODE') is not None:
+        return os.environ.get('AZ_BATCH_MASTER_NODE').split(':')[0]
+    elif os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE') is not None:
+        return os.environ.get('AZ_BATCHAI_MPI_MASTER_NODE')
+    else:
+        return "127.0.0.1"
+if __name__ == '__main__':
+    trials = 10
+    for _ in range(trials):
+        video_length = 10
+        # The returned masks are either stationary (50%) or moving (50%)
+        masks = create_random_shape_with_random_motion(
+            video_length, imageHeight=240, imageWidth=432)
+        for m in masks:
+            cv2.imshow('mask', np.array(m))
+            cv2.waitKey(500)

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.12
+RUN --mount=type=cache,target=/root/.cache,sharing=private \
+    apt update && \
+    apt install -y libgl1-mesa-glx && \
+    true
+ADD . /vsr
+ARG CUDA_VERSION=11.8
+ARG USE_DIRECTML=0
+# 如果是 CUDA 版本，执行 CUDA 特定设置
+RUN --mount=type=cache,target=/root/.cache,sharing=private \
+    if [ "${USE_DIRECTML:-0}" != "1" ]; then \
+        pip install paddlepaddle==3.0 && \
+        pip install torch==2.7.0 torchvision==0.22.0 --index-url https://download.pytorch.org/whl/cu$(echo ${CUDA_VERSION} | tr -d '.') && \
+        pip install -r /vsr/requirements.txt; \
+    fi
+# 如果是 DirectML 版本，执行 DirectML 特定设置
+RUN --mount=type=cache,target=/root/.cache,sharing=private \
+    if [ "${USE_DIRECTML:-0}" = "1" ]; then \
+        pip install paddlepaddle==3.0 && \
+        pip install torch_directml==0.2.5.dev240914 && \
+        pip install -r /vsr/requirements.txt; \
+    fi
+ENV LD_LIBRARY_PATH=/usr/local/lib/python3.12/site-packages/nvidia/cudnn/lib/
+WORKDIR /vsr
+CMD ["python", "/vsr/backend/main.py"]

google_colabs/README.md ADDED Viewed

	@@ -0,0 +1,126 @@

+# Google Colab Gradio Interface
+This folder contains two versions of the Google Colab notebook:
+## Files
+### 1. `Video_Subtitle_Remover_Gradio.ipynb` ⭐ **NEW - Recommended**
+**Gradio Web Interface** - Easy-to-use browser-based UI
+**Features:**
+- 🖱️ Click-and-upload interface (no coding required)
+- 🎨 Visual algorithm selection
+- ⚙️ Adjustable parameters with sliders
+- 📊 Real-time progress tracking
+- 📥 One-click download
+**Best for:**
+- Users who prefer GUI
+- Quick testing
+- Non-technical users
+- Multiple video processing
+**Usage:**
+1. Open in Colab
+2. Run all cells
+3. Click the generated link
+4. Use web interface in browser
+---
+### 2. `Video_Subtitle_Remover.ipynb`
+**Traditional Notebook** - Code-based approach
+**Features:**
+- Step-by-step execution
+- Full control over parameters
+- Good for understanding the process
+- Batch processing scripts
+**Best for:**
+- Users comfortable with code
+- Custom workflows
+- Debugging
+- Learning the internals
+---
+## Quick Start
+### For Gradio Interface (Recommended):
+```bash
+1. Open Video_Subtitle_Remover_Gradio.ipynb in Colab
+2. Runtime → Change runtime type → GPU
+3. Run all cells (Ctrl+F9)
+4. Click the gradio.live URL
+5. Upload video and click "Remove Subtitles"
+```
+### For Traditional Notebook:
+```bash
+1. Open Video_Subtitle_Remover.ipynb in Colab
+2. Runtime → Change runtime type → GPU
+3. Run cells step by step
+4. Configure settings in Step 5
+5. Run processing in Step 7
+```
+## Algorithm Recommendations
+| Use Case | Algorithm | Quality | Speed |
+|----------|-----------|---------|-------|
+| **Best Quality** | DiffuEraser | ⭐⭐⭐⭐⭐ | ⭐⭐ |
+| **Fastest** | STTN | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
+| **Balanced** | Stable Diffusion | ⭐⭐⭐⭐ | ⭐⭐⭐ |
+| **High Motion** | ProPainter | ⭐⭐⭐⭐⭐ | ⭐ |
+## System Requirements
+- **GPU**: Required (T4/P100/V100)
+- **Storage**: 10-20GB for models
+- **VRAM**:
+  - STTN: 4GB
+  - DiffuEraser: 12GB
+  - Stable Diffusion: 8GB
+## Performance (Colab T4 GPU)
+| Video | Algorithm | Time |
+|-------|-----------|------|
+| 1 min 720p | STTN | ~30s |
+| 1 min 720p | DiffuEraser | ~3-5min |
+| 5 min 720p | STTN | ~2min |
+| 5 min 720p | DiffuEraser | ~15-20min |
+## Troubleshooting
+### Gradio not loading
+- Wait 30-60 seconds for models to load
+- Check if all cells ran successfully
+- Restart runtime and try again
+### Out of Memory
+- Reduce batch size in settings
+- Use STTN instead of DiffuEraser
+- Process shorter videos
+### Slow processing
+- Use STTN for preview
+- Enable GPU in Colab settings
+- Consider Colab Pro for unlimited runtime
+## Links
+- **GitHub**: https://github.com/YaoFANGUK/video-subtitle-remover
+- **Documentation**: See `docs/` folder
+- **Issues**: Report on GitHub
+## Tips
+1. **Start with STTN** to test quickly
+2. **Use DiffuEraser** for final high-quality output
+3. **Keep videos under 10 minutes** on free tier
+4. **Save to Google Drive** to avoid data loss
+5. **Monitor GPU usage** with `!nvidia-smi`

google_colabs/Video_Subtitle_Remover_Gradio.ipynb ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 🎬 Video Subtitle Remover - Gradio Interface\n",
+    "\n",
+    "**Easy-to-use web interface for removing hardcoded subtitles from videos**\n",
+    "\n",
+    "This notebook provides a Gradio web UI that runs in your browser.\n",
+    "\n",
+    "**Features:**\n",
+    "- 🖱️ Click-and-upload interface\n",
+    "- 🎨 Multiple AI algorithms (STTN, LAMA, DiffuEraser, etc.)\n",
+    "- ⚙️ Adjustable parameters\n",
+    "- 📊 Real-time progress\n",
+    "- 📥 Direct download\n",
+    "\n",
+    "**Requirements:**\n",
+    "- Google Colab with GPU (Runtime → Change runtime type → GPU)\n",
+    "- ~10-20GB storage (for models)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Check GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Clone Repository"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git clone https://github.com/walidurrosyad/sub-remover.git\n",
+    "%cd sub-remover"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3: Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Core dependencies\n",
+    "!pip install -q filesplit==3.0.2 albumentations scikit-image imgaug pyclipper lmdb\n",
+    "!pip install -q PyYAML omegaconf tqdm easydict scikit-learn pandas webdataset\n",
+    "!pip install -q protobuf av einops paddleocr paddle2onnx onnxruntime-gpu\n",
+    "!pip install -q paddlepaddle-gpu==2.6.2\n",
+    "\n",
+    "# Gradio for web interface\n",
+    "!pip install -q gradio\n",
+    "\n",
+    "# Advanced models (optional - uncomment if using SD or DiffuEraser)\n",
+    "!pip install -q diffusers transformers accelerate\n",
+    "\n",
+    "print(\"✓ All dependencies installed!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4: Launch Gradio Interface\n",
+    "\n",
+    "This will create a web interface you can use in your browser!\n",
+    "\n",
+    "**Click the public URL that appears below to access the interface.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Launch Gradio interface\n",
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Add paths\n",
+    "sys.path.insert(0, '/content/sub-remover')\n",
+    "sys.path.insert(0, '/content/sub-remover/backend')\n",
+    "\n",
+    "# Change to google_colabs directory to import gradio_app\n",
+    "os.chdir('/content/sub-remover/google_colabs')\n",
+    "\n",
+    "from gradio_app import create_interface\n",
+    "\n",
+    "demo = create_interface()\n",
+    "demo.launch(share=True, debug=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Alternative: Run Gradio in Notebook\n",
+    "\n",
+    "If the above doesn't work, run Gradio directly in the notebook:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/content/sub-remover')\n",
+    "sys.path.insert(0, '/content/sub-remover/backend')\n",
+    "\n",
+    "# Import and run\n",
+    "from google_colabs.gradio_app import create_interface\n",
+    "\n",
+    "demo = create_interface()\n",
+    "demo.launch(share=True, debug=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How to Use the Gradio Interface\n",
+    "\n",
+    "1. **Click the public URL** (looks like: https://xxxxx.gradio.live)\n",
+    "2. **Upload your video** using the upload button\n",
+    "3. **Select algorithm**:\n",
+    "   - **DiffuEraser (Recommended)**: Best quality for subtitles\n",
+    "   - **STTN (Fast)**: Quickest processing\n",
+    "   - **Stable Diffusion**: High quality alternative\n",
+    "4. **Adjust settings** (optional) in \"Advanced Settings\"\n",
+    "5. **Click \"Remove Subtitles\"**\n",
+    "6. **Wait for processing** (progress shown)\n",
+    "7. **Download result** using the download button\n",
+    "\n",
+    "## Performance Guide\n",
+    "\n",
+    "### Colab T4 GPU (Free Tier)\n",
+    "\n",
+    "| Video Length | Algorithm | Time |\n",
+    "|--------------|-----------|------|\n",
+    "| 1 min 720p | STTN | ~30s |\n",
+    "| 1 min 720p | DiffuEraser | ~3-5min |\n",
+    "| 5 min 720p | STTN | ~2min |\n",
+    "| 5 min 720p | DiffuEraser | ~15-20min |\n",
+    "\n",
+    "### Tips\n",
+    "\n",
+    "- **Use STTN for preview**, DiffuEraser for final\n",
+    "- **Keep videos under 10 minutes** to avoid Colab timeout\n",
+    "- **Enable GPU** in Runtime settings\n",
+    "- **Reduce batch size** if you get OOM errors\n",
+    "\n",
+    "## Troubleshooting\n",
+    "\n",
+    "### \"Out of Memory\" Error\n",
+    "Reduce batch size in Advanced Settings:\n",
+    "- DiffuEraser: Set \"Max Frames per Batch\" to 40\n",
+    "- STTN: Set to 30\n",
+    "\n",
+    "### Gradio Not Loading\n",
+    "Restart runtime and run all cells again.\n",
+    "\n",
+    "### Slow Processing\n",
+    "- Use STTN algorithm for faster results\n",
+    "- Process shorter video clips\n",
+    "\n",
+    "### Session Timeout\n",
+    "Colab free tier has time limits. Process shorter videos or use Colab Pro."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+albumentations~=1.4.10
+filesplit==3.0.2
+opencv-python==4.11.0.86
+scikit-image==0.25.2
+imgaug==0.4.0
+pyclipper==1.3.0.post6
+lmdb==1.6.2
+PyYAML==6.0.2
+omegaconf==2.3.0
+tqdm==4.67.1
+PySimpleGUI-4-foss==4.60.4.1
+easydict==1.13
+scikit-learn==1.6.1
+pandas==2.2.3
+webdataset==0.2.111
+numpy==2.2.5
+protobuf==6.30.2
+av==14.3.0
+einops==0.8.1
+paddleocr==2.10.0
+paddle2onnx==1.3.1
+onnxruntime-gpu==1.20.1
+onnxruntime-directml==1.20.1;  sys_platform == 'win32'
+# Advanced Inpainting Models
+diffusers>=0.27.0                # For Stable Diffusion & DiffuEraser
+transformers>=4.36.0             # Required by diffusers
+accelerate>=0.25.0               # For faster inference
+xformers>=0.0.23;  sys_platform != 'darwin'  # Memory optimization (not on macOS)