ahmad walidurosyad commited on
Commit
dd98968
·
0 Parent(s):
Files changed (2) hide show
  1. backend/config.py +184 -0
  2. google_colabs/README.md +126 -0
backend/config.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ from enum import Enum, unique
3
+ warnings.filterwarnings('ignore')
4
+ import os
5
+ import torch
6
+ import logging
7
+ import platform
8
+ import stat
9
+ from fsplit.filesplit import Filesplit
10
+ import onnxruntime as ort
11
+
12
+ # 项目版本号
13
+ VERSION = "1.1.1"
14
+ # ×××××××××××××××××××× [不要改] start ××××××××××××××××××××
15
+ logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
16
+ logging.disable(logging.WARNING) # 关闭WARNING日志的打印
17
+ try:
18
+ import torch_directml
19
+ device = torch_directml.device(torch_directml.default_device())
20
+ USE_DML = True
21
+ except:
22
+ USE_DML = False
23
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
25
+ LAMA_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'big-lama')
26
+ STTN_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'sttn', 'infer_model.pth')
27
+ VIDEO_INPAINT_MODEL_PATH = os.path.join(BASE_DIR, 'models', 'video')
28
+ MODEL_VERSION = 'V4'
29
+ DET_MODEL_BASE = os.path.join(BASE_DIR, 'models')
30
+ DET_MODEL_PATH = os.path.join(DET_MODEL_BASE, MODEL_VERSION, 'ch_det')
31
+
32
+ # 查看该路径下是否有模型完整文件,没有的话合并小文件生成完整文件
33
+ if 'big-lama.pt' not in (os.listdir(LAMA_MODEL_PATH)):
34
+ fs = Filesplit()
35
+ fs.merge(input_dir=LAMA_MODEL_PATH)
36
+
37
+ if 'inference.pdiparams' not in os.listdir(DET_MODEL_PATH):
38
+ fs = Filesplit()
39
+ fs.merge(input_dir=DET_MODEL_PATH)
40
+
41
+ if 'ProPainter.pth' not in os.listdir(VIDEO_INPAINT_MODEL_PATH):
42
+ fs = Filesplit()
43
+ fs.merge(input_dir=VIDEO_INPAINT_MODEL_PATH)
44
+
45
+ # 指定ffmpeg可执行程序路径
46
+ sys_str = platform.system()
47
+ if sys_str == "Windows":
48
+ ffmpeg_bin = os.path.join('win_x64', 'ffmpeg.exe')
49
+ elif sys_str == "Linux":
50
+ ffmpeg_bin = os.path.join('linux_x64', 'ffmpeg')
51
+ else:
52
+ ffmpeg_bin = os.path.join('macos', 'ffmpeg')
53
+ FFMPEG_PATH = os.path.join(BASE_DIR, '', 'ffmpeg', ffmpeg_bin)
54
+
55
+ if 'ffmpeg.exe' not in os.listdir(os.path.join(BASE_DIR, '', 'ffmpeg', 'win_x64')):
56
+ fs = Filesplit()
57
+ fs.merge(input_dir=os.path.join(BASE_DIR, '', 'ffmpeg', 'win_x64'))
58
+ # 将ffmpeg添加可执行权限
59
+ os.chmod(FFMPEG_PATH, stat.S_IRWXU + stat.S_IRWXG + stat.S_IRWXO)
60
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
61
+
62
+ # 是否使用ONNX(DirectML/AMD/Intel)
63
+ ONNX_PROVIDERS = []
64
+ available_providers = ort.get_available_providers()
65
+ for provider in available_providers:
66
+ if provider in [
67
+ "CPUExecutionProvider"
68
+ ]:
69
+ continue
70
+ if provider not in [
71
+ "DmlExecutionProvider", # DirectML,适用于 Windows GPU
72
+ "ROCMExecutionProvider", # AMD ROCm
73
+ "MIGraphXExecutionProvider", # AMD MIGraphX
74
+ "VitisAIExecutionProvider", # AMD VitisAI,适用于 RyzenAI & Windows, 实测和DirectML性能似乎差不多
75
+ "OpenVINOExecutionProvider", # Intel GPU
76
+ "MetalExecutionProvider", # Apple macOS
77
+ "CoreMLExecutionProvider", # Apple macOS
78
+ "CUDAExecutionProvider", # Nvidia GPU
79
+ ]:
80
+ continue
81
+ ONNX_PROVIDERS.append(provider)
82
+ # ×××××××××××××××××××× [不要改] end ××××××××××××××××××××
83
+
84
+
85
+ @unique
86
+ class InpaintMode(Enum):
87
+ """
88
+ 图像重绘算法枚举
89
+ """
90
+ STTN = 'sttn'
91
+ LAMA = 'lama'
92
+ PROPAINTER = 'propainter'
93
+ STABLE_DIFFUSION = 'sd' # Stable Diffusion Inpainting
94
+ DIFFUERASER = 'diffueraser' # DiffuEraser (diffusion-based)
95
+ E2FGVI = 'e2fgvi' # Flow-guided video inpainting
96
+
97
+
98
+ # ×××××××××××××××××××× [可以改] start ××××××××××××××××××××
99
+ # 是否使用h264编码,如果需要安卓手机分享生成的视频,请打开该选项
100
+ USE_H264 = True
101
+
102
+ # ×××××××××× 通用设置 start ××××××××××
103
+ """
104
+ MODE可选算法类型
105
+ - InpaintMode.STTN 算法:对于真人视频效果较好,速度快,可以跳过字幕检测
106
+ - InpaintMode.LAMA 算法:对于动画类视频效果好,速度一般,不可以跳过字幕检测
107
+ - InpaintMode.PROPAINTER 算法: 需要消耗大量显存,速度较慢,对运动非常剧烈的视频效果较好
108
+ """
109
+ # 默认重绘算法模式 sttn/lama/propainter/sd/diffueraser/e2fgvi
110
+ MODE = InpaintMode.STTN
111
+
112
+ # ×××××××××××××××××××× Stable Diffusion Settings ××××××××××××××××××××
113
+ SD_MODEL_PATH = 'backend/models/stable-diffusion-inpainting'
114
+ SD_STEPS = 50 # Inference steps
115
+ SD_GUIDANCE_SCALE = 7.5 # Classifier-free guidance
116
+ SD_PROMPT = "natural scene, high quality" # Text prompt for guidance
117
+ SD_USE_FP16 = True # Use half precision for faster inference
118
+
119
+ # ×××××××××××××××××××× DiffuEraser Settings ××××××××××××××××××××
120
+ DIFFUERASER_MODEL_PATH = 'backend/models/diffueraser'
121
+ DIFFUERASER_STEPS = 50 # Diffusion steps
122
+ DIFFUERASER_GUIDANCE = 7.5 # Guidance scale
123
+ DIFFUERASER_USE_SAM2 = False # Auto-masking with SAM2
124
+ DIFFUERASER_MAX_LOAD_NUM = 80 # Max frames per batch
125
+
126
+ # ×××××××××××××××××××× E2FGVI Settings ××××××××××××××××××××
127
+ E2FGVI_MODEL_PATH = 'backend/models/e2fgvi'
128
+ E2FGVI_MAX_LOAD_NUM = 80 # Max frames per batch
129
+ E2FGVI_NEIGHBOR_LENGTH = 10 # Temporal window for flow
130
+ # 【设置像素点偏差】
131
+ # 用于判断是不是非字幕区域(一般认为字幕文本框的长度是要大于宽度的,如果字幕框的高大于宽,且大于的幅度超过指定像素点大小,则认为是错误检测)
132
+ THRESHOLD_HEIGHT_WIDTH_DIFFERENCE = 10
133
+ # 用于放大mask大小,防止自动检测的文本框过小,inpaint阶段出现文字边,有残留
134
+ SUBTITLE_AREA_DEVIATION_PIXEL = 20
135
+ # 同于判断两个文本框是否为同一行字幕,高度差距指定像素点以内认为是同一行
136
+ THRESHOLD_HEIGHT_DIFFERENCE = 20
137
+ # 用于判断两个字幕文本的矩形框是否相似,如果X轴和Y轴偏差都在指定阈值内,则认为时同一个文本框
138
+ PIXEL_TOLERANCE_Y = 20 # 允许检测框纵向偏差的像素点数
139
+ PIXEL_TOLERANCE_X = 20 # 允许检测框横向偏差的像素点数
140
+ # ×××××××××× 通用设置 end ××××××××××
141
+
142
+ # ×××××××××× InpaintMode.STTN算法设置 start ××××××××××
143
+ # 以下参数仅适用STTN算法时,才生效
144
+ """
145
+ 1. STTN_SKIP_DETECTION
146
+ 含义:是否使用跳过检测
147
+ 效果:设置为True跳过字幕检测,会省去很大时间,但是可能误伤无字幕的视频帧或者会导致去除的字幕漏了
148
+
149
+ 2. STTN_NEIGHBOR_STRIDE
150
+ 含义:相邻帧数步长, 如果需要为第50帧填充缺失的区域,STTN_NEIGHBOR_STRIDE=5,那么算法会使用第45帧、第40帧等作为参照。
151
+ 效果:用于控制参考帧选择的密度,较大的步长意味着使用更少、更分散的参考帧,较小的步长意味着使用更多、更集中的参考帧。
152
+
153
+ 3. STTN_REFERENCE_LENGTH
154
+ 含义:参数帧数量,STTN算法会查看每个待修复帧的前后若干帧来获得用于修复的上下文信息
155
+ 效果:调大会增加显存占用,处理效果变好,但是处理速度变慢
156
+
157
+ 4. STTN_MAX_LOAD_NUM
158
+ 含义:STTN算法每次最多加载的视频帧数量
159
+ 效果:设置越大速度越慢,但效果越好
160
+ 注意:要保证STTN_MAX_LOAD_NUM大于STTN_NEIGHBOR_STRIDE和STTN_REFERENCE_LENGTH
161
+ """
162
+ STTN_SKIP_DETECTION = True
163
+ # 参考帧步长
164
+ STTN_NEIGHBOR_STRIDE = 5
165
+ # 参考帧长度(数量)
166
+ STTN_REFERENCE_LENGTH = 10
167
+ # 设置STTN算法最大同时处理的帧数量
168
+ STTN_MAX_LOAD_NUM = 50
169
+ if STTN_MAX_LOAD_NUM < STTN_REFERENCE_LENGTH * STTN_NEIGHBOR_STRIDE:
170
+ STTN_MAX_LOAD_NUM = STTN_REFERENCE_LENGTH * STTN_NEIGHBOR_STRIDE
171
+ # ×××××××××× InpaintMode.STTN算法设置 end ××××××××××
172
+
173
+ # ×××××××××× InpaintMode.PROPAINTER算法设置 start ××××××××××
174
+ # 【根据自己的GPU显存大小设置】最大同时处理的图片数量,设置越大处理效果越好,但是要求显存越高
175
+ # 1280x720p视频设置80需要25G显存,设置50需要19G显存
176
+ # 720x480p视频设置80需要8G显存,设置50需要7G显存
177
+ PROPAINTER_MAX_LOAD_NUM = 70
178
+ # ×××××××××× InpaintMode.PROPAINTER算法设置 end ××××××××××
179
+
180
+ # ×××××××××× InpaintMode.LAMA算法设置 start ××××××××××
181
+ # 是否开启极速模式,开启后不保证inpaint效果,仅仅对包含文本的区域文本进行去除
182
+ LAMA_SUPER_FAST = False
183
+ # ×××××××××× InpaintMode.LAMA算法设置 end ××××××××××
184
+ # ×××××××××××××××××××× [可以改] end ××××××××××××××××××××
google_colabs/README.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google Colab Gradio Interface
2
+
3
+ This folder contains two versions of the Google Colab notebook:
4
+
5
+ ## Files
6
+
7
+ ### 1. `Video_Subtitle_Remover_Gradio.ipynb` ⭐ **NEW - Recommended**
8
+ **Gradio Web Interface** - Easy-to-use browser-based UI
9
+
10
+ **Features:**
11
+ - 🖱️ Click-and-upload interface (no coding required)
12
+ - 🎨 Visual algorithm selection
13
+ - ⚙️ Adjustable parameters with sliders
14
+ - 📊 Real-time progress tracking
15
+ - 📥 One-click download
16
+
17
+ **Best for:**
18
+ - Users who prefer GUI
19
+ - Quick testing
20
+ - Non-technical users
21
+ - Multiple video processing
22
+
23
+ **Usage:**
24
+ 1. Open in Colab
25
+ 2. Run all cells
26
+ 3. Click the generated link
27
+ 4. Use web interface in browser
28
+
29
+ ---
30
+
31
+ ### 2. `Video_Subtitle_Remover.ipynb`
32
+ **Traditional Notebook** - Code-based approach
33
+
34
+ **Features:**
35
+ - Step-by-step execution
36
+ - Full control over parameters
37
+ - Good for understanding the process
38
+ - Batch processing scripts
39
+
40
+ **Best for:**
41
+ - Users comfortable with code
42
+ - Custom workflows
43
+ - Debugging
44
+ - Learning the internals
45
+
46
+ ---
47
+
48
+ ## Quick Start
49
+
50
+ ### For Gradio Interface (Recommended):
51
+
52
+ ```bash
53
+ 1. Open Video_Subtitle_Remover_Gradio.ipynb in Colab
54
+ 2. Runtime → Change runtime type → GPU
55
+ 3. Run all cells (Ctrl+F9)
56
+ 4. Click the gradio.live URL
57
+ 5. Upload video and click "Remove Subtitles"
58
+ ```
59
+
60
+ ### For Traditional Notebook:
61
+
62
+ ```bash
63
+ 1. Open Video_Subtitle_Remover.ipynb in Colab
64
+ 2. Runtime → Change runtime type → GPU
65
+ 3. Run cells step by step
66
+ 4. Configure settings in Step 5
67
+ 5. Run processing in Step 7
68
+ ```
69
+
70
+ ## Algorithm Recommendations
71
+
72
+ | Use Case | Algorithm | Quality | Speed |
73
+ |----------|-----------|---------|-------|
74
+ | **Best Quality** | DiffuEraser | ⭐⭐⭐⭐⭐ | ⭐⭐ |
75
+ | **Fastest** | STTN | ⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
76
+ | **Balanced** | Stable Diffusion | ⭐⭐⭐⭐ | ⭐⭐⭐ |
77
+ | **High Motion** | ProPainter | ⭐⭐⭐⭐⭐ | ⭐ |
78
+
79
+ ## System Requirements
80
+
81
+ - **GPU**: Required (T4/P100/V100)
82
+ - **Storage**: 10-20GB for models
83
+ - **VRAM**:
84
+ - STTN: 4GB
85
+ - DiffuEraser: 12GB
86
+ - Stable Diffusion: 8GB
87
+
88
+ ## Performance (Colab T4 GPU)
89
+
90
+ | Video | Algorithm | Time |
91
+ |-------|-----------|------|
92
+ | 1 min 720p | STTN | ~30s |
93
+ | 1 min 720p | DiffuEraser | ~3-5min |
94
+ | 5 min 720p | STTN | ~2min |
95
+ | 5 min 720p | DiffuEraser | ~15-20min |
96
+
97
+ ## Troubleshooting
98
+
99
+ ### Gradio not loading
100
+ - Wait 30-60 seconds for models to load
101
+ - Check if all cells ran successfully
102
+ - Restart runtime and try again
103
+
104
+ ### Out of Memory
105
+ - Reduce batch size in settings
106
+ - Use STTN instead of DiffuEraser
107
+ - Process shorter videos
108
+
109
+ ### Slow processing
110
+ - Use STTN for preview
111
+ - Enable GPU in Colab settings
112
+ - Consider Colab Pro for unlimited runtime
113
+
114
+ ## Links
115
+
116
+ - **GitHub**: https://github.com/YaoFANGUK/video-subtitle-remover
117
+ - **Documentation**: See `docs/` folder
118
+ - **Issues**: Report on GitHub
119
+
120
+ ## Tips
121
+
122
+ 1. **Start with STTN** to test quickly
123
+ 2. **Use DiffuEraser** for final high-quality output
124
+ 3. **Keep videos under 10 minutes** on free tier
125
+ 4. **Save to Google Drive** to avoid data loss
126
+ 5. **Monitor GPU usage** with `!nvidia-smi`