Spaces:
Build error
Build error
Upload 2 files
Browse files- app.py +571 -0
- requirements.txt +11 -0
app.py
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import subprocess
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
import shutil
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
import datetime
|
| 9 |
+
import csv
|
| 10 |
+
from pydub import AudioSegment
|
| 11 |
+
import numpy as np
|
| 12 |
+
import torch
|
| 13 |
+
import gc
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
+
# Load environment variables
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
# Import NeMo for transcription (you'll need to install: pip install nemo_toolkit[asr])
|
| 20 |
+
try:
|
| 21 |
+
from nemo.collections.asr.models import ASRModel
|
| 22 |
+
NEMO_AVAILABLE = True
|
| 23 |
+
except ImportError:
|
| 24 |
+
NEMO_AVAILABLE = False
|
| 25 |
+
print("Warning: NeMo not available. Auto-transcription will be disabled.")
|
| 26 |
+
|
| 27 |
+
class AutomatedSubtitleBurner:
|
| 28 |
+
def __init__(self):
|
| 29 |
+
self.temp_dir = tempfile.mkdtemp()
|
| 30 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 31 |
+
self.model = None
|
| 32 |
+
|
| 33 |
+
# Load transcription model if available
|
| 34 |
+
if NEMO_AVAILABLE:
|
| 35 |
+
try:
|
| 36 |
+
MODEL_NAME = os.getenv('MODEL_NAME')
|
| 37 |
+
if MODEL_NAME:
|
| 38 |
+
self.model = ASRModel.from_pretrained(model_name=MODEL_NAME)
|
| 39 |
+
self.model.eval()
|
| 40 |
+
except Exception as e:
|
| 41 |
+
self.model = None
|
| 42 |
+
|
| 43 |
+
def extract_audio_from_video(self, video_path):
|
| 44 |
+
"""Extract audio from video file for transcription"""
|
| 45 |
+
try:
|
| 46 |
+
audio_path = os.path.join(self.temp_dir, "extracted_audio.wav")
|
| 47 |
+
|
| 48 |
+
# Use FFmpeg to extract audio
|
| 49 |
+
cmd = [
|
| 50 |
+
'ffmpeg', '-y', '-i', video_path,
|
| 51 |
+
'-vn', # No video
|
| 52 |
+
'-acodec', 'pcm_s16le', # Audio codec
|
| 53 |
+
'-ar', '16000', # Sample rate
|
| 54 |
+
'-ac', '1', # Mono
|
| 55 |
+
audio_path
|
| 56 |
+
]
|
| 57 |
+
|
| 58 |
+
subprocess.run(cmd, capture_output=True, check=True)
|
| 59 |
+
return audio_path
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
print(f"Error extracting audio: {e}")
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
def format_srt_time(self, seconds: float) -> str:
|
| 66 |
+
"""Converts seconds to SRT time format HH:MM:SS,mmm"""
|
| 67 |
+
sanitized_total_seconds = max(0.0, seconds)
|
| 68 |
+
delta = datetime.timedelta(seconds=sanitized_total_seconds)
|
| 69 |
+
total_int_seconds = int(delta.total_seconds())
|
| 70 |
+
|
| 71 |
+
hours = total_int_seconds // 3600
|
| 72 |
+
remainder_seconds_after_hours = total_int_seconds % 3600
|
| 73 |
+
minutes = remainder_seconds_after_hours // 60
|
| 74 |
+
seconds_part = remainder_seconds_after_hours % 60
|
| 75 |
+
milliseconds = delta.microseconds // 1000
|
| 76 |
+
|
| 77 |
+
return f"{hours:02d}:{minutes:02d}:{seconds_part:02d},{milliseconds:03d}"
|
| 78 |
+
|
| 79 |
+
def generate_srt_content(self, word_timestamps: list) -> str:
|
| 80 |
+
"""Generates SRT formatted string from word timestamps"""
|
| 81 |
+
srt_content = []
|
| 82 |
+
for i, ts in enumerate(word_timestamps):
|
| 83 |
+
start_time = self.format_srt_time(ts['start'])
|
| 84 |
+
end_time = self.format_srt_time(ts['end'])
|
| 85 |
+
text = ts['word']
|
| 86 |
+
srt_content.append(str(i + 1))
|
| 87 |
+
srt_content.append(f"{start_time} --> {end_time}")
|
| 88 |
+
srt_content.append(text)
|
| 89 |
+
srt_content.append("")
|
| 90 |
+
return "\n".join(srt_content)
|
| 91 |
+
|
| 92 |
+
def transcribe_audio(self, audio_path, progress_callback=None):
|
| 93 |
+
"""Transcribe audio to get word-level timestamps"""
|
| 94 |
+
if not self.model or not NEMO_AVAILABLE:
|
| 95 |
+
return None, "Transcription model not available"
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
if progress_callback:
|
| 99 |
+
progress_callback(0.1, "Loading audio...")
|
| 100 |
+
|
| 101 |
+
# Load and preprocess audio
|
| 102 |
+
audio = AudioSegment.from_file(audio_path)
|
| 103 |
+
duration_sec = audio.duration_seconds
|
| 104 |
+
|
| 105 |
+
if progress_callback:
|
| 106 |
+
progress_callback(0.2, "Preprocessing audio...")
|
| 107 |
+
|
| 108 |
+
# Ensure audio is in correct format
|
| 109 |
+
if audio.frame_rate != 16000:
|
| 110 |
+
audio = audio.set_frame_rate(16000)
|
| 111 |
+
if audio.channels != 1:
|
| 112 |
+
audio = audio.set_channels(1)
|
| 113 |
+
|
| 114 |
+
# Save preprocessed audio
|
| 115 |
+
processed_path = os.path.join(self.temp_dir, "processed_audio.wav")
|
| 116 |
+
audio.export(processed_path, format="wav")
|
| 117 |
+
|
| 118 |
+
if progress_callback:
|
| 119 |
+
progress_callback(0.3, "Starting transcription...")
|
| 120 |
+
|
| 121 |
+
# Configure model for long audio if needed
|
| 122 |
+
long_audio_settings_applied = False
|
| 123 |
+
if duration_sec > 480: # 8 minutes
|
| 124 |
+
try:
|
| 125 |
+
print("Applying long audio settings for transcription...")
|
| 126 |
+
self.model.change_attention_model("rel_pos_local_attn", [256, 256])
|
| 127 |
+
self.model.change_subsampling_conv_chunking_factor(1)
|
| 128 |
+
long_audio_settings_applied = True
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Warning: Could not apply long audio settings: {e}")
|
| 131 |
+
|
| 132 |
+
# Move model to appropriate device and precision
|
| 133 |
+
self.model.to(self.device)
|
| 134 |
+
self.model.to(torch.bfloat16)
|
| 135 |
+
|
| 136 |
+
if progress_callback:
|
| 137 |
+
progress_callback(0.5, "Transcribing (this may take a while)...")
|
| 138 |
+
|
| 139 |
+
# Transcribe with timestamps
|
| 140 |
+
output = self.model.transcribe([processed_path], timestamps=True)
|
| 141 |
+
|
| 142 |
+
if progress_callback:
|
| 143 |
+
progress_callback(0.8, "Processing transcription results...")
|
| 144 |
+
|
| 145 |
+
if not output or not output[0] or not hasattr(output[0], 'timestamp'):
|
| 146 |
+
return None, "Transcription failed - no output generated"
|
| 147 |
+
|
| 148 |
+
# Get word-level timestamps
|
| 149 |
+
word_timestamps = output[0].timestamp.get('word', [])
|
| 150 |
+
|
| 151 |
+
if not word_timestamps:
|
| 152 |
+
return None, "No word-level timestamps generated"
|
| 153 |
+
|
| 154 |
+
# Generate SRT content
|
| 155 |
+
srt_content = self.generate_srt_content(word_timestamps)
|
| 156 |
+
|
| 157 |
+
if progress_callback:
|
| 158 |
+
progress_callback(1.0, "Transcription complete!")
|
| 159 |
+
|
| 160 |
+
return srt_content, "Transcription successful!"
|
| 161 |
+
|
| 162 |
+
except torch.cuda.OutOfMemoryError:
|
| 163 |
+
return None, "CUDA out of memory. Please try a shorter video or use CPU."
|
| 164 |
+
except Exception as e:
|
| 165 |
+
return None, f"Transcription error: {str(e)}"
|
| 166 |
+
finally:
|
| 167 |
+
# Cleanup model settings and memory
|
| 168 |
+
try:
|
| 169 |
+
if long_audio_settings_applied and self.model:
|
| 170 |
+
self.model.change_attention_model("rel_pos")
|
| 171 |
+
self.model.change_subsampling_conv_chunking_factor(-1)
|
| 172 |
+
|
| 173 |
+
if self.model and self.device == 'cuda':
|
| 174 |
+
self.model.cpu()
|
| 175 |
+
gc.collect()
|
| 176 |
+
if self.device == 'cuda':
|
| 177 |
+
torch.cuda.empty_cache()
|
| 178 |
+
except Exception as e:
|
| 179 |
+
print(f"Warning: Error during cleanup: {e}")
|
| 180 |
+
|
| 181 |
+
def auto_generate_srt(self, video_file, progress=gr.Progress()):
|
| 182 |
+
"""Automatically generate SRT from video"""
|
| 183 |
+
if not video_file:
|
| 184 |
+
return "", "Please provide a video file"
|
| 185 |
+
|
| 186 |
+
if not self.model or not NEMO_AVAILABLE:
|
| 187 |
+
return "", "Transcription model not available. Please install NeMo toolkit."
|
| 188 |
+
|
| 189 |
+
try:
|
| 190 |
+
progress(0.05, desc="Extracting audio from video...")
|
| 191 |
+
|
| 192 |
+
# Extract audio from video
|
| 193 |
+
audio_path = self.extract_audio_from_video(video_file)
|
| 194 |
+
if not audio_path:
|
| 195 |
+
return "", "Failed to extract audio from video"
|
| 196 |
+
|
| 197 |
+
progress(0.1, desc="Audio extracted, starting transcription...")
|
| 198 |
+
|
| 199 |
+
# Transcribe audio
|
| 200 |
+
def progress_callback(value, desc):
|
| 201 |
+
progress(0.1 + (value * 0.8), desc=desc)
|
| 202 |
+
|
| 203 |
+
srt_content, message = self.transcribe_audio(audio_path, progress_callback)
|
| 204 |
+
|
| 205 |
+
progress(0.95, desc="Finalizing...")
|
| 206 |
+
|
| 207 |
+
if srt_content:
|
| 208 |
+
progress(1.0, desc="SRT generation complete!")
|
| 209 |
+
return srt_content, message
|
| 210 |
+
else:
|
| 211 |
+
return "", message
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
return "", f"Error generating SRT: {str(e)}"
|
| 215 |
+
|
| 216 |
+
def create_styled_srt(self, srt_content, font_size=24, font_color="white",
|
| 217 |
+
outline_color="black", outline_width=1):
|
| 218 |
+
"""Create a styled SRT file with ASS-style formatting"""
|
| 219 |
+
lines = srt_content.strip().split('\n')
|
| 220 |
+
styled_lines = []
|
| 221 |
+
|
| 222 |
+
i = 0
|
| 223 |
+
while i < len(lines):
|
| 224 |
+
if lines[i].strip().isdigit(): # Subtitle number
|
| 225 |
+
styled_lines.append(lines[i])
|
| 226 |
+
i += 1
|
| 227 |
+
|
| 228 |
+
if i < len(lines): # Timestamp
|
| 229 |
+
styled_lines.append(lines[i])
|
| 230 |
+
i += 1
|
| 231 |
+
|
| 232 |
+
# Collect all text lines for this subtitle
|
| 233 |
+
text_lines = []
|
| 234 |
+
while i < len(lines) and lines[i].strip() != "":
|
| 235 |
+
text_lines.append(lines[i])
|
| 236 |
+
i += 1
|
| 237 |
+
|
| 238 |
+
# Apply styling to text
|
| 239 |
+
if text_lines:
|
| 240 |
+
styled_text = ' '.join(text_lines)
|
| 241 |
+
# Add basic styling tags
|
| 242 |
+
styled_text = f"<font size='{font_size}' color='{font_color}'>{styled_text}</font>"
|
| 243 |
+
styled_lines.append(styled_text)
|
| 244 |
+
|
| 245 |
+
styled_lines.append("") # Empty line separator
|
| 246 |
+
else:
|
| 247 |
+
i += 1
|
| 248 |
+
|
| 249 |
+
return '\n'.join(styled_lines)
|
| 250 |
+
|
| 251 |
+
def get_video_info(self, video_path):
|
| 252 |
+
"""Get video information using ffprobe"""
|
| 253 |
+
try:
|
| 254 |
+
cmd = [
|
| 255 |
+
'ffprobe', '-v', 'quiet', '-print_format', 'json',
|
| 256 |
+
'-show_format', '-show_streams', video_path
|
| 257 |
+
]
|
| 258 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 259 |
+
info = json.loads(result.stdout)
|
| 260 |
+
|
| 261 |
+
# Find video stream
|
| 262 |
+
video_stream = None
|
| 263 |
+
for stream in info['streams']:
|
| 264 |
+
if stream['codec_type'] == 'video':
|
| 265 |
+
video_stream = stream
|
| 266 |
+
break
|
| 267 |
+
|
| 268 |
+
if video_stream:
|
| 269 |
+
width = int(video_stream['width'])
|
| 270 |
+
height = int(video_stream['height'])
|
| 271 |
+
duration = float(video_stream.get('duration', 0))
|
| 272 |
+
return width, height, duration
|
| 273 |
+
|
| 274 |
+
except Exception as e:
|
| 275 |
+
print(f"Error getting video info: {e}")
|
| 276 |
+
|
| 277 |
+
return 1920, 1080, 0 # Default values
|
| 278 |
+
|
| 279 |
+
def burn_subtitles(self, video_file, srt_content, font_size=24, font_color="white",
|
| 280 |
+
position="bottom_center", outline_color="black", outline_width=1,
|
| 281 |
+
progress=gr.Progress()):
|
| 282 |
+
"""Burn subtitles into video using FFmpeg"""
|
| 283 |
+
|
| 284 |
+
if not video_file or not srt_content.strip():
|
| 285 |
+
return None, "Please provide both video file and SRT content"
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
progress(0.1, desc="Preparing files...")
|
| 289 |
+
|
| 290 |
+
# Create temporary SRT file
|
| 291 |
+
srt_path = os.path.join(self.temp_dir, "subtitles.srt")
|
| 292 |
+
styled_srt = self.create_styled_srt(srt_content, font_size, font_color,
|
| 293 |
+
outline_color, outline_width)
|
| 294 |
+
|
| 295 |
+
with open(srt_path, 'w', encoding='utf-8') as f:
|
| 296 |
+
f.write(styled_srt)
|
| 297 |
+
|
| 298 |
+
progress(0.2, desc="Getting video information...")
|
| 299 |
+
|
| 300 |
+
# Get video info
|
| 301 |
+
width, height, duration = self.get_video_info(video_file)
|
| 302 |
+
|
| 303 |
+
progress(0.3, desc="Starting subtitle burning...")
|
| 304 |
+
|
| 305 |
+
# Output file
|
| 306 |
+
output_filename = f"output_with_subtitles_{font_size}px.mp4"
|
| 307 |
+
output_path = os.path.join(self.temp_dir, output_filename)
|
| 308 |
+
|
| 309 |
+
# Build FFmpeg command with subtitle filter
|
| 310 |
+
cmd = [
|
| 311 |
+
'ffmpeg', '-y', # Overwrite output files
|
| 312 |
+
'-i', video_file,
|
| 313 |
+
'-vf', f"""subtitles='{srt_path}':force_style='FontSize={font_size},PrimaryColour=&H{self.color_to_bgr_hex(font_color)},OutlineColour=&H{self.color_to_bgr_hex(outline_color)},Outline={outline_width},Alignment=2'""",
|
| 314 |
+
'-c:a', 'copy', # Copy audio without re-encoding
|
| 315 |
+
'-c:v', 'libx264', # Video codec
|
| 316 |
+
'-preset', 'medium', # Encoding preset
|
| 317 |
+
'-crf', '23', # Quality setting
|
| 318 |
+
output_path
|
| 319 |
+
]
|
| 320 |
+
|
| 321 |
+
progress(0.4, desc="Processing video (this may take a while)...")
|
| 322 |
+
|
| 323 |
+
# Run FFmpeg
|
| 324 |
+
process = subprocess.Popen(
|
| 325 |
+
cmd,
|
| 326 |
+
stdout=subprocess.PIPE,
|
| 327 |
+
stderr=subprocess.PIPE,
|
| 328 |
+
universal_newlines=True
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
# Monitor progress
|
| 332 |
+
while True:
|
| 333 |
+
output = process.stderr.readline()
|
| 334 |
+
if output == '' and process.poll() is not None:
|
| 335 |
+
break
|
| 336 |
+
if output and 'time=' in output:
|
| 337 |
+
# Try to extract time for progress
|
| 338 |
+
try:
|
| 339 |
+
time_str = output.split('time=')[1].split()[0]
|
| 340 |
+
time_parts = time_str.split(':')
|
| 341 |
+
current_seconds = (float(time_parts[0]) * 3600 +
|
| 342 |
+
float(time_parts[1]) * 60 +
|
| 343 |
+
float(time_parts[2]))
|
| 344 |
+
if duration > 0:
|
| 345 |
+
prog = 0.4 + (current_seconds / duration) * 0.5
|
| 346 |
+
progress(min(prog, 0.9), desc=f"Processing: {time_str}")
|
| 347 |
+
except:
|
| 348 |
+
pass
|
| 349 |
+
|
| 350 |
+
progress(0.95, desc="Finalizing...")
|
| 351 |
+
|
| 352 |
+
return_code = process.poll()
|
| 353 |
+
if return_code == 0:
|
| 354 |
+
progress(1.0, desc="Complete!")
|
| 355 |
+
return output_path, "Video processed successfully!"
|
| 356 |
+
else:
|
| 357 |
+
error_output = process.stderr.read()
|
| 358 |
+
return None, f"FFmpeg error: {error_output}"
|
| 359 |
+
|
| 360 |
+
except Exception as e:
|
| 361 |
+
return None, f"Error processing video: {str(e)}"
|
| 362 |
+
|
| 363 |
+
def color_to_bgr_hex(self, color):
|
| 364 |
+
"""Convert color name to BGR hex for FFmpeg"""
|
| 365 |
+
color_map = {
|
| 366 |
+
'white': 'FFFFFF',
|
| 367 |
+
'black': '000000',
|
| 368 |
+
'red': '0000FF',
|
| 369 |
+
'green': '00FF00',
|
| 370 |
+
'blue': 'FF0000',
|
| 371 |
+
'yellow': '00FFFF',
|
| 372 |
+
'cyan': 'FFFF00',
|
| 373 |
+
'magenta': 'FF00FF',
|
| 374 |
+
'orange': '0080FF',
|
| 375 |
+
'purple': '800080',
|
| 376 |
+
'pink': 'FFB6C1',
|
| 377 |
+
'gray': '808080',
|
| 378 |
+
'grey': '808080'
|
| 379 |
+
}
|
| 380 |
+
return color_map.get(color.lower(), 'FFFFFF')
|
| 381 |
+
|
| 382 |
+
def preview_subtitles(self, srt_content, font_size, font_color, position):
|
| 383 |
+
"""Generate a preview of how subtitles will look"""
|
| 384 |
+
if not srt_content.strip():
|
| 385 |
+
return "No SRT content provided"
|
| 386 |
+
|
| 387 |
+
lines = srt_content.strip().split('\n')
|
| 388 |
+
preview_lines = []
|
| 389 |
+
|
| 390 |
+
# Extract first few subtitles for preview
|
| 391 |
+
subtitle_count = 0
|
| 392 |
+
i = 0
|
| 393 |
+
|
| 394 |
+
while i < len(lines) and subtitle_count < 3:
|
| 395 |
+
if lines[i].strip().isdigit():
|
| 396 |
+
subtitle_num = lines[i].strip()
|
| 397 |
+
i += 1
|
| 398 |
+
|
| 399 |
+
if i < len(lines):
|
| 400 |
+
timestamp = lines[i].strip()
|
| 401 |
+
i += 1
|
| 402 |
+
|
| 403 |
+
text_lines = []
|
| 404 |
+
while i < len(lines) and lines[i].strip() != "":
|
| 405 |
+
text_lines.append(lines[i].strip())
|
| 406 |
+
i += 1
|
| 407 |
+
|
| 408 |
+
if text_lines:
|
| 409 |
+
text = ' '.join(text_lines)
|
| 410 |
+
preview_lines.append(f"#{subtitle_num} [{timestamp}]")
|
| 411 |
+
preview_lines.append(f"Text: \"{text}\"")
|
| 412 |
+
preview_lines.append(f"Style: {font_size}px {font_color} at {position}")
|
| 413 |
+
preview_lines.append("---")
|
| 414 |
+
subtitle_count += 1
|
| 415 |
+
else:
|
| 416 |
+
i += 1
|
| 417 |
+
|
| 418 |
+
return '\n'.join(preview_lines) if preview_lines else "No valid subtitles found"
|
| 419 |
+
|
| 420 |
+
# Initialize the subtitle burner
|
| 421 |
+
burner = AutomatedSubtitleBurner()
|
| 422 |
+
|
| 423 |
+
# Create Gradio interface
|
| 424 |
+
def create_interface():
|
| 425 |
+
with gr.Blocks(title="Automated AI Subtitle Video Captions", theme=gr.themes.Soft()) as demo:
|
| 426 |
+
gr.Markdown("# π¬ Automated SRT Subtitle Video Burner")
|
| 427 |
+
gr.Markdown("Upload a video and either auto-generate subtitles or paste your own SRT content!")
|
| 428 |
+
|
| 429 |
+
if not NEMO_AVAILABLE:
|
| 430 |
+
gr.Markdown("β οΈ **Note**: Auto-transcription is disabled. Install NeMo toolkit for automatic SRT generation.")
|
| 431 |
+
|
| 432 |
+
with gr.Row():
|
| 433 |
+
with gr.Column(scale=1):
|
| 434 |
+
gr.Markdown("### π Input")
|
| 435 |
+
video_input = gr.File(
|
| 436 |
+
label="Upload Video File",
|
| 437 |
+
file_types=[".mp4", ".avi", ".mov", ".mkv", ".wmv", ".flv"],
|
| 438 |
+
type="filepath"
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
with gr.Row():
|
| 442 |
+
if NEMO_AVAILABLE and burner.model:
|
| 443 |
+
auto_generate_btn = gr.Button("π€ Auto-Generate SRT", variant="secondary")
|
| 444 |
+
else:
|
| 445 |
+
auto_generate_btn = gr.Button("π€ Auto-Generate SRT (Disabled)", variant="secondary", interactive=False)
|
| 446 |
+
|
| 447 |
+
srt_input = gr.Textbox(
|
| 448 |
+
label="SRT Content (Auto-generated or Manual)",
|
| 449 |
+
placeholder="SRT content will appear here after auto-generation, or paste your own...",
|
| 450 |
+
lines=12,
|
| 451 |
+
max_lines=20
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
with gr.Column(scale=1):
|
| 455 |
+
gr.Markdown("### π¨ Subtitle Styling")
|
| 456 |
+
|
| 457 |
+
font_size = gr.Slider(
|
| 458 |
+
minimum=8,
|
| 459 |
+
maximum=72,
|
| 460 |
+
value=24,
|
| 461 |
+
step=1,
|
| 462 |
+
label="Font Size (px)"
|
| 463 |
+
)
|
| 464 |
+
|
| 465 |
+
font_color = gr.Dropdown(
|
| 466 |
+
choices=["white", "black", "red", "green", "blue", "yellow",
|
| 467 |
+
"cyan", "magenta", "orange", "purple", "pink", "gray"],
|
| 468 |
+
value="white",
|
| 469 |
+
label="Font Color"
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
position = gr.Dropdown(
|
| 473 |
+
choices=["top_left", "top_center", "top_right",
|
| 474 |
+
"center_left", "center", "center_right",
|
| 475 |
+
"bottom_left", "bottom_center", "bottom_right"],
|
| 476 |
+
value="bottom_center",
|
| 477 |
+
label="Position"
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
outline_color = gr.Dropdown(
|
| 481 |
+
choices=["black", "white", "red", "green", "blue", "yellow",
|
| 482 |
+
"cyan", "magenta", "orange", "purple", "pink", "gray"],
|
| 483 |
+
value="black",
|
| 484 |
+
label="Outline Color"
|
| 485 |
+
)
|
| 486 |
+
|
| 487 |
+
outline_width = gr.Slider(
|
| 488 |
+
minimum=0,
|
| 489 |
+
maximum=5,
|
| 490 |
+
value=1,
|
| 491 |
+
step=1,
|
| 492 |
+
label="Outline Width"
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
with gr.Row():
|
| 496 |
+
with gr.Column():
|
| 497 |
+
gr.Markdown("### ποΈ Preview")
|
| 498 |
+
preview_output = gr.Textbox(
|
| 499 |
+
label="Subtitle Preview",
|
| 500 |
+
lines=8,
|
| 501 |
+
interactive=False
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
preview_btn = gr.Button("π Preview Subtitles", variant="secondary")
|
| 505 |
+
|
| 506 |
+
with gr.Row():
|
| 507 |
+
process_btn = gr.Button("π₯ Burn Subtitles to Video", variant="primary", size="lg")
|
| 508 |
+
|
| 509 |
+
with gr.Row():
|
| 510 |
+
with gr.Column():
|
| 511 |
+
output_video = gr.File(label="Download Processed Video")
|
| 512 |
+
status_output = gr.Textbox(label="Status", interactive=False)
|
| 513 |
+
|
| 514 |
+
# Event handlers
|
| 515 |
+
if NEMO_AVAILABLE and burner.model:
|
| 516 |
+
auto_generate_btn.click(
|
| 517 |
+
fn=burner.auto_generate_srt,
|
| 518 |
+
inputs=[video_input],
|
| 519 |
+
outputs=[srt_input, status_output],
|
| 520 |
+
show_progress=True
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
preview_btn.click(
|
| 524 |
+
fn=burner.preview_subtitles,
|
| 525 |
+
inputs=[srt_input, font_size, font_color, position],
|
| 526 |
+
outputs=preview_output
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
process_btn.click(
|
| 530 |
+
fn=burner.burn_subtitles,
|
| 531 |
+
inputs=[video_input, srt_input, font_size, font_color, position,
|
| 532 |
+
outline_color, outline_width],
|
| 533 |
+
outputs=[output_video, status_output],
|
| 534 |
+
show_progress=True
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
# Auto-preview when inputs change
|
| 538 |
+
for input_component in [srt_input, font_size, font_color, position]:
|
| 539 |
+
input_component.change(
|
| 540 |
+
fn=burner.preview_subtitles,
|
| 541 |
+
inputs=[srt_input, font_size, font_color, position],
|
| 542 |
+
outputs=preview_output
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
return demo
|
| 546 |
+
|
| 547 |
+
if __name__ == "__main__":
|
| 548 |
+
# Check if FFmpeg is available
|
| 549 |
+
try:
|
| 550 |
+
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
| 551 |
+
print("β
FFmpeg found!")
|
| 552 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 553 |
+
print("β FFmpeg not found! Please install FFmpeg and make sure it's in your PATH.")
|
| 554 |
+
print("Download from: https://ffmpeg.org/download.html")
|
| 555 |
+
exit(1)
|
| 556 |
+
|
| 557 |
+
# Check transcription capability
|
| 558 |
+
if NEMO_AVAILABLE and burner.model:
|
| 559 |
+
print("β
Auto-transcription enabled!")
|
| 560 |
+
else:
|
| 561 |
+
print("β οΈ Auto-transcription disabled. Install NeMo toolkit for automatic SRT generation:")
|
| 562 |
+
print("pip install nemo_toolkit[asr]")
|
| 563 |
+
|
| 564 |
+
# Launch the interface
|
| 565 |
+
demo = create_interface()
|
| 566 |
+
demo.launch(
|
| 567 |
+
server_name="0.0.0.0",
|
| 568 |
+
server_port=7860,
|
| 569 |
+
share=True,
|
| 570 |
+
debug=True
|
| 571 |
+
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Cython
|
| 2 |
+
git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
|
| 3 |
+
numpy<2.0
|
| 4 |
+
gradio
|
| 5 |
+
spaces
|
| 6 |
+
ffmpeg
|
| 7 |
+
pydub
|
| 8 |
+
ffmpeg-python
|
| 9 |
+
python-dotenv==1.0.0
|
| 10 |
+
torch
|
| 11 |
+
torchaudio
|