Spaces:

hellorahulk
/

videocaptiontool

Build error

App Files Files Community

hellorahulk commited on Apr 10, 2025

Commit

2302206

1 Parent(s): 58d3731

Add video caption app with Whisper auto-captioning and styling options

Browse files

Files changed (5) hide show

.gitignore +41 -0
README.md +25 -0
app.py +633 -0
requirements.txt +9 -0
setup.sh +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+# Virtual environments
+venv/
+env/
+ENV/
+# Jupyter Notebook
+.ipynb_checkpoints
+# Temporary files
+temp/
+tmp/
+*.temp
+*.tmp
+# OS-specific files
+.DS_Store
+Thumbs.db
+# Model weights/large files
+*.pt
+*.pth
+*.model
+# Logs
+logs/
+*.log
+# Testing
+.coverage
+htmlcov/
+.pytest_cache/

README.md CHANGED Viewed

@@ -9,4 +9,29 @@ app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# Video Caption Generator
+This tool allows you to add captions to your videos with precise control over styling and positioning. You can either auto-generate captions using Whisper AI speech recognition or provide your own captions in SRT, ASS, or VTT format.
+## Features
+- **Auto Caption Generation**: Extract and transcribe audio from your video using OpenAI's Whisper model
+- **Manual Caption Support**: Input your own captions in popular formats (SRT, ASS, VTT)
+- **Customizable Styling**: Control font, size, color, and positioning of captions
+- **High-Quality Output**: Burn captions directly into your video with FFmpeg
+## How to Use
+1. Upload your video file
+2. Choose whether to auto-generate captions or provide your own
+3. Customize font, size, color, and alignment
+4. Click "Generate Captioned Video" and wait for processing
+5. Download the resulting video with embedded captions
+Perfect for creating accessible content, adding subtitles to multilingual videos, or emphasizing important information in educational content.
+## Note
+Processing time depends on video length and complexity. Auto-caption generation utilizes Whisper and may take longer for larger files.
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,633 @@

+import os
+import tempfile
+import gradio as gr
+import ffmpeg
+import logging
+import whisper as openai_whisper  # Renamed to avoid potential conflicts
+import numpy as np
+import torch
+import datetime
+import subprocess
+import shlex
+from pathlib import Path
+import re # For parsing ASS/SRT
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Define fonts directory - adapt for Hugging Face environment if needed
+FONTS_DIR = '/usr/share/fonts/truetype'  # Common Linux font location
+# Check common font locations for other OS if needed
+if not os.path.exists(FONTS_DIR) and os.path.exists('/System/Library/Fonts'): # macOS
+    FONTS_DIR = '/System/Library/Fonts'
+elif not os.path.exists(FONTS_DIR) and os.path.exists('C:\Windows\Fonts'): # Windows
+    FONTS_DIR = 'C:\Windows\Fonts'
+FONT_PATHS = {}
+ACCEPTABLE_FONTS = ['Arial', 'Helvetica', 'Times New Roman'] # Start with common fallbacks
+try:
+    if FONTS_DIR and os.path.exists(FONTS_DIR):
+        logger.info(f"Searching for fonts in: {FONTS_DIR}")
+        found_fonts = []
+        for root, dirs, files in os.walk(FONTS_DIR):
+             for file in files:
+                 if file.lower().endswith(('.ttf', '.otf', '.ttc')):
+                     font_path = os.path.join(root, file)
+                     font_name = os.path.splitext(file)[0]
+                     # Basic name cleanup
+                     base_font_name = re.sub(r'[-_ ]?(bold|italic|regular|medium|light|condensed)?$', '', font_name, flags=re.IGNORECASE)
+                     if base_font_name not in FONT_PATHS:
+                         FONT_PATHS[base_font_name] = font_path
+                         found_fonts.append(base_font_name)
+        if found_fonts:
+             ACCEPTABLE_FONTS = sorted(list(set(found_fonts + ACCEPTABLE_FONTS)))
+             logger.info(f"Found system fonts: {ACCEPTABLE_FONTS}")
+        else:
+            logger.warning(f"No font files found in {FONTS_DIR}. Using defaults.")
+    else:
+        logger.warning(f"Font directory {FONTS_DIR} not found. Using defaults: {ACCEPTABLE_FONTS}")
+except Exception as e:
+    logger.warning(f"Could not load system fonts from {FONTS_DIR}: {e}. Using defaults: {ACCEPTABLE_FONTS}")
+# Global variable for Whisper model to avoid reloading
+whisper_model = None
+def generate_style_line(options):
+    """Generate ASS style line from options. Uses common defaults.
+       Ensure color format is correct (&HBBGGRRAA or &HAABBGGRR depending on FFmpeg build)
+       Using &HBBGGRR format for PrimaryColour based on common FFmpeg usage.
+    """
+    # Convert hex color picker (#FFFFFF) to ASS format (&HBBGGRR)
+    def hex_to_ass_bgr(hex_color):
+        hex_color = hex_color.lstrip('#')
+        if len(hex_color) == 6:
+            r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
+            return f"&H{b:02X}{g:02X}{r:02X}"
+        return '&H00FFFFFF' # Default to white if format is wrong
+    primary_color_ass = hex_to_ass_bgr(options.get('primary_color', '#FFFFFF'))
+    style_options = {
+        'Name': 'Default',
+        'Fontname': options.get('font_name', 'Arial'), # Ensure this font is accessible to FFmpeg
+        'Fontsize': options.get('font_size', 24),
+        'PrimaryColour': primary_color_ass,
+        'SecondaryColour': '&H000000FF', # Often unused, but good to define
+        'OutlineColour': '&H00000000', # Black outline
+        'BackColour': '&H80000000', # Semi-transparent black background/shadow
+        'Bold': 0, # Use -1 for True, 0 for False in ASS
+        'Italic': 0,
+        'Underline': 0,
+        'StrikeOut': 0,
+        'ScaleX': 100,
+        'ScaleY': 100,
+        'Spacing': 0,
+        'Angle': 0,
+        'BorderStyle': 1, # 1 = Outline + Shadow
+        'Outline': 2, # Outline thickness
+        'Shadow': 1, # Shadow distance
+        'Alignment': options.get('alignment', 2), # 2 = Bottom Center
+        'MarginL': 10,
+        'MarginR': 10,
+        'MarginV': 10, # Bottom margin
+        'Encoding': 1 # Default ANSI encoding
+    }
+    logger.info(f"Generated ASS Style Options: {style_options}")
+    return f"Style: {','.join(map(str, style_options.values()))}"
+def transcribe_audio(audio_path, progress=None):
+    """Transcribe audio using Whisper ASR model."""
+    global whisper_model
+    logger.info(f"Starting transcription for: {audio_path}")
+    try:
+        if whisper_model is None:
+            safe_progress_update(progress, 0.1, "Loading Whisper model...")
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Using device: {device} for Whisper")
+            # Use a smaller model if only CPU is available to potentially speed things up
+            model_size = "base" if device == "cuda" else "tiny.en" # or "tiny"
+            logger.info(f"Loading Whisper model size: {model_size}")
+            whisper_model = openai_whisper.load_model(model_size, device=device)
+            safe_progress_update(progress, 0.3, "Model loaded, processing audio...")
+        result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available())
+        logger.info(f"Transcription result (first 100 chars): {str(result)[:100]}")
+        safe_progress_update(progress, 0.7, "Transcription complete, formatting captions...")
+        return result
+    except Exception as e:
+        logger.exception(f"Error transcribing audio: {audio_path}") # Use logger.exception to include traceback
+        raise
+def format_time(seconds):
+    """Format time in SRT/ASS format (H:MM:SS.ms)."""
+    # ASS format uses H:MM:SS.xx (hundredths of a second)
+    hundredths = int((seconds % 1) * 100)
+    s = int(seconds) % 60
+    m = int(seconds / 60) % 60
+    h = int(seconds / 3600)
+    return f"{h}:{m:02d}:{s:02d}.{hundredths:02d}"
+def format_time_srt(seconds):
+    """Format time in SRT format (HH:MM:SS,ms)."""
+    ms = int((seconds % 1) * 1000)
+    s = int(seconds) % 60
+    m = int(seconds / 60) % 60
+    h = int(seconds / 3600)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+def generate_srt_from_transcript(segments):
+    """Convert whisper segments to SRT format."""
+    srt_content = ""
+    for i, segment in enumerate(segments):
+        start_time = format_time_srt(segment["start"])
+        end_time = format_time_srt(segment["end"])
+        text = segment["text"].strip()
+        srt_content += f"{i+1}\n{start_time} --> {end_time}\n{text}\n\n"
+    logger.info(f"Generated SRT (first 200 chars): {srt_content[:200]}")
+    return srt_content.strip()
+def generate_ass_dialogue_line(segment, style_name='Default'):
+    """Generate a single ASS dialogue line from a segment."""
+    start_time = format_time(segment["start"])
+    end_time = format_time(segment["end"])
+    text = segment["text"].strip().replace('\n', '\\N') # Replace newline with ASS newline
+    # Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+    return f"Dialogue: 0,{start_time},{end_time},{style_name},,0,0,0,,{text}"
+def generate_ass_from_transcript(segments, style_options):
+    """Convert whisper segments to ASS format including style header."""
+    style_line = generate_style_line(style_options)
+    ass_header = f"""
+[Script Info]
+Title: Generated Captions
+ScriptType: v4.00+
+WrapStyle: 0
+PlayResX: 384
+PlayResY: 288
+ScaledBorderAndShadow: yes
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+{style_line}
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+    dialogue_lines = [generate_ass_dialogue_line(seg) for seg in segments]
+    full_ass_content = ass_header + "\n".join(dialogue_lines)
+    logger.info(f"Generated ASS (first 300 chars): {full_ass_content[:300]}")
+    return full_ass_content
+def extract_audio(video_path, output_path):
+    """Extract audio from video file using ffmpeg subprocess."""
+    logger.info(f"Attempting to extract audio from {video_path} to {output_path}")
+    try:
+        command = [
+            "ffmpeg", "-i", video_path,
+            "-vn", # No video
+            "-acodec", "pcm_s16le", # Standard WAV format
+            "-ac", "1", # Mono
+            "-ar", "16000", # 16kHz sample rate (common for ASR)
+            "-y", # Overwrite output
+            output_path
+        ]
+        logger.info(f"Running audio extraction command: {' '.join(map(shlex.quote, command))}")
+        process = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            encoding='utf-8', # Explicitly set encoding
+            check=False
+        )
+        if process.returncode != 0:
+            logger.error(f"FFmpeg audio extraction error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}")
+            return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..."
+        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
+            logger.error(f"Audio extraction failed: Output file not created or empty. FFmpeg stderr: {process.stderr}")
+            return False, f"Output audio file not created or empty. FFmpeg stderr: {process.stderr[:500]}..."
+        logger.info(f"Audio extracted successfully to {output_path}, size: {os.path.getsize(output_path)} bytes")
+        return True, ""
+    except Exception as e:
+        logger.exception(f"Exception during audio extraction from {video_path}")
+        return False, str(e)
+def run_ffmpeg_with_subtitles(video_path, subtitle_path, output_path, style_options=None):
+    """Burn subtitles into video using ffmpeg subprocess.
+    Args:
+        video_path: Path to input video
+        subtitle_path: Path to ASS subtitle file
+        output_path: Path to save output video
+        style_options: Optional style parameters (not directly used, but kept for consistency)
+    Returns:
+        tuple: (success, error_message)
+    """
+    logger.info(f"Attempting to burn subtitles from {subtitle_path} into {video_path}")
+    # Check if the subtitle file exists and is not empty
+    if not os.path.exists(subtitle_path) or os.path.getsize(subtitle_path) == 0:
+        return False, f"Subtitle file {subtitle_path} does not exist or is empty"
+    # Check if the video file exists
+    if not os.path.exists(video_path):
+        return False, f"Video file {video_path} does not exist"
+    # Validate the video file using ffprobe
+    try:
+        probe_cmd = [
+            "ffprobe", "-v", "error",
+            "-select_streams", "v:0",
+            "-show_entries", "stream=codec_name,width,height",
+            "-of", "json",
+            video_path
+        ]
+        probe_result = subprocess.run(
+            probe_cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            encoding='utf-8'
+        )
+        if probe_result.returncode != 0:
+            logger.error(f"FFprobe validation failed: {probe_result.stderr}")
+            return False, f"FFprobe validation failed: {probe_result.stderr[:200]}..."
+    except Exception as e:
+        logger.exception(f"Exception during video validation: {video_path}")
+        return False, f"Video validation failed: {str(e)}"
+    try:
+        # The subtitle path needs to be properly escaped for the filter complex
+        # On Windows, backslashes need special handling
+        subtitle_path_esc = subtitle_path.replace('\\', '\\\\')
+        # Ensure paths are properly quoted for the shell command
+        command = [
+            "ffmpeg",
+            "-i", video_path,
+            "-vf", f"ass='{subtitle_path_esc}'",
+            "-c:v", "libx264",  # Use H.264 codec for broad compatibility
+            "-preset", "medium", # Balance between speed and quality
+            "-crf", "23",        # Reasonable quality setting (lower is better)
+            "-c:a", "aac",       # Use AAC for audio
+            "-b:a", "128k",      # Decent audio bitrate
+            "-movflags", "+faststart", # Optimize for web playback
+            "-y",                # Overwrite output if exists
+            output_path
+        ]
+        logger.info(f"Running subtitle burn command: {' '.join(map(shlex.quote, command))}")
+        process = subprocess.run(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            encoding='utf-8',
+            check=False
+        )
+        if process.returncode != 0:
+            logger.error(f"FFmpeg subtitle burn error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}")
+            return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..."
+        # Verify output file was created and is not empty
+        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
+            logger.error(f"Subtitle burning failed: Output file not created or empty. FFmpeg stderr: {process.stderr}")
+            return False, f"Output video file not created or empty. FFmpeg stderr: {process.stderr[:500]}..."
+        logger.info(f"Subtitles burned successfully, output: {output_path}, size: {os.path.getsize(output_path)} bytes")
+        return True, ""
+    except Exception as e:
+        logger.exception(f"Exception during subtitle burning: {video_path}")
+        return False, str(e)
+def safe_progress_update(progress_callback, value, desc=""):
+    """Safely update progress without crashing if progress_callback is None or fails."""
+    if progress_callback is not None:
+        try:
+            progress_callback(value, desc)
+        except Exception as e:
+            # Avoid flooding logs for simple progress updates
+            # logger.warning(f"Progress update progress failed: {e}")
+            pass # Silently ignore progress update errors
+def parse_srt_to_dialogue(srt_content):
+    """Basic SRT parser to list of dialogue events for ASS conversion."""
+    dialogue = []
+    # Regex to find index, timecodes, and text blocks
+    # Allows comma or period for milliseconds separator
+    pattern = re.compile(
+        r'^\s*(\d+)\s*$\n?'  # Index line
+        r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*'  # Start time
+        r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*$\n'  # End time
+        r'(.*?)(?=\n\s*\n\d+\s*$|\Z)',  # Text block (non-greedy) until blank line and next index or end of string
+        re.DOTALL | re.MULTILINE
+    )
+    logger.info("Attempting to parse SRT/VTT content...")
+    matches_found = 0
+    last_index = 0
+    for match in pattern.finditer(srt_content):
+        matches_found += 1
+        try:
+            index = int(match.group(1))
+            sh, sm, ss, sms = map(int, match.group(2, 3, 4, 5))
+            eh, em, es, ems = map(int, match.group(6, 7, 8, 9))
+            start_sec = sh * 3600 + sm * 60 + ss + sms / 1000.0
+            end_sec = eh * 3600 + em * 60 + es + ems / 1000.0
+            text = match.group(10).strip().replace('\n', '\\N') # Replace newline with ASS \N
+            # Basic validation
+            if end_sec < start_sec:
+                 logger.warning(f"SRT parse warning: End time {end_sec} before start time {start_sec} at index {index}. Skipping.")
+                 continue
+            if not text:
+                 logger.warning(f"SRT parse warning: Empty text content at index {index}. Skipping.")
+                 continue
+            dialogue.append({'start': start_sec, 'end': end_sec, 'text': text})
+            last_index = match.end()
+        except Exception as e:
+            logger.warning(f"Could not parse SRT block starting near index {match.group(1)}: {e}")
+    # Check if parsing consumed a reasonable amount of the input
+    if matches_found > 0 and last_index < len(srt_content) * 0.8:
+        logger.warning(f"SRT parsing finished early. Found {matches_found} blocks, but stopped near character {last_index} of {len(srt_content)}. Input format might be inconsistent.")
+    elif matches_found == 0 and len(srt_content) > 10:
+        logger.error(f"SRT parsing failed. No dialogue blocks found in content starting with: {srt_content[:100]}...")
+    logger.info(f"Parsed {len(dialogue)} dialogue events from SRT/VTT content.")
+    return dialogue
+def parse_ass_to_dialogue(ass_content):
+    """Basic ASS parser to extract dialogue events."""
+    dialogue = []
+    # Regex for ASS Dialogue line - make capturing groups non-optional where possible
+    # Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+    pattern = re.compile(
+        r'^Dialogue:\s*'
+        r'(?P<layer>\d+),\s*'
+        r'(?P<start>\d+:\d{2}:\d{2}\.\d{2}),\s*'
+        r'(?P<end>\d+:\d{2}:\d{2}\.\d{2}),\s*'
+        r'(?P<style>[^,]*),\s*'  # Style name
+        r'(?P<name>[^,]*),\s*'  # Actor name
+        r'(?P<marginL>\d+),\s*'
+        r'(?P<marginR>\d+),\s*'
+        r'(?P<marginV>\d+),\s*'
+        r'(?P<effect>[^,]*),\s*'  # Effect
+        r'(?P<text>.*?)$',  # Text (rest of line)
+        re.IGNORECASE
+    )
+    # Helper to convert H:MM:SS.xx to seconds
+    def time_to_seconds(time_str):
+        try:
+            parts = time_str.split(':')
+            h = int(parts[0])
+            m = int(parts[1])
+            s_parts = parts[2].split('.')
+            s = int(s_parts[0])
+            cs = int(s_parts[1])
+            return h * 3600 + m * 60 + s + cs / 100.0
+        except Exception as e:
+             logger.error(f"Failed to parse time string '{time_str}': {e}")
+             return 0.0 # Return 0 on failure to avoid crashing, but log it
+    logger.info("Attempting to parse ASS content...")
+    lines_parsed = 0
+    for line in ass_content.splitlines():
+        line = line.strip()
+        if not line.lower().startswith('dialogue:'):
+            continue
+        match = pattern.match(line)
+        if match:
+            lines_parsed += 1
+            try:
+                start_sec = time_to_seconds(match.group('start'))
+                end_sec = time_to_seconds(match.group('end'))
+                text = match.group('text').strip() # Already handles \N from ASS spec
+                if end_sec < start_sec:
+                    logger.warning(f"ASS parse warning: End time {end_sec} before start time {start_sec} in line: '{line}'. Skipping.")
+                    continue
+                if not text:
+                    logger.warning(f"ASS parse warning: Empty text content in line: '{line}'. Skipping.")
+                    continue
+                dialogue.append({'start': start_sec, 'end': end_sec, 'text': text})
+            except Exception as e:
+                logger.warning(f"Could not parse ASS dialogue line: '{line}'. Error: {e}")
+        else:
+             logger.warning(f"ASS dialogue line did not match expected pattern: '{line}'")
+    if lines_parsed == 0 and len(ass_content) > 50: # Check if content was substantial
+         logger.error(f"ASS parsing failed. No dialogue lines matched the expected pattern in content starting with: {ass_content[:200]}...")
+    logger.info(f"Parsed {len(dialogue)} dialogue events from {lines_parsed} matched ASS lines.")
+    return dialogue
+def process_video_with_captions(video, captions, caption_type, font_name, font_size,
+                               primary_color, alignment, auto_caption):
+    """Main processing function."""
+    progress = gr.Progress(track_tqdm=True)
+    temp_dir = None
+    try:
+        progress(0, desc="Initializing...")
+        temp_dir = tempfile.mkdtemp()
+        logger.info(f"Created temp dir: {temp_dir}")
+        video_path = os.path.join(temp_dir, "input_video.mp4")
+        output_path = os.path.join(temp_dir, "output_video.mp4")
+        # Removed initial_subtitle_path, only need final
+        final_ass_path = os.path.join(temp_dir, "captions_final.ass")
+        # --- Handle Video Input ---
+        progress(0.05, desc="Saving video...")
+        if hasattr(video, 'name') and video.name and os.path.exists(video.name):
+             import shutil
+             shutil.copy(video.name, video_path)
+             logger.info(f"Copied input video from Gradio temp file {video.name} to {video_path}")
+        elif isinstance(video, str) and os.path.exists(video):
+             import shutil
+             shutil.copy(video, video_path)
+             logger.info(f"Copied input video from path {video} to {video_path}")
+        else:
+             raise gr.Error("Could not access uploaded video file. Please try uploading again.")
+        # --- Prepare Styles ---
+        progress(0.1, desc="Preparing styles...")
+        generated_captions_display_text = ""
+        alignment_map = {"Bottom Center": 2, "Bottom Left": 1, "Bottom Right": 3}
+        style_options = {
+            'font_name': font_name,
+            'font_size': font_size,
+            'primary_color': primary_color,
+            'alignment': alignment_map.get(alignment, 2)
+        }
+        # --- Auto-Generate or Process Provided Captions ---
+        dialogue_events = [] # To hold {'start': float, 'end': float, 'text': str}
+        if auto_caption:
+            logger.info("Auto-generating captions...")
+            progress(0.15, desc="Extracting audio...")
+            audio_path = os.path.join(temp_dir, "audio.wav")
+            success, error_msg = extract_audio(video_path, audio_path)
+            if not success: raise gr.Error(f"Audio extraction failed: {error_msg}")
+            progress(0.25, desc="Transcribing audio...")
+            transcript = transcribe_audio(audio_path, progress=progress)
+            if not transcript or not transcript.get("segments"): raise gr.Error("No speech detected.")
+            dialogue_events = transcript["segments"] # Use segments directly
+            progress(0.6, desc="Generating ASS captions...")
+        else: # Use provided captions
+            logger.info(f"Using provided {caption_type} captions.")
+            if not captions or captions.strip() == "": raise gr.Error("Caption input is empty.")
+            progress(0.6, desc=f"Processing {caption_type} captions...")
+            if caption_type.lower() == 'ass':
+                 logger.info("Parsing provided ASS content.")
+                 dialogue_events = parse_ass_to_dialogue(captions)
+                 if not dialogue_events:
+                      raise gr.Error("Could not parse dialogue lines from provided ASS content.")
+            elif caption_type.lower() in ['srt', 'vtt']:
+                logger.info(f"Parsing provided {caption_type} content.")
+                dialogue_events = parse_srt_to_dialogue(captions)
+                if not dialogue_events:
+                     raise gr.Error(f"Could not parse provided {caption_type} content.")
+            else:
+                 raise gr.Error(f"Unsupported caption type: {caption_type}")
+        # --- Generate Final ASS File ---
+        if not dialogue_events:
+             raise gr.Error("No caption dialogue events found or generated.")
+        logger.info(f"Generating final ASS file with {len(dialogue_events)} events and UI styles.")
+        final_ass_content = generate_ass_from_transcript(dialogue_events, style_options)
+        generated_captions_display_text = final_ass_content # Show the final generated ASS
+        with open(final_ass_path, 'w', encoding='utf-8') as f:
+            f.write(final_ass_content)
+        logger.info(f"Written final styled ASS to {final_ass_path}")
+        # Verify file creation
+        if not os.path.exists(final_ass_path) or os.path.getsize(final_ass_path) == 0:
+            raise gr.Error(f"Internal error: Failed to write final ASS file to {final_ass_path}")
+        # --- Burn Subtitles ---
+        progress(0.7, desc="Burning subtitles into video...")
+        success, error_msg = run_ffmpeg_with_subtitles(
+            video_path, final_ass_path, output_path, style_options
+        )
+        if not success:
+            logger.error(f"Subtitle burning failed. Video: {video_path}, ASS: {final_ass_path}")
+            raise gr.Error(f"FFmpeg failed to burn subtitles: {error_msg}")
+        progress(1.0, desc="Processing complete!")
+        logger.info(f"Output video generated: {output_path}")
+        return output_path, generated_captions_display_text
+    except Exception as e:
+        logger.exception(f"Error in process_video_with_captions")
+        if temp_dir and os.path.exists(temp_dir):
+            try:
+                files = os.listdir(temp_dir)
+                logger.error(f"Files in temp dir {temp_dir} during error: {files}")
+            except Exception as list_e:
+                logger.error(f"Could not list temp dir {temp_dir}: {list_e}")
+        if isinstance(e, gr.Error): raise e
+        else: raise gr.Error(f"An unexpected error occurred: {str(e)}")
+# Function to toggle interactivity
+def toggle_captions_input(auto_generate):
+    """Toggle the interactivity of the captions input."""
+    return gr.update(interactive=not auto_generate)
+# --- Gradio Interface ---
+with gr.Blocks(title="Video Caption Generator") as app:
+    gr.Markdown("## Video Caption Generator")
+    gr.Markdown("Upload a video, choose styling, and add captions. Use auto-generation or provide your own SRT/ASS/VTT.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("**Input & Options**")
+            video_input = gr.Video(label="Upload Video")
+            auto_caption = gr.Checkbox(label="Auto-generate captions (Overrides below)", value=False)
+            captions_input = gr.Textbox(
+                label="Or Enter Captions Manually",
+                placeholder="1\n00:00:01,000 --> 00:00:05,000\nHello World\n\n2\n...",
+                lines=8,
+                interactive=True
+            )
+            caption_type = gr.Dropdown(
+                choices=["srt", "ass", "vtt"],
+                value="srt",
+                label="Format (if providing captions manually)"
+            )
+            gr.Markdown("**Caption Styling** (Applied to auto-generated or converted ASS)")
+            with gr.Row():
+                 font_name = gr.Dropdown(
+                    choices=ACCEPTABLE_FONTS,
+                    value=ACCEPTABLE_FONTS[0] if ACCEPTABLE_FONTS else "Arial",
+                    label="Font"
+                )
+                 font_size = gr.Slider(minimum=10, maximum=60, value=24, step=1, label="Font Size")
+            with gr.Row():
+                primary_color = gr.ColorPicker(value="#FFFFFF", label="Text Color")
+                alignment = gr.Dropdown(
+                    choices=["Bottom Center", "Bottom Left", "Bottom Right"],
+                    value="Bottom Center",
+                    label="Alignment"
+                )
+            process_btn = gr.Button("Generate Captioned Video", variant="primary")
+        with gr.Column(scale=1):
+             gr.Markdown("**Output**")
+             video_output = gr.Video(label="Captioned Video")
+             generated_captions_output = gr.Textbox(
+                label="Generated Captions (ASS format if auto-generated)",
+                lines=10,
+                interactive=False
+             )
+    # Link checkbox to captions input interactivity
+    auto_caption.change(
+        fn=toggle_captions_input,
+        inputs=[auto_caption],
+        outputs=[captions_input]
+    )
+    # Define the main processing function call for the button
+    process_btn.click(
+        fn=process_video_with_captions,
+        inputs=[
+            video_input,
+            captions_input,
+            caption_type,
+            font_name,
+            font_size,
+            primary_color,
+            alignment,
+            auto_caption
+        ],
+        outputs=[video_output, generated_captions_output],
+        # api_name="generate_captions"
+    )
+# Launch the app
+if __name__ == "__main__":
+    app.launch(debug=True, share=False) # Enable debug for local testing

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=3.50.2
+ffmpeg-python>=0.2.0
+opencv-python-headless>=4.8.0
+numpy>=1.22.0
+openai-whisper>=20231117
+tqdm>=4.66.0
+torch>=2.0.0
+transformers>=4.35.0
+pathlib>=1.0.1

setup.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+# Install FFmpeg if not already installed
+if ! command -v ffmpeg &> /dev/null
+then
+    echo "FFmpeg not found, installing..."
+    apt-get update && apt-get install -y ffmpeg
+else
+    echo "FFmpeg is already installed"
+fi
+# Install FFprobe if not already installed (should come with FFmpeg but checking to be safe)
+if ! command -v ffprobe &> /dev/null
+then
+    echo "FFprobe not found, installing..."
+    apt-get update && apt-get install -y ffmpeg
+else
+    echo "FFprobe is already installed"
+fi
+# Make sure the script has appropriate permissions in case it needs execution
+chmod -R 755 .
+echo "Setup complete!"