Spaces:

aheedsajid
/

AI-Video-Subtitle-Captions

Build error

App Files Files Community

aheedsajid commited on May 22, 2025

Commit

1ce9e98

verified ·

1 Parent(s): eb71023

Upload 2 files

Browse files

Files changed (2) hide show

app.py +571 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,571 @@

+import gradio as gr
+import subprocess
+import os
+import tempfile
+import shutil
+from pathlib import Path
+import json
+import datetime
+import csv
+from pydub import AudioSegment
+import numpy as np
+import torch
+import gc
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Import NeMo for transcription (you'll need to install: pip install nemo_toolkit[asr])
+try:
+    from nemo.collections.asr.models import ASRModel
+    NEMO_AVAILABLE = True
+except ImportError:
+    NEMO_AVAILABLE = False
+    print("Warning: NeMo not available. Auto-transcription will be disabled.")
+class AutomatedSubtitleBurner:
+    def __init__(self):
+        self.temp_dir = tempfile.mkdtemp()
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = None
+        # Load transcription model if available
+        if NEMO_AVAILABLE:
+            try:
+                MODEL_NAME = os.getenv('MODEL_NAME')
+                if MODEL_NAME:
+                    self.model = ASRModel.from_pretrained(model_name=MODEL_NAME)
+                    self.model.eval()
+            except Exception as e:
+                self.model = None
+    def extract_audio_from_video(self, video_path):
+        """Extract audio from video file for transcription"""
+        try:
+            audio_path = os.path.join(self.temp_dir, "extracted_audio.wav")
+            # Use FFmpeg to extract audio
+            cmd = [
+                'ffmpeg', '-y', '-i', video_path,
+                '-vn',  # No video
+                '-acodec', 'pcm_s16le',  # Audio codec
+                '-ar', '16000',  # Sample rate
+                '-ac', '1',  # Mono
+                audio_path
+            ]
+            subprocess.run(cmd, capture_output=True, check=True)
+            return audio_path
+        except Exception as e:
+            print(f"Error extracting audio: {e}")
+            return None
+    def format_srt_time(self, seconds: float) -> str:
+        """Converts seconds to SRT time format HH:MM:SS,mmm"""
+        sanitized_total_seconds = max(0.0, seconds)
+        delta = datetime.timedelta(seconds=sanitized_total_seconds)
+        total_int_seconds = int(delta.total_seconds())
+        hours = total_int_seconds // 3600
+        remainder_seconds_after_hours = total_int_seconds % 3600
+        minutes = remainder_seconds_after_hours // 60
+        seconds_part = remainder_seconds_after_hours % 60
+        milliseconds = delta.microseconds // 1000
+        return f"{hours:02d}:{minutes:02d}:{seconds_part:02d},{milliseconds:03d}"
+    def generate_srt_content(self, word_timestamps: list) -> str:
+        """Generates SRT formatted string from word timestamps"""
+        srt_content = []
+        for i, ts in enumerate(word_timestamps):
+            start_time = self.format_srt_time(ts['start'])
+            end_time = self.format_srt_time(ts['end'])
+            text = ts['word']
+            srt_content.append(str(i + 1))
+            srt_content.append(f"{start_time} --> {end_time}")
+            srt_content.append(text)
+            srt_content.append("")
+        return "\n".join(srt_content)
+    def transcribe_audio(self, audio_path, progress_callback=None):
+        """Transcribe audio to get word-level timestamps"""
+        if not self.model or not NEMO_AVAILABLE:
+            return None, "Transcription model not available"
+        try:
+            if progress_callback:
+                progress_callback(0.1, "Loading audio...")
+            # Load and preprocess audio
+            audio = AudioSegment.from_file(audio_path)
+            duration_sec = audio.duration_seconds
+            if progress_callback:
+                progress_callback(0.2, "Preprocessing audio...")
+            # Ensure audio is in correct format
+            if audio.frame_rate != 16000:
+                audio = audio.set_frame_rate(16000)
+            if audio.channels != 1:
+                audio = audio.set_channels(1)
+            # Save preprocessed audio
+            processed_path = os.path.join(self.temp_dir, "processed_audio.wav")
+            audio.export(processed_path, format="wav")
+            if progress_callback:
+                progress_callback(0.3, "Starting transcription...")
+            # Configure model for long audio if needed
+            long_audio_settings_applied = False
+            if duration_sec > 480:  # 8 minutes
+                try:
+                    print("Applying long audio settings for transcription...")
+                    self.model.change_attention_model("rel_pos_local_attn", [256, 256])
+                    self.model.change_subsampling_conv_chunking_factor(1)
+                    long_audio_settings_applied = True
+                except Exception as e:
+                    print(f"Warning: Could not apply long audio settings: {e}")
+            # Move model to appropriate device and precision
+            self.model.to(self.device)
+            self.model.to(torch.bfloat16)
+            if progress_callback:
+                progress_callback(0.5, "Transcribing (this may take a while)...")
+            # Transcribe with timestamps
+            output = self.model.transcribe([processed_path], timestamps=True)
+            if progress_callback:
+                progress_callback(0.8, "Processing transcription results...")
+            if not output or not output[0] or not hasattr(output[0], 'timestamp'):
+                return None, "Transcription failed - no output generated"
+            # Get word-level timestamps
+            word_timestamps = output[0].timestamp.get('word', [])
+            if not word_timestamps:
+                return None, "No word-level timestamps generated"
+            # Generate SRT content
+            srt_content = self.generate_srt_content(word_timestamps)
+            if progress_callback:
+                progress_callback(1.0, "Transcription complete!")
+            return srt_content, "Transcription successful!"
+        except torch.cuda.OutOfMemoryError:
+            return None, "CUDA out of memory. Please try a shorter video or use CPU."
+        except Exception as e:
+            return None, f"Transcription error: {str(e)}"
+        finally:
+            # Cleanup model settings and memory
+            try:
+                if long_audio_settings_applied and self.model:
+                    self.model.change_attention_model("rel_pos")
+                    self.model.change_subsampling_conv_chunking_factor(-1)
+                if self.model and self.device == 'cuda':
+                    self.model.cpu()
+                gc.collect()
+                if self.device == 'cuda':
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                print(f"Warning: Error during cleanup: {e}")
+    def auto_generate_srt(self, video_file, progress=gr.Progress()):
+        """Automatically generate SRT from video"""
+        if not video_file:
+            return "", "Please provide a video file"
+        if not self.model or not NEMO_AVAILABLE:
+            return "", "Transcription model not available. Please install NeMo toolkit."
+        try:
+            progress(0.05, desc="Extracting audio from video...")
+            # Extract audio from video
+            audio_path = self.extract_audio_from_video(video_file)
+            if not audio_path:
+                return "", "Failed to extract audio from video"
+            progress(0.1, desc="Audio extracted, starting transcription...")
+            # Transcribe audio
+            def progress_callback(value, desc):
+                progress(0.1 + (value * 0.8), desc=desc)
+            srt_content, message = self.transcribe_audio(audio_path, progress_callback)
+            progress(0.95, desc="Finalizing...")
+            if srt_content:
+                progress(1.0, desc="SRT generation complete!")
+                return srt_content, message
+            else:
+                return "", message
+        except Exception as e:
+            return "", f"Error generating SRT: {str(e)}"
+    def create_styled_srt(self, srt_content, font_size=24, font_color="white",
+                         outline_color="black", outline_width=1):
+        """Create a styled SRT file with ASS-style formatting"""
+        lines = srt_content.strip().split('\n')
+        styled_lines = []
+        i = 0
+        while i < len(lines):
+            if lines[i].strip().isdigit():  # Subtitle number
+                styled_lines.append(lines[i])
+                i += 1
+                if i < len(lines):  # Timestamp
+                    styled_lines.append(lines[i])
+                    i += 1
+                    # Collect all text lines for this subtitle
+                    text_lines = []
+                    while i < len(lines) and lines[i].strip() != "":
+                        text_lines.append(lines[i])
+                        i += 1
+                    # Apply styling to text
+                    if text_lines:
+                        styled_text = ' '.join(text_lines)
+                        # Add basic styling tags
+                        styled_text = f"<font size='{font_size}' color='{font_color}'>{styled_text}</font>"
+                        styled_lines.append(styled_text)
+                    styled_lines.append("")  # Empty line separator
+            else:
+                i += 1
+        return '\n'.join(styled_lines)
+    def get_video_info(self, video_path):
+        """Get video information using ffprobe"""
+        try:
+            cmd = [
+                'ffprobe', '-v', 'quiet', '-print_format', 'json',
+                '-show_format', '-show_streams', video_path
+            ]
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            info = json.loads(result.stdout)
+            # Find video stream
+            video_stream = None
+            for stream in info['streams']:
+                if stream['codec_type'] == 'video':
+                    video_stream = stream
+                    break
+            if video_stream:
+                width = int(video_stream['width'])
+                height = int(video_stream['height'])
+                duration = float(video_stream.get('duration', 0))
+                return width, height, duration
+        except Exception as e:
+            print(f"Error getting video info: {e}")
+        return 1920, 1080, 0  # Default values
+    def burn_subtitles(self, video_file, srt_content, font_size=24, font_color="white",
+                      position="bottom_center", outline_color="black", outline_width=1,
+                      progress=gr.Progress()):
+        """Burn subtitles into video using FFmpeg"""
+        if not video_file or not srt_content.strip():
+            return None, "Please provide both video file and SRT content"
+        try:
+            progress(0.1, desc="Preparing files...")
+            # Create temporary SRT file
+            srt_path = os.path.join(self.temp_dir, "subtitles.srt")
+            styled_srt = self.create_styled_srt(srt_content, font_size, font_color,
+                                              outline_color, outline_width)
+            with open(srt_path, 'w', encoding='utf-8') as f:
+                f.write(styled_srt)
+            progress(0.2, desc="Getting video information...")
+            # Get video info
+            width, height, duration = self.get_video_info(video_file)
+            progress(0.3, desc="Starting subtitle burning...")
+            # Output file
+            output_filename = f"output_with_subtitles_{font_size}px.mp4"
+            output_path = os.path.join(self.temp_dir, output_filename)
+            # Build FFmpeg command with subtitle filter
+            cmd = [
+                'ffmpeg', '-y',  # Overwrite output files
+                '-i', video_file,
+                '-vf', f"""subtitles='{srt_path}':force_style='FontSize={font_size},PrimaryColour=&H{self.color_to_bgr_hex(font_color)},OutlineColour=&H{self.color_to_bgr_hex(outline_color)},Outline={outline_width},Alignment=2'""",
+                '-c:a', 'copy',  # Copy audio without re-encoding
+                '-c:v', 'libx264',  # Video codec
+                '-preset', 'medium',  # Encoding preset
+                '-crf', '23',  # Quality setting
+                output_path
+            ]
+            progress(0.4, desc="Processing video (this may take a while)...")
+            # Run FFmpeg
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                universal_newlines=True
+            )
+            # Monitor progress
+            while True:
+                output = process.stderr.readline()
+                if output == '' and process.poll() is not None:
+                    break
+                if output and 'time=' in output:
+                    # Try to extract time for progress
+                    try:
+                        time_str = output.split('time=')[1].split()[0]
+                        time_parts = time_str.split(':')
+                        current_seconds = (float(time_parts[0]) * 3600 +
+                                         float(time_parts[1]) * 60 +
+                                         float(time_parts[2]))
+                        if duration > 0:
+                            prog = 0.4 + (current_seconds / duration) * 0.5
+                            progress(min(prog, 0.9), desc=f"Processing: {time_str}")
+                    except:
+                        pass
+            progress(0.95, desc="Finalizing...")
+            return_code = process.poll()
+            if return_code == 0:
+                progress(1.0, desc="Complete!")
+                return output_path, "Video processed successfully!"
+            else:
+                error_output = process.stderr.read()
+                return None, f"FFmpeg error: {error_output}"
+        except Exception as e:
+            return None, f"Error processing video: {str(e)}"
+    def color_to_bgr_hex(self, color):
+        """Convert color name to BGR hex for FFmpeg"""
+        color_map = {
+            'white': 'FFFFFF',
+            'black': '000000',
+            'red': '0000FF',
+            'green': '00FF00',
+            'blue': 'FF0000',
+            'yellow': '00FFFF',
+            'cyan': 'FFFF00',
+            'magenta': 'FF00FF',
+            'orange': '0080FF',
+            'purple': '800080',
+            'pink': 'FFB6C1',
+            'gray': '808080',
+            'grey': '808080'
+        }
+        return color_map.get(color.lower(), 'FFFFFF')
+    def preview_subtitles(self, srt_content, font_size, font_color, position):
+        """Generate a preview of how subtitles will look"""
+        if not srt_content.strip():
+            return "No SRT content provided"
+        lines = srt_content.strip().split('\n')
+        preview_lines = []
+        # Extract first few subtitles for preview
+        subtitle_count = 0
+        i = 0
+        while i < len(lines) and subtitle_count < 3:
+            if lines[i].strip().isdigit():
+                subtitle_num = lines[i].strip()
+                i += 1
+                if i < len(lines):
+                    timestamp = lines[i].strip()
+                    i += 1
+                    text_lines = []
+                    while i < len(lines) and lines[i].strip() != "":
+                        text_lines.append(lines[i].strip())
+                        i += 1
+                    if text_lines:
+                        text = ' '.join(text_lines)
+                        preview_lines.append(f"#{subtitle_num} [{timestamp}]")
+                        preview_lines.append(f"Text: \"{text}\"")
+                        preview_lines.append(f"Style: {font_size}px {font_color} at {position}")
+                        preview_lines.append("---")
+                        subtitle_count += 1
+            else:
+                i += 1
+        return '\n'.join(preview_lines) if preview_lines else "No valid subtitles found"
+# Initialize the subtitle burner
+burner = AutomatedSubtitleBurner()
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Automated AI Subtitle Video Captions", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🎬 Automated SRT Subtitle Video Burner")
+        gr.Markdown("Upload a video and either auto-generate subtitles or paste your own SRT content!")
+        if not NEMO_AVAILABLE:
+            gr.Markdown("⚠️ **Note**: Auto-transcription is disabled. Install NeMo toolkit for automatic SRT generation.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📁 Input")
+                video_input = gr.File(
+                    label="Upload Video File",
+                    file_types=[".mp4", ".avi", ".mov", ".mkv", ".wmv", ".flv"],
+                    type="filepath"
+                )
+                with gr.Row():
+                    if NEMO_AVAILABLE and burner.model:
+                        auto_generate_btn = gr.Button("🤖 Auto-Generate SRT", variant="secondary")
+                    else:
+                        auto_generate_btn = gr.Button("🤖 Auto-Generate SRT (Disabled)", variant="secondary", interactive=False)
+                srt_input = gr.Textbox(
+                    label="SRT Content (Auto-generated or Manual)",
+                    placeholder="SRT content will appear here after auto-generation, or paste your own...",
+                    lines=12,
+                    max_lines=20
+                )
+            with gr.Column(scale=1):
+                gr.Markdown("### 🎨 Subtitle Styling")
+                font_size = gr.Slider(
+                    minimum=8,
+                    maximum=72,
+                    value=24,
+                    step=1,
+                    label="Font Size (px)"
+                )
+                font_color = gr.Dropdown(
+                    choices=["white", "black", "red", "green", "blue", "yellow",
+                            "cyan", "magenta", "orange", "purple", "pink", "gray"],
+                    value="white",
+                    label="Font Color"
+                )
+                position = gr.Dropdown(
+                    choices=["top_left", "top_center", "top_right",
+                            "center_left", "center", "center_right",
+                            "bottom_left", "bottom_center", "bottom_right"],
+                    value="bottom_center",
+                    label="Position"
+                )
+                outline_color = gr.Dropdown(
+                    choices=["black", "white", "red", "green", "blue", "yellow",
+                            "cyan", "magenta", "orange", "purple", "pink", "gray"],
+                    value="black",
+                    label="Outline Color"
+                )
+                outline_width = gr.Slider(
+                    minimum=0,
+                    maximum=5,
+                    value=1,
+                    step=1,
+                    label="Outline Width"
+                )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### 👁️ Preview")
+                preview_output = gr.Textbox(
+                    label="Subtitle Preview",
+                    lines=8,
+                    interactive=False
+                )
+                preview_btn = gr.Button("🔍 Preview Subtitles", variant="secondary")
+        with gr.Row():
+            process_btn = gr.Button("🔥 Burn Subtitles to Video", variant="primary", size="lg")
+        with gr.Row():
+            with gr.Column():
+                output_video = gr.File(label="Download Processed Video")
+                status_output = gr.Textbox(label="Status", interactive=False)
+        # Event handlers
+        if NEMO_AVAILABLE and burner.model:
+            auto_generate_btn.click(
+                fn=burner.auto_generate_srt,
+                inputs=[video_input],
+                outputs=[srt_input, status_output],
+                show_progress=True
+            )
+        preview_btn.click(
+            fn=burner.preview_subtitles,
+            inputs=[srt_input, font_size, font_color, position],
+            outputs=preview_output
+        )
+        process_btn.click(
+            fn=burner.burn_subtitles,
+            inputs=[video_input, srt_input, font_size, font_color, position,
+                   outline_color, outline_width],
+            outputs=[output_video, status_output],
+            show_progress=True
+        )
+        # Auto-preview when inputs change
+        for input_component in [srt_input, font_size, font_color, position]:
+            input_component.change(
+                fn=burner.preview_subtitles,
+                inputs=[srt_input, font_size, font_color, position],
+                outputs=preview_output
+            )
+    return demo
+if __name__ == "__main__":
+    # Check if FFmpeg is available
+    try:
+        subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
+        print("✅ FFmpeg found!")
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("❌ FFmpeg not found! Please install FFmpeg and make sure it's in your PATH.")
+        print("Download from: https://ffmpeg.org/download.html")
+        exit(1)
+    # Check transcription capability
+    if NEMO_AVAILABLE and burner.model:
+        print("✅ Auto-transcription enabled!")
+    else:
+        print("⚠️ Auto-transcription disabled. Install NeMo toolkit for automatic SRT generation:")
+        print("pip install nemo_toolkit[asr]")
+    # Launch the interface
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+Cython
+git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
+numpy<2.0
+gradio
+spaces
+ffmpeg
+pydub
+ffmpeg-python
+python-dotenv==1.0.0
+torch
+torchaudio