Real-Time-Video-Captioning

Sleeping

App Files Files Community

Varsha Dewangan commited on Jun 21, 2025

Commit

db2860e

1 Parent(s): 7b93c80

application file added

Browse files

Files changed (8) hide show

.gitignore +0 -0
README.md +1 -1
app.py +701 -0
dockerfile +30 -0
requirements.txt +15 -0
static/app.js +447 -0
static/style.css +1103 -0
templates/index.html +234 -0

.gitignore ADDED Viewed

Binary file (56 Bytes). View file

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Real Time Video Captioning
-emoji: 🌖
 colorFrom: purple
 colorTo: yellow
 sdk: docker

 ---
 title: Real Time Video Captioning
+emoji: 🎥
 colorFrom: purple
 colorTo: yellow
 sdk: docker

app.py ADDED Viewed

	@@ -0,0 +1,701 @@

+import base64
+import numpy as np
+import torch
+from flask import Flask, render_template, request
+from flask_socketio import SocketIO, emit
+from PIL import Image, ImageEnhance, ImageFilter
+from io import BytesIO
+import logging
+import threading
+import time
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from collections import deque
+import cv2
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import hashlib
+import json
+from datetime import datetime, timedelta
+import queue
+# ---- 1. ENHANCED SETUP ----
+# Suppress excessive logging from libraries
+logging.getLogger('engineio').setLevel(logging.ERROR)
+logging.getLogger('socketio').setLevel(logging.ERROR)
+# --- Enhanced Configuration ---
+FRAME_SKIP = 3  # Adaptive frame skipping
+IMAGE_SIZE = 224  # Optimized size for BLIP
+BUFFER_SIZE = 5  # Smart buffering
+MIN_CONFIDENCE_DIFF = 0.03
+MAX_WORKERS = 6  # Increased thread pool
+CACHE_SIZE = 500  # Larger cache with LRU
+BATCH_SIZE = 4  # Batch processing capability
+# Advanced performance settings
+ADAPTIVE_QUALITY = True
+MIN_PROCESSING_INTERVAL = 0.1  # Minimum time between processing
+SCENE_CHANGE_THRESHOLD = 0.15  # For scene change detection
+CAPTION_HISTORY_SIZE = 10  # Keep caption history for context
+# --- Flask & SocketIO App Initialization ---
+app = Flask(__name__)
+app.config['SECRET_KEY'] = 'your-very-secret-key!'
+socketio = SocketIO(app, async_mode='threading', logger=False, engineio_logger=False,
+                   cors_allowed_origins="*", ping_timeout=60, ping_interval=25)
+# --- Enhanced AI Model Setup ---
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# Advanced thread pool with priority queue
+executor = ThreadPoolExecutor(max_workers=MAX_WORKERS, thread_name_prefix="caption_worker")
+priority_queue = queue.PriorityQueue()
+# Load BLIP model with advanced optimizations
+try:
+    print("Loading BLIP model with optimizations...")
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+    model = model.to(device)
+    model.eval()
+    # Advanced CUDA optimizations
+    if device.type == 'cuda':
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cudnn.deterministic = False
+        model = torch.jit.script(model)  # TorchScript optimization
+        from torch.cuda.amp import autocast, GradScaler
+        USE_AMP = True
+        scaler = GradScaler()
+        print("CUDA optimizations and TorchScript enabled")
+    else:
+        USE_AMP = False
+    # Warm up the model
+    dummy_image = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE), color='black')
+    dummy_inputs = processor(dummy_image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        _ = model.generate(**dummy_inputs, max_length=10)
+    print("Model warmed up successfully!")
+except Exception as e:
+    print(f"Error loading BLIP model: {e}")
+    exit()
+# --- Advanced Caching System ---
+class LRUCache:
+    def __init__(self, max_size):
+        self.max_size = max_size
+        self.cache = {}
+        self.access_order = deque()
+        self.lock = threading.Lock()
+    def get(self, key):
+        with self.lock:
+            if key in self.cache:
+                # Move to end (most recently used)
+                self.access_order.remove(key)
+                self.access_order.append(key)
+                return self.cache[key]
+            return None
+    def put(self, key, value):
+        with self.lock:
+            if key in self.cache:
+                self.access_order.remove(key)
+            elif len(self.cache) >= self.max_size:
+                # Remove least recently used
+                oldest = self.access_order.popleft()
+                del self.cache[oldest]
+            self.cache[key] = value
+            self.access_order.append(key)
+    def clear(self):
+        with self.lock:
+            self.cache.clear()
+            self.access_order.clear()
+# --- Advanced Frame Processing ---
+frame_counters = {}
+processing_locks = {}
+caption_buffers = {}
+last_captions = {}
+processing_times = {}
+caption_history = {}
+last_processed_time = {}
+scene_features = {}  # For scene change detection
+# Enhanced caching
+caption_cache = LRUCache(CACHE_SIZE)
+batch_queue = {}
+# --- Smart Performance Monitor ---
+class AdvancedPerformanceMonitor:
+    def __init__(self):
+        self.metrics = {
+            'total_frames': 0,
+            'processed_frames': 0,
+            'cache_hits': 0,
+            'cache_misses': 0,
+            'batch_processed': 0,
+            'scene_changes': 0,
+            'processing_times': deque(maxlen=100),
+            'start_time': time.time()
+        }
+        self.lock = threading.Lock()
+    def log_frame(self, processing_time=None, cache_hit=False, batch_size=1, scene_change=False):
+        with self.lock:
+            self.metrics['total_frames'] += 1
+            if processing_time:
+                self.metrics['processed_frames'] += 1
+                self.metrics['processing_times'].append(processing_time)
+                if batch_size > 1:
+                    self.metrics['batch_processed'] += batch_size
+            if cache_hit:
+                self.metrics['cache_hits'] += 1
+            else:
+                self.metrics['cache_misses'] += 1
+            if scene_change:
+                self.metrics['scene_changes'] += 1
+    def get_stats(self):
+        with self.lock:
+            if not self.metrics['processing_times']:
+                return {"avg_time": 0, "cache_hit_rate": 0, "fps": 0, "efficiency": 0}
+            total_time = time.time() - self.metrics['start_time']
+            avg_processing_time = np.mean(self.metrics['processing_times'])
+            cache_hit_rate = self.metrics['cache_hits'] / max(1, self.metrics['total_frames'])
+            processing_fps = self.metrics['processed_frames'] / max(1, avg_processing_time * self.metrics['processed_frames'])
+            efficiency = self.metrics['processed_frames'] / max(1, self.metrics['total_frames'])
+            return {
+                "avg_time": avg_processing_time,
+                "cache_hit_rate": cache_hit_rate,
+                "processing_fps": processing_fps,
+                "efficiency": efficiency,
+                "total_frames": self.metrics['total_frames'],
+                "scene_changes": self.metrics['scene_changes'],
+                "batch_efficiency": self.metrics['batch_processed'] / max(1, self.metrics['processed_frames'])
+            }
+perf_monitor = AdvancedPerformanceMonitor()
+# --- Smart Image Preprocessing ---
+def smart_preprocess_image(image, enhance_quality=True):
+    """Enhanced image preprocessing with quality improvements."""
+    # Convert to RGB if needed
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    if enhance_quality:
+        # Enhance image quality
+        # Sharpening
+        enhancer = ImageEnhance.Sharpness(image)
+        image = enhancer.enhance(1.2)
+        # Contrast enhancement
+        enhancer = ImageEnhance.Contrast(image)
+        image = enhancer.enhance(1.1)
+        # Color enhancement
+        enhancer = ImageEnhance.Color(image)
+        image = enhancer.enhance(1.05)
+    # Smart resizing with aspect ratio preservation
+    original_size = image.size
+    if original_size[0] != original_size[1]:  # Non-square image
+        # Crop to square from center
+        min_dim = min(original_size)
+        left = (original_size[0] - min_dim) // 2
+        top = (original_size[1] - min_dim) // 2
+        image = image.crop((left, top, left + min_dim, top + min_dim))
+    # Resize with high-quality resampling
+    image = image.resize((IMAGE_SIZE, IMAGE_SIZE), Image.LANCZOS)
+    return image
+def advanced_hash_image(image):
+    """Generate robust hash for image similarity detection."""
+    # Create perceptual hash using multiple features
+    img_small = image.resize((16, 16), Image.LANCZOS)
+    img_gray = img_small.convert('L')
+    # Get pixel values
+    pixels = list(img_gray.getdata())
+    # Create hash from average and differences
+    avg = sum(pixels) / len(pixels)
+    hash_bits = ''.join('1' if pixel > avg else '0' for pixel in pixels)
+    # Additional feature: edge detection hash
+    img_array = np.array(img_gray)
+    edges = cv2.Canny(img_array, 50, 150)
+    edge_hash = hashlib.md5(edges.tobytes()).hexdigest()[:8]
+    return hash_bits + edge_hash
+def detect_scene_change(sid, current_features):
+    """Detect significant scene changes."""
+    if sid not in scene_features:
+        scene_features[sid] = current_features
+        return True
+    # Compare with previous features
+    prev_features = scene_features[sid]
+    # Calculate similarity (Hamming distance for hash)
+    if len(current_features) == len(prev_features):
+        diff_count = sum(c1 != c2 for c1, c2 in zip(current_features[:256], prev_features[:256]))
+        similarity = 1 - (diff_count / 256)
+        scene_features[sid] = current_features
+        return similarity < (1 - SCENE_CHANGE_THRESHOLD)
+    scene_features[sid] = current_features
+    return True
+# ---- 2. ENHANCED WEBSOCKET HANDLERS ----
+@socketio.on('connect')
+def handle_connect():
+    """Enhanced client connection handler."""
+    print(f"Client connected: {request.sid}")
+    sid = request.sid
+    # Initialize client data
+    frame_counters[sid] = 0
+    processing_locks[sid] = threading.Lock()
+    caption_buffers[sid] = deque(maxlen=BUFFER_SIZE)
+    last_captions[sid] = ""
+    processing_times[sid] = deque(maxlen=20)
+    caption_history[sid] = deque(maxlen=CAPTION_HISTORY_SIZE)
+    last_processed_time[sid] = 0
+    scene_features[sid] = ""
+    batch_queue[sid] = []
+    # Send initial status
+    emit('status', {'connected': True, 'device': str(device)})
+@socketio.on('disconnect')
+def handle_disconnect():
+    """Enhanced client disconnection handler."""
+    print(f"Client disconnected: {request.sid}")
+    cleanup_client(request.sid)
+def cleanup_client(sid):
+    """Enhanced client cleanup."""
+    for data_dict in [frame_counters, processing_locks, caption_buffers,
+                      last_captions, processing_times, caption_history,
+                      last_processed_time, scene_features, batch_queue]:
+        if sid in data_dict:
+            del data_dict[sid]
+@socketio.on('image')
+def handle_image(data_image):
+    """Enhanced image handling with smart processing."""
+    sid = request.sid
+    # Initialize if not exists
+    if sid not in frame_counters:
+        handle_connect()
+    frame_counters[sid] += 1
+    current_time = time.time()
+    # Adaptive frame skipping based on processing load
+    skip_factor = FRAME_SKIP
+    if sid in processing_times and processing_times[sid]:
+        avg_time = np.mean(processing_times[sid])
+        if avg_time > 0.5:  # If processing is slow, skip more frames
+            skip_factor = FRAME_SKIP * 2
+        elif avg_time < 0.1:  # If processing is fast, skip fewer frames
+            skip_factor = max(1, FRAME_SKIP // 2)
+    if frame_counters[sid] % skip_factor != 0:
+        perf_monitor.log_frame()  # Count skipped frames
+        return
+    # Rate limiting
+    if current_time - last_processed_time.get(sid, 0) < MIN_PROCESSING_INTERVAL:
+        return
+    # Check if we're already processing
+    if not processing_locks[sid].acquire(blocking=False):
+        return
+    last_processed_time[sid] = current_time
+    # Submit to thread pool with priority
+    priority = 1  # Normal priority
+    future = executor.submit(process_frame_advanced, sid, data_image, priority)
+def process_frame_advanced(sid, data_image, priority=1):
+    """Advanced frame processing with multiple optimizations."""
+    start_time = time.time()
+    try:
+        # Decode image
+        image_data = base64.b64decode(data_image.split(',')[1])
+        img = Image.open(BytesIO(image_data))
+        # Smart preprocessing
+        img = smart_preprocess_image(img, enhance_quality=ADAPTIVE_QUALITY)
+        # Generate advanced hash
+        img_hash = advanced_hash_image(img)
+        # Scene change detection
+        scene_changed = detect_scene_change(sid, img_hash)
+        # Check cache first
+        cached_caption = caption_cache.get(img_hash)
+        if cached_caption and not scene_changed:
+            caption = cached_caption
+            cache_hit = True
+        else:
+            # Generate new caption
+            caption = generate_caption_advanced(img)
+            caption_cache.put(img_hash, caption)
+            cache_hit = False
+        # Smart caption updating with context
+        if should_update_caption_advanced(sid, caption, scene_changed):
+            # Add to caption history
+            caption_history[sid].append({
+                'caption': caption,
+                'timestamp': time.time(),
+                'scene_changed': scene_changed
+            })
+            last_captions[sid] = caption
+            # Enhanced caption with context
+            contextual_caption = add_context_to_caption(sid, caption)
+            print(f"New caption for {sid}: {contextual_caption}")
+            # Send enhanced response
+            socketio.emit('caption', {
+                'caption': contextual_caption,
+                'raw_caption': caption,
+                'timestamp': time.time(),
+                'confidence': 0.95 if not cache_hit else 1.0,
+                'scene_changed': scene_changed,
+                'processing_time': time.time() - start_time
+            }, room=sid)
+        # Update performance metrics
+        processing_time = time.time() - start_time
+        processing_times[sid].append(processing_time)
+        perf_monitor.log_frame(processing_time, cache_hit, scene_change=scene_changed)
+        # Periodic performance logging
+        if frame_counters[sid] % 100 == 0:
+            stats = perf_monitor.get_stats()
+            print(f"Client {sid}: Avg: {stats['avg_time']:.3f}s, Cache: {stats['cache_hit_rate']:.2f}, "
+                  f"Efficiency: {stats['efficiency']:.2f}, Scene changes: {stats['scene_changes']}")
+    except Exception as e:
+        print(f"Error processing frame for {sid}: {e}")
+        socketio.emit('caption', {
+            'caption': f"Processing error: {str(e)[:50]}...",
+            'timestamp': time.time(),
+            'confidence': 0.0,
+            'error': True
+        }, room=sid)
+    finally:
+        if sid in processing_locks:
+            processing_locks[sid].release()
+def should_update_caption_advanced(sid, new_caption, scene_changed):
+    """Advanced caption update logic with context awareness."""
+    if sid not in last_captions or scene_changed:
+        return True
+    last_caption = last_captions[sid]
+    # Always update on errors or initial state
+    if not last_caption or "error" in last_caption.lower() or last_caption == "Processing...":
+        return True
+    # Check caption history for patterns
+    if sid in caption_history and len(caption_history[sid]) > 1:
+        recent_captions = [item['caption'] for item in list(caption_history[sid])[-3:]]
+        if len(set(recent_captions)) == 1 and new_caption not in recent_captions:
+            return True  # Break repetition
+    # Enhanced semantic similarity with weighted keywords
+    words_old = set(last_caption.lower().split())
+    words_new = set(new_caption.lower().split())
+    # Weighted keywords for different importance levels
+    high_priority_words = {'walking', 'running', 'sitting', 'standing', 'jumping', 'dancing',
+                          'eating', 'drinking', 'driving', 'flying', 'swimming', 'climbing'}
+    medium_priority_words = {'holding', 'wearing', 'looking', 'pointing', 'smiling', 'talking',
+                            'reading', 'writing', 'playing', 'working', 'sleeping'}
+    objects_words = {'car', 'bike', 'phone', 'book', 'cup', 'computer', 'dog', 'cat', 'bird'}
+    # Check for high priority changes
+    old_high = words_old.intersection(high_priority_words)
+    new_high = words_new.intersection(high_priority_words)
+    if old_high != new_high:
+        return True
+    # Check for significant object changes
+    old_objects = words_old.intersection(objects_words)
+    new_objects = words_new.intersection(objects_words)
+    if len(old_objects.symmetric_difference(new_objects)) > 1:
+        return True
+    # Advanced similarity calculation
+    intersection = words_old.intersection(words_new)
+    union = words_old.union(words_new)
+    if len(union) == 0:
+        return True
+    # Weighted similarity based on word importance
+    weight_old = sum(3 if word in high_priority_words else 2 if word in medium_priority_words else 1
+                    for word in words_old)
+    weight_new = sum(3 if word in high_priority_words else 2 if word in medium_priority_words else 1
+                    for word in words_new)
+    weight_intersection = sum(3 if word in high_priority_words else 2 if word in medium_priority_words else 1
+                             for word in intersection)
+    weighted_similarity = (2 * weight_intersection) / (weight_old + weight_new) if (weight_old + weight_new) > 0 else 0
+    return weighted_similarity < 0.75
+def add_context_to_caption(sid, caption):
+    """Add temporal context to captions."""
+    if sid not in caption_history or len(caption_history[sid]) < 2:
+        return caption
+    recent_captions = [item['caption'] for item in list(caption_history[sid])[-3:]]
+    # Detect action continuity
+    action_words = {'walking', 'running', 'sitting', 'standing', 'eating', 'drinking'}
+    current_actions = set(caption.lower().split()).intersection(action_words)
+    if current_actions:
+        for prev_caption in recent_captions[:-1]:
+            prev_actions = set(prev_caption.lower().split()).intersection(action_words)
+            if current_actions == prev_actions:
+                return f"{caption} (continuing)"
+    return caption
+def generate_caption_advanced(image):
+    """Advanced caption generation with optimizations."""
+    try:
+        inputs = processor(image, return_tensors="pt").to(device)
+        # Enhanced generation parameters
+        generation_kwargs = {
+            'max_length': 30,
+            'min_length': 8,
+            'num_beams': 5,
+            'do_sample': True,
+            'temperature': 0.8,
+            'top_p': 0.95,
+            'top_k': 50,
+            'early_stopping': True,
+            'no_repeat_ngram_size': 3,
+            'length_penalty': 1.1,
+            'repetition_penalty': 1.2
+        }
+        if USE_AMP and device.type == 'cuda':
+            with autocast():
+                with torch.no_grad():
+                    generated_ids = model.generate(**inputs, **generation_kwargs)
+        else:
+            with torch.no_grad():
+                generated_ids = model.generate(**inputs, **generation_kwargs)
+        caption = processor.decode(generated_ids[0], skip_special_tokens=True)
+        return enhance_caption_advanced(caption)
+    except Exception as e:
+        print(f"Error in generate_caption_advanced: {e}")
+        return "Processing scene..."
+def enhance_caption_advanced(caption):
+    """Advanced caption enhancement with NLP improvements."""
+    caption = caption.strip()
+    if not caption:
+        return "Analyzing scene..."
+    # Remove common prefixes more intelligently
+    prefixes_to_remove = [
+        "a picture of ", "an image of ", "this is ", "there is ", "there are ",
+        "the image shows ", "this image shows ", "a photo of ", "a photograph of "
+    ]
+    caption_lower = caption.lower()
+    for prefix in prefixes_to_remove:
+        if caption_lower.startswith(prefix):
+            caption = caption[len(prefix):]
+            break
+    # Advanced replacements for more natural language
+    replacements = {
+        r'\b(man|woman|person) (is )?(sitting on|standing in|walking on)\b':
+            lambda m: f"{m.group(1)} {m.group(3).replace('on', 'at').replace('in', 'within')}",
+        r'\bholding a\b': 'holding',
+        r'\bwearing a\b': 'wearing',
+        r'\blooking at the\b': 'observing the',
+        r'\bstanding next to\b': 'beside',
+        r'\bwalking down\b': 'walking along',
+        r'\bsitting at\b': 'seated at'
+    }
+    import re
+    for pattern, replacement in replacements.items():
+        if callable(replacement):
+            caption = re.sub(pattern, replacement, caption, flags=re.IGNORECASE)
+        else:
+            caption = re.sub(pattern, replacement, caption, flags=re.IGNORECASE)
+    # Capitalize appropriately
+    if caption and not caption[0].isupper():
+        caption = caption[0].upper() + caption[1:]
+    # Add descriptive variety
+    action_variations = {
+        'walking': ['strolling', 'moving', 'walking'],
+        'sitting': ['seated', 'resting', 'sitting'],
+        'standing': ['positioned', 'standing', 'upright'],
+        'holding': ['grasping', 'carrying', 'holding'],
+        'looking': ['observing', 'viewing', 'watching', 'looking at']
+    }
+    # Randomly vary some common actions (seed based on caption for consistency)
+    import random
+    random.seed(hash(caption) % 1000)
+    for base_action, variations in action_variations.items():
+        if base_action in caption.lower():
+            if random.random() < 0.3:  # 30% chance to vary
+                caption = caption.replace(base_action, random.choice(variations))
+    return caption
+# ---- 3. ENHANCED FLASK ROUTES ----
+@app.route('/')
+def index():
+    """Render the main HTML page."""
+    return render_template('index.html')
+@app.route('/status')
+def status():
+    """Enhanced server status with detailed metrics."""
+    stats = perf_monitor.get_stats()
+    return {
+        'active_connections': len(frame_counters),
+        'device': str(device),
+        'configuration': {
+            'frame_skip': FRAME_SKIP,
+            'image_size': IMAGE_SIZE,
+            'buffer_size': BUFFER_SIZE,
+            'cache_size': CACHE_SIZE,
+            'batch_size': BATCH_SIZE,
+            'adaptive_quality': ADAPTIVE_QUALITY
+        },
+        'performance': stats,
+        'cache_info': {
+            'size': len(caption_cache.cache),
+            'max_size': CACHE_SIZE
+        },
+        'optimizations': {
+            'mixed_precision': USE_AMP,
+            'torch_script': device.type == 'cuda',
+            'thread_pool_size': MAX_WORKERS
+        }
+    }
+@app.route('/metrics')
+def metrics():
+    """Detailed performance metrics endpoint."""
+    stats = perf_monitor.get_stats()
+    # Client-specific metrics
+    client_metrics = {}
+    for sid in frame_counters:
+        if sid in processing_times and processing_times[sid]:
+            client_metrics[sid] = {
+                'frames_processed': frame_counters[sid],
+                'avg_processing_time': np.mean(processing_times[sid]),
+                'caption_history_size': len(caption_history.get(sid, [])),
+                'last_caption': last_captions.get(sid, "None")
+            }
+    return {
+        'global_metrics': stats,
+        'client_metrics': client_metrics,
+        'system_info': {
+            'device': str(device),
+            'cuda_available': torch.cuda.is_available(),
+            'cuda_memory': torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else None
+        }
+    }
+@app.route('/clear_cache')
+def clear_cache():
+    """Clear all caches."""
+    caption_cache.clear()
+    return {'status': 'cache_cleared', 'timestamp': time.time()}
+@app.route('/config', methods=['GET', 'POST'])
+def config():
+    """Dynamic configuration endpoint."""
+    global FRAME_SKIP, ADAPTIVE_QUALITY, SCENE_CHANGE_THRESHOLD
+    if request.method == 'POST':
+        config_data = request.get_json()
+        if 'frame_skip' in config_data:
+            FRAME_SKIP = max(1, int(config_data['frame_skip']))
+        if 'adaptive_quality' in config_data:
+            ADAPTIVE_QUALITY = bool(config_data['adaptive_quality'])
+        if 'scene_change_threshold' in config_data:
+            SCENE_CHANGE_THRESHOLD = float(config_data['scene_change_threshold'])
+        return {'status': 'updated', 'config': {
+            'frame_skip': FRAME_SKIP,
+            'adaptive_quality': ADAPTIVE_QUALITY,
+            'scene_change_threshold': SCENE_CHANGE_THRESHOLD
+        }}
+    return {
+        'frame_skip': FRAME_SKIP,
+        'adaptive_quality': ADAPTIVE_QUALITY,
+        'scene_change_threshold': SCENE_CHANGE_THRESHOLD
+    }
+# ---- 4. ENHANCED STARTUP ----
+if __name__ == '__main__':
+    print("=" * 60)
+    print("🚀 Starting Enhanced Real-Time Video Captioning Server")
+    print("=" * 60)
+    print(f"📱 Device: {device}")
+    print(f"🎯 Image Processing: {IMAGE_SIZE}x{IMAGE_SIZE}")
+    print(f"⚡ Frame Skip: {FRAME_SKIP} (adaptive)")
+    print(f"🧠 Mixed Precision: {USE_AMP}")
+    print(f"🔄 Thread Pool: {MAX_WORKERS} workers")
+    print(f"💾 Cache Size: {CACHE_SIZE} entries (LRU)")
+    print(f"🎨 Quality Enhancement: {ADAPTIVE_QUALITY}")
+    print(f"🔍 Scene Change Detection: Enabled")
+    print("=" * 60)
+    socketio.run(app, host='0.0.0.0', port=5000, debug=False, allow_unsafe_werkzeug=True)

dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+# Dockerfile
+# Use an official Python runtime as a parent image
+FROM python:3.11-slim
+# Set the working directory in the container
+WORKDIR /app
+# Copy the requirements file into the container
+COPY requirements.txt .
+# Install any needed packages specified in requirements.txt
+# --no-cache-dir ensures the image is smaller
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your application code into the container
+COPY . .
+# Make port 5000 available to the world outside this container
+# Your app runs on port 5000 according to your script
+EXPOSE 5000
+# Define environment variables if needed
+ENV FLASK_APP=app.py
+# Command to run your application using gunicorn for production
+# This is more robust than `flask run`
+# CMD ["gunicorn", "--workers", "1", "--threads", "4", "--bind", "0.0.0.0:5000", "--log-level", "info", "app:app"]
+CMD ["python", "app.py"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+numpy
+torch
+torchvision
+flask
+flask_socketio
+pillow
+opencv-python
+transformers
+accelerate
+opencv-contrib-python
+ultralytics
+gunicorn
+python-engineio
+gevent-websocket
+opencv-python-headless

static/app.js ADDED Viewed

	@@ -0,0 +1,447 @@

+document.addEventListener('DOMContentLoaded', () => {
+    // ---- 1. DOM Element References ----
+    const elements = {
+        webcam: document.getElementById('webcam'),
+        captionText: document.getElementById('caption-text'),
+        confidenceFill: document.getElementById('confidence-fill'),
+        confidenceText: document.getElementById('confidence-text'),
+        captionTimestamp: document.getElementById('caption-timestamp'),
+        startButton: document.getElementById('startButton'),
+        stopButton: document.getElementById('stopButton'),
+        muteButton: document.getElementById('muteButton'),
+        settingsButton: document.getElementById('settingsButton'),
+        fullscreenButton: document.getElementById('fullscreenButton'),
+        connectionStatus: document.getElementById('connection-status'),
+        fpsCounter: document.getElementById('fps-counter'),
+        recordingIndicator: document.getElementById('recording-indicator'),
+        latencyValue: document.getElementById('latency-value'),
+        accuracyValue: document.getElementById('accuracy-value'),
+        processedFrames: document.getElementById('processed-frames'),
+        captionsCount: document.getElementById('captions-count'),
+        historyList: document.getElementById('history-list'),
+        deviceInfo: document.getElementById('device-info'),
+        resolutionInfo: document.getElementById('resolution-info'),
+        cacheInfo: document.getElementById('cache-info'),
+        settingsModal: document.getElementById('settingsModal'),
+        closeSettings: document.getElementById('closeSettings'),
+        saveSettings: document.getElementById('saveSettings'),
+        resetSettings: document.getElementById('resetSettings'),
+        frameRateSelect: document.getElementById('frameRateSelect'),
+        qualitySlider: document.getElementById('qualitySlider'),
+        qualityValue: document.getElementById('qualityValue'),
+        audioToggle: document.getElementById('audioToggle'),
+        statusMessage: document.getElementById('status-message'),
+        toastContainer: document.getElementById('toastContainer')
+    };
+    // ---- 2. Application State & Settings ----
+    let socket;
+    let stream;
+    let frameSenderInterval;
+    let isCapturing = false;
+    let captionHistory = [];
+    let settings = {
+        frameRate: 15,
+        quality: 0.7,
+        audio: true
+    };
+    let performance = {
+        sentFrames: 0,
+        receivedFrames: 0,
+        captionsGenerated: 0,
+        totalConfidence: 0,
+        startTime: 0,
+        latencyBuffer: []
+    };
+    const LATENCY_BUFFER_SIZE = 20;
+    // ---- 3. Core Application Logic ----
+    /**
+     * Starts the video analysis process.
+     */
+    const startAnalysis = async () => {
+        if (isCapturing) return;
+        try {
+            // Get webcam stream
+            stream = await navigator.mediaDevices.getUserMedia({
+                video: {
+                    width: { ideal: 1280 },
+                    height: { ideal: 720 },
+                    frameRate: { ideal: 30 }
+                },
+                audio: false
+            });
+            elements.webcam.srcObject = stream;
+            await elements.webcam.play();
+            isCapturing = true;
+            // Update UI
+            updateUIForStartState();
+            connectSocket();
+        } catch (err) {
+            console.error("Error accessing webcam:", err);
+            showToast("Webcam Error", "Could not access the webcam. Please check permissions.", "error");
+            updateUIForStopState();
+        }
+    };
+    /**
+     * Stops the video analysis process.
+     */
+    const stopAnalysis = () => {
+        if (!isCapturing) return;
+        // Stop intervals and streams
+        clearInterval(frameSenderInterval);
+        frameSenderInterval = null;
+        stream?.getTracks().forEach(track => track.stop());
+        socket?.disconnect();
+        // Cancel any ongoing speech
+        window.speechSynthesis.cancel();
+        // Reset state
+        isCapturing = false;
+        elements.webcam.srcObject = null;
+        updateUIForStopState();
+        showToast("Analysis Stopped", "Real-time captioning has been turned off.", "info");
+    };
+    /**
+     * Connects to the WebSocket server and sets up event listeners.
+     */
+    const connectSocket = () => {
+        // Use the current host and port, but with the ws:// protocol
+        socket = io(window.location.origin, {
+            transports: ['websocket'],
+            upgrade: false
+        });
+        socket.on('connect', () => {
+            console.log('Connected to server! SID:', socket.id);
+            elements.connectionStatus.textContent = "Connected";
+            elements.connectionStatus.style.color = 'var(--success-color)';
+            showToast("Connected", "Successfully connected to the AI server.", "success");
+            startFrameSending();
+        });
+        socket.on('caption', handleCaption);
+        socket.on('disconnect', () => {
+            console.log('Disconnected from server.');
+            elements.connectionStatus.textContent = "Disconnected";
+            elements.connectionStatus.style.color = 'var(--danger-color)';
+            if (isCapturing) {
+                stopAnalysis();
+            }
+        });
+        socket.on('connect_error', (error) => {
+            console.error('Connection error:', error);
+            showToast("Connection Error", "Failed to connect to the server.", "error");
+            stopAnalysis();
+        });
+    };
+    /**
+     * Initializes the interval for sending video frames to the server.
+     */
+    const startFrameSending = () => {
+        const canvas = document.createElement('canvas');
+        const context = canvas.getContext('2d', { alpha: false });
+        frameSenderInterval = setInterval(() => {
+            if (!isCapturing || elements.webcam.paused || elements.webcam.ended) {
+                return;
+            }
+            // Match the server's expected image size
+            canvas.width = 384;
+            canvas.height = 384;
+            context.drawImage(elements.webcam, 0, 0, canvas.width, canvas.height);
+            const dataUrl = canvas.toDataURL('image/jpeg', settings.quality);
+            socket.emit('image', dataUrl);
+            performance.sentFrames++;
+            updatePerformanceUI();
+        }, 1000 / settings.frameRate);
+    };
+    // ---- 4. UI Update Functions ----
+    /**
+     * Handles incoming captions from the server.
+     * @param {object} data - The caption data from the server.
+     */
+    const handleCaption = (data) => {
+        performance.receivedFrames++;
+        performance.captionsGenerated++;
+        performance.totalConfidence += data.confidence;
+        // Update main caption display
+        elements.captionText.textContent = data.caption;
+        const confidencePercent = (data.confidence * 100).toFixed(0);
+        elements.confidenceFill.style.width = `${confidencePercent}%`;
+        elements.confidenceText.textContent = `${confidencePercent}%`;
+        const timestamp = new Date(data.timestamp * 1000);
+        elements.captionTimestamp.textContent = timestamp.toLocaleTimeString();
+        // Calculate latency
+        const latency = (Date.now() / 1000) - data.timestamp;
+        performance.latencyBuffer.push(latency);
+        if (performance.latencyBuffer.length > LATENCY_BUFFER_SIZE) {
+            performance.latencyBuffer.shift();
+        }
+        // Add to history
+        updateHistory(data.caption, confidencePercent, timestamp);
+        // Speak the caption
+        if (settings.audio) {
+            speakCaption(data.caption);
+        }
+    };
+    /**
+     * Updates the UI to reflect the "capturing started" state.
+     */
+    const updateUIForStartState = () => {
+        elements.startButton.disabled = true;
+        elements.stopButton.disabled = false;
+        elements.recordingIndicator.classList.add('active');
+        elements.statusMessage.textContent = "AI analysis is active...";
+        // Reset performance metrics
+        performance = {
+            sentFrames: 0,
+            receivedFrames: 0,
+            captionsGenerated: 0,
+            totalConfidence: 0,
+            startTime: Date.now(),
+            latencyBuffer: []
+        };
+        elements.resolutionInfo.textContent = `${elements.webcam.videoWidth}x${elements.webcam.videoHeight}`;
+        elements.historyList.innerHTML = '<div class="history-item"><div class="history-text">Waiting for captions...</div></div>';
+    };
+    /**
+     * Updates the UI to reflect the "capturing stopped" state.
+     */
+    const updateUIForStopState = () => {
+        elements.startButton.disabled = false;
+        elements.stopButton.disabled = true;
+        elements.recordingIndicator.classList.remove('active');
+        elements.statusMessage.textContent = "Ready to start analysis.";
+        elements.connectionStatus.textContent = "Disconnected";
+        elements.connectionStatus.style.color = 'var(--text-secondary)';
+        elements.fpsCounter.textContent = '0';
+        elements.latencyValue.textContent = '0ms';
+        elements.captionText.textContent = "Analysis stopped.";
+    };
+    /**
+     * Periodically updates performance metrics on the UI.
+     */
+    const updatePerformanceUI = () => {
+        const elapsedSeconds = (Date.now() - performance.startTime) / 1000;
+        if (elapsedSeconds === 0) return;
+        const fps = (performance.sentFrames / elapsedSeconds).toFixed(0);
+        elements.fpsCounter.textContent = fps;
+        const avgLatency = performance.latencyBuffer.reduce((a, b) => a + b, 0) / performance.latencyBuffer.length || 0;
+        elements.latencyValue.textContent = `${(avgLatency * 1000).toFixed(0)}ms`;
+        const avgConfidence = (performance.totalConfidence / performance.captionsGenerated * 100) || 0;
+        elements.accuracyValue.textContent = `${avgConfidence.toFixed(0)}%`;
+        elements.processedFrames.textContent = performance.receivedFrames;
+        elements.captionsCount.textContent = performance.captionsGenerated;
+    };
+    /**
+     * Adds a new caption to the history panel.
+     * @param {string} caption - The caption text.
+     * @param {string} confidence - The confidence percentage string.
+     * @param {Date} timestamp - The Date object for the caption.
+     */
+    const updateHistory = (caption, confidence, timestamp) => {
+        // Remove placeholder if it exists
+        if (captionHistory.length === 0) {
+            elements.historyList.innerHTML = '';
+        }
+        const historyItem = { caption, confidence, timestamp };
+        captionHistory.unshift(historyItem);
+        if (captionHistory.length > 20) { // Limit history size
+            captionHistory.pop();
+        }
+        const itemElement = document.createElement('div');
+        itemElement.className = 'history-item';
+        itemElement.innerHTML = `
+            <div class="history-text">${caption}</div>
+            <div class="history-meta">
+                <span class="history-confidence">${confidence}%</span>
+                <span class="history-time">${timestamp.toLocaleTimeString()}</span>
+            </div>
+        `;
+        elements.historyList.prepend(itemElement);
+        // Remove the last element if list is too long
+        if (elements.historyList.children.length > 20) {
+            elements.historyList.lastChild.remove();
+        }
+    };
+    /**
+     * Fetches and displays system status from the server.
+     */
+    const fetchStatus = async () => {
+        try {
+            const response = await fetch('/status');
+            const data = await response.json();
+            elements.deviceInfo.textContent = data.device.toUpperCase();
+            elements.cacheInfo.textContent = `${(data.performance.cache_hit_rate * 100).toFixed(0)}%`;
+        } catch (error) {
+            console.error("Error fetching server status:", error);
+            elements.deviceInfo.textContent = 'Error';
+        }
+    };
+    // ---- 5. Feature Logic (Audio, Settings, etc.) ----
+    /**
+     * Uses the Web Speech API to speak the provided text.
+     * @param {string} text - The text to be spoken.
+     */
+    const speakCaption = (text) => {
+        if (!text || text.toLowerCase().includes("processing")) return;
+        window.speechSynthesis.cancel(); // Interrupt previous speech for the latest update
+        const utterance = new SpeechSynthesisUtterance(text);
+        utterance.rate = 1.1;
+        utterance.pitch = 1.0;
+        utterance.volume = 0.8;
+        window.speechSynthesis.speak(utterance);
+    };
+    /**
+     * Toggles the settings modal visibility.
+     */
+    const toggleSettingsModal = () => {
+        elements.settingsModal.classList.toggle('active');
+    };
+    /**
+     * Saves the settings from the modal and applies them.
+     */
+    const saveSettings = () => {
+        settings.frameRate = parseInt(elements.frameRateSelect.value, 10);
+        settings.quality = parseFloat(elements.qualitySlider.value);
+        settings.audio = elements.audioToggle.checked;
+        toggleSettingsModal();
+        showToast("Settings Saved", "Your new settings have been applied.", "success");
+        // If capturing, restart the interval to apply new frame rate
+        if (isCapturing) {
+            clearInterval(frameSenderInterval);
+            startFrameSending();
+        }
+    };
+    /**
+     * Creates and displays a toast notification.
+     * @param {string} title - The title of the toast.
+     * @param {string} message - The message body of the toast.
+     * @param {string} type - The type of toast (success, error, info, warning).
+     */
+    const showToast = (title, message, type = 'info') => {
+        const toast = document.createElement('div');
+        toast.className = `toast ${type}`;
+        toast.innerHTML = `
+            <div class="toast-content">
+                <div class="toast-title">${title}</div>
+                <div class="toast-message">${message}</div>
+            </div>
+            <button class="toast-close">&times;</button>
+        `;
+        elements.toastContainer.appendChild(toast);
+        setTimeout(() => toast.classList.add('show'), 10);
+        const removeToast = () => {
+            toast.classList.remove('show');
+            setTimeout(() => toast.remove(), 500);
+        };
+        toast.querySelector('.toast-close').onclick = removeToast;
+        setTimeout(removeToast, 5000);
+    };
+    // ---- 6. Event Listeners ----
+    elements.startButton.addEventListener('click', startAnalysis);
+    elements.stopButton.addEventListener('click', stopAnalysis);
+    elements.settingsButton.addEventListener('click', toggleSettingsModal);
+    elements.closeSettings.addEventListener('click', toggleSettingsModal);
+    elements.saveSettings.addEventListener('click', saveSettings);
+    elements.muteButton.addEventListener('click', () => {
+        settings.audio = !settings.audio;
+        elements.audioToggle.checked = settings.audio;
+        elements.muteButton.classList.toggle('active', settings.audio);
+        showToast("Audio " + (settings.audio ? "Enabled" : "Disabled"), "", "info");
+    });
+    elements.fullscreenButton.addEventListener('click', () => {
+        if (!document.fullscreenElement) {
+            elements.webcam.parentElement.requestFullscreen();
+        } else {
+            document.exitFullscreen();
+        }
+    });
+    elements.qualitySlider.addEventListener('input', (e) => {
+        elements.qualityValue.textContent = `${Math.round(e.target.value * 100)}%`;
+    });
+    document.addEventListener('keydown', (e) => {
+        if (e.target.tagName === 'INPUT' || e.target.tagName === 'SELECT') return;
+        switch (e.code) {
+            case 'Space':
+                e.preventDefault();
+                isCapturing ? stopAnalysis() : startAnalysis();
+                break;
+            case 'KeyS':
+                e.preventDefault();
+                toggleSettingsModal();
+                break;
+            case 'KeyM':
+                e.preventDefault();
+                elements.muteButton.click();
+                break;
+            case 'KeyF':
+                 e.preventDefault();
+                elements.fullscreenButton.click();
+                break;
+        }
+    });
+    // ---- 7. Initialization ----
+    const init = () => {
+        updateUIForStopState();
+        fetchStatus();
+    };
+    init();
+});

static/style.css ADDED Viewed

	@@ -0,0 +1,1103 @@

+/* Reset and Base Styles */
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+:root {
+    --primary-color: #0066cc;
+    --primary-hover: #0052a3;
+    --secondary-color: #6c757d;
+    --success-color: #28a745;
+    --danger-color: #dc3545;
+    --warning-color: #ffc107;
+    --info-color: #17a2b8;
+    --bg-primary: #0a0e1a;
+    --bg-secondary: #1a1f2e;
+    --bg-tertiary: #242938;
+    --bg-card: rgba(26, 31, 46, 0.8);
+    --bg-overlay: rgba(0, 0, 0, 0.7);
+    --text-primary: #ffffff;
+    --text-secondary: #b8c1d3;
+    --text-muted: #6c757d;
+    --border-color: rgba(255, 255, 255, 0.1);
+    --border-active: rgba(0, 102, 204, 0.5);
+    --shadow-sm: 0 2px 4px rgba(0, 0, 0, 0.1);
+    --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.15);
+    --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.25);
+    --shadow-xl: 0 12px 36px rgba(0, 0, 0, 0.35);
+    --radius-sm: 6px;
+    --radius-md: 8px;
+    --radius-lg: 12px;
+    --radius-xl: 16px;
+    --transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+    --transition-fast: all 0.15s ease;
+}
+body {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+    background: linear-gradient(135deg, var(--bg-primary) 0%, #0f1419 50%, var(--bg-secondary) 100%);
+    color: var(--text-primary);
+    line-height: 1.6;
+    min-height: 100vh;
+    overflow-x: hidden;
+    font-feature-settings: 'kern' 1, 'liga' 1;
+}
+/* App Container */
+.app-container {
+    min-height: 100vh;
+    display: flex;
+    flex-direction: column;
+}
+/* Header */
+.header {
+    background: var(--bg-card);
+    backdrop-filter: blur(20px);
+    border-bottom: 1px solid var(--border-color);
+    padding: 1rem 2rem;
+    position: sticky;
+    top: 0;
+    z-index: 100;
+}
+.header-content {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    max-width: 1400px;
+    margin: 0 auto;
+}
+.logo-section {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+}
+.logo-icon {
+    font-size: 2rem;
+    background: linear-gradient(135deg, var(--primary-color), var(--info-color));
+    background-clip: text;
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.header h1 {
+    font-size: 1.5rem;
+    font-weight: 700;
+    margin: 0;
+    background: linear-gradient(135deg, var(--primary-color), var(--info-color));
+    background-clip: text;
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.subtitle {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+    margin: 0;
+}
+.header-stats {
+    display: flex;
+    gap: 2rem;
+}
+.stat-item {
+    text-align: center;
+}
+.stat-label {
+    display: block;
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+    margin-bottom: 0.25rem;
+}
+.stat-value {
+    display: block;
+    font-size: 1.25rem;
+    font-weight: 600;
+    color: var(--text-primary);
+}
+/* Main Content */
+.main-content {
+    flex: 1;
+    display: grid;
+    grid-template-columns: 1fr 350px;
+    gap: 2rem;
+    padding: 2rem;
+    max-width: 1400px;
+    margin: 0 auto;
+    width: 100%;
+}
+.video-section {
+    display: flex;
+    flex-direction: column;
+    gap: 1.5rem;
+}
+/* Video Container */
+.video-container {
+    position: relative;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-xl);
+    overflow: hidden;
+    box-shadow: var(--shadow-xl);
+    border: 1px solid var(--border-color);
+    transition: var(--transition);
+}
+.video-container:hover {
+    box-shadow: var(--shadow-xl), 0 0 0 1px var(--border-active);
+}
+#webcam {
+    width: 100%;
+    height: 480px;
+    object-fit: cover;
+    display: block;
+    background: #000;
+}
+/* Video Overlay */
+.video-overlay {
+    position: absolute;
+    inset: 0;
+    pointer-events: none;
+    display: flex;
+    flex-direction: column;
+    justify-content: space-between;
+}
+.overlay-top {
+    display: flex;
+    justify-content: space-between;
+    align-items: flex-start;
+    padding: 1rem;
+}
+.recording-indicator {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+    background: var(--bg-overlay);
+    backdrop-filter: blur(10px);
+    padding: 0.5rem 1rem;
+    border-radius: var(--radius-lg);
+    color: var(--danger-color);
+    font-size: 0.875rem;
+    font-weight: 600;
+    opacity: 0;
+    transition: var(--transition);
+}
+.recording-indicator.active {
+    opacity: 1;
+}
+.recording-dot {
+    width: 8px;
+    height: 8px;
+    background: var(--danger-color);
+    border-radius: 50%;
+    animation: pulse 2s infinite;
+}
+@keyframes pulse {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.5; }
+}
+.quality-indicator {
+    background: var(--bg-overlay);
+    backdrop-filter: blur(10px);
+    padding: 0.25rem 0.75rem;
+    border-radius: var(--radius-md);
+    font-size: 0.75rem;
+    font-weight: 600;
+    color: var(--success-color);
+}
+/* Caption Overlay */
+.caption-overlay {
+    padding: 2rem;
+    background: linear-gradient(
+        to top,
+        var(--bg-overlay) 0%,
+        rgba(0, 0, 0, 0.4) 70%,
+        transparent 100%
+    );
+    backdrop-filter: blur(10px);
+}
+.caption-content {
+    max-width: 600px;
+}
+#caption-text {
+    font-size: 1.5rem;
+    font-weight: 600;
+    line-height: 1.4;
+    margin-bottom: 1rem;
+    text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.8);
+    min-height: 2.1rem;
+    transition: var(--transition);
+}
+.caption-metadata {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+    font-size: 0.875rem;
+}
+.confidence-bar {
+    flex: 1;
+    height: 6px;
+    background: rgba(255, 255, 255, 0.2);
+    border-radius: 3px;
+    overflow: hidden;
+}
+.confidence-fill {
+    height: 100%;
+    background: linear-gradient(90deg, var(--danger-color), var(--warning-color), var(--success-color));
+    border-radius: 3px;
+    transition: width 0.5s ease;
+    width: 0%;
+}
+.confidence-text {
+    color: var(--text-secondary);
+    font-weight: 600;
+    min-width: 40px;
+}
+.timestamp {
+    color: var(--text-muted);
+    font-size: 0.75rem;
+}
+/* Controls */
+.controls-section {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    gap: 1rem;
+    flex-wrap: wrap;
+}
+.main-controls, .advanced-controls {
+    display: flex;
+    gap: 1rem;
+}
+/* Button Styles */
+.btn {
+    display: inline-flex;
+    align-items: center;
+    gap: 0.5rem;
+    padding: 0.75rem 1.5rem;
+    border: none;
+    border-radius: var(--radius-md);
+    font-size: 0.875rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: var(--transition);
+    text-decoration: none;
+    white-space: nowrap;
+    position: relative;
+    overflow: hidden;
+}
+.btn::before {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: -100%;
+    width: 100%;
+    height: 100%;
+    background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.1), transparent);
+    transition: left 0.5s ease;
+}
+.btn:hover::before {
+    left: 100%;
+}
+.btn-primary {
+    background: linear-gradient(135deg, var(--primary-color), var(--primary-hover));
+    color: white;
+    box-shadow: var(--shadow-md);
+}
+.btn-primary:hover:not(:disabled) {
+    transform: translateY(-2px);
+    box-shadow: var(--shadow-lg);
+}
+.btn-secondary {
+    background: linear-gradient(135deg, var(--danger-color), #b02e3c);
+    color: white;
+    box-shadow: var(--shadow-md);
+}
+.btn-secondary:hover:not(:disabled) {
+    transform: translateY(-2px);
+    box-shadow: var(--shadow-lg);
+}
+.btn-outline {
+    background: var(--bg-card);
+    color: var(--text-secondary);
+    border: 1px solid var(--border-color);
+}
+.btn-outline:hover {
+    background: var(--bg-tertiary);
+    color: var(--text-primary);
+    border-color: var(--border-active);
+}
+.btn:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+    transform: none !important;
+}
+.btn-icon {
+    font-size: 1rem;
+}
+/* Sidebar */
+.sidebar {
+    display: flex;
+    flex-direction: column;
+    gap: 1.5rem;
+}
+.metrics-panel, .history-panel, .system-panel {
+    background: var(--bg-card);
+    backdrop-filter: blur(20px);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-lg);
+    padding: 1.5rem;
+    box-shadow: var(--shadow-md);
+}
+.metrics-panel h3, .history-panel h3, .system-panel h3 {
+    font-size: 1.125rem;
+    font-weight: 600;
+    margin-bottom: 1rem;
+    color: var(--text-primary);
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+}
+/* Metrics Grid */
+.metrics-grid {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 1rem;
+}
+.metric-item {
+    text-align: center;
+    padding: 1rem;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-md);
+    border: 1px solid var(--border-color);
+    transition: var(--transition);
+}
+.metric-item:hover {
+    border-color: var(--border-active);
+    transform: translateY(-2px);
+}
+.metric-value {
+    font-size: 1.5rem;
+    font-weight: 700;
+    color: var(--primary-color);
+    margin-bottom: 0.25rem;
+}
+.metric-label {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+/* History Panel */
+.history-list {
+    max-height: 300px;
+    overflow-y: auto;
+    display: flex;
+    flex-direction: column;
+    gap: 0.75rem;
+}
+.history-list::-webkit-scrollbar {
+    width: 6px;
+}
+.history-list::-webkit-scrollbar-track {
+    background: var(--bg-tertiary);
+    border-radius: 3px;
+}
+.history-list::-webkit-scrollbar-thumb {
+    background: var(--border-color);
+    border-radius: 3px;
+}
+.history-list::-webkit-scrollbar-thumb:hover {
+    background: var(--border-active);
+}
+.history-item {
+    padding: 1rem;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-md);
+    border: 1px solid var(--border-color);
+    transition: var(--transition);
+}
+.history-item:hover {
+    border-color: var(--border-active);
+}
+.history-text {
+    font-size: 0.875rem;
+    margin-bottom: 0.5rem;
+    line-height: 1.4;
+}
+.history-meta {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    font-size: 0.75rem;
+    color: var(--text-muted);
+}
+.history-confidence {
+    padding: 0.25rem 0.5rem;
+    background: var(--success-color);
+    color: white;
+    border-radius: var(--radius-sm);
+    font-weight: 600;
+}
+/* System Info */
+.system-info {
+    display: flex;
+    flex-direction: column;
+    gap: 0.75rem;
+}
+.info-item {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 0.75rem;
+    background: var(--bg-tertiary);
+    border-radius: var(--radius-md);
+    border: 1px solid var(--border-color);
+}
+.info-label {
+    font-weight: 600;
+    color: var(--text-secondary);
+}
+.info-value {
+    color: var(--text-primary);
+    font-weight: 500;
+}
+/* Modal */
+.modal {
+    position: fixed;
+    inset: 0;
+    background: rgba(0, 0, 0, 0.8);
+    backdrop-filter: blur(10px);
+    display: none;
+    align-items: center;
+    justify-content: center;
+    z-index: 1000;
+    opacity: 0;
+    transition: var(--transition);
+}
+.modal.active {
+    display: flex;
+    opacity: 1;
+}
+.modal-content {
+    background: var(--bg-secondary);
+    border-radius: var(--radius-xl);
+    border: 1px solid var(--border-color);
+    box-shadow: var(--shadow-xl);
+    width: 90%;
+    max-width: 500px;
+    max-height: 80vh;
+    overflow: hidden;
+    transform: translateY(-20px);
+    transition: var(--transition);
+}
+.modal.active .modal-content {
+    transform: translateY(0);
+}
+.modal-header {
+    padding: 1.5rem;
+    border-bottom: 1px solid var(--border-color);
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.modal-header h3 {
+    font-size: 1.25rem;
+    font-weight: 600;
+}
+.modal-close {
+    background: none;
+    border: none;
+    color: var(--text-muted);
+    font-size: 1.5rem;
+    cursor: pointer;
+    padding: 0;
+    transition: var(--transition);
+}
+.modal-close:hover {
+    color: var(--text-primary);
+}
+.modal-body {
+    padding: 1.5rem;
+    max-height: 400px;
+    overflow-y: auto;
+}
+.modal-body::-webkit-scrollbar {
+    width: 6px;
+}
+.modal-body::-webkit-scrollbar-track {
+    background: var(--bg-tertiary);
+    border-radius: 3px;
+}
+.modal-body::-webkit-scrollbar-thumb {
+    background: var(--border-color);
+    border-radius: 3px;
+}
+.modal-body::-webkit-scrollbar-thumb:hover {
+    background: var(--border-active);
+}
+.modal-footer {
+    padding: 1.5rem;
+    border-top: 1px solid var(--border-color);
+    display: flex;
+    justify-content: flex-end;
+    gap: 1rem;
+}
+/* Settings */
+.setting-group {
+    margin-bottom: 1.5rem;
+}
+.setting-group:last-child {
+    margin-bottom: 0;
+}
+.setting-group label {
+    display: block;
+    font-weight: 600;
+    margin-bottom: 0.5rem;
+    color: var(--text-primary);
+}
+.setting-group select,
+.setting-group input[type="range"] {
+    width: 100%;
+    padding: 0.75rem;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-md);
+    color: var(--text-primary);
+    transition: var(--transition);
+}
+.setting-group select:focus,
+.setting-group input[type="range"]:focus {
+    outline: none;
+    border-color: var(--border-active);
+}
+.setting-group select {
+    appearance: none;
+    background-image: url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 20 20'%3e%3cpath stroke='%236b7280' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' d='m6 8 4 4 4-4'/%3e%3c/svg%3e");
+    background-position: right 0.5rem center;
+    background-repeat: no-repeat;
+    background-size: 1.5em 1.5em;
+    padding-right: 2.5rem;
+}
+.setting-group input[type="range"] {
+    /* -webkit-appearance: none; */
+    height: 8px;
+    border-radius: 4px;
+    background: var(--bg-tertiary);
+    outline: none;
+}
+.setting-group input[type="range"]::-webkit-slider-thumb {
+    -webkit-appearance: none;
+    appearance: none;
+    width: 20px;
+    height: 20px;
+    border-radius: 50%;
+    background: var(--primary-color);
+    cursor: pointer;
+    border: 2px solid white;
+    box-shadow: var(--shadow-sm);
+}
+.setting-group input[type="range"]::-moz-range-thumb {
+    width: 20px;
+    height: 20px;
+    border-radius: 50%;
+    background: var(--primary-color);
+    cursor: pointer;
+    border: 2px solid white;
+    box-shadow: var(--shadow-sm);
+}
+/* Toggle Switch */
+.toggle-switch {
+    position: relative;
+    display: inline-block;
+    width: 50px;
+    height: 24px;
+}
+.toggle-switch input {
+    opacity: 0;
+    width: 0;
+    height: 0;
+}
+.toggle-slider {
+    position: absolute;
+    cursor: pointer;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background: var(--bg-tertiary);
+    border: 1px solid var(--border-color);
+    transition: var(--transition);
+    border-radius: 24px;
+}
+.toggle-slider::before {
+    position: absolute;
+    content: "";
+    height: 18px;
+    width: 18px;
+    left: 2px;
+    top: 2px;
+    background: var(--text-muted);
+    transition: var(--transition);
+    border-radius: 50%;
+}
+.toggle-switch input:checked + .toggle-slider {
+    background: var(--primary-color);
+    border-color: var(--primary-color);
+}
+.toggle-switch input:checked + .toggle-slider::before {
+    transform: translateX(26px);
+    background: white;
+}
+/* Status Bar */
+.status-bar {
+    background: var(--bg-card);
+    backdrop-filter: blur(20px);
+    border-top: 1px solid var(--border-color);
+    padding: 0.75rem 2rem;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    font-size: 0.875rem;
+}
+.status-left {
+    color: var(--text-primary);
+    font-weight: 500;
+}
+.status-right {
+    color: var(--text-muted);
+}
+.keyboard-hint {
+    font-size: 0.75rem;
+}
+/* Toast Notifications */
+.toast-container {
+    position: fixed;
+    top: 2rem;
+    right: 2rem;
+    z-index: 1100;
+    display: flex;
+    flex-direction: column;
+    gap: 0.5rem;
+    pointer-events: none;
+}
+.toast {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-lg);
+    padding: 1rem 1.5rem;
+    box-shadow: var(--shadow-lg);
+    backdrop-filter: blur(20px);
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+    min-width: 300px;
+    transform: translateX(400px);
+    transition: var(--transition);
+    pointer-events: auto;
+}
+.toast.show {
+    transform: translateX(0);
+}
+.toast.hide {
+    transform: translateX(400px);
+    opacity: 0;
+}
+.toast-icon {
+    font-size: 1.25rem;
+    flex-shrink: 0;
+}
+.toast-content {
+    flex: 1;
+}
+.toast-title {
+    font-weight: 600;
+    font-size: 0.875rem;
+    margin-bottom: 0.25rem;
+    color: var(--text-primary);
+}
+.toast-message {
+    font-size: 0.8125rem;
+    color: var(--text-secondary);
+    line-height: 1.4;
+}
+.toast-close {
+    background: none;
+    border: none;
+    color: var(--text-muted);
+    cursor: pointer;
+    padding: 0;
+    font-size: 1rem;
+    transition: var(--transition);
+    flex-shrink: 0;
+}
+.toast-close:hover {
+    color: var(--text-primary);
+}
+/* Toast Types */
+.toast.success {
+    border-left: 4px solid var(--success-color);
+}
+.toast.success .toast-icon {
+    color: var(--success-color);
+}
+.toast.error {
+    border-left: 4px solid var(--danger-color);
+}
+.toast.error .toast-icon {
+    color: var(--danger-color);
+}
+.toast.warning {
+    border-left: 4px solid var(--warning-color);
+}
+.toast.warning .toast-icon {
+    color: var(--warning-color);
+}
+.toast.info {
+    border-left: 4px solid var(--info-color);
+}
+.toast.info .toast-icon {
+    color: var(--info-color);
+}
+/* Loading States */
+.loading {
+    position: relative;
+    overflow: hidden;
+}
+.loading::after {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: -100%;
+    width: 100%;
+    height: 100%;
+    background: linear-gradient(
+        90deg,
+        transparent,
+        rgba(255, 255, 255, 0.1),
+        transparent
+    );
+    animation: loading-shimmer 2s infinite;
+}
+@keyframes loading-shimmer {
+    0% { left: -100%; }
+    100% { left: 100%; }
+}
+/* Responsive Design */
+@media (max-width: 1200px) {
+    .main-content {
+        grid-template-columns: 1fr 300px;
+        gap: 1.5rem;
+    }
+    .sidebar {
+        gap: 1rem;
+    }
+    .metrics-panel, .history-panel, .system-panel {
+        padding: 1rem;
+    }
+}
+@media (max-width: 768px) {
+    .header {
+        padding: 1rem;
+    }
+    .header-content {
+        flex-direction: column;
+        gap: 1rem;
+        align-items: flex-start;
+    }
+    .header-stats {
+        gap: 1rem;
+        width: 100%;
+        justify-content: space-around;
+    }
+    .main-content {
+        grid-template-columns: 1fr;
+        padding: 1rem;
+        gap: 1rem;
+    }
+    #webcam {
+        height: 300px;
+    }
+    .controls-section {
+        flex-direction: column;
+        align-items: stretch;
+    }
+    .main-controls, .advanced-controls {
+        justify-content: center;
+        flex-wrap: wrap;
+    }
+    .caption-overlay {
+        padding: 1rem;
+    }
+    #caption-text {
+        font-size: 1.25rem;
+    }
+    .metrics-grid {
+        grid-template-columns: 1fr;
+    }
+    .status-bar {
+        padding: 0.5rem 1rem;
+        flex-direction: column;
+        gap: 0.5rem;
+        text-align: center;
+    }
+    .keyboard-hint {
+        display: none;
+    }
+    .toast {
+        min-width: 280px;
+        margin: 0 1rem;
+    }
+    .modal-content {
+        width: 95%;
+        margin: 1rem;
+    }
+}
+@media (max-width: 480px) {
+    .header {
+        padding: 0.75rem;
+    }
+    .logo-section {
+        gap: 0.5rem;
+    }
+    .logo-icon {
+        font-size: 1.5rem;
+    }
+    .header h1 {
+        font-size: 1.25rem;
+    }
+    .subtitle {
+        font-size: 0.8125rem;
+    }
+    .main-content {
+        padding: 0.75rem;
+    }
+    #webcam {
+        height: 250px;
+    }
+    .btn {
+        padding: 0.625rem 1rem;
+        font-size: 0.8125rem;
+    }
+    .main-controls, .advanced-controls {
+        gap: 0.75rem;
+    }
+    #caption-text {
+        font-size: 1.125rem;
+    }
+    .caption-metadata {
+        flex-direction: column;
+        align-items: flex-start;
+        gap: 0.5rem;
+    }
+    .confidence-bar {
+        width: 100%;
+    }
+    .toast {
+        min-width: 260px;
+    }
+}
+/* Dark mode enhancements */
+@media (prefers-color-scheme: dark) {
+    body {
+        background: linear-gradient(135deg, var(--bg-primary) 0%, #0a0d14 50%, var(--bg-secondary) 100%);
+    }
+}
+/* High contrast mode support */
+@media (prefers-contrast: high) {
+    :root {
+        --border-color: rgba(255, 255, 255, 0.3);
+        --border-active: var(--primary-color);
+        --text-secondary: #d1d5db;
+    }
+}
+/* Reduced motion support */
+@media (prefers-reduced-motion: reduce) {
+    * {
+        animation-duration: 0.01ms !important;
+        animation-iteration-count: 1 !important;
+        transition-duration: 0.01ms !important;
+    }
+    .recording-dot {
+        animation: none;
+    }
+    .loading::after {
+        animation: none;
+    }
+}
+/* Print styles */
+@media print {
+    .header, .sidebar, .controls-section, .status-bar, .toast-container {
+        display: none;
+    }
+    .main-content {
+        grid-template-columns: 1fr;
+        padding: 0;
+    }
+    .video-container {
+        border: 2px solid #000;
+        border-radius: 0;
+        box-shadow: none;
+    }
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,234 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Real-Time AI Action Captioner Pro</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+    <!-- Include Socket.IO client library -->
+    <script src="https://cdn.socket.io/4.5.2/socket.io.min.js"></script>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+</head>
+<body>
+    <div class="app-container">
+        <!-- Header Section -->
+        <header class="header">
+            <div class="header-content">
+                <div class="logo-section">
+                    <div class="logo-icon">🎥</div>
+                    <div>
+                        <h1>LiveSense AI</h1>
+                        <p class="subtitle">Real-Time Video Insights & Description</p>
+                    </div>
+                </div>
+                <div class="header-stats">
+                    <div class="stat-item">
+                        <span class="stat-label">Status</span>
+                        <span class="stat-value" id="connection-status">Disconnected</span>
+                    </div>
+                    <div class="stat-item">
+                        <span class="stat-label">FPS</span>
+                        <span class="stat-value" id="fps-counter">0</span>
+                    </div>
+                </div>
+            </div>
+        </header>
+        <!-- Main Content -->
+        <main class="main-content">
+            <div class="video-section">
+                <!-- Video Container -->
+                <div class="video-container">
+                    <video id="webcam" autoplay muted playsinline></video>
+                    <!-- Video Overlay Controls -->
+                    <div class="video-overlay">
+                        <div class="overlay-top">
+                            <div class="recording-indicator" id="recording-indicator">
+                                <div class="recording-dot"></div>
+                                <span>LIVE</span>
+                            </div>
+                            <div class="quality-indicator" id="quality-indicator">
+                                <span>HD</span>
+                            </div>
+                        </div>
+                        <!-- Caption Container -->
+                        <div class="caption-overlay">
+                            <div class="caption-content">
+                                <p id="caption-text">Ready to start captioning...</p>
+                                <div class="caption-metadata">
+                                    <div class="confidence-bar">
+                                        <div class="confidence-fill" id="confidence-fill"></div>
+                                    </div>
+                                    <span class="confidence-text" id="confidence-text">0%</span>
+                                    <span class="timestamp" id="caption-timestamp"></span>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <!-- Controls -->
+                <div class="controls-section">
+                    <div class="main-controls">
+                        <button id="startButton" class="btn btn-primary">
+                            <span class="btn-icon">▶</span>
+                            Start Analysis
+                        </button>
+                        <button id="stopButton" class="btn btn-secondary" disabled>
+                            <span class="btn-icon">⏹</span>
+                            Stop Analysis
+                        </button>
+                    </div>
+                    <div class="advanced-controls">
+                        <button id="muteButton" class="btn btn-outline">
+                            <span class="btn-icon">🔊</span>
+                            Audio
+                        </button>
+                        <button id="settingsButton" class="btn btn-outline">
+                            <span class="btn-icon">⚙️</span>
+                            Settings
+                        </button>
+                        <button id="fullscreenButton" class="btn btn-outline">
+                            <span class="btn-icon">⛶</span>
+                            Fullscreen
+                        </button>
+                    </div>
+                </div>
+            </div>
+            <!-- Sidebar -->
+            <aside class="sidebar">
+                <!-- Performance Metrics -->
+                <div class="metrics-panel">
+                    <h3>Performance Metrics</h3>
+                    <div class="metrics-grid">
+                        <div class="metric-item">
+                            <div class="metric-value" id="latency-value">0ms</div>
+                            <div class="metric-label">Latency</div>
+                        </div>
+                        <div class="metric-item">
+                            <div class="metric-value" id="accuracy-value">0%</div>
+                            <div class="metric-label">Avg Confidence</div>
+                        </div>
+                        <div class="metric-item">
+                            <div class="metric-value" id="processed-frames">0</div>
+                            <div class="metric-label">Frames Processed</div>
+                        </div>
+                        <div class="metric-item">
+                            <div class="metric-value" id="captions-count">0</div>
+                            <div class="metric-label">Captions Generated</div>
+                        </div>
+                    </div>
+                </div>
+                <!-- Recent Captions History -->
+                <div class="history-panel">
+                    <h3>Caption History</h3>
+                    <div class="history-list" id="history-list">
+                        <div class="history-item">
+                            <div class="history-text">No captions yet</div>
+                            <div class="history-meta">
+                                <span class="history-confidence">-</span>
+                                <span class="history-time">--:--</span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <!-- System Info -->
+                <div class="system-panel">
+                    <h3>System Information</h3>
+                    <div class="system-info">
+                        <div class="info-item">
+                            <span class="info-label">Device:</span>
+                            <span class="info-value" id="device-info">Loading...</span>
+                        </div>
+                        <div class="info-item">
+                            <span class="info-label">Model:</span>
+                            <span class="info-value">BLIP-Base</span>
+                        </div>
+                        <div class="info-item">
+                            <span class="info-label">Resolution:</span>
+                            <span class="info-value" id="resolution-info">-</span>
+                        </div>
+                        <div class="info-item">
+                            <span class="info-label">Cache Hit Rate:</span>
+                            <span class="info-value" id="cache-info">0%</span>
+                        </div>
+                    </div>
+                </div>
+            </aside>
+        </main>
+        <!-- Settings Modal -->
+        <div class="modal" id="settingsModal">
+            <div class="modal-content">
+                <div class="modal-header">
+                    <h3>Settings</h3>
+                    <button class="modal-close" id="closeSettings">&times;</button>
+                </div>
+                <div class="modal-body">
+                    <div class="setting-group">
+                        <label>Frame Rate</label>
+                        <select id="frameRateSelect">
+                            <option value="10">10 FPS</option>
+                            <option value="15" selected>15 FPS</option>
+                            <option value="20">20 FPS</option>
+                            <option value="30">30 FPS</option>
+                        </select>
+                    </div>
+                    <div class="setting-group">
+                        <label>Image Quality</label>
+                        <input type="range" id="qualitySlider" min="0.3" max="0.9" step="0.1" value="0.7">
+                        <span id="qualityValue">70%</span>
+                    </div>
+                    <div class="setting-group">
+                        <label>Audio Narration</label>
+                        <div class="toggle-switch">
+                            <input type="checkbox" id="audioToggle" checked>
+                            <span class="toggle-slider"></span>
+                        </div>
+                    </div>
+                    <div class="setting-group">
+                        <label>Confidence Threshold</label>
+                        <input type="range" id="confidenceSlider" min="0.1" max="1.0" step="0.1" value="0.6">
+                        <span id="confidenceThreshold">60%</span>
+                    </div>
+                    <div class="setting-group">
+                        <label>Auto-Pause on Low Light</label>
+                        <div class="toggle-switch">
+                            <input type="checkbox" id="lowLightToggle">
+                            <span class="toggle-slider"></span>
+                        </div>
+                    </div>
+                </div>
+                <div class="modal-footer">
+                    <button class="btn btn-outline" id="resetSettings">Reset to Default</button>
+                    <button class="btn btn-primary" id="saveSettings">Save Settings</button>
+                </div>
+            </div>
+        </div>
+        <!-- Status Bar -->
+        <div class="status-bar">
+            <div class="status-left">
+                <span id="status-message">Ready to start</span>
+            </div>
+            <div class="status-right">
+                <span class="keyboard-hint">Space: Start/Stop | M: Mute | S: Settings | F: Fullscreen</span>
+            </div>
+        </div>
+    </div>
+    <!-- Toast Notifications -->
+    <div class="toast-container" id="toastContainer"></div>
+    <!-- Our application logic -->
+    <script src="{{ url_for('static', filename='app.js') }}"></script>
+</body>
+</html>