Varsha Dewangan commited on
Commit
db2860e
·
1 Parent(s): 7b93c80

application file added

Browse files
Files changed (8) hide show
  1. .gitignore +0 -0
  2. README.md +1 -1
  3. app.py +701 -0
  4. dockerfile +30 -0
  5. requirements.txt +15 -0
  6. static/app.js +447 -0
  7. static/style.css +1103 -0
  8. templates/index.html +234 -0
.gitignore ADDED
Binary file (56 Bytes). View file
 
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Real Time Video Captioning
3
- emoji: 🌖
4
  colorFrom: purple
5
  colorTo: yellow
6
  sdk: docker
 
1
  ---
2
  title: Real Time Video Captioning
3
+ emoji: 🎥
4
  colorFrom: purple
5
  colorTo: yellow
6
  sdk: docker
app.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import base64
3
+ import numpy as np
4
+ import torch
5
+ from flask import Flask, render_template, request
6
+ from flask_socketio import SocketIO, emit
7
+ from PIL import Image, ImageEnhance, ImageFilter
8
+ from io import BytesIO
9
+ import logging
10
+ import threading
11
+ import time
12
+ from transformers import BlipProcessor, BlipForConditionalGeneration
13
+ from collections import deque
14
+ import cv2
15
+ import asyncio
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ import hashlib
18
+ import json
19
+ from datetime import datetime, timedelta
20
+ import queue
21
+
22
+ # ---- 1. ENHANCED SETUP ----
23
+
24
+ # Suppress excessive logging from libraries
25
+ logging.getLogger('engineio').setLevel(logging.ERROR)
26
+ logging.getLogger('socketio').setLevel(logging.ERROR)
27
+
28
+ # --- Enhanced Configuration ---
29
+ FRAME_SKIP = 3 # Adaptive frame skipping
30
+ IMAGE_SIZE = 224 # Optimized size for BLIP
31
+ BUFFER_SIZE = 5 # Smart buffering
32
+ MIN_CONFIDENCE_DIFF = 0.03
33
+ MAX_WORKERS = 6 # Increased thread pool
34
+ CACHE_SIZE = 500 # Larger cache with LRU
35
+ BATCH_SIZE = 4 # Batch processing capability
36
+
37
+ # Advanced performance settings
38
+ ADAPTIVE_QUALITY = True
39
+ MIN_PROCESSING_INTERVAL = 0.1 # Minimum time between processing
40
+ SCENE_CHANGE_THRESHOLD = 0.15 # For scene change detection
41
+ CAPTION_HISTORY_SIZE = 10 # Keep caption history for context
42
+
43
+ # --- Flask & SocketIO App Initialization ---
44
+ app = Flask(__name__)
45
+ app.config['SECRET_KEY'] = 'your-very-secret-key!'
46
+ socketio = SocketIO(app, async_mode='threading', logger=False, engineio_logger=False,
47
+ cors_allowed_origins="*", ping_timeout=60, ping_interval=25)
48
+
49
+ # --- Enhanced AI Model Setup ---
50
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
51
+ print(f"Using device: {device}")
52
+
53
+ # Advanced thread pool with priority queue
54
+ executor = ThreadPoolExecutor(max_workers=MAX_WORKERS, thread_name_prefix="caption_worker")
55
+ priority_queue = queue.PriorityQueue()
56
+
57
+ # Load BLIP model with advanced optimizations
58
+ try:
59
+ print("Loading BLIP model with optimizations...")
60
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
61
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
62
+ model = model.to(device)
63
+ model.eval()
64
+
65
+ # Advanced CUDA optimizations
66
+ if device.type == 'cuda':
67
+ torch.backends.cudnn.benchmark = True
68
+ torch.backends.cudnn.deterministic = False
69
+ model = torch.jit.script(model) # TorchScript optimization
70
+ from torch.cuda.amp import autocast, GradScaler
71
+ USE_AMP = True
72
+ scaler = GradScaler()
73
+ print("CUDA optimizations and TorchScript enabled")
74
+ else:
75
+ USE_AMP = False
76
+
77
+ # Warm up the model
78
+ dummy_image = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE), color='black')
79
+ dummy_inputs = processor(dummy_image, return_tensors="pt").to(device)
80
+ with torch.no_grad():
81
+ _ = model.generate(**dummy_inputs, max_length=10)
82
+ print("Model warmed up successfully!")
83
+
84
+ except Exception as e:
85
+ print(f"Error loading BLIP model: {e}")
86
+ exit()
87
+
88
+ # --- Advanced Caching System ---
89
+ class LRUCache:
90
+ def __init__(self, max_size):
91
+ self.max_size = max_size
92
+ self.cache = {}
93
+ self.access_order = deque()
94
+ self.lock = threading.Lock()
95
+
96
+ def get(self, key):
97
+ with self.lock:
98
+ if key in self.cache:
99
+ # Move to end (most recently used)
100
+ self.access_order.remove(key)
101
+ self.access_order.append(key)
102
+ return self.cache[key]
103
+ return None
104
+
105
+ def put(self, key, value):
106
+ with self.lock:
107
+ if key in self.cache:
108
+ self.access_order.remove(key)
109
+ elif len(self.cache) >= self.max_size:
110
+ # Remove least recently used
111
+ oldest = self.access_order.popleft()
112
+ del self.cache[oldest]
113
+
114
+ self.cache[key] = value
115
+ self.access_order.append(key)
116
+
117
+ def clear(self):
118
+ with self.lock:
119
+ self.cache.clear()
120
+ self.access_order.clear()
121
+
122
+ # --- Advanced Frame Processing ---
123
+ frame_counters = {}
124
+ processing_locks = {}
125
+ caption_buffers = {}
126
+ last_captions = {}
127
+ processing_times = {}
128
+ caption_history = {}
129
+ last_processed_time = {}
130
+ scene_features = {} # For scene change detection
131
+
132
+ # Enhanced caching
133
+ caption_cache = LRUCache(CACHE_SIZE)
134
+ batch_queue = {}
135
+
136
+ # --- Smart Performance Monitor ---
137
+ class AdvancedPerformanceMonitor:
138
+ def __init__(self):
139
+ self.metrics = {
140
+ 'total_frames': 0,
141
+ 'processed_frames': 0,
142
+ 'cache_hits': 0,
143
+ 'cache_misses': 0,
144
+ 'batch_processed': 0,
145
+ 'scene_changes': 0,
146
+ 'processing_times': deque(maxlen=100),
147
+ 'start_time': time.time()
148
+ }
149
+ self.lock = threading.Lock()
150
+
151
+ def log_frame(self, processing_time=None, cache_hit=False, batch_size=1, scene_change=False):
152
+ with self.lock:
153
+ self.metrics['total_frames'] += 1
154
+ if processing_time:
155
+ self.metrics['processed_frames'] += 1
156
+ self.metrics['processing_times'].append(processing_time)
157
+ if batch_size > 1:
158
+ self.metrics['batch_processed'] += batch_size
159
+
160
+ if cache_hit:
161
+ self.metrics['cache_hits'] += 1
162
+ else:
163
+ self.metrics['cache_misses'] += 1
164
+
165
+ if scene_change:
166
+ self.metrics['scene_changes'] += 1
167
+
168
+ def get_stats(self):
169
+ with self.lock:
170
+ if not self.metrics['processing_times']:
171
+ return {"avg_time": 0, "cache_hit_rate": 0, "fps": 0, "efficiency": 0}
172
+
173
+ total_time = time.time() - self.metrics['start_time']
174
+ avg_processing_time = np.mean(self.metrics['processing_times'])
175
+ cache_hit_rate = self.metrics['cache_hits'] / max(1, self.metrics['total_frames'])
176
+ processing_fps = self.metrics['processed_frames'] / max(1, avg_processing_time * self.metrics['processed_frames'])
177
+ efficiency = self.metrics['processed_frames'] / max(1, self.metrics['total_frames'])
178
+
179
+ return {
180
+ "avg_time": avg_processing_time,
181
+ "cache_hit_rate": cache_hit_rate,
182
+ "processing_fps": processing_fps,
183
+ "efficiency": efficiency,
184
+ "total_frames": self.metrics['total_frames'],
185
+ "scene_changes": self.metrics['scene_changes'],
186
+ "batch_efficiency": self.metrics['batch_processed'] / max(1, self.metrics['processed_frames'])
187
+ }
188
+
189
+ perf_monitor = AdvancedPerformanceMonitor()
190
+
191
+ # --- Smart Image Preprocessing ---
192
+ def smart_preprocess_image(image, enhance_quality=True):
193
+ """Enhanced image preprocessing with quality improvements."""
194
+ # Convert to RGB if needed
195
+ if image.mode != 'RGB':
196
+ image = image.convert('RGB')
197
+
198
+ if enhance_quality:
199
+ # Enhance image quality
200
+ # Sharpening
201
+ enhancer = ImageEnhance.Sharpness(image)
202
+ image = enhancer.enhance(1.2)
203
+
204
+ # Contrast enhancement
205
+ enhancer = ImageEnhance.Contrast(image)
206
+ image = enhancer.enhance(1.1)
207
+
208
+ # Color enhancement
209
+ enhancer = ImageEnhance.Color(image)
210
+ image = enhancer.enhance(1.05)
211
+
212
+ # Smart resizing with aspect ratio preservation
213
+ original_size = image.size
214
+ if original_size[0] != original_size[1]: # Non-square image
215
+ # Crop to square from center
216
+ min_dim = min(original_size)
217
+ left = (original_size[0] - min_dim) // 2
218
+ top = (original_size[1] - min_dim) // 2
219
+ image = image.crop((left, top, left + min_dim, top + min_dim))
220
+
221
+ # Resize with high-quality resampling
222
+ image = image.resize((IMAGE_SIZE, IMAGE_SIZE), Image.LANCZOS)
223
+
224
+ return image
225
+
226
+ def advanced_hash_image(image):
227
+ """Generate robust hash for image similarity detection."""
228
+ # Create perceptual hash using multiple features
229
+ img_small = image.resize((16, 16), Image.LANCZOS)
230
+ img_gray = img_small.convert('L')
231
+
232
+ # Get pixel values
233
+ pixels = list(img_gray.getdata())
234
+
235
+ # Create hash from average and differences
236
+ avg = sum(pixels) / len(pixels)
237
+ hash_bits = ''.join('1' if pixel > avg else '0' for pixel in pixels)
238
+
239
+ # Additional feature: edge detection hash
240
+ img_array = np.array(img_gray)
241
+ edges = cv2.Canny(img_array, 50, 150)
242
+ edge_hash = hashlib.md5(edges.tobytes()).hexdigest()[:8]
243
+
244
+ return hash_bits + edge_hash
245
+
246
+ def detect_scene_change(sid, current_features):
247
+ """Detect significant scene changes."""
248
+ if sid not in scene_features:
249
+ scene_features[sid] = current_features
250
+ return True
251
+
252
+ # Compare with previous features
253
+ prev_features = scene_features[sid]
254
+
255
+ # Calculate similarity (Hamming distance for hash)
256
+ if len(current_features) == len(prev_features):
257
+ diff_count = sum(c1 != c2 for c1, c2 in zip(current_features[:256], prev_features[:256]))
258
+ similarity = 1 - (diff_count / 256)
259
+
260
+ scene_features[sid] = current_features
261
+ return similarity < (1 - SCENE_CHANGE_THRESHOLD)
262
+
263
+ scene_features[sid] = current_features
264
+ return True
265
+
266
+ # ---- 2. ENHANCED WEBSOCKET HANDLERS ----
267
+
268
+ @socketio.on('connect')
269
+ def handle_connect():
270
+ """Enhanced client connection handler."""
271
+ print(f"Client connected: {request.sid}")
272
+ sid = request.sid
273
+
274
+ # Initialize client data
275
+ frame_counters[sid] = 0
276
+ processing_locks[sid] = threading.Lock()
277
+ caption_buffers[sid] = deque(maxlen=BUFFER_SIZE)
278
+ last_captions[sid] = ""
279
+ processing_times[sid] = deque(maxlen=20)
280
+ caption_history[sid] = deque(maxlen=CAPTION_HISTORY_SIZE)
281
+ last_processed_time[sid] = 0
282
+ scene_features[sid] = ""
283
+ batch_queue[sid] = []
284
+
285
+ # Send initial status
286
+ emit('status', {'connected': True, 'device': str(device)})
287
+
288
+ @socketio.on('disconnect')
289
+ def handle_disconnect():
290
+ """Enhanced client disconnection handler."""
291
+ print(f"Client disconnected: {request.sid}")
292
+ cleanup_client(request.sid)
293
+
294
+ def cleanup_client(sid):
295
+ """Enhanced client cleanup."""
296
+ for data_dict in [frame_counters, processing_locks, caption_buffers,
297
+ last_captions, processing_times, caption_history,
298
+ last_processed_time, scene_features, batch_queue]:
299
+ if sid in data_dict:
300
+ del data_dict[sid]
301
+
302
+ @socketio.on('image')
303
+ def handle_image(data_image):
304
+ """Enhanced image handling with smart processing."""
305
+ sid = request.sid
306
+
307
+ # Initialize if not exists
308
+ if sid not in frame_counters:
309
+ handle_connect()
310
+
311
+ frame_counters[sid] += 1
312
+ current_time = time.time()
313
+
314
+ # Adaptive frame skipping based on processing load
315
+ skip_factor = FRAME_SKIP
316
+ if sid in processing_times and processing_times[sid]:
317
+ avg_time = np.mean(processing_times[sid])
318
+ if avg_time > 0.5: # If processing is slow, skip more frames
319
+ skip_factor = FRAME_SKIP * 2
320
+ elif avg_time < 0.1: # If processing is fast, skip fewer frames
321
+ skip_factor = max(1, FRAME_SKIP // 2)
322
+
323
+ if frame_counters[sid] % skip_factor != 0:
324
+ perf_monitor.log_frame() # Count skipped frames
325
+ return
326
+
327
+ # Rate limiting
328
+ if current_time - last_processed_time.get(sid, 0) < MIN_PROCESSING_INTERVAL:
329
+ return
330
+
331
+ # Check if we're already processing
332
+ if not processing_locks[sid].acquire(blocking=False):
333
+ return
334
+
335
+ last_processed_time[sid] = current_time
336
+
337
+ # Submit to thread pool with priority
338
+ priority = 1 # Normal priority
339
+ future = executor.submit(process_frame_advanced, sid, data_image, priority)
340
+
341
+ def process_frame_advanced(sid, data_image, priority=1):
342
+ """Advanced frame processing with multiple optimizations."""
343
+ start_time = time.time()
344
+
345
+ try:
346
+ # Decode image
347
+ image_data = base64.b64decode(data_image.split(',')[1])
348
+ img = Image.open(BytesIO(image_data))
349
+
350
+ # Smart preprocessing
351
+ img = smart_preprocess_image(img, enhance_quality=ADAPTIVE_QUALITY)
352
+
353
+ # Generate advanced hash
354
+ img_hash = advanced_hash_image(img)
355
+
356
+ # Scene change detection
357
+ scene_changed = detect_scene_change(sid, img_hash)
358
+
359
+ # Check cache first
360
+ cached_caption = caption_cache.get(img_hash)
361
+ if cached_caption and not scene_changed:
362
+ caption = cached_caption
363
+ cache_hit = True
364
+ else:
365
+ # Generate new caption
366
+ caption = generate_caption_advanced(img)
367
+ caption_cache.put(img_hash, caption)
368
+ cache_hit = False
369
+
370
+ # Smart caption updating with context
371
+ if should_update_caption_advanced(sid, caption, scene_changed):
372
+ # Add to caption history
373
+ caption_history[sid].append({
374
+ 'caption': caption,
375
+ 'timestamp': time.time(),
376
+ 'scene_changed': scene_changed
377
+ })
378
+
379
+ last_captions[sid] = caption
380
+
381
+ # Enhanced caption with context
382
+ contextual_caption = add_context_to_caption(sid, caption)
383
+
384
+ print(f"New caption for {sid}: {contextual_caption}")
385
+
386
+ # Send enhanced response
387
+ socketio.emit('caption', {
388
+ 'caption': contextual_caption,
389
+ 'raw_caption': caption,
390
+ 'timestamp': time.time(),
391
+ 'confidence': 0.95 if not cache_hit else 1.0,
392
+ 'scene_changed': scene_changed,
393
+ 'processing_time': time.time() - start_time
394
+ }, room=sid)
395
+
396
+ # Update performance metrics
397
+ processing_time = time.time() - start_time
398
+ processing_times[sid].append(processing_time)
399
+ perf_monitor.log_frame(processing_time, cache_hit, scene_change=scene_changed)
400
+
401
+ # Periodic performance logging
402
+ if frame_counters[sid] % 100 == 0:
403
+ stats = perf_monitor.get_stats()
404
+ print(f"Client {sid}: Avg: {stats['avg_time']:.3f}s, Cache: {stats['cache_hit_rate']:.2f}, "
405
+ f"Efficiency: {stats['efficiency']:.2f}, Scene changes: {stats['scene_changes']}")
406
+
407
+ except Exception as e:
408
+ print(f"Error processing frame for {sid}: {e}")
409
+ socketio.emit('caption', {
410
+ 'caption': f"Processing error: {str(e)[:50]}...",
411
+ 'timestamp': time.time(),
412
+ 'confidence': 0.0,
413
+ 'error': True
414
+ }, room=sid)
415
+
416
+ finally:
417
+ if sid in processing_locks:
418
+ processing_locks[sid].release()
419
+
420
+ def should_update_caption_advanced(sid, new_caption, scene_changed):
421
+ """Advanced caption update logic with context awareness."""
422
+ if sid not in last_captions or scene_changed:
423
+ return True
424
+
425
+ last_caption = last_captions[sid]
426
+
427
+ # Always update on errors or initial state
428
+ if not last_caption or "error" in last_caption.lower() or last_caption == "Processing...":
429
+ return True
430
+
431
+ # Check caption history for patterns
432
+ if sid in caption_history and len(caption_history[sid]) > 1:
433
+ recent_captions = [item['caption'] for item in list(caption_history[sid])[-3:]]
434
+ if len(set(recent_captions)) == 1 and new_caption not in recent_captions:
435
+ return True # Break repetition
436
+
437
+ # Enhanced semantic similarity with weighted keywords
438
+ words_old = set(last_caption.lower().split())
439
+ words_new = set(new_caption.lower().split())
440
+
441
+ # Weighted keywords for different importance levels
442
+ high_priority_words = {'walking', 'running', 'sitting', 'standing', 'jumping', 'dancing',
443
+ 'eating', 'drinking', 'driving', 'flying', 'swimming', 'climbing'}
444
+ medium_priority_words = {'holding', 'wearing', 'looking', 'pointing', 'smiling', 'talking',
445
+ 'reading', 'writing', 'playing', 'working', 'sleeping'}
446
+ objects_words = {'car', 'bike', 'phone', 'book', 'cup', 'computer', 'dog', 'cat', 'bird'}
447
+
448
+ # Check for high priority changes
449
+ old_high = words_old.intersection(high_priority_words)
450
+ new_high = words_new.intersection(high_priority_words)
451
+ if old_high != new_high:
452
+ return True
453
+
454
+ # Check for significant object changes
455
+ old_objects = words_old.intersection(objects_words)
456
+ new_objects = words_new.intersection(objects_words)
457
+ if len(old_objects.symmetric_difference(new_objects)) > 1:
458
+ return True
459
+
460
+ # Advanced similarity calculation
461
+ intersection = words_old.intersection(words_new)
462
+ union = words_old.union(words_new)
463
+
464
+ if len(union) == 0:
465
+ return True
466
+
467
+ # Weighted similarity based on word importance
468
+ weight_old = sum(3 if word in high_priority_words else 2 if word in medium_priority_words else 1
469
+ for word in words_old)
470
+ weight_new = sum(3 if word in high_priority_words else 2 if word in medium_priority_words else 1
471
+ for word in words_new)
472
+ weight_intersection = sum(3 if word in high_priority_words else 2 if word in medium_priority_words else 1
473
+ for word in intersection)
474
+
475
+ weighted_similarity = (2 * weight_intersection) / (weight_old + weight_new) if (weight_old + weight_new) > 0 else 0
476
+
477
+ return weighted_similarity < 0.75
478
+
479
+ def add_context_to_caption(sid, caption):
480
+ """Add temporal context to captions."""
481
+ if sid not in caption_history or len(caption_history[sid]) < 2:
482
+ return caption
483
+
484
+ recent_captions = [item['caption'] for item in list(caption_history[sid])[-3:]]
485
+
486
+ # Detect action continuity
487
+ action_words = {'walking', 'running', 'sitting', 'standing', 'eating', 'drinking'}
488
+ current_actions = set(caption.lower().split()).intersection(action_words)
489
+
490
+ if current_actions:
491
+ for prev_caption in recent_captions[:-1]:
492
+ prev_actions = set(prev_caption.lower().split()).intersection(action_words)
493
+ if current_actions == prev_actions:
494
+ return f"{caption} (continuing)"
495
+
496
+ return caption
497
+
498
+ def generate_caption_advanced(image):
499
+ """Advanced caption generation with optimizations."""
500
+ try:
501
+ inputs = processor(image, return_tensors="pt").to(device)
502
+
503
+ # Enhanced generation parameters
504
+ generation_kwargs = {
505
+ 'max_length': 30,
506
+ 'min_length': 8,
507
+ 'num_beams': 5,
508
+ 'do_sample': True,
509
+ 'temperature': 0.8,
510
+ 'top_p': 0.95,
511
+ 'top_k': 50,
512
+ 'early_stopping': True,
513
+ 'no_repeat_ngram_size': 3,
514
+ 'length_penalty': 1.1,
515
+ 'repetition_penalty': 1.2
516
+ }
517
+
518
+ if USE_AMP and device.type == 'cuda':
519
+ with autocast():
520
+ with torch.no_grad():
521
+ generated_ids = model.generate(**inputs, **generation_kwargs)
522
+ else:
523
+ with torch.no_grad():
524
+ generated_ids = model.generate(**inputs, **generation_kwargs)
525
+
526
+ caption = processor.decode(generated_ids[0], skip_special_tokens=True)
527
+ return enhance_caption_advanced(caption)
528
+
529
+ except Exception as e:
530
+ print(f"Error in generate_caption_advanced: {e}")
531
+ return "Processing scene..."
532
+
533
+ def enhance_caption_advanced(caption):
534
+ """Advanced caption enhancement with NLP improvements."""
535
+ caption = caption.strip()
536
+ if not caption:
537
+ return "Analyzing scene..."
538
+
539
+ # Remove common prefixes more intelligently
540
+ prefixes_to_remove = [
541
+ "a picture of ", "an image of ", "this is ", "there is ", "there are ",
542
+ "the image shows ", "this image shows ", "a photo of ", "a photograph of "
543
+ ]
544
+
545
+ caption_lower = caption.lower()
546
+ for prefix in prefixes_to_remove:
547
+ if caption_lower.startswith(prefix):
548
+ caption = caption[len(prefix):]
549
+ break
550
+
551
+ # Advanced replacements for more natural language
552
+ replacements = {
553
+ r'\b(man|woman|person) (is )?(sitting on|standing in|walking on)\b':
554
+ lambda m: f"{m.group(1)} {m.group(3).replace('on', 'at').replace('in', 'within')}",
555
+ r'\bholding a\b': 'holding',
556
+ r'\bwearing a\b': 'wearing',
557
+ r'\blooking at the\b': 'observing the',
558
+ r'\bstanding next to\b': 'beside',
559
+ r'\bwalking down\b': 'walking along',
560
+ r'\bsitting at\b': 'seated at'
561
+ }
562
+
563
+ import re
564
+ for pattern, replacement in replacements.items():
565
+ if callable(replacement):
566
+ caption = re.sub(pattern, replacement, caption, flags=re.IGNORECASE)
567
+ else:
568
+ caption = re.sub(pattern, replacement, caption, flags=re.IGNORECASE)
569
+
570
+ # Capitalize appropriately
571
+ if caption and not caption[0].isupper():
572
+ caption = caption[0].upper() + caption[1:]
573
+
574
+ # Add descriptive variety
575
+ action_variations = {
576
+ 'walking': ['strolling', 'moving', 'walking'],
577
+ 'sitting': ['seated', 'resting', 'sitting'],
578
+ 'standing': ['positioned', 'standing', 'upright'],
579
+ 'holding': ['grasping', 'carrying', 'holding'],
580
+ 'looking': ['observing', 'viewing', 'watching', 'looking at']
581
+ }
582
+
583
+ # Randomly vary some common actions (seed based on caption for consistency)
584
+ import random
585
+ random.seed(hash(caption) % 1000)
586
+
587
+ for base_action, variations in action_variations.items():
588
+ if base_action in caption.lower():
589
+ if random.random() < 0.3: # 30% chance to vary
590
+ caption = caption.replace(base_action, random.choice(variations))
591
+
592
+ return caption
593
+
594
+ # ---- 3. ENHANCED FLASK ROUTES ----
595
+
596
+ @app.route('/')
597
+ def index():
598
+ """Render the main HTML page."""
599
+ return render_template('index.html')
600
+
601
+ @app.route('/status')
602
+ def status():
603
+ """Enhanced server status with detailed metrics."""
604
+ stats = perf_monitor.get_stats()
605
+ return {
606
+ 'active_connections': len(frame_counters),
607
+ 'device': str(device),
608
+ 'configuration': {
609
+ 'frame_skip': FRAME_SKIP,
610
+ 'image_size': IMAGE_SIZE,
611
+ 'buffer_size': BUFFER_SIZE,
612
+ 'cache_size': CACHE_SIZE,
613
+ 'batch_size': BATCH_SIZE,
614
+ 'adaptive_quality': ADAPTIVE_QUALITY
615
+ },
616
+ 'performance': stats,
617
+ 'cache_info': {
618
+ 'size': len(caption_cache.cache),
619
+ 'max_size': CACHE_SIZE
620
+ },
621
+ 'optimizations': {
622
+ 'mixed_precision': USE_AMP,
623
+ 'torch_script': device.type == 'cuda',
624
+ 'thread_pool_size': MAX_WORKERS
625
+ }
626
+ }
627
+
628
+ @app.route('/metrics')
629
+ def metrics():
630
+ """Detailed performance metrics endpoint."""
631
+ stats = perf_monitor.get_stats()
632
+
633
+ # Client-specific metrics
634
+ client_metrics = {}
635
+ for sid in frame_counters:
636
+ if sid in processing_times and processing_times[sid]:
637
+ client_metrics[sid] = {
638
+ 'frames_processed': frame_counters[sid],
639
+ 'avg_processing_time': np.mean(processing_times[sid]),
640
+ 'caption_history_size': len(caption_history.get(sid, [])),
641
+ 'last_caption': last_captions.get(sid, "None")
642
+ }
643
+
644
+ return {
645
+ 'global_metrics': stats,
646
+ 'client_metrics': client_metrics,
647
+ 'system_info': {
648
+ 'device': str(device),
649
+ 'cuda_available': torch.cuda.is_available(),
650
+ 'cuda_memory': torch.cuda.get_device_properties(0).total_memory if torch.cuda.is_available() else None
651
+ }
652
+ }
653
+
654
+ @app.route('/clear_cache')
655
+ def clear_cache():
656
+ """Clear all caches."""
657
+ caption_cache.clear()
658
+ return {'status': 'cache_cleared', 'timestamp': time.time()}
659
+
660
+ @app.route('/config', methods=['GET', 'POST'])
661
+ def config():
662
+ """Dynamic configuration endpoint."""
663
+ global FRAME_SKIP, ADAPTIVE_QUALITY, SCENE_CHANGE_THRESHOLD
664
+
665
+ if request.method == 'POST':
666
+ config_data = request.get_json()
667
+ if 'frame_skip' in config_data:
668
+ FRAME_SKIP = max(1, int(config_data['frame_skip']))
669
+ if 'adaptive_quality' in config_data:
670
+ ADAPTIVE_QUALITY = bool(config_data['adaptive_quality'])
671
+ if 'scene_change_threshold' in config_data:
672
+ SCENE_CHANGE_THRESHOLD = float(config_data['scene_change_threshold'])
673
+
674
+ return {'status': 'updated', 'config': {
675
+ 'frame_skip': FRAME_SKIP,
676
+ 'adaptive_quality': ADAPTIVE_QUALITY,
677
+ 'scene_change_threshold': SCENE_CHANGE_THRESHOLD
678
+ }}
679
+
680
+ return {
681
+ 'frame_skip': FRAME_SKIP,
682
+ 'adaptive_quality': ADAPTIVE_QUALITY,
683
+ 'scene_change_threshold': SCENE_CHANGE_THRESHOLD
684
+ }
685
+
686
+ # ---- 4. ENHANCED STARTUP ----
687
+ if __name__ == '__main__':
688
+ print("=" * 60)
689
+ print("🚀 Starting Enhanced Real-Time Video Captioning Server")
690
+ print("=" * 60)
691
+ print(f"📱 Device: {device}")
692
+ print(f"🎯 Image Processing: {IMAGE_SIZE}x{IMAGE_SIZE}")
693
+ print(f"⚡ Frame Skip: {FRAME_SKIP} (adaptive)")
694
+ print(f"🧠 Mixed Precision: {USE_AMP}")
695
+ print(f"🔄 Thread Pool: {MAX_WORKERS} workers")
696
+ print(f"💾 Cache Size: {CACHE_SIZE} entries (LRU)")
697
+ print(f"🎨 Quality Enhancement: {ADAPTIVE_QUALITY}")
698
+ print(f"🔍 Scene Change Detection: Enabled")
699
+ print("=" * 60)
700
+
701
+ socketio.run(app, host='0.0.0.0', port=5000, debug=False, allow_unsafe_werkzeug=True)
dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+
3
+ # Use an official Python runtime as a parent image
4
+ FROM python:3.11-slim
5
+
6
+ # Set the working directory in the container
7
+ WORKDIR /app
8
+
9
+ # Copy the requirements file into the container
10
+ COPY requirements.txt .
11
+
12
+ # Install any needed packages specified in requirements.txt
13
+ # --no-cache-dir ensures the image is smaller
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy the rest of your application code into the container
17
+ COPY . .
18
+
19
+ # Make port 5000 available to the world outside this container
20
+ # Your app runs on port 5000 according to your script
21
+ EXPOSE 5000
22
+
23
+ # Define environment variables if needed
24
+ ENV FLASK_APP=app.py
25
+
26
+ # Command to run your application using gunicorn for production
27
+ # This is more robust than `flask run`
28
+ # CMD ["gunicorn", "--workers", "1", "--threads", "4", "--bind", "0.0.0.0:5000", "--log-level", "info", "app:app"]
29
+
30
+ CMD ["python", "app.py"]
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ torch
3
+ torchvision
4
+ flask
5
+ flask_socketio
6
+ pillow
7
+ opencv-python
8
+ transformers
9
+ accelerate
10
+ opencv-contrib-python
11
+ ultralytics
12
+ gunicorn
13
+ python-engineio
14
+ gevent-websocket
15
+ opencv-python-headless
static/app.js ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document.addEventListener('DOMContentLoaded', () => {
2
+ // ---- 1. DOM Element References ----
3
+ const elements = {
4
+ webcam: document.getElementById('webcam'),
5
+ captionText: document.getElementById('caption-text'),
6
+ confidenceFill: document.getElementById('confidence-fill'),
7
+ confidenceText: document.getElementById('confidence-text'),
8
+ captionTimestamp: document.getElementById('caption-timestamp'),
9
+ startButton: document.getElementById('startButton'),
10
+ stopButton: document.getElementById('stopButton'),
11
+ muteButton: document.getElementById('muteButton'),
12
+ settingsButton: document.getElementById('settingsButton'),
13
+ fullscreenButton: document.getElementById('fullscreenButton'),
14
+ connectionStatus: document.getElementById('connection-status'),
15
+ fpsCounter: document.getElementById('fps-counter'),
16
+ recordingIndicator: document.getElementById('recording-indicator'),
17
+ latencyValue: document.getElementById('latency-value'),
18
+ accuracyValue: document.getElementById('accuracy-value'),
19
+ processedFrames: document.getElementById('processed-frames'),
20
+ captionsCount: document.getElementById('captions-count'),
21
+ historyList: document.getElementById('history-list'),
22
+ deviceInfo: document.getElementById('device-info'),
23
+ resolutionInfo: document.getElementById('resolution-info'),
24
+ cacheInfo: document.getElementById('cache-info'),
25
+ settingsModal: document.getElementById('settingsModal'),
26
+ closeSettings: document.getElementById('closeSettings'),
27
+ saveSettings: document.getElementById('saveSettings'),
28
+ resetSettings: document.getElementById('resetSettings'),
29
+ frameRateSelect: document.getElementById('frameRateSelect'),
30
+ qualitySlider: document.getElementById('qualitySlider'),
31
+ qualityValue: document.getElementById('qualityValue'),
32
+ audioToggle: document.getElementById('audioToggle'),
33
+ statusMessage: document.getElementById('status-message'),
34
+ toastContainer: document.getElementById('toastContainer')
35
+ };
36
+
37
+ // ---- 2. Application State & Settings ----
38
+ let socket;
39
+ let stream;
40
+ let frameSenderInterval;
41
+ let isCapturing = false;
42
+ let captionHistory = [];
43
+
44
+ let settings = {
45
+ frameRate: 15,
46
+ quality: 0.7,
47
+ audio: true
48
+ };
49
+
50
+ let performance = {
51
+ sentFrames: 0,
52
+ receivedFrames: 0,
53
+ captionsGenerated: 0,
54
+ totalConfidence: 0,
55
+ startTime: 0,
56
+ latencyBuffer: []
57
+ };
58
+
59
+ const LATENCY_BUFFER_SIZE = 20;
60
+
61
+ // ---- 3. Core Application Logic ----
62
+
63
+ /**
64
+ * Starts the video analysis process.
65
+ */
66
+ const startAnalysis = async () => {
67
+ if (isCapturing) return;
68
+
69
+ try {
70
+ // Get webcam stream
71
+ stream = await navigator.mediaDevices.getUserMedia({
72
+ video: {
73
+ width: { ideal: 1280 },
74
+ height: { ideal: 720 },
75
+ frameRate: { ideal: 30 }
76
+ },
77
+ audio: false
78
+ });
79
+ elements.webcam.srcObject = stream;
80
+ await elements.webcam.play();
81
+ isCapturing = true;
82
+
83
+ // Update UI
84
+ updateUIForStartState();
85
+ connectSocket();
86
+
87
+ } catch (err) {
88
+ console.error("Error accessing webcam:", err);
89
+ showToast("Webcam Error", "Could not access the webcam. Please check permissions.", "error");
90
+ updateUIForStopState();
91
+ }
92
+ };
93
+
94
+ /**
95
+ * Stops the video analysis process.
96
+ */
97
+ const stopAnalysis = () => {
98
+ if (!isCapturing) return;
99
+
100
+ // Stop intervals and streams
101
+ clearInterval(frameSenderInterval);
102
+ frameSenderInterval = null;
103
+ stream?.getTracks().forEach(track => track.stop());
104
+ socket?.disconnect();
105
+
106
+ // Cancel any ongoing speech
107
+ window.speechSynthesis.cancel();
108
+
109
+ // Reset state
110
+ isCapturing = false;
111
+ elements.webcam.srcObject = null;
112
+ updateUIForStopState();
113
+ showToast("Analysis Stopped", "Real-time captioning has been turned off.", "info");
114
+ };
115
+
116
+ /**
117
+ * Connects to the WebSocket server and sets up event listeners.
118
+ */
119
+ const connectSocket = () => {
120
+ // Use the current host and port, but with the ws:// protocol
121
+ socket = io(window.location.origin, {
122
+ transports: ['websocket'],
123
+ upgrade: false
124
+ });
125
+
126
+ socket.on('connect', () => {
127
+ console.log('Connected to server! SID:', socket.id);
128
+ elements.connectionStatus.textContent = "Connected";
129
+ elements.connectionStatus.style.color = 'var(--success-color)';
130
+ showToast("Connected", "Successfully connected to the AI server.", "success");
131
+ startFrameSending();
132
+ });
133
+
134
+ socket.on('caption', handleCaption);
135
+
136
+ socket.on('disconnect', () => {
137
+ console.log('Disconnected from server.');
138
+ elements.connectionStatus.textContent = "Disconnected";
139
+ elements.connectionStatus.style.color = 'var(--danger-color)';
140
+ if (isCapturing) {
141
+ stopAnalysis();
142
+ }
143
+ });
144
+
145
+ socket.on('connect_error', (error) => {
146
+ console.error('Connection error:', error);
147
+ showToast("Connection Error", "Failed to connect to the server.", "error");
148
+ stopAnalysis();
149
+ });
150
+ };
151
+
152
+ /**
153
+ * Initializes the interval for sending video frames to the server.
154
+ */
155
+ const startFrameSending = () => {
156
+ const canvas = document.createElement('canvas');
157
+ const context = canvas.getContext('2d', { alpha: false });
158
+
159
+ frameSenderInterval = setInterval(() => {
160
+ if (!isCapturing || elements.webcam.paused || elements.webcam.ended) {
161
+ return;
162
+ }
163
+ // Match the server's expected image size
164
+ canvas.width = 384;
165
+ canvas.height = 384;
166
+
167
+ context.drawImage(elements.webcam, 0, 0, canvas.width, canvas.height);
168
+ const dataUrl = canvas.toDataURL('image/jpeg', settings.quality);
169
+ socket.emit('image', dataUrl);
170
+ performance.sentFrames++;
171
+ updatePerformanceUI();
172
+
173
+ }, 1000 / settings.frameRate);
174
+ };
175
+
176
+ // ---- 4. UI Update Functions ----
177
+
178
+ /**
179
+ * Handles incoming captions from the server.
180
+ * @param {object} data - The caption data from the server.
181
+ */
182
+ const handleCaption = (data) => {
183
+ performance.receivedFrames++;
184
+ performance.captionsGenerated++;
185
+ performance.totalConfidence += data.confidence;
186
+
187
+ // Update main caption display
188
+ elements.captionText.textContent = data.caption;
189
+ const confidencePercent = (data.confidence * 100).toFixed(0);
190
+ elements.confidenceFill.style.width = `${confidencePercent}%`;
191
+ elements.confidenceText.textContent = `${confidencePercent}%`;
192
+ const timestamp = new Date(data.timestamp * 1000);
193
+ elements.captionTimestamp.textContent = timestamp.toLocaleTimeString();
194
+
195
+ // Calculate latency
196
+ const latency = (Date.now() / 1000) - data.timestamp;
197
+ performance.latencyBuffer.push(latency);
198
+ if (performance.latencyBuffer.length > LATENCY_BUFFER_SIZE) {
199
+ performance.latencyBuffer.shift();
200
+ }
201
+
202
+ // Add to history
203
+ updateHistory(data.caption, confidencePercent, timestamp);
204
+
205
+ // Speak the caption
206
+ if (settings.audio) {
207
+ speakCaption(data.caption);
208
+ }
209
+ };
210
+
211
+ /**
212
+ * Updates the UI to reflect the "capturing started" state.
213
+ */
214
+ const updateUIForStartState = () => {
215
+ elements.startButton.disabled = true;
216
+ elements.stopButton.disabled = false;
217
+ elements.recordingIndicator.classList.add('active');
218
+ elements.statusMessage.textContent = "AI analysis is active...";
219
+
220
+ // Reset performance metrics
221
+ performance = {
222
+ sentFrames: 0,
223
+ receivedFrames: 0,
224
+ captionsGenerated: 0,
225
+ totalConfidence: 0,
226
+ startTime: Date.now(),
227
+ latencyBuffer: []
228
+ };
229
+ elements.resolutionInfo.textContent = `${elements.webcam.videoWidth}x${elements.webcam.videoHeight}`;
230
+ elements.historyList.innerHTML = '<div class="history-item"><div class="history-text">Waiting for captions...</div></div>';
231
+ };
232
+
233
+ /**
234
+ * Updates the UI to reflect the "capturing stopped" state.
235
+ */
236
+ const updateUIForStopState = () => {
237
+ elements.startButton.disabled = false;
238
+ elements.stopButton.disabled = true;
239
+ elements.recordingIndicator.classList.remove('active');
240
+ elements.statusMessage.textContent = "Ready to start analysis.";
241
+ elements.connectionStatus.textContent = "Disconnected";
242
+ elements.connectionStatus.style.color = 'var(--text-secondary)';
243
+ elements.fpsCounter.textContent = '0';
244
+ elements.latencyValue.textContent = '0ms';
245
+ elements.captionText.textContent = "Analysis stopped.";
246
+ };
247
+
248
+ /**
249
+ * Periodically updates performance metrics on the UI.
250
+ */
251
+ const updatePerformanceUI = () => {
252
+ const elapsedSeconds = (Date.now() - performance.startTime) / 1000;
253
+ if (elapsedSeconds === 0) return;
254
+
255
+ const fps = (performance.sentFrames / elapsedSeconds).toFixed(0);
256
+ elements.fpsCounter.textContent = fps;
257
+
258
+ const avgLatency = performance.latencyBuffer.reduce((a, b) => a + b, 0) / performance.latencyBuffer.length || 0;
259
+ elements.latencyValue.textContent = `${(avgLatency * 1000).toFixed(0)}ms`;
260
+
261
+ const avgConfidence = (performance.totalConfidence / performance.captionsGenerated * 100) || 0;
262
+ elements.accuracyValue.textContent = `${avgConfidence.toFixed(0)}%`;
263
+
264
+ elements.processedFrames.textContent = performance.receivedFrames;
265
+ elements.captionsCount.textContent = performance.captionsGenerated;
266
+ };
267
+
268
+ /**
269
+ * Adds a new caption to the history panel.
270
+ * @param {string} caption - The caption text.
271
+ * @param {string} confidence - The confidence percentage string.
272
+ * @param {Date} timestamp - The Date object for the caption.
273
+ */
274
+ const updateHistory = (caption, confidence, timestamp) => {
275
+ // Remove placeholder if it exists
276
+ if (captionHistory.length === 0) {
277
+ elements.historyList.innerHTML = '';
278
+ }
279
+
280
+ const historyItem = { caption, confidence, timestamp };
281
+ captionHistory.unshift(historyItem);
282
+ if (captionHistory.length > 20) { // Limit history size
283
+ captionHistory.pop();
284
+ }
285
+
286
+ const itemElement = document.createElement('div');
287
+ itemElement.className = 'history-item';
288
+ itemElement.innerHTML = `
289
+ <div class="history-text">${caption}</div>
290
+ <div class="history-meta">
291
+ <span class="history-confidence">${confidence}%</span>
292
+ <span class="history-time">${timestamp.toLocaleTimeString()}</span>
293
+ </div>
294
+ `;
295
+ elements.historyList.prepend(itemElement);
296
+
297
+ // Remove the last element if list is too long
298
+ if (elements.historyList.children.length > 20) {
299
+ elements.historyList.lastChild.remove();
300
+ }
301
+ };
302
+
303
+ /**
304
+ * Fetches and displays system status from the server.
305
+ */
306
+ const fetchStatus = async () => {
307
+ try {
308
+ const response = await fetch('/status');
309
+ const data = await response.json();
310
+ elements.deviceInfo.textContent = data.device.toUpperCase();
311
+ elements.cacheInfo.textContent = `${(data.performance.cache_hit_rate * 100).toFixed(0)}%`;
312
+ } catch (error) {
313
+ console.error("Error fetching server status:", error);
314
+ elements.deviceInfo.textContent = 'Error';
315
+ }
316
+ };
317
+
318
+ // ---- 5. Feature Logic (Audio, Settings, etc.) ----
319
+
320
+ /**
321
+ * Uses the Web Speech API to speak the provided text.
322
+ * @param {string} text - The text to be spoken.
323
+ */
324
+ const speakCaption = (text) => {
325
+ if (!text || text.toLowerCase().includes("processing")) return;
326
+
327
+ window.speechSynthesis.cancel(); // Interrupt previous speech for the latest update
328
+ const utterance = new SpeechSynthesisUtterance(text);
329
+ utterance.rate = 1.1;
330
+ utterance.pitch = 1.0;
331
+ utterance.volume = 0.8;
332
+ window.speechSynthesis.speak(utterance);
333
+ };
334
+
335
+ /**
336
+ * Toggles the settings modal visibility.
337
+ */
338
+ const toggleSettingsModal = () => {
339
+ elements.settingsModal.classList.toggle('active');
340
+ };
341
+
342
+ /**
343
+ * Saves the settings from the modal and applies them.
344
+ */
345
+ const saveSettings = () => {
346
+ settings.frameRate = parseInt(elements.frameRateSelect.value, 10);
347
+ settings.quality = parseFloat(elements.qualitySlider.value);
348
+ settings.audio = elements.audioToggle.checked;
349
+
350
+ toggleSettingsModal();
351
+ showToast("Settings Saved", "Your new settings have been applied.", "success");
352
+
353
+ // If capturing, restart the interval to apply new frame rate
354
+ if (isCapturing) {
355
+ clearInterval(frameSenderInterval);
356
+ startFrameSending();
357
+ }
358
+ };
359
+
360
+ /**
361
+ * Creates and displays a toast notification.
362
+ * @param {string} title - The title of the toast.
363
+ * @param {string} message - The message body of the toast.
364
+ * @param {string} type - The type of toast (success, error, info, warning).
365
+ */
366
+ const showToast = (title, message, type = 'info') => {
367
+ const toast = document.createElement('div');
368
+ toast.className = `toast ${type}`;
369
+ toast.innerHTML = `
370
+ <div class="toast-content">
371
+ <div class="toast-title">${title}</div>
372
+ <div class="toast-message">${message}</div>
373
+ </div>
374
+ <button class="toast-close">&times;</button>
375
+ `;
376
+ elements.toastContainer.appendChild(toast);
377
+
378
+ setTimeout(() => toast.classList.add('show'), 10);
379
+
380
+ const removeToast = () => {
381
+ toast.classList.remove('show');
382
+ setTimeout(() => toast.remove(), 500);
383
+ };
384
+
385
+ toast.querySelector('.toast-close').onclick = removeToast;
386
+ setTimeout(removeToast, 5000);
387
+ };
388
+
389
+
390
+ // ---- 6. Event Listeners ----
391
+ elements.startButton.addEventListener('click', startAnalysis);
392
+ elements.stopButton.addEventListener('click', stopAnalysis);
393
+
394
+ elements.settingsButton.addEventListener('click', toggleSettingsModal);
395
+ elements.closeSettings.addEventListener('click', toggleSettingsModal);
396
+ elements.saveSettings.addEventListener('click', saveSettings);
397
+
398
+ elements.muteButton.addEventListener('click', () => {
399
+ settings.audio = !settings.audio;
400
+ elements.audioToggle.checked = settings.audio;
401
+ elements.muteButton.classList.toggle('active', settings.audio);
402
+ showToast("Audio " + (settings.audio ? "Enabled" : "Disabled"), "", "info");
403
+ });
404
+
405
+ elements.fullscreenButton.addEventListener('click', () => {
406
+ if (!document.fullscreenElement) {
407
+ elements.webcam.parentElement.requestFullscreen();
408
+ } else {
409
+ document.exitFullscreen();
410
+ }
411
+ });
412
+
413
+ elements.qualitySlider.addEventListener('input', (e) => {
414
+ elements.qualityValue.textContent = `${Math.round(e.target.value * 100)}%`;
415
+ });
416
+
417
+ document.addEventListener('keydown', (e) => {
418
+ if (e.target.tagName === 'INPUT' || e.target.tagName === 'SELECT') return;
419
+
420
+ switch (e.code) {
421
+ case 'Space':
422
+ e.preventDefault();
423
+ isCapturing ? stopAnalysis() : startAnalysis();
424
+ break;
425
+ case 'KeyS':
426
+ e.preventDefault();
427
+ toggleSettingsModal();
428
+ break;
429
+ case 'KeyM':
430
+ e.preventDefault();
431
+ elements.muteButton.click();
432
+ break;
433
+ case 'KeyF':
434
+ e.preventDefault();
435
+ elements.fullscreenButton.click();
436
+ break;
437
+ }
438
+ });
439
+
440
+ // ---- 7. Initialization ----
441
+ const init = () => {
442
+ updateUIForStopState();
443
+ fetchStatus();
444
+ };
445
+
446
+ init();
447
+ });
static/style.css ADDED
@@ -0,0 +1,1103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Reset and Base Styles */
2
+ * {
3
+ margin: 0;
4
+ padding: 0;
5
+ box-sizing: border-box;
6
+ }
7
+
8
+ :root {
9
+ --primary-color: #0066cc;
10
+ --primary-hover: #0052a3;
11
+ --secondary-color: #6c757d;
12
+ --success-color: #28a745;
13
+ --danger-color: #dc3545;
14
+ --warning-color: #ffc107;
15
+ --info-color: #17a2b8;
16
+
17
+ --bg-primary: #0a0e1a;
18
+ --bg-secondary: #1a1f2e;
19
+ --bg-tertiary: #242938;
20
+ --bg-card: rgba(26, 31, 46, 0.8);
21
+ --bg-overlay: rgba(0, 0, 0, 0.7);
22
+
23
+ --text-primary: #ffffff;
24
+ --text-secondary: #b8c1d3;
25
+ --text-muted: #6c757d;
26
+
27
+ --border-color: rgba(255, 255, 255, 0.1);
28
+ --border-active: rgba(0, 102, 204, 0.5);
29
+
30
+ --shadow-sm: 0 2px 4px rgba(0, 0, 0, 0.1);
31
+ --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.15);
32
+ --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.25);
33
+ --shadow-xl: 0 12px 36px rgba(0, 0, 0, 0.35);
34
+
35
+ --radius-sm: 6px;
36
+ --radius-md: 8px;
37
+ --radius-lg: 12px;
38
+ --radius-xl: 16px;
39
+
40
+ --transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
41
+ --transition-fast: all 0.15s ease;
42
+ }
43
+
44
+ body {
45
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
46
+ background: linear-gradient(135deg, var(--bg-primary) 0%, #0f1419 50%, var(--bg-secondary) 100%);
47
+ color: var(--text-primary);
48
+ line-height: 1.6;
49
+ min-height: 100vh;
50
+ overflow-x: hidden;
51
+ font-feature-settings: 'kern' 1, 'liga' 1;
52
+ }
53
+
54
+ /* App Container */
55
+ .app-container {
56
+ min-height: 100vh;
57
+ display: flex;
58
+ flex-direction: column;
59
+ }
60
+
61
+ /* Header */
62
+ .header {
63
+ background: var(--bg-card);
64
+ backdrop-filter: blur(20px);
65
+ border-bottom: 1px solid var(--border-color);
66
+ padding: 1rem 2rem;
67
+ position: sticky;
68
+ top: 0;
69
+ z-index: 100;
70
+ }
71
+
72
+ .header-content {
73
+ display: flex;
74
+ justify-content: space-between;
75
+ align-items: center;
76
+ max-width: 1400px;
77
+ margin: 0 auto;
78
+ }
79
+
80
+ .logo-section {
81
+ display: flex;
82
+ align-items: center;
83
+ gap: 1rem;
84
+ }
85
+
86
+ .logo-icon {
87
+ font-size: 2rem;
88
+ background: linear-gradient(135deg, var(--primary-color), var(--info-color));
89
+ background-clip: text;
90
+ -webkit-background-clip: text;
91
+ -webkit-text-fill-color: transparent;
92
+ }
93
+
94
+ .header h1 {
95
+ font-size: 1.5rem;
96
+ font-weight: 700;
97
+ margin: 0;
98
+ background: linear-gradient(135deg, var(--primary-color), var(--info-color));
99
+ background-clip: text;
100
+ -webkit-background-clip: text;
101
+ -webkit-text-fill-color: transparent;
102
+ }
103
+
104
+ .subtitle {
105
+ font-size: 0.875rem;
106
+ color: var(--text-secondary);
107
+ margin: 0;
108
+ }
109
+
110
+ .header-stats {
111
+ display: flex;
112
+ gap: 2rem;
113
+ }
114
+
115
+ .stat-item {
116
+ text-align: center;
117
+ }
118
+
119
+ .stat-label {
120
+ display: block;
121
+ font-size: 0.75rem;
122
+ color: var(--text-muted);
123
+ text-transform: uppercase;
124
+ letter-spacing: 0.5px;
125
+ margin-bottom: 0.25rem;
126
+ }
127
+
128
+ .stat-value {
129
+ display: block;
130
+ font-size: 1.25rem;
131
+ font-weight: 600;
132
+ color: var(--text-primary);
133
+ }
134
+
135
+ /* Main Content */
136
+ .main-content {
137
+ flex: 1;
138
+ display: grid;
139
+ grid-template-columns: 1fr 350px;
140
+ gap: 2rem;
141
+ padding: 2rem;
142
+ max-width: 1400px;
143
+ margin: 0 auto;
144
+ width: 100%;
145
+ }
146
+
147
+ .video-section {
148
+ display: flex;
149
+ flex-direction: column;
150
+ gap: 1.5rem;
151
+ }
152
+
153
+ /* Video Container */
154
+ .video-container {
155
+ position: relative;
156
+ background: var(--bg-tertiary);
157
+ border-radius: var(--radius-xl);
158
+ overflow: hidden;
159
+ box-shadow: var(--shadow-xl);
160
+ border: 1px solid var(--border-color);
161
+ transition: var(--transition);
162
+ }
163
+
164
+ .video-container:hover {
165
+ box-shadow: var(--shadow-xl), 0 0 0 1px var(--border-active);
166
+ }
167
+
168
+ #webcam {
169
+ width: 100%;
170
+ height: 480px;
171
+ object-fit: cover;
172
+ display: block;
173
+ background: #000;
174
+ }
175
+
176
+ /* Video Overlay */
177
+ .video-overlay {
178
+ position: absolute;
179
+ inset: 0;
180
+ pointer-events: none;
181
+ display: flex;
182
+ flex-direction: column;
183
+ justify-content: space-between;
184
+ }
185
+
186
+ .overlay-top {
187
+ display: flex;
188
+ justify-content: space-between;
189
+ align-items: flex-start;
190
+ padding: 1rem;
191
+ }
192
+
193
+ .recording-indicator {
194
+ display: flex;
195
+ align-items: center;
196
+ gap: 0.5rem;
197
+ background: var(--bg-overlay);
198
+ backdrop-filter: blur(10px);
199
+ padding: 0.5rem 1rem;
200
+ border-radius: var(--radius-lg);
201
+ color: var(--danger-color);
202
+ font-size: 0.875rem;
203
+ font-weight: 600;
204
+ opacity: 0;
205
+ transition: var(--transition);
206
+ }
207
+
208
+ .recording-indicator.active {
209
+ opacity: 1;
210
+ }
211
+
212
+ .recording-dot {
213
+ width: 8px;
214
+ height: 8px;
215
+ background: var(--danger-color);
216
+ border-radius: 50%;
217
+ animation: pulse 2s infinite;
218
+ }
219
+
220
+ @keyframes pulse {
221
+ 0%, 100% { opacity: 1; }
222
+ 50% { opacity: 0.5; }
223
+ }
224
+
225
+ .quality-indicator {
226
+ background: var(--bg-overlay);
227
+ backdrop-filter: blur(10px);
228
+ padding: 0.25rem 0.75rem;
229
+ border-radius: var(--radius-md);
230
+ font-size: 0.75rem;
231
+ font-weight: 600;
232
+ color: var(--success-color);
233
+ }
234
+
235
+ /* Caption Overlay */
236
+ .caption-overlay {
237
+ padding: 2rem;
238
+ background: linear-gradient(
239
+ to top,
240
+ var(--bg-overlay) 0%,
241
+ rgba(0, 0, 0, 0.4) 70%,
242
+ transparent 100%
243
+ );
244
+ backdrop-filter: blur(10px);
245
+ }
246
+
247
+ .caption-content {
248
+ max-width: 600px;
249
+ }
250
+
251
+ #caption-text {
252
+ font-size: 1.5rem;
253
+ font-weight: 600;
254
+ line-height: 1.4;
255
+ margin-bottom: 1rem;
256
+ text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.8);
257
+ min-height: 2.1rem;
258
+ transition: var(--transition);
259
+ }
260
+
261
+ .caption-metadata {
262
+ display: flex;
263
+ align-items: center;
264
+ gap: 1rem;
265
+ font-size: 0.875rem;
266
+ }
267
+
268
+ .confidence-bar {
269
+ flex: 1;
270
+ height: 6px;
271
+ background: rgba(255, 255, 255, 0.2);
272
+ border-radius: 3px;
273
+ overflow: hidden;
274
+ }
275
+
276
+ .confidence-fill {
277
+ height: 100%;
278
+ background: linear-gradient(90deg, var(--danger-color), var(--warning-color), var(--success-color));
279
+ border-radius: 3px;
280
+ transition: width 0.5s ease;
281
+ width: 0%;
282
+ }
283
+
284
+ .confidence-text {
285
+ color: var(--text-secondary);
286
+ font-weight: 600;
287
+ min-width: 40px;
288
+ }
289
+
290
+ .timestamp {
291
+ color: var(--text-muted);
292
+ font-size: 0.75rem;
293
+ }
294
+
295
+ /* Controls */
296
+ .controls-section {
297
+ display: flex;
298
+ justify-content: space-between;
299
+ align-items: center;
300
+ gap: 1rem;
301
+ flex-wrap: wrap;
302
+ }
303
+
304
+ .main-controls, .advanced-controls {
305
+ display: flex;
306
+ gap: 1rem;
307
+ }
308
+
309
+ /* Button Styles */
310
+ .btn {
311
+ display: inline-flex;
312
+ align-items: center;
313
+ gap: 0.5rem;
314
+ padding: 0.75rem 1.5rem;
315
+ border: none;
316
+ border-radius: var(--radius-md);
317
+ font-size: 0.875rem;
318
+ font-weight: 600;
319
+ cursor: pointer;
320
+ transition: var(--transition);
321
+ text-decoration: none;
322
+ white-space: nowrap;
323
+ position: relative;
324
+ overflow: hidden;
325
+ }
326
+
327
+ .btn::before {
328
+ content: '';
329
+ position: absolute;
330
+ top: 0;
331
+ left: -100%;
332
+ width: 100%;
333
+ height: 100%;
334
+ background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.1), transparent);
335
+ transition: left 0.5s ease;
336
+ }
337
+
338
+ .btn:hover::before {
339
+ left: 100%;
340
+ }
341
+
342
+ .btn-primary {
343
+ background: linear-gradient(135deg, var(--primary-color), var(--primary-hover));
344
+ color: white;
345
+ box-shadow: var(--shadow-md);
346
+ }
347
+
348
+ .btn-primary:hover:not(:disabled) {
349
+ transform: translateY(-2px);
350
+ box-shadow: var(--shadow-lg);
351
+ }
352
+
353
+ .btn-secondary {
354
+ background: linear-gradient(135deg, var(--danger-color), #b02e3c);
355
+ color: white;
356
+ box-shadow: var(--shadow-md);
357
+ }
358
+
359
+ .btn-secondary:hover:not(:disabled) {
360
+ transform: translateY(-2px);
361
+ box-shadow: var(--shadow-lg);
362
+ }
363
+
364
+ .btn-outline {
365
+ background: var(--bg-card);
366
+ color: var(--text-secondary);
367
+ border: 1px solid var(--border-color);
368
+ }
369
+
370
+ .btn-outline:hover {
371
+ background: var(--bg-tertiary);
372
+ color: var(--text-primary);
373
+ border-color: var(--border-active);
374
+ }
375
+
376
+ .btn:disabled {
377
+ opacity: 0.5;
378
+ cursor: not-allowed;
379
+ transform: none !important;
380
+ }
381
+
382
+ .btn-icon {
383
+ font-size: 1rem;
384
+ }
385
+
386
+ /* Sidebar */
387
+ .sidebar {
388
+ display: flex;
389
+ flex-direction: column;
390
+ gap: 1.5rem;
391
+ }
392
+
393
+ .metrics-panel, .history-panel, .system-panel {
394
+ background: var(--bg-card);
395
+ backdrop-filter: blur(20px);
396
+ border: 1px solid var(--border-color);
397
+ border-radius: var(--radius-lg);
398
+ padding: 1.5rem;
399
+ box-shadow: var(--shadow-md);
400
+ }
401
+
402
+ .metrics-panel h3, .history-panel h3, .system-panel h3 {
403
+ font-size: 1.125rem;
404
+ font-weight: 600;
405
+ margin-bottom: 1rem;
406
+ color: var(--text-primary);
407
+ display: flex;
408
+ align-items: center;
409
+ gap: 0.5rem;
410
+ }
411
+
412
+ /* Metrics Grid */
413
+ .metrics-grid {
414
+ display: grid;
415
+ grid-template-columns: 1fr 1fr;
416
+ gap: 1rem;
417
+ }
418
+
419
+ .metric-item {
420
+ text-align: center;
421
+ padding: 1rem;
422
+ background: var(--bg-tertiary);
423
+ border-radius: var(--radius-md);
424
+ border: 1px solid var(--border-color);
425
+ transition: var(--transition);
426
+ }
427
+
428
+ .metric-item:hover {
429
+ border-color: var(--border-active);
430
+ transform: translateY(-2px);
431
+ }
432
+
433
+ .metric-value {
434
+ font-size: 1.5rem;
435
+ font-weight: 700;
436
+ color: var(--primary-color);
437
+ margin-bottom: 0.25rem;
438
+ }
439
+
440
+ .metric-label {
441
+ font-size: 0.75rem;
442
+ color: var(--text-muted);
443
+ text-transform: uppercase;
444
+ letter-spacing: 0.5px;
445
+ }
446
+
447
+ /* History Panel */
448
+ .history-list {
449
+ max-height: 300px;
450
+ overflow-y: auto;
451
+ display: flex;
452
+ flex-direction: column;
453
+ gap: 0.75rem;
454
+ }
455
+
456
+ .history-list::-webkit-scrollbar {
457
+ width: 6px;
458
+ }
459
+
460
+ .history-list::-webkit-scrollbar-track {
461
+ background: var(--bg-tertiary);
462
+ border-radius: 3px;
463
+ }
464
+
465
+ .history-list::-webkit-scrollbar-thumb {
466
+ background: var(--border-color);
467
+ border-radius: 3px;
468
+ }
469
+
470
+ .history-list::-webkit-scrollbar-thumb:hover {
471
+ background: var(--border-active);
472
+ }
473
+
474
+ .history-item {
475
+ padding: 1rem;
476
+ background: var(--bg-tertiary);
477
+ border-radius: var(--radius-md);
478
+ border: 1px solid var(--border-color);
479
+ transition: var(--transition);
480
+ }
481
+
482
+ .history-item:hover {
483
+ border-color: var(--border-active);
484
+ }
485
+
486
+ .history-text {
487
+ font-size: 0.875rem;
488
+ margin-bottom: 0.5rem;
489
+ line-height: 1.4;
490
+ }
491
+
492
+ .history-meta {
493
+ display: flex;
494
+ justify-content: space-between;
495
+ align-items: center;
496
+ font-size: 0.75rem;
497
+ color: var(--text-muted);
498
+ }
499
+
500
+ .history-confidence {
501
+ padding: 0.25rem 0.5rem;
502
+ background: var(--success-color);
503
+ color: white;
504
+ border-radius: var(--radius-sm);
505
+ font-weight: 600;
506
+ }
507
+
508
+ /* System Info */
509
+ .system-info {
510
+ display: flex;
511
+ flex-direction: column;
512
+ gap: 0.75rem;
513
+ }
514
+
515
+ .info-item {
516
+ display: flex;
517
+ justify-content: space-between;
518
+ align-items: center;
519
+ padding: 0.75rem;
520
+ background: var(--bg-tertiary);
521
+ border-radius: var(--radius-md);
522
+ border: 1px solid var(--border-color);
523
+ }
524
+
525
+ .info-label {
526
+ font-weight: 600;
527
+ color: var(--text-secondary);
528
+ }
529
+
530
+ .info-value {
531
+ color: var(--text-primary);
532
+ font-weight: 500;
533
+ }
534
+
535
+ /* Modal */
536
+ .modal {
537
+ position: fixed;
538
+ inset: 0;
539
+ background: rgba(0, 0, 0, 0.8);
540
+ backdrop-filter: blur(10px);
541
+ display: none;
542
+ align-items: center;
543
+ justify-content: center;
544
+ z-index: 1000;
545
+ opacity: 0;
546
+ transition: var(--transition);
547
+ }
548
+
549
+ .modal.active {
550
+ display: flex;
551
+ opacity: 1;
552
+ }
553
+
554
+ .modal-content {
555
+ background: var(--bg-secondary);
556
+ border-radius: var(--radius-xl);
557
+ border: 1px solid var(--border-color);
558
+ box-shadow: var(--shadow-xl);
559
+ width: 90%;
560
+ max-width: 500px;
561
+ max-height: 80vh;
562
+ overflow: hidden;
563
+ transform: translateY(-20px);
564
+ transition: var(--transition);
565
+ }
566
+
567
+ .modal.active .modal-content {
568
+ transform: translateY(0);
569
+ }
570
+
571
+ .modal-header {
572
+ padding: 1.5rem;
573
+ border-bottom: 1px solid var(--border-color);
574
+ display: flex;
575
+ justify-content: space-between;
576
+ align-items: center;
577
+ }
578
+
579
+ .modal-header h3 {
580
+ font-size: 1.25rem;
581
+ font-weight: 600;
582
+ }
583
+
584
+ .modal-close {
585
+ background: none;
586
+ border: none;
587
+ color: var(--text-muted);
588
+ font-size: 1.5rem;
589
+ cursor: pointer;
590
+ padding: 0;
591
+ transition: var(--transition);
592
+ }
593
+
594
+ .modal-close:hover {
595
+ color: var(--text-primary);
596
+ }
597
+
598
+ .modal-body {
599
+ padding: 1.5rem;
600
+ max-height: 400px;
601
+ overflow-y: auto;
602
+ }
603
+
604
+ .modal-body::-webkit-scrollbar {
605
+ width: 6px;
606
+ }
607
+
608
+ .modal-body::-webkit-scrollbar-track {
609
+ background: var(--bg-tertiary);
610
+ border-radius: 3px;
611
+ }
612
+
613
+ .modal-body::-webkit-scrollbar-thumb {
614
+ background: var(--border-color);
615
+ border-radius: 3px;
616
+ }
617
+
618
+ .modal-body::-webkit-scrollbar-thumb:hover {
619
+ background: var(--border-active);
620
+ }
621
+
622
+ .modal-footer {
623
+ padding: 1.5rem;
624
+ border-top: 1px solid var(--border-color);
625
+ display: flex;
626
+ justify-content: flex-end;
627
+ gap: 1rem;
628
+ }
629
+
630
+ /* Settings */
631
+ .setting-group {
632
+ margin-bottom: 1.5rem;
633
+ }
634
+
635
+ .setting-group:last-child {
636
+ margin-bottom: 0;
637
+ }
638
+
639
+ .setting-group label {
640
+ display: block;
641
+ font-weight: 600;
642
+ margin-bottom: 0.5rem;
643
+ color: var(--text-primary);
644
+ }
645
+
646
+ .setting-group select,
647
+ .setting-group input[type="range"] {
648
+ width: 100%;
649
+ padding: 0.75rem;
650
+ background: var(--bg-tertiary);
651
+ border: 1px solid var(--border-color);
652
+ border-radius: var(--radius-md);
653
+ color: var(--text-primary);
654
+ transition: var(--transition);
655
+ }
656
+
657
+ .setting-group select:focus,
658
+ .setting-group input[type="range"]:focus {
659
+ outline: none;
660
+ border-color: var(--border-active);
661
+ }
662
+
663
+ .setting-group select {
664
+ appearance: none;
665
+ background-image: url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 20 20'%3e%3cpath stroke='%236b7280' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' d='m6 8 4 4 4-4'/%3e%3c/svg%3e");
666
+ background-position: right 0.5rem center;
667
+ background-repeat: no-repeat;
668
+ background-size: 1.5em 1.5em;
669
+ padding-right: 2.5rem;
670
+ }
671
+
672
+ .setting-group input[type="range"] {
673
+ /* -webkit-appearance: none; */
674
+ height: 8px;
675
+ border-radius: 4px;
676
+ background: var(--bg-tertiary);
677
+ outline: none;
678
+ }
679
+
680
+ .setting-group input[type="range"]::-webkit-slider-thumb {
681
+ -webkit-appearance: none;
682
+ appearance: none;
683
+ width: 20px;
684
+ height: 20px;
685
+ border-radius: 50%;
686
+ background: var(--primary-color);
687
+ cursor: pointer;
688
+ border: 2px solid white;
689
+ box-shadow: var(--shadow-sm);
690
+ }
691
+
692
+ .setting-group input[type="range"]::-moz-range-thumb {
693
+ width: 20px;
694
+ height: 20px;
695
+ border-radius: 50%;
696
+ background: var(--primary-color);
697
+ cursor: pointer;
698
+ border: 2px solid white;
699
+ box-shadow: var(--shadow-sm);
700
+ }
701
+
702
+ /* Toggle Switch */
703
+ .toggle-switch {
704
+ position: relative;
705
+ display: inline-block;
706
+ width: 50px;
707
+ height: 24px;
708
+ }
709
+
710
+ .toggle-switch input {
711
+ opacity: 0;
712
+ width: 0;
713
+ height: 0;
714
+ }
715
+
716
+ .toggle-slider {
717
+ position: absolute;
718
+ cursor: pointer;
719
+ top: 0;
720
+ left: 0;
721
+ right: 0;
722
+ bottom: 0;
723
+ background: var(--bg-tertiary);
724
+ border: 1px solid var(--border-color);
725
+ transition: var(--transition);
726
+ border-radius: 24px;
727
+ }
728
+
729
+ .toggle-slider::before {
730
+ position: absolute;
731
+ content: "";
732
+ height: 18px;
733
+ width: 18px;
734
+ left: 2px;
735
+ top: 2px;
736
+ background: var(--text-muted);
737
+ transition: var(--transition);
738
+ border-radius: 50%;
739
+ }
740
+
741
+ .toggle-switch input:checked + .toggle-slider {
742
+ background: var(--primary-color);
743
+ border-color: var(--primary-color);
744
+ }
745
+
746
+ .toggle-switch input:checked + .toggle-slider::before {
747
+ transform: translateX(26px);
748
+ background: white;
749
+ }
750
+
751
+ /* Status Bar */
752
+ .status-bar {
753
+ background: var(--bg-card);
754
+ backdrop-filter: blur(20px);
755
+ border-top: 1px solid var(--border-color);
756
+ padding: 0.75rem 2rem;
757
+ display: flex;
758
+ justify-content: space-between;
759
+ align-items: center;
760
+ font-size: 0.875rem;
761
+ }
762
+
763
+ .status-left {
764
+ color: var(--text-primary);
765
+ font-weight: 500;
766
+ }
767
+
768
+ .status-right {
769
+ color: var(--text-muted);
770
+ }
771
+
772
+ .keyboard-hint {
773
+ font-size: 0.75rem;
774
+ }
775
+
776
+ /* Toast Notifications */
777
+ .toast-container {
778
+ position: fixed;
779
+ top: 2rem;
780
+ right: 2rem;
781
+ z-index: 1100;
782
+ display: flex;
783
+ flex-direction: column;
784
+ gap: 0.5rem;
785
+ pointer-events: none;
786
+ }
787
+
788
+ .toast {
789
+ background: var(--bg-secondary);
790
+ border: 1px solid var(--border-color);
791
+ border-radius: var(--radius-lg);
792
+ padding: 1rem 1.5rem;
793
+ box-shadow: var(--shadow-lg);
794
+ backdrop-filter: blur(20px);
795
+ display: flex;
796
+ align-items: center;
797
+ gap: 0.75rem;
798
+ min-width: 300px;
799
+ transform: translateX(400px);
800
+ transition: var(--transition);
801
+ pointer-events: auto;
802
+ }
803
+
804
+ .toast.show {
805
+ transform: translateX(0);
806
+ }
807
+
808
+ .toast.hide {
809
+ transform: translateX(400px);
810
+ opacity: 0;
811
+ }
812
+
813
+ .toast-icon {
814
+ font-size: 1.25rem;
815
+ flex-shrink: 0;
816
+ }
817
+
818
+ .toast-content {
819
+ flex: 1;
820
+ }
821
+
822
+ .toast-title {
823
+ font-weight: 600;
824
+ font-size: 0.875rem;
825
+ margin-bottom: 0.25rem;
826
+ color: var(--text-primary);
827
+ }
828
+
829
+ .toast-message {
830
+ font-size: 0.8125rem;
831
+ color: var(--text-secondary);
832
+ line-height: 1.4;
833
+ }
834
+
835
+ .toast-close {
836
+ background: none;
837
+ border: none;
838
+ color: var(--text-muted);
839
+ cursor: pointer;
840
+ padding: 0;
841
+ font-size: 1rem;
842
+ transition: var(--transition);
843
+ flex-shrink: 0;
844
+ }
845
+
846
+ .toast-close:hover {
847
+ color: var(--text-primary);
848
+ }
849
+
850
+ /* Toast Types */
851
+ .toast.success {
852
+ border-left: 4px solid var(--success-color);
853
+ }
854
+
855
+ .toast.success .toast-icon {
856
+ color: var(--success-color);
857
+ }
858
+
859
+ .toast.error {
860
+ border-left: 4px solid var(--danger-color);
861
+ }
862
+
863
+ .toast.error .toast-icon {
864
+ color: var(--danger-color);
865
+ }
866
+
867
+ .toast.warning {
868
+ border-left: 4px solid var(--warning-color);
869
+ }
870
+
871
+ .toast.warning .toast-icon {
872
+ color: var(--warning-color);
873
+ }
874
+
875
+ .toast.info {
876
+ border-left: 4px solid var(--info-color);
877
+ }
878
+
879
+ .toast.info .toast-icon {
880
+ color: var(--info-color);
881
+ }
882
+
883
+ /* Loading States */
884
+ .loading {
885
+ position: relative;
886
+ overflow: hidden;
887
+ }
888
+
889
+ .loading::after {
890
+ content: '';
891
+ position: absolute;
892
+ top: 0;
893
+ left: -100%;
894
+ width: 100%;
895
+ height: 100%;
896
+ background: linear-gradient(
897
+ 90deg,
898
+ transparent,
899
+ rgba(255, 255, 255, 0.1),
900
+ transparent
901
+ );
902
+ animation: loading-shimmer 2s infinite;
903
+ }
904
+
905
+ @keyframes loading-shimmer {
906
+ 0% { left: -100%; }
907
+ 100% { left: 100%; }
908
+ }
909
+
910
+ /* Responsive Design */
911
+ @media (max-width: 1200px) {
912
+ .main-content {
913
+ grid-template-columns: 1fr 300px;
914
+ gap: 1.5rem;
915
+ }
916
+
917
+ .sidebar {
918
+ gap: 1rem;
919
+ }
920
+
921
+ .metrics-panel, .history-panel, .system-panel {
922
+ padding: 1rem;
923
+ }
924
+ }
925
+
926
+ @media (max-width: 768px) {
927
+ .header {
928
+ padding: 1rem;
929
+ }
930
+
931
+ .header-content {
932
+ flex-direction: column;
933
+ gap: 1rem;
934
+ align-items: flex-start;
935
+ }
936
+
937
+ .header-stats {
938
+ gap: 1rem;
939
+ width: 100%;
940
+ justify-content: space-around;
941
+ }
942
+
943
+ .main-content {
944
+ grid-template-columns: 1fr;
945
+ padding: 1rem;
946
+ gap: 1rem;
947
+ }
948
+
949
+ #webcam {
950
+ height: 300px;
951
+ }
952
+
953
+ .controls-section {
954
+ flex-direction: column;
955
+ align-items: stretch;
956
+ }
957
+
958
+ .main-controls, .advanced-controls {
959
+ justify-content: center;
960
+ flex-wrap: wrap;
961
+ }
962
+
963
+ .caption-overlay {
964
+ padding: 1rem;
965
+ }
966
+
967
+ #caption-text {
968
+ font-size: 1.25rem;
969
+ }
970
+
971
+ .metrics-grid {
972
+ grid-template-columns: 1fr;
973
+ }
974
+
975
+ .status-bar {
976
+ padding: 0.5rem 1rem;
977
+ flex-direction: column;
978
+ gap: 0.5rem;
979
+ text-align: center;
980
+ }
981
+
982
+ .keyboard-hint {
983
+ display: none;
984
+ }
985
+
986
+ .toast {
987
+ min-width: 280px;
988
+ margin: 0 1rem;
989
+ }
990
+
991
+ .modal-content {
992
+ width: 95%;
993
+ margin: 1rem;
994
+ }
995
+ }
996
+
997
+ @media (max-width: 480px) {
998
+ .header {
999
+ padding: 0.75rem;
1000
+ }
1001
+
1002
+ .logo-section {
1003
+ gap: 0.5rem;
1004
+ }
1005
+
1006
+ .logo-icon {
1007
+ font-size: 1.5rem;
1008
+ }
1009
+
1010
+ .header h1 {
1011
+ font-size: 1.25rem;
1012
+ }
1013
+
1014
+ .subtitle {
1015
+ font-size: 0.8125rem;
1016
+ }
1017
+
1018
+ .main-content {
1019
+ padding: 0.75rem;
1020
+ }
1021
+
1022
+ #webcam {
1023
+ height: 250px;
1024
+ }
1025
+
1026
+ .btn {
1027
+ padding: 0.625rem 1rem;
1028
+ font-size: 0.8125rem;
1029
+ }
1030
+
1031
+ .main-controls, .advanced-controls {
1032
+ gap: 0.75rem;
1033
+ }
1034
+
1035
+ #caption-text {
1036
+ font-size: 1.125rem;
1037
+ }
1038
+
1039
+ .caption-metadata {
1040
+ flex-direction: column;
1041
+ align-items: flex-start;
1042
+ gap: 0.5rem;
1043
+ }
1044
+
1045
+ .confidence-bar {
1046
+ width: 100%;
1047
+ }
1048
+
1049
+ .toast {
1050
+ min-width: 260px;
1051
+ }
1052
+ }
1053
+
1054
+ /* Dark mode enhancements */
1055
+ @media (prefers-color-scheme: dark) {
1056
+ body {
1057
+ background: linear-gradient(135deg, var(--bg-primary) 0%, #0a0d14 50%, var(--bg-secondary) 100%);
1058
+ }
1059
+ }
1060
+
1061
+ /* High contrast mode support */
1062
+ @media (prefers-contrast: high) {
1063
+ :root {
1064
+ --border-color: rgba(255, 255, 255, 0.3);
1065
+ --border-active: var(--primary-color);
1066
+ --text-secondary: #d1d5db;
1067
+ }
1068
+ }
1069
+
1070
+ /* Reduced motion support */
1071
+ @media (prefers-reduced-motion: reduce) {
1072
+ * {
1073
+ animation-duration: 0.01ms !important;
1074
+ animation-iteration-count: 1 !important;
1075
+ transition-duration: 0.01ms !important;
1076
+ }
1077
+
1078
+ .recording-dot {
1079
+ animation: none;
1080
+ }
1081
+
1082
+ .loading::after {
1083
+ animation: none;
1084
+ }
1085
+ }
1086
+
1087
+ /* Print styles */
1088
+ @media print {
1089
+ .header, .sidebar, .controls-section, .status-bar, .toast-container {
1090
+ display: none;
1091
+ }
1092
+
1093
+ .main-content {
1094
+ grid-template-columns: 1fr;
1095
+ padding: 0;
1096
+ }
1097
+
1098
+ .video-container {
1099
+ border: 2px solid #000;
1100
+ border-radius: 0;
1101
+ box-shadow: none;
1102
+ }
1103
+ }
templates/index.html ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Real-Time AI Action Captioner Pro</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
8
+ <!-- Include Socket.IO client library -->
9
+ <script src="https://cdn.socket.io/4.5.2/socket.io.min.js"></script>
10
+ <link rel="preconnect" href="https://fonts.googleapis.com">
11
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
12
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
13
+ </head>
14
+ <body>
15
+ <div class="app-container">
16
+ <!-- Header Section -->
17
+ <header class="header">
18
+ <div class="header-content">
19
+ <div class="logo-section">
20
+ <div class="logo-icon">🎥</div>
21
+ <div>
22
+ <h1>LiveSense AI</h1>
23
+ <p class="subtitle">Real-Time Video Insights & Description</p>
24
+ </div>
25
+ </div>
26
+ <div class="header-stats">
27
+ <div class="stat-item">
28
+ <span class="stat-label">Status</span>
29
+ <span class="stat-value" id="connection-status">Disconnected</span>
30
+ </div>
31
+ <div class="stat-item">
32
+ <span class="stat-label">FPS</span>
33
+ <span class="stat-value" id="fps-counter">0</span>
34
+ </div>
35
+ </div>
36
+ </div>
37
+ </header>
38
+
39
+ <!-- Main Content -->
40
+ <main class="main-content">
41
+ <div class="video-section">
42
+ <!-- Video Container -->
43
+ <div class="video-container">
44
+ <video id="webcam" autoplay muted playsinline></video>
45
+
46
+ <!-- Video Overlay Controls -->
47
+ <div class="video-overlay">
48
+ <div class="overlay-top">
49
+ <div class="recording-indicator" id="recording-indicator">
50
+ <div class="recording-dot"></div>
51
+ <span>LIVE</span>
52
+ </div>
53
+ <div class="quality-indicator" id="quality-indicator">
54
+ <span>HD</span>
55
+ </div>
56
+ </div>
57
+
58
+ <!-- Caption Container -->
59
+ <div class="caption-overlay">
60
+ <div class="caption-content">
61
+ <p id="caption-text">Ready to start captioning...</p>
62
+ <div class="caption-metadata">
63
+ <div class="confidence-bar">
64
+ <div class="confidence-fill" id="confidence-fill"></div>
65
+ </div>
66
+ <span class="confidence-text" id="confidence-text">0%</span>
67
+ <span class="timestamp" id="caption-timestamp"></span>
68
+ </div>
69
+ </div>
70
+ </div>
71
+ </div>
72
+ </div>
73
+
74
+ <!-- Controls -->
75
+ <div class="controls-section">
76
+ <div class="main-controls">
77
+ <button id="startButton" class="btn btn-primary">
78
+ <span class="btn-icon">▶</span>
79
+ Start Analysis
80
+ </button>
81
+ <button id="stopButton" class="btn btn-secondary" disabled>
82
+ <span class="btn-icon">⏹</span>
83
+ Stop Analysis
84
+ </button>
85
+ </div>
86
+
87
+ <div class="advanced-controls">
88
+ <button id="muteButton" class="btn btn-outline">
89
+ <span class="btn-icon">🔊</span>
90
+ Audio
91
+ </button>
92
+ <button id="settingsButton" class="btn btn-outline">
93
+ <span class="btn-icon">⚙️</span>
94
+ Settings
95
+ </button>
96
+ <button id="fullscreenButton" class="btn btn-outline">
97
+ <span class="btn-icon">⛶</span>
98
+ Fullscreen
99
+ </button>
100
+ </div>
101
+ </div>
102
+ </div>
103
+
104
+ <!-- Sidebar -->
105
+ <aside class="sidebar">
106
+ <!-- Performance Metrics -->
107
+ <div class="metrics-panel">
108
+ <h3>Performance Metrics</h3>
109
+ <div class="metrics-grid">
110
+ <div class="metric-item">
111
+ <div class="metric-value" id="latency-value">0ms</div>
112
+ <div class="metric-label">Latency</div>
113
+ </div>
114
+ <div class="metric-item">
115
+ <div class="metric-value" id="accuracy-value">0%</div>
116
+ <div class="metric-label">Avg Confidence</div>
117
+ </div>
118
+ <div class="metric-item">
119
+ <div class="metric-value" id="processed-frames">0</div>
120
+ <div class="metric-label">Frames Processed</div>
121
+ </div>
122
+ <div class="metric-item">
123
+ <div class="metric-value" id="captions-count">0</div>
124
+ <div class="metric-label">Captions Generated</div>
125
+ </div>
126
+ </div>
127
+ </div>
128
+
129
+ <!-- Recent Captions History -->
130
+ <div class="history-panel">
131
+ <h3>Caption History</h3>
132
+ <div class="history-list" id="history-list">
133
+ <div class="history-item">
134
+ <div class="history-text">No captions yet</div>
135
+ <div class="history-meta">
136
+ <span class="history-confidence">-</span>
137
+ <span class="history-time">--:--</span>
138
+ </div>
139
+ </div>
140
+ </div>
141
+ </div>
142
+
143
+ <!-- System Info -->
144
+ <div class="system-panel">
145
+ <h3>System Information</h3>
146
+ <div class="system-info">
147
+ <div class="info-item">
148
+ <span class="info-label">Device:</span>
149
+ <span class="info-value" id="device-info">Loading...</span>
150
+ </div>
151
+ <div class="info-item">
152
+ <span class="info-label">Model:</span>
153
+ <span class="info-value">BLIP-Base</span>
154
+ </div>
155
+ <div class="info-item">
156
+ <span class="info-label">Resolution:</span>
157
+ <span class="info-value" id="resolution-info">-</span>
158
+ </div>
159
+ <div class="info-item">
160
+ <span class="info-label">Cache Hit Rate:</span>
161
+ <span class="info-value" id="cache-info">0%</span>
162
+ </div>
163
+ </div>
164
+ </div>
165
+ </aside>
166
+ </main>
167
+
168
+ <!-- Settings Modal -->
169
+ <div class="modal" id="settingsModal">
170
+ <div class="modal-content">
171
+ <div class="modal-header">
172
+ <h3>Settings</h3>
173
+ <button class="modal-close" id="closeSettings">&times;</button>
174
+ </div>
175
+ <div class="modal-body">
176
+ <div class="setting-group">
177
+ <label>Frame Rate</label>
178
+ <select id="frameRateSelect">
179
+ <option value="10">10 FPS</option>
180
+ <option value="15" selected>15 FPS</option>
181
+ <option value="20">20 FPS</option>
182
+ <option value="30">30 FPS</option>
183
+ </select>
184
+ </div>
185
+ <div class="setting-group">
186
+ <label>Image Quality</label>
187
+ <input type="range" id="qualitySlider" min="0.3" max="0.9" step="0.1" value="0.7">
188
+ <span id="qualityValue">70%</span>
189
+ </div>
190
+ <div class="setting-group">
191
+ <label>Audio Narration</label>
192
+ <div class="toggle-switch">
193
+ <input type="checkbox" id="audioToggle" checked>
194
+ <span class="toggle-slider"></span>
195
+ </div>
196
+ </div>
197
+ <div class="setting-group">
198
+ <label>Confidence Threshold</label>
199
+ <input type="range" id="confidenceSlider" min="0.1" max="1.0" step="0.1" value="0.6">
200
+ <span id="confidenceThreshold">60%</span>
201
+ </div>
202
+ <div class="setting-group">
203
+ <label>Auto-Pause on Low Light</label>
204
+ <div class="toggle-switch">
205
+ <input type="checkbox" id="lowLightToggle">
206
+ <span class="toggle-slider"></span>
207
+ </div>
208
+ </div>
209
+ </div>
210
+ <div class="modal-footer">
211
+ <button class="btn btn-outline" id="resetSettings">Reset to Default</button>
212
+ <button class="btn btn-primary" id="saveSettings">Save Settings</button>
213
+ </div>
214
+ </div>
215
+ </div>
216
+
217
+ <!-- Status Bar -->
218
+ <div class="status-bar">
219
+ <div class="status-left">
220
+ <span id="status-message">Ready to start</span>
221
+ </div>
222
+ <div class="status-right">
223
+ <span class="keyboard-hint">Space: Start/Stop | M: Mute | S: Settings | F: Fullscreen</span>
224
+ </div>
225
+ </div>
226
+ </div>
227
+
228
+ <!-- Toast Notifications -->
229
+ <div class="toast-container" id="toastContainer"></div>
230
+
231
+ <!-- Our application logic -->
232
+ <script src="{{ url_for('static', filename='app.js') }}"></script>
233
+ </body>
234
+ </html>