nailarais1 commited on
Commit
0d78b31
·
verified ·
1 Parent(s): c9b9d71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -593
app.py CHANGED
@@ -4,53 +4,73 @@ VoiceBridge.AI - Production Ready Universal Communication Platform
4
  Supporting: Blind, Deaf, Non-Verbal, Deaf-Blind Users
5
  """
6
 
7
- import gradio as gr
8
- import speech_recognition as sr
9
-
10
- # Prevent mic usage in headless mode
11
- recognizer = sr.Recognizer()
12
- microphone = None # Disable default mic
13
-
14
- import pyttsx3
15
- import threading
16
- import time
17
- import json
18
- import tempfile
19
  import os
20
  import logging
 
 
 
21
  from datetime import datetime
22
  from pathlib import Path
 
 
 
 
 
23
  import torch
24
  from transformers import pipeline
25
- import cv2
26
- import numpy as np
27
- import requests
28
- from typing import Dict, List, Optional, Tuple
 
 
 
 
29
 
30
  # Configure logging
31
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
  logger = logging.getLogger(__name__)
33
 
 
34
  class ProductionVoiceBridge:
35
  """
36
  Production-grade universal communication system for all disabilities
37
  """
38
-
39
- def __init__(self):
 
 
 
 
 
40
  self.setup_directories()
41
  self.load_config()
42
- self.initialize_engines()
43
  self.current_mode = "universal"
44
  self.user_preferences = {}
45
  self.conversation_history = []
46
  self.emergency_contacts = []
47
-
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def setup_directories(self):
49
  """Create necessary directories for production"""
 
50
  Path("data/conversations").mkdir(parents=True, exist_ok=True)
51
  Path("data/emergency").mkdir(parents=True, exist_ok=True)
52
  Path("data/user_profiles").mkdir(parents=True, exist_ok=True)
53
-
 
54
  def load_config(self):
55
  """Load production configuration"""
56
  self.config = {
@@ -66,103 +86,142 @@ class ProductionVoiceBridge:
66
  "error": [100, 50, 100, 50, 100]
67
  }
68
  }
69
-
70
  def initialize_engines(self):
71
  """Initialize all AI engines and hardware interfaces"""
72
  try:
73
- # Text-to-Speech Engine
74
- self.tts_engine = pyttsx3.init()
75
- voices = self.tts_engine.getProperty('voices')
76
- self.tts_engine.setProperty('voice', voices[0].id)
77
- self.tts_engine.setProperty('rate', 160)
78
- self.tts_engine.setProperty('volume', 0.8)
79
-
80
- # Speech Recognition
81
- self.recognizer = sr.Recognizer()
82
- self.microphone = sr.Microphone()
83
- import speech_recognition as sr
84
-
85
- with self.microphone as source:
86
- self.recognizer.adjust_for_ambient_noise(source, duration=1)
87
-
88
- # AI Models with error handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  self.load_ai_models()
90
-
91
  # Emergency system
92
  self.emergency_mode = False
93
  self.last_emergency_check = time.time()
94
-
95
- logger.info("All engines initialized successfully")
96
-
97
  except Exception as e:
 
98
  logger.error(f"Engine initialization failed: {e}")
 
99
  raise
100
-
101
  def load_ai_models(self):
102
- """Load AI models with fallbacks"""
 
103
  try:
 
 
104
  self.speech_to_text_model = pipeline(
105
  "automatic-speech-recognition",
106
  model="openai/whisper-base",
107
- device=-1 # CPU for reliability
108
  )
 
109
  except Exception as e:
110
- logger.warning(f"Whisper model failed, using fallback: {e}")
111
  self.speech_to_text_model = None
112
-
 
113
  try:
114
  self.image_caption_model = pipeline(
115
  "image-to-text",
116
  model="Salesforce/blip-image-captioning-base",
117
  device=-1
118
  )
 
119
  except Exception as e:
120
- logger.warning(f"BLIP model failed, using fallback: {e}")
121
  self.image_caption_model = None
122
-
123
  # ==================== UNIVERSAL MODE ====================
124
-
125
  def universal_communication(self, input_data: dict) -> dict:
126
  """
127
  Universal communication handler that adapts to any input type
128
  """
129
  try:
130
  input_type = input_data.get('type', 'voice')
131
-
132
  if input_type == 'voice' and input_data.get('audio'):
133
  return self.handle_voice_input(input_data['audio'])
134
-
135
  elif input_type == 'text' and input_data.get('text'):
136
  return self.handle_text_input(input_data['text'])
137
-
138
  elif input_type == 'image' and input_data.get('image'):
139
  return self.handle_image_input(input_data['image'])
140
-
141
  elif input_type == 'command':
142
  return self.handle_system_command(input_data.get('command', ''))
143
-
144
  else:
145
  return self.create_response(
146
  "Please provide voice, text, or image input",
147
  "error"
148
  )
149
-
150
  except Exception as e:
151
- logger.error(f"Universal communication error: {e}")
152
  return self.create_response(
153
  "System error. Please try again or use emergency mode.",
154
  "error"
155
  )
156
-
157
  def handle_voice_input(self, audio_path: str) -> dict:
158
  """Process voice input for deaf users and general transcription"""
159
  try:
160
- # Convert speech to text
161
- if self.speech_to_text_model:
162
- transcript = self.speech_to_text_model(audio_path)["text"]
 
 
 
 
 
 
 
163
  else:
164
  transcript = self.fallback_speech_to_text(audio_path)
165
-
 
 
 
166
  # Check for emergency keywords
167
  if self.detect_emergency_keywords(transcript):
168
  emergency_result = self.trigger_emergency_mode("voice_triggered")
@@ -172,27 +231,27 @@ class ProductionVoiceBridge:
172
  audio=emergency_result.get('audio'),
173
  visual_alert="🔴 EMERGENCY ACTIVATED"
174
  )
175
-
176
  # Check for system commands
177
  if self.is_system_command(transcript):
178
  return self.handle_system_command(transcript)
179
-
180
  # Regular communication
181
  self.add_to_conversation("User", transcript)
182
-
183
  return self.create_response(
184
  transcript,
185
  "transcription",
186
  visual_alert=f"💬 New message: {transcript[:50]}..."
187
  )
188
-
189
  except Exception as e:
190
- logger.error(f"Voice input error: {e}")
191
  return self.create_response(
192
  "Could not process audio. Please try again.",
193
  "error"
194
  )
195
-
196
  def handle_text_input(self, text: str) -> dict:
197
  """Process text input for non-verbal users"""
198
  try:
@@ -205,54 +264,65 @@ class ProductionVoiceBridge:
205
  audio=emergency_result.get('audio'),
206
  visual_alert="🔴 EMERGENCY"
207
  )
208
-
209
  # Convert to speech
210
  audio_path = self.text_to_speech(text)
211
-
212
  self.add_to_conversation("User", text, "spoken")
213
-
214
  return self.create_response(
215
  text,
216
  "communication",
217
  audio=audio_path,
218
  visual_alert=f"🗣️ Speaking: {text[:30]}..."
219
  )
220
-
221
  except Exception as e:
222
- logger.error(f"Text input error: {e}")
223
  return self.create_response(
224
  "Could not process text. Please try again.",
225
  "error"
226
  )
227
-
228
  def handle_image_input(self, image_path: str) -> dict:
229
  """Process image input for blind users"""
230
  try:
231
  if not self.image_caption_model:
232
  description = "I see an image but cannot describe it in detail right now."
233
  else:
234
- description = self.image_caption_model(image_path)[0]['generated_text']
235
- # Enhance description
 
 
 
 
 
 
 
 
 
 
 
236
  description = self.enhance_scene_description(description)
237
-
238
  # Convert description to speech
239
  audio_path = self.text_to_speech(description)
240
-
241
  return self.create_response(
242
  description,
243
  "scene_description",
244
  audio=audio_path
245
  )
246
-
247
  except Exception as e:
248
- logger.error(f"Image input error: {e}")
249
  return self.create_response(
250
  "Could not process image. Please try again.",
251
  "error"
252
  )
253
-
254
  # ==================== DISABILITY-SPECIFIC MODES ====================
255
-
256
  def blind_mode(self, command: str = None, image_path: str = None) -> dict:
257
  """Voice-first interface for blind users"""
258
  if not command and not image_path:
@@ -261,10 +331,10 @@ class ProductionVoiceBridge:
261
  "'read text' for text recognition, or 'help' for options."
262
  )
263
  return self.create_response(welcome_msg, "system", audio=self.text_to_speech(welcome_msg))
264
-
265
  if command:
266
  command = command.lower()
267
-
268
  if 'describe' in command or 'scene' in command or image_path:
269
  if image_path:
270
  return self.handle_image_input(image_path)
@@ -273,13 +343,13 @@ class ProductionVoiceBridge:
273
  "Please capture an image using the camera",
274
  "instruction"
275
  )
276
-
277
  elif 'read' in command or 'text' in command:
278
  return self.create_response(
279
  "Please capture an image containing text",
280
  "instruction"
281
  )
282
-
283
  elif 'navigate' in command or 'direction' in command:
284
  guidance = "Navigation assistance: Move forward carefully. Obstacle detection active."
285
  return self.create_response(
@@ -287,41 +357,41 @@ class ProductionVoiceBridge:
287
  "navigation",
288
  audio=self.text_to_speech(guidance)
289
  )
290
-
291
  elif 'help' in command:
292
- help_text = """
293
- Blind Mode Commands:
294
- • "Describe scene" - Describe surroundings using camera
295
- • "Read text" - Read text from images
296
- • "Navigate" - Get walking directions
297
- • "Emergency" - Immediate assistance
298
- • "Change mode" - Switch accessibility mode
299
- """
300
  return self.create_response(help_text, "help", audio=self.text_to_speech(help_text))
301
-
302
  else:
303
  response = "Command not recognized. Say 'help' for options."
304
  return self.create_response(response, "error", audio=self.text_to_speech(response))
305
-
306
  def deaf_mode(self, audio_input: str = None, continuous: bool = False) -> dict:
307
  """Visual interface for deaf users with real-time transcription"""
308
  if audio_input:
309
  result = self.handle_voice_input(audio_input)
310
-
311
  # Add visual enhancements for deaf users
312
- if result['type'] == 'transcription':
313
- result['visual_alert'] = f"👂 TRANSCRIPTION: {result['text'][:100]}..."
314
-
315
  # Check for important sounds
316
  if self.detect_important_sounds(audio_input):
317
  result['visual_alert'] = "🔔 IMPORTANT SOUND DETECTED! " + result.get('visual_alert', '')
318
  result['haptic_feedback'] = self.config['haptic_patterns']['notification']
319
-
320
  return result
321
  else:
322
  status = "Deaf mode active. Real-time transcription ready. Visual alerts enabled."
323
  return self.create_response(status, "system", visual_alert="👂 Deaf Mode Active")
324
-
325
  def non_verbal_mode(self, text: str = None, preset: str = None) -> dict:
326
  """Text-to-speech communication for non-verbal users"""
327
  if preset:
@@ -340,11 +410,11 @@ class ProductionVoiceBridge:
340
  text_to_speak = phrases.get(preset, preset)
341
  else:
342
  text_to_speak = text or "I need help"
343
-
344
  audio_path = self.text_to_speech(text_to_speak)
345
-
346
  self.add_to_conversation("User", text_to_speak, "spoken")
347
-
348
  return self.create_response(
349
  text_to_speak,
350
  "communication",
@@ -352,7 +422,7 @@ class ProductionVoiceBridge:
352
  visual_alert=f"🗣️ Speaking: {text_to_speak}",
353
  haptic_feedback=self.config['haptic_patterns']['confirmation']
354
  )
355
-
356
  def deaf_blind_mode(self, input_text: str = None, output_format: str = "haptic") -> dict:
357
  """Tactile communication for deaf-blind users"""
358
  if input_text:
@@ -374,14 +444,14 @@ class ProductionVoiceBridge:
374
  else:
375
  status = "Deaf-blind mode active. Use text input with haptic or braille output."
376
  return self.create_response(status, "system")
377
-
378
  # ==================== EMERGENCY SYSTEM ====================
379
-
380
  def trigger_emergency_mode(self, trigger_source: str = "manual") -> dict:
381
  """Activate emergency response system"""
382
  self.emergency_mode = True
383
  timestamp = datetime.now().isoformat()
384
-
385
  emergency_data = {
386
  "status": "EMERGENCY_ACTIVATED",
387
  "timestamp": timestamp,
@@ -390,7 +460,7 @@ class ProductionVoiceBridge:
390
  "actions_taken": [],
391
  "contacts_notified": []
392
  }
393
-
394
  # Notify emergency contacts
395
  for contact in self.emergency_contacts:
396
  try:
@@ -398,65 +468,86 @@ class ProductionVoiceBridge:
398
  emergency_data["contacts_notified"].append(contact)
399
  except Exception as e:
400
  logger.error(f"Failed to notify {contact}: {e}")
401
-
402
  # Create emergency audio message
403
  emergency_audio = self.text_to_speech(emergency_data["message"])
404
  emergency_data["audio"] = emergency_audio
405
-
406
  # Log emergency
407
  self.log_emergency(emergency_data)
408
-
409
  return emergency_data
410
-
411
  def notify_emergency_contact(self, contact: str, emergency_data: dict):
412
  """Notify emergency contact (simplified - in production would use SMS/email)"""
413
  logger.info(f"EMERGENCY NOTIFICATION to {contact}: {emergency_data['message']}")
414
  # In production: send SMS, email, or push notification
415
-
416
  # ==================== CORE ENGINE METHODS ====================
417
-
418
  def text_to_speech(self, text: str) -> str:
419
- """Convert text to speech and return audio file path"""
 
 
 
 
 
 
420
  try:
421
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav', dir='data/') as tmp_file:
422
- self.tts_engine.save_to_file(text, tmp_file.name)
423
- self.tts_engine.runAndWait()
424
- return tmp_file.name
 
 
425
  except Exception as e:
426
- logger.error(f"TTS error: {e}")
427
  return None
428
-
429
  def fallback_speech_to_text(self, audio_path: str) -> str:
430
  """Fallback speech recognition using speech_recognition library"""
 
 
 
 
 
431
  try:
432
  with sr.AudioFile(audio_path) as source:
433
  audio = self.recognizer.record(source)
434
- return self.recognizer.recognize_google(audio)
 
 
 
 
 
 
 
435
  except Exception as e:
436
- return f"Could not understand audio: {str(e)}"
437
-
 
438
  def detect_emergency_keywords(self, text: str) -> bool:
439
  """Detect emergency keywords in text"""
 
 
440
  emergency_words = [
441
- 'emergency', 'help', 'urgent', 'danger', 'dangerous',
442
  'accident', 'injured', 'hurt', 'pain', 'bleeding',
443
  'fire', 'police', 'ambulance', 'hospital', '911',
444
  'save me', 'help me', 'i need help'
445
  ]
446
  text_lower = text.lower()
447
  return any(word in text_lower for word in emergency_words)
448
-
449
  def detect_important_sounds(self, audio_path: str) -> bool:
450
- """Detect important environmental sounds"""
451
- # Simplified - in production would use audio analysis
452
- # For now, use speech recognition to check for important words
453
  try:
454
  transcript = self.fallback_speech_to_text(audio_path)
455
  important_words = ['help', 'emergency', 'fire', 'watch out', 'danger']
456
  return any(word in transcript.lower() for word in important_words)
457
- except:
458
  return False
459
-
460
  def text_to_vibration_pattern(self, text: str) -> List[int]:
461
  """Convert text to vibration pattern (simplified Morse code)"""
462
  morse_code = {
@@ -469,7 +560,7 @@ class ProductionVoiceBridge:
469
  '6': '-....', '7': '--...', '8': '---..', '9': '----.', '0': '-----',
470
  ' ': ' '
471
  }
472
-
473
  pattern = []
474
  for char in text.upper():
475
  if char in morse_code:
@@ -481,9 +572,9 @@ class ProductionVoiceBridge:
481
  pattern.extend([300]) # Long vibration
482
  pattern.extend([50]) # Gap between symbols
483
  pattern.extend([200]) # Gap between letters
484
-
485
  return pattern
486
-
487
  def text_to_braille(self, text: str) -> str:
488
  """Convert text to braille unicode characters"""
489
  braille_map = {
@@ -493,11 +584,13 @@ class ProductionVoiceBridge:
493
  '1': '⠁', '2': '⠃', '3': '⠉', '4': '⠙', '5': '⠑', '6': '⠋', '7': '⠛', '8': '⠓', '9': '⠊', '0': '⠚',
494
  ' ': ' ', '.': '⠲', ',': '⠂', '!': '⠖', '?': '⠦'
495
  }
496
-
497
  return ''.join(braille_map.get(char.upper(), '?') for char in text)
498
-
499
  def enhance_scene_description(self, description: str) -> str:
500
  """Enhance AI-generated scene descriptions"""
 
 
501
  enhancements = {
502
  "indoor": "This appears to be an indoor setting. ",
503
  "outdoor": "This appears to be an outdoor area. ",
@@ -505,44 +598,46 @@ class ProductionVoiceBridge:
505
  "text": "There is text that could be read. ",
506
  "obstacle": "Be careful of potential obstacles. ",
507
  }
508
-
509
  enhanced = description
510
  desc_lower = description.lower()
511
-
512
  if any(word in desc_lower for word in ['room', 'indoor', 'inside', 'wall']):
513
  enhanced = enhancements["indoor"] + enhanced
514
  elif any(word in desc_lower for word in ['outdoor', 'outside', 'sky', 'tree']):
515
  enhanced = enhancements["outdoor"] + enhanced
516
-
517
  if any(word in desc_lower for word in ['person', 'people', 'man', 'woman']):
518
  enhanced = enhancements["people"] + enhanced
519
-
520
  if any(word in desc_lower for word in ['sign', 'text', 'letter', 'word']):
521
  enhanced = enhancements["text"] + enhanced
522
-
523
  return enhanced
524
-
525
  def is_system_command(self, text: str) -> bool:
526
  """Check if text contains system commands"""
 
 
527
  commands = ['mode', 'help', 'emergency', 'stop', 'cancel', 'reset']
528
  return any(command in text.lower() for command in commands)
529
-
530
  def handle_system_command(self, command: str) -> dict:
531
  """Handle system control commands"""
532
- command = command.lower()
533
-
534
  if 'blind' in command:
535
  self.current_mode = "blind"
536
  response = "Blind mode activated. Voice navigation enabled."
 
 
 
537
  elif 'deaf' in command:
538
  self.current_mode = "deaf"
539
  response = "Deaf mode activated. Visual alerts enabled."
540
  elif 'non verbal' in command or 'mute' in command:
541
  self.current_mode = "non_verbal"
542
  response = "Non-verbal mode activated. Text-to-speech ready."
543
- elif 'deaf blind' in command:
544
- self.current_mode = "deaf_blind"
545
- response = "Deaf-blind mode activated. Haptic feedback enabled."
546
  elif 'universal' in command:
547
  self.current_mode = "universal"
548
  response = "Universal mode activated."
@@ -550,9 +645,9 @@ class ProductionVoiceBridge:
550
  return self.trigger_emergency_mode("voice_command")
551
  else:
552
  response = f"Current mode: {self.current_mode}. Say 'help' for options."
553
-
554
  return self.create_response(response, "system", audio=self.text_to_speech(response))
555
-
556
  def add_to_conversation(self, speaker: str, text: str, message_type: str = "text"):
557
  """Add message to conversation history"""
558
  self.conversation_history.append({
@@ -561,11 +656,11 @@ class ProductionVoiceBridge:
561
  "text": text,
562
  "type": message_type
563
  })
564
-
565
  # Keep only last 100 messages
566
  if len(self.conversation_history) > 100:
567
  self.conversation_history = self.conversation_history[-100:]
568
-
569
  def log_emergency(self, emergency_data: dict):
570
  """Log emergency event"""
571
  try:
@@ -574,7 +669,7 @@ class ProductionVoiceBridge:
574
  json.dump(emergency_data, f, indent=2)
575
  except Exception as e:
576
  logger.error(f"Failed to log emergency: {e}")
577
-
578
  def create_response(self, text: str, response_type: str, **kwargs) -> dict:
579
  """Create standardized response object"""
580
  return {
@@ -591,352 +686,131 @@ class ProductionVoiceBridge:
591
 
592
  # ==================== GRADIO INTERFACE ====================
593
 
594
- def create_production_interface():
595
  """Create production-ready Gradio interface"""
596
-
597
- # Initialize the system
598
- voice_bridge = ProductionVoiceBridge()
599
-
600
- # Custom CSS for accessibility
601
  custom_css = """
602
- :root {
603
- --primary-color: #2563eb;
604
- --danger-color: #dc2626;
605
- --success-color: #16a34a;
606
- --warning-color: #d97706;
607
- }
608
-
609
- .accessible-btn {
610
- min-height: 48px !important;
611
- min-width: 48px !important;
612
- padding: 12px 24px !important;
613
- font-size: 16px !important;
614
- margin: 4px !important;
615
- border: 2px solid !important;
616
- }
617
-
618
- .emergency-btn {
619
- background: linear-gradient(45deg, #dc2626, #ef4444) !important;
620
- color: white !important;
621
- font-weight: bold !important;
622
- font-size: 20px !important;
623
- animation: pulse 2s infinite !important;
624
- }
625
-
626
- @keyframes pulse {
627
- 0% { transform: scale(1); opacity: 1; }
628
- 50% { transform: scale(1.05); opacity: 0.9; }
629
- 100% { transform: scale(1); opacity: 1; }
630
- }
631
-
632
- .high-contrast {
633
- filter: contrast(200%) !important;
634
- }
635
-
636
- .large-text { font-size: 18px !important; }
637
- .x-large-text { font-size: 22px !important; }
638
-
639
- .sr-only {
640
- position: absolute !important;
641
- width: 1px !important;
642
- height: 1px !important;
643
- padding: 0 !important;
644
- margin: -1px !important;
645
- overflow: hidden !important;
646
- clip: rect(0, 0, 0, 0) !important;
647
- white-space: nowrap !important;
648
- border: 0 !important;
649
- }
650
-
651
- @media (max-width: 768px) {
652
- .container {
653
- padding: 8px !important;
654
- }
655
- .accessible-btn {
656
- min-height: 54px !important;
657
- min-width: 54px !important;
658
- font-size: 18px !important;
659
- }
660
- }
661
  """
662
-
663
- with gr.Blocks(
664
- css=custom_css,
665
- theme=gr.themes.Soft(primary_hue="blue"),
666
- title="VoiceBridge AI - Universal Communication",
667
- head='<meta name="description" content="Accessibility communication platform for blind, deaf, non-verbal, and deaf-blind users">'
668
- ) as demo:
669
-
670
- # Screen reader announcement area
671
- sr_announcement = gr.Textbox(
672
- label="Screen Reader Announcements",
673
- elem_id="sr-announcement",
674
- visible=False
675
- )
676
-
677
- gr.Markdown("""
678
- # 🎯 VoiceBridge AI - Universal Communication Platform
679
- **Production-Ready Accessibility Solution for All Disabilities**
680
-
681
- *Supporting: 👁️ Blind Users • 👂 Deaf Users • 🤐 Non-Verbal Users • 👁️👂 Deaf-Blind Users*
682
- """)
683
-
684
- # System Status Bar
685
- with gr.Row():
686
- system_status = gr.Textbox(
687
- label="System Status",
688
- value="✅ System Ready - VoiceBridge AI Initialized",
689
- interactive=False,
690
- max_lines=1
691
- )
692
- current_mode_display = gr.Textbox(
693
- label="Current Mode",
694
- value="universal",
695
- interactive=False,
696
- max_lines=1
697
- )
698
-
699
- # Emergency Section (Always Visible)
700
  with gr.Row():
701
- emergency_btn = gr.Button(
702
- "🚨 ACTIVATE EMERGENCY MODE",
703
- elem_classes=["accessible-btn", "emergency-btn", "x-large-text"],
704
- scale=2
705
- )
706
- emergency_contact_input = gr.Textbox(
707
- label="Emergency Contact (Email/Phone)",
708
- placeholder="Enter emergency contact information...",
709
- scale=1
710
- )
711
-
712
- # Mode Selection
713
  with gr.Row():
714
- mode_selector = gr.Radio(
715
- choices=[
716
- ("Universal", "universal"),
717
- ("Blind", "blind"),
718
- ("Deaf", "deaf"),
719
- ("Non-Verbal", "non_verbal"),
720
- ("Deaf-Blind", "deaf_blind")
721
- ],
722
- label="Accessibility Mode",
723
- value="universal",
724
- elem_id="mode-selector"
725
- )
726
-
727
- # Universal Communication Tab
728
- with gr.Tab("🌐 Universal Communication", id="universal"):
729
  with gr.Row():
730
- with gr.Column(scale=1):
731
- universal_audio = gr.Audio(
732
- label="🎤 Speak (Voice Input)",
733
- type="filepath",
734
- sources=["microphone"]
735
- )
736
- universal_text = gr.Textbox(
737
- label="⌨️ Type to Speak",
738
- placeholder="Enter text to be spoken aloud...",
739
- lines=3
740
- )
741
- universal_image = gr.Image(
742
- label="📷 Capture Scene",
743
- type="filepath",
744
- sources=["webcam", "upload"]
745
- )
746
-
747
- process_universal = gr.Button(
748
- "Process Input",
749
- elem_classes="accessible-btn",
750
- size="lg"
751
- )
752
-
753
- with gr.Column(scale=1):
754
- universal_output = gr.Textbox(
755
- label="Output",
756
- lines=6,
757
- max_lines=10
758
- )
759
- universal_audio_output = gr.Audio(
760
- label="Audio Output",
761
- type="filepath",
762
- interactive=False
763
- )
764
- universal_alert = gr.Textbox(
765
- label="Visual Alerts",
766
- visible=False
767
- )
768
-
769
- # Blind Assistance Tab
770
- with gr.Tab("👁️ Blind Assistance", id="blind"):
771
  with gr.Row():
772
- with gr.Column(scale=1):
773
- blind_audio = gr.Audio(
774
- label="Voice Commands",
775
- type="filepath",
776
- sources=["microphone"]
777
- )
778
- blind_commands = gr.Radio(
779
- choices=[
780
- "describe scene",
781
- "read text",
782
- "navigate",
783
- "help"
784
- ],
785
- label="Quick Commands",
786
- value="describe scene"
787
- )
788
- blind_image = gr.Image(
789
- label="Camera Feed",
790
- type="filepath",
791
- sources=["webcam", "upload"]
792
- )
793
-
794
- process_blind = gr.Button(
795
- "Execute Command",
796
- elem_classes="accessible-btn"
797
- )
798
-
799
- with gr.Column(scale=1):
800
- blind_output = gr.Textbox(
801
- label="Scene Description",
802
- lines=5
803
- )
804
- blind_audio_output = gr.Audio(
805
- label="Audio Description",
806
- type="filepath"
807
- )
808
-
809
- # Deaf Assistance Tab
810
- with gr.Tab("👂 Deaf Assistance", id="deaf"):
811
  with gr.Row():
812
- with gr.Column(scale=1):
813
- deaf_audio = gr.Audio(
814
- label="Audio to Transcribe",
815
- type="filepath",
816
- sources=["microphone", "upload"]
817
- )
818
- continuous_listening = gr.Checkbox(
819
- label="Continuous Listening Mode",
820
- value=False
821
- )
822
-
823
- process_deaf = gr.Button(
824
- "Transcribe Audio",
825
- elem_classes="accessible-btn"
826
- )
827
-
828
- with gr.Column(scale=1):
829
- deaf_output = gr.Textbox(
830
- label="Transcription",
831
- lines=6
832
- )
833
- deaf_alerts = gr.Textbox(
834
- label="Sound Alerts",
835
- lines=2
836
- )
837
-
838
- # Non-Verbal Communication Tab
839
- with gr.Tab("🤐 Non-Verbal Communication", id="non_verbal"):
840
  with gr.Row():
841
- with gr.Column(scale=1):
842
- preset_phrases = gr.Radio(
843
- choices=[
844
- "greeting", "help", "medical", "emergency",
845
- "thanks", "yes", "no", "pain", "lost", "bathroom"
846
- ],
847
- label="Quick Phrases",
848
- value="greeting"
849
- )
850
- custom_phrase = gr.Textbox(
851
- label="Custom Message",
852
- placeholder="Or type your own message...",
853
- lines=2
854
- )
855
-
856
- speak_btn = gr.Button(
857
- "Speak Message",
858
- elem_classes="accessible-btn",
859
- size="lg"
860
- )
861
-
862
- with gr.Column(scale=1):
863
- spoken_text = gr.Textbox(
864
- label="Message",
865
- lines=3
866
- )
867
- message_audio = gr.Audio(
868
- label="Spoken Audio",
869
- type="filepath"
870
- )
871
-
872
- # Deaf-Blind Communication Tab
873
- with gr.Tab("👁️👂 Deaf-Blind Communication", id="deaf_blind"):
874
  with gr.Row():
875
- with gr.Column(scale=1):
876
- tactile_input = gr.Textbox(
877
- label="Message to Convert",
878
- placeholder="Enter text for tactile communication...",
879
- lines=3
880
- )
881
- output_format = gr.Radio(
882
- choices=["haptic", "braille"],
883
- label="Output Format",
884
- value="haptic"
885
- )
886
-
887
- convert_btn = gr.Button(
888
- "Convert to Tactile",
889
- elem_classes="accessible-btn"
890
- )
891
-
892
- with gr.Column(scale=1):
893
- braille_output = gr.Textbox(
894
- label="Braille Output",
895
- lines=3
896
- )
897
- vibration_pattern = gr.Textbox(
898
- label="Vibration Pattern",
899
- lines=2
900
- )
901
-
902
- # Feedback and Settings
903
- with gr.Tab("⚙️ Settings & Feedback", id="settings"):
904
  with gr.Row():
905
- with gr.Column(scale=1):
906
- gr.Markdown("### 🔧 Accessibility Settings")
907
  high_contrast = gr.Checkbox(label="High Contrast Mode", value=False)
908
  large_text = gr.Checkbox(label="Large Text Mode", value=False)
909
  voice_navigation = gr.Checkbox(label="Voice Navigation", value=True)
910
-
911
- gr.Markdown("### 📧 Feedback")
912
  feedback_email = gr.Textbox(label="Your Email (optional)")
913
- feedback_message = gr.Textbox(
914
- label="Feedback & Suggestions",
915
- placeholder="Help us improve VoiceBridge AI...",
916
- lines=4
917
- )
918
  submit_feedback = gr.Button("Submit Feedback", elem_classes="accessible-btn")
919
  feedback_status = gr.Textbox(label="Status", interactive=False)
920
-
921
- with gr.Column(scale=1):
922
- gr.Markdown("### 📊 System Information")
923
- conversation_history = gr.Textbox(
924
- label="Recent Conversation",
925
- lines=8,
926
- max_lines=10
927
- )
928
  clear_history = gr.Button("Clear History", elem_classes="accessible-btn")
929
  export_data = gr.Button("Export Data", elem_classes="accessible-btn")
930
-
931
- # ==================== EVENT HANDLERS ====================
932
-
933
  def handle_mode_change(mode):
934
  voice_bridge.current_mode = mode
935
  status_msg = f"Mode changed to: {mode}"
936
  voice_bridge.add_to_conversation("System", status_msg)
937
- return status_msg, status_msg # For both status displays
938
-
939
  def handle_universal_input(audio, text, image, mode):
 
940
  if audio:
941
  input_data = {'type': 'voice', 'audio': audio}
942
  elif text:
@@ -944,11 +818,10 @@ def create_production_interface():
944
  elif image:
945
  input_data = {'type': 'image', 'image': image}
946
  else:
947
- return "Please provide input", None, None
948
-
949
  result = voice_bridge.universal_communication(input_data)
950
- return result['text'], result.get('audio'), result.get('visual_alert', '')
951
-
952
  def handle_blind_assistance(audio, command, image):
953
  if audio:
954
  transcript = voice_bridge.fallback_speech_to_text(audio)
@@ -957,124 +830,93 @@ def create_production_interface():
957
  result = voice_bridge.blind_mode(command, image)
958
  else:
959
  result = voice_bridge.blind_mode(command)
960
-
961
- return result['text'], result.get('audio')
962
-
963
  def handle_deaf_assistance(audio, continuous):
964
  result = voice_bridge.deaf_mode(audio, continuous)
965
- return result['text'], result.get('visual_alert', 'No important sounds detected')
966
-
967
  def handle_non_verbal(preset, custom):
968
- text_to_speak = custom if custom else None
 
969
  result = voice_bridge.non_verbal_mode(text_to_speak, preset)
970
- return result['text'], result.get('audio')
971
-
972
- def handle_deaf_blind(input_text, output_format):
973
- result = voice_bridge.deaf_blind_mode(input_text, output_format)
974
- braille = result.get('braille', '')
975
- pattern = str(result.get('haptic_feedback', []))
976
- return result['text'], braille, pattern
977
-
978
- def handle_emergency(contact):
979
- if contact:
980
- voice_bridge.emergency_contacts.append(contact)
981
- result = voice_bridge.trigger_emergency_mode("manual")
982
- return result['message'], result.get('audio')
983
-
984
  def handle_feedback(email, message):
985
- if not message.strip():
986
- return "Please enter feedback message"
987
-
988
- # In production, this would save to database/send email
989
- feedback_data = {
990
- 'timestamp': datetime.now().isoformat(),
991
- 'email': email,
992
- 'message': message,
993
- 'mode': voice_bridge.current_mode
994
  }
995
-
996
- try:
997
- # Save feedback locally
998
- with open('data/feedback.json', 'a') as f:
999
- f.write(json.dumps(feedback_data) + '\n')
1000
-
1001
- return "✅ Thank you for your feedback! We'll review it soon. Contact: Naila.Rais@msftcommunity.com"
1002
- except Exception as e:
1003
- return f" Could not save feedback: {str(e)}"
1004
-
1005
- # Connect event handlers
1006
- mode_selector.change(
1007
- handle_mode_change,
1008
- inputs=mode_selector,
1009
- outputs=[system_status, current_mode_display]
1010
- )
1011
-
1012
- process_universal.click(
1013
- handle_universal_input,
1014
- inputs=[universal_audio, universal_text, universal_image, mode_selector],
1015
- outputs=[universal_output, universal_audio_output, universal_alert]
1016
- )
1017
-
1018
- process_blind.click(
1019
- handle_blind_assistance,
1020
- inputs=[blind_audio, blind_commands, blind_image],
1021
- outputs=[blind_output, blind_audio_output]
1022
- )
1023
-
1024
- process_deaf.click(
1025
- handle_deaf_assistance,
1026
- inputs=[deaf_audio, continuous_listening],
1027
- outputs=[deaf_output, deaf_alerts]
1028
- )
1029
-
1030
- speak_btn.click(
1031
- handle_non_verbal,
1032
- inputs=[preset_phrases, custom_phrase],
1033
- outputs=[spoken_text, message_audio]
1034
- )
1035
-
1036
- convert_btn.click(
1037
- handle_deaf_blind,
1038
- inputs=[tactile_input, output_format],
1039
- outputs=[tactile_input, braille_output, vibration_pattern]
1040
- )
1041
-
1042
- emergency_btn.click(
1043
- handle_emergency,
1044
- inputs=emergency_contact_input,
1045
- outputs=[system_status, universal_audio_output]
1046
- )
1047
-
1048
- submit_feedback.click(
1049
- handle_feedback,
1050
- inputs=[feedback_email, feedback_message],
1051
- outputs=feedback_status
1052
- )
1053
-
1054
- # Initialize system
1055
- demo.load(
1056
- fn=lambda: ("System Ready - VoiceBridge AI Initialized", "universal"),
1057
- outputs=[system_status, current_mode_display]
1058
- )
1059
-
1060
  return demo
1061
 
1062
- # Production deployment
 
 
1063
  if __name__ == "__main__":
1064
- try:
1065
- print("🚀 Starting VoiceBridge AI Production Server...")
1066
- print("📧 Support & Feedback: Naila.Rais@msftcommunity.com")
1067
- print("🌐 Access the app at the URL provided below")
1068
-
1069
- demo = create_production_interface()
1070
- demo.launch(
1071
- server_name="0.0.0.0",
1072
- server_port=7860,
1073
- share=True,
1074
- debug=False,
1075
- show_error=True,
1076
- auth=("NailaR", "voicebridge2025") if os.getenv('PRODUCTION') else None
1077
- )
1078
- except Exception as e:
1079
- logger.error(f"Failed to start production server: {e}")
1080
- print(f"❌ Startup failed: {e}")
 
4
  Supporting: Blind, Deaf, Non-Verbal, Deaf-Blind Users
5
  """
6
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import os
8
  import logging
9
+ import json
10
+ import tempfile
11
+ import time
12
  from datetime import datetime
13
  from pathlib import Path
14
+ from typing import List
15
+
16
+ import gradio as gr
17
+ import speech_recognition as sr
18
+ import pyttsx3
19
  import torch
20
  from transformers import pipeline
21
+
22
+ # Optional imports (may fail gracefully in some environments)
23
+ try:
24
+ import cv2
25
+ import numpy as np
26
+ except Exception:
27
+ cv2 = None
28
+ np = None
29
 
30
  # Configure logging
31
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
  logger = logging.getLogger(__name__)
33
 
34
+
35
  class ProductionVoiceBridge:
36
  """
37
  Production-grade universal communication system for all disabilities
38
  """
39
+
40
+ def __init__(self, allow_microphone: bool = False):
41
+ """
42
+ If allow_microphone is False (default) we will not force initialization
43
+ of system microphone — avoids failures in headless/HF Spaces.
44
+ """
45
+ self.allow_microphone = allow_microphone
46
  self.setup_directories()
47
  self.load_config()
 
48
  self.current_mode = "universal"
49
  self.user_preferences = {}
50
  self.conversation_history = []
51
  self.emergency_contacts = []
52
+ # initialize engines in try so failures are caught
53
+ self._init_defaults()
54
+ self.initialize_engines()
55
+
56
+ def _init_defaults(self):
57
+ # placeholders so other methods can rely on attributes
58
+ self.tts_engine = None
59
+ self.recognizer = None
60
+ self.microphone = None
61
+ self.speech_to_text_model = None
62
+ self.image_caption_model = None
63
+ self.emergency_mode = False
64
+ self.last_emergency_check = time.time()
65
+
66
  def setup_directories(self):
67
  """Create necessary directories for production"""
68
+ Path("data").mkdir(parents=True, exist_ok=True)
69
  Path("data/conversations").mkdir(parents=True, exist_ok=True)
70
  Path("data/emergency").mkdir(parents=True, exist_ok=True)
71
  Path("data/user_profiles").mkdir(parents=True, exist_ok=True)
72
+ Path("data/feedback").mkdir(parents=True, exist_ok=True)
73
+
74
  def load_config(self):
75
  """Load production configuration"""
76
  self.config = {
 
86
  "error": [100, 50, 100, 50, 100]
87
  }
88
  }
89
+
90
  def initialize_engines(self):
91
  """Initialize all AI engines and hardware interfaces"""
92
  try:
93
+ # Text-to-Speech Engine (pyttsx3 is local and usually safe)
94
+ try:
95
+ self.tts_engine = pyttsx3.init()
96
+ voices = self.tts_engine.getProperty('voices') or []
97
+ if voices:
98
+ self.tts_engine.setProperty('voice', voices[0].id)
99
+ self.tts_engine.setProperty('rate', 160)
100
+ self.tts_engine.setProperty('volume', 0.8)
101
+ except Exception as e:
102
+ logger.warning(f"TTS engine init failed: {e}")
103
+ self.tts_engine = None
104
+
105
+ # Speech Recognition (recognizer only; microphone optional)
106
+ try:
107
+ self.recognizer = sr.Recognizer()
108
+ except Exception as e:
109
+ logger.warning(f"SpeechRecognition init failed: {e}")
110
+ self.recognizer = None
111
+
112
+ # Only attempt to initialize Microphone if explicitly allowed
113
+ if self.allow_microphone:
114
+ try:
115
+ self.microphone = sr.Microphone()
116
+ # adjust_for_ambient_noise can fail in headless; guard it
117
+ try:
118
+ with self.microphone as source:
119
+ self.recognizer.adjust_for_ambient_noise(source, duration=1)
120
+ except Exception as e:
121
+ logger.warning(f"Ambient noise adjust failed: {e}")
122
+ except Exception as e:
123
+ logger.warning(f"Microphone not available: {e}")
124
+ self.microphone = None
125
+ else:
126
+ # keep microphone None in headless mode to avoid exceptions
127
+ self.microphone = None
128
+
129
+ # AI Models with error handling (transformers pipelines)
130
  self.load_ai_models()
131
+
132
  # Emergency system
133
  self.emergency_mode = False
134
  self.last_emergency_check = time.time()
135
+
136
+ logger.info("All engines initialized (best-effort)")
137
+
138
  except Exception as e:
139
+ # Log full traceback then re-raise to make failure obvious in dev mode
140
  logger.error(f"Engine initialization failed: {e}")
141
+ logger.debug("Traceback:", exc_info=True)
142
  raise
143
+
144
  def load_ai_models(self):
145
+ """Load AI models with fallbacks. These can be heavy; fail gracefully."""
146
+ # Whisper (ASR) - optional; if not available will fall back to sr.Recognizer
147
  try:
148
+ # device selection: if CUDA available, use it, else CPU
149
+ device = 0 if torch.cuda.is_available() else -1
150
  self.speech_to_text_model = pipeline(
151
  "automatic-speech-recognition",
152
  model="openai/whisper-base",
153
+ device=device
154
  )
155
+ logger.info("Whisper ASR model loaded")
156
  except Exception as e:
157
+ logger.warning(f"Whisper model failed to load: {e}")
158
  self.speech_to_text_model = None
159
+
160
+ # Image captioning - optional
161
  try:
162
  self.image_caption_model = pipeline(
163
  "image-to-text",
164
  model="Salesforce/blip-image-captioning-base",
165
  device=-1
166
  )
167
+ logger.info("Image caption model loaded")
168
  except Exception as e:
169
+ logger.warning(f"Image caption model failed to load: {e}")
170
  self.image_caption_model = None
171
+
172
  # ==================== UNIVERSAL MODE ====================
173
+
174
  def universal_communication(self, input_data: dict) -> dict:
175
  """
176
  Universal communication handler that adapts to any input type
177
  """
178
  try:
179
  input_type = input_data.get('type', 'voice')
180
+
181
  if input_type == 'voice' and input_data.get('audio'):
182
  return self.handle_voice_input(input_data['audio'])
183
+
184
  elif input_type == 'text' and input_data.get('text'):
185
  return self.handle_text_input(input_data['text'])
186
+
187
  elif input_type == 'image' and input_data.get('image'):
188
  return self.handle_image_input(input_data['image'])
189
+
190
  elif input_type == 'command':
191
  return self.handle_system_command(input_data.get('command', ''))
192
+
193
  else:
194
  return self.create_response(
195
  "Please provide voice, text, or image input",
196
  "error"
197
  )
198
+
199
  except Exception as e:
200
+ logger.error(f"Universal communication error: {e}", exc_info=True)
201
  return self.create_response(
202
  "System error. Please try again or use emergency mode.",
203
  "error"
204
  )
205
+
206
  def handle_voice_input(self, audio_path: str) -> dict:
207
  """Process voice input for deaf users and general transcription"""
208
  try:
209
+ transcript = ""
210
+ # If Hugging Face/transformers ASR available and audio path exists, try it
211
+ if self.speech_to_text_model and audio_path:
212
+ try:
213
+ # transformers pipelines accept file path
214
+ out = self.speech_to_text_model(audio_path)
215
+ transcript = out.get("text", "") if isinstance(out, dict) else str(out)
216
+ except Exception as e:
217
+ logger.warning(f"HF ASR failed, falling back to sr: {e}")
218
+ transcript = self.fallback_speech_to_text(audio_path)
219
  else:
220
  transcript = self.fallback_speech_to_text(audio_path)
221
+
222
+ if not transcript:
223
+ transcript = ""
224
+
225
  # Check for emergency keywords
226
  if self.detect_emergency_keywords(transcript):
227
  emergency_result = self.trigger_emergency_mode("voice_triggered")
 
231
  audio=emergency_result.get('audio'),
232
  visual_alert="🔴 EMERGENCY ACTIVATED"
233
  )
234
+
235
  # Check for system commands
236
  if self.is_system_command(transcript):
237
  return self.handle_system_command(transcript)
238
+
239
  # Regular communication
240
  self.add_to_conversation("User", transcript)
241
+
242
  return self.create_response(
243
  transcript,
244
  "transcription",
245
  visual_alert=f"💬 New message: {transcript[:50]}..."
246
  )
247
+
248
  except Exception as e:
249
+ logger.error(f"Voice input error: {e}", exc_info=True)
250
  return self.create_response(
251
  "Could not process audio. Please try again.",
252
  "error"
253
  )
254
+
255
  def handle_text_input(self, text: str) -> dict:
256
  """Process text input for non-verbal users"""
257
  try:
 
264
  audio=emergency_result.get('audio'),
265
  visual_alert="🔴 EMERGENCY"
266
  )
267
+
268
  # Convert to speech
269
  audio_path = self.text_to_speech(text)
270
+
271
  self.add_to_conversation("User", text, "spoken")
272
+
273
  return self.create_response(
274
  text,
275
  "communication",
276
  audio=audio_path,
277
  visual_alert=f"🗣️ Speaking: {text[:30]}..."
278
  )
279
+
280
  except Exception as e:
281
+ logger.error(f"Text input error: {e}", exc_info=True)
282
  return self.create_response(
283
  "Could not process text. Please try again.",
284
  "error"
285
  )
286
+
287
  def handle_image_input(self, image_path: str) -> dict:
288
  """Process image input for blind users"""
289
  try:
290
  if not self.image_caption_model:
291
  description = "I see an image but cannot describe it in detail right now."
292
  else:
293
+ try:
294
+ caption_out = self.image_caption_model(image_path)
295
+ # pipeline returns list or dict depending on version
296
+ if isinstance(caption_out, list) and caption_out:
297
+ description = caption_out[0].get('generated_text', '')
298
+ elif isinstance(caption_out, dict):
299
+ description = caption_out.get('generated_text', '') or caption_out.get('text', '')
300
+ else:
301
+ description = str(caption_out)
302
+ except Exception as e:
303
+ logger.warning(f"Image captioning failed: {e}")
304
+ description = "I see an image but cannot describe it in detail right now."
305
+
306
  description = self.enhance_scene_description(description)
307
+
308
  # Convert description to speech
309
  audio_path = self.text_to_speech(description)
310
+
311
  return self.create_response(
312
  description,
313
  "scene_description",
314
  audio=audio_path
315
  )
316
+
317
  except Exception as e:
318
+ logger.error(f"Image input error: {e}", exc_info=True)
319
  return self.create_response(
320
  "Could not process image. Please try again.",
321
  "error"
322
  )
323
+
324
  # ==================== DISABILITY-SPECIFIC MODES ====================
325
+
326
  def blind_mode(self, command: str = None, image_path: str = None) -> dict:
327
  """Voice-first interface for blind users"""
328
  if not command and not image_path:
 
331
  "'read text' for text recognition, or 'help' for options."
332
  )
333
  return self.create_response(welcome_msg, "system", audio=self.text_to_speech(welcome_msg))
334
+
335
  if command:
336
  command = command.lower()
337
+
338
  if 'describe' in command or 'scene' in command or image_path:
339
  if image_path:
340
  return self.handle_image_input(image_path)
 
343
  "Please capture an image using the camera",
344
  "instruction"
345
  )
346
+
347
  elif 'read' in command or 'text' in command:
348
  return self.create_response(
349
  "Please capture an image containing text",
350
  "instruction"
351
  )
352
+
353
  elif 'navigate' in command or 'direction' in command:
354
  guidance = "Navigation assistance: Move forward carefully. Obstacle detection active."
355
  return self.create_response(
 
357
  "navigation",
358
  audio=self.text_to_speech(guidance)
359
  )
360
+
361
  elif 'help' in command:
362
+ help_text = (
363
+ "Blind Mode Commands:\n"
364
+ "\"Describe scene\" - Describe surroundings using camera\n"
365
+ "\"Read text\" - Read text from images\n"
366
+ "\"Navigate\" - Get walking directions\n"
367
+ "\"Emergency\" - Immediate assistance\n"
368
+ "\"Change mode\" - Switch accessibility mode\n"
369
+ )
370
  return self.create_response(help_text, "help", audio=self.text_to_speech(help_text))
371
+
372
  else:
373
  response = "Command not recognized. Say 'help' for options."
374
  return self.create_response(response, "error", audio=self.text_to_speech(response))
375
+
376
  def deaf_mode(self, audio_input: str = None, continuous: bool = False) -> dict:
377
  """Visual interface for deaf users with real-time transcription"""
378
  if audio_input:
379
  result = self.handle_voice_input(audio_input)
380
+
381
  # Add visual enhancements for deaf users
382
+ if result.get('type') == 'transcription':
383
+ result['visual_alert'] = f"👂 TRANSCRIPTION: {result.get('text','')[:100]}..."
384
+
385
  # Check for important sounds
386
  if self.detect_important_sounds(audio_input):
387
  result['visual_alert'] = "🔔 IMPORTANT SOUND DETECTED! " + result.get('visual_alert', '')
388
  result['haptic_feedback'] = self.config['haptic_patterns']['notification']
389
+
390
  return result
391
  else:
392
  status = "Deaf mode active. Real-time transcription ready. Visual alerts enabled."
393
  return self.create_response(status, "system", visual_alert="👂 Deaf Mode Active")
394
+
395
  def non_verbal_mode(self, text: str = None, preset: str = None) -> dict:
396
  """Text-to-speech communication for non-verbal users"""
397
  if preset:
 
410
  text_to_speak = phrases.get(preset, preset)
411
  else:
412
  text_to_speak = text or "I need help"
413
+
414
  audio_path = self.text_to_speech(text_to_speak)
415
+
416
  self.add_to_conversation("User", text_to_speak, "spoken")
417
+
418
  return self.create_response(
419
  text_to_speak,
420
  "communication",
 
422
  visual_alert=f"🗣️ Speaking: {text_to_speak}",
423
  haptic_feedback=self.config['haptic_patterns']['confirmation']
424
  )
425
+
426
  def deaf_blind_mode(self, input_text: str = None, output_format: str = "haptic") -> dict:
427
  """Tactile communication for deaf-blind users"""
428
  if input_text:
 
444
  else:
445
  status = "Deaf-blind mode active. Use text input with haptic or braille output."
446
  return self.create_response(status, "system")
447
+
448
  # ==================== EMERGENCY SYSTEM ====================
449
+
450
  def trigger_emergency_mode(self, trigger_source: str = "manual") -> dict:
451
  """Activate emergency response system"""
452
  self.emergency_mode = True
453
  timestamp = datetime.now().isoformat()
454
+
455
  emergency_data = {
456
  "status": "EMERGENCY_ACTIVATED",
457
  "timestamp": timestamp,
 
460
  "actions_taken": [],
461
  "contacts_notified": []
462
  }
463
+
464
  # Notify emergency contacts
465
  for contact in self.emergency_contacts:
466
  try:
 
468
  emergency_data["contacts_notified"].append(contact)
469
  except Exception as e:
470
  logger.error(f"Failed to notify {contact}: {e}")
471
+
472
  # Create emergency audio message
473
  emergency_audio = self.text_to_speech(emergency_data["message"])
474
  emergency_data["audio"] = emergency_audio
475
+
476
  # Log emergency
477
  self.log_emergency(emergency_data)
478
+
479
  return emergency_data
480
+
481
  def notify_emergency_contact(self, contact: str, emergency_data: dict):
482
  """Notify emergency contact (simplified - in production would use SMS/email)"""
483
  logger.info(f"EMERGENCY NOTIFICATION to {contact}: {emergency_data['message']}")
484
  # In production: send SMS, email, or push notification
485
+
486
  # ==================== CORE ENGINE METHODS ====================
487
+
488
  def text_to_speech(self, text: str) -> str:
489
+ """Convert text to speech and return audio file path (best-effort)."""
490
+ if not text:
491
+ return None
492
+ # If no TTS engine, return None gracefully
493
+ if self.tts_engine is None:
494
+ logger.warning("TTS engine not available; returning None for audio path.")
495
+ return None
496
  try:
497
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav', dir='data/') as tmp_file:
498
+ tmp_path = tmp_file.name
499
+ # pyttsx3 uses save_to_file then runAndWait
500
+ self.tts_engine.save_to_file(text, tmp_path)
501
+ self.tts_engine.runAndWait()
502
+ return tmp_path
503
  except Exception as e:
504
+ logger.error(f"TTS error: {e}", exc_info=True)
505
  return None
506
+
507
  def fallback_speech_to_text(self, audio_path: str) -> str:
508
  """Fallback speech recognition using speech_recognition library"""
509
+ if not audio_path:
510
+ return ""
511
+ if self.recognizer is None:
512
+ logger.warning("Recognizer not available; cannot transcribe audio.")
513
+ return ""
514
  try:
515
  with sr.AudioFile(audio_path) as source:
516
  audio = self.recognizer.record(source)
517
+ # Use Google Web Speech API (requires internet)
518
+ text = self.recognizer.recognize_google(audio)
519
+ return text
520
+ except sr.UnknownValueError:
521
+ return ""
522
+ except sr.RequestError as e:
523
+ logger.error(f"Speech recognition RequestError: {e}")
524
+ return ""
525
  except Exception as e:
526
+ logger.error(f"Fallback STT error: {e}", exc_info=True)
527
+ return ""
528
+
529
  def detect_emergency_keywords(self, text: str) -> bool:
530
  """Detect emergency keywords in text"""
531
+ if not text:
532
+ return False
533
  emergency_words = [
534
+ 'emergency', 'help', 'urgent', 'danger', 'dangerous',
535
  'accident', 'injured', 'hurt', 'pain', 'bleeding',
536
  'fire', 'police', 'ambulance', 'hospital', '911',
537
  'save me', 'help me', 'i need help'
538
  ]
539
  text_lower = text.lower()
540
  return any(word in text_lower for word in emergency_words)
541
+
542
  def detect_important_sounds(self, audio_path: str) -> bool:
543
+ """Detect important environmental sounds (simplified heuristic)"""
 
 
544
  try:
545
  transcript = self.fallback_speech_to_text(audio_path)
546
  important_words = ['help', 'emergency', 'fire', 'watch out', 'danger']
547
  return any(word in transcript.lower() for word in important_words)
548
+ except Exception:
549
  return False
550
+
551
  def text_to_vibration_pattern(self, text: str) -> List[int]:
552
  """Convert text to vibration pattern (simplified Morse code)"""
553
  morse_code = {
 
560
  '6': '-....', '7': '--...', '8': '---..', '9': '----.', '0': '-----',
561
  ' ': ' '
562
  }
563
+
564
  pattern = []
565
  for char in text.upper():
566
  if char in morse_code:
 
572
  pattern.extend([300]) # Long vibration
573
  pattern.extend([50]) # Gap between symbols
574
  pattern.extend([200]) # Gap between letters
575
+
576
  return pattern
577
+
578
  def text_to_braille(self, text: str) -> str:
579
  """Convert text to braille unicode characters"""
580
  braille_map = {
 
584
  '1': '⠁', '2': '⠃', '3': '⠉', '4': '⠙', '5': '⠑', '6': '⠋', '7': '⠛', '8': '⠓', '9': '⠊', '0': '⠚',
585
  ' ': ' ', '.': '⠲', ',': '⠂', '!': '⠖', '?': '⠦'
586
  }
587
+
588
  return ''.join(braille_map.get(char.upper(), '?') for char in text)
589
+
590
  def enhance_scene_description(self, description: str) -> str:
591
  """Enhance AI-generated scene descriptions"""
592
+ if not description:
593
+ return description
594
  enhancements = {
595
  "indoor": "This appears to be an indoor setting. ",
596
  "outdoor": "This appears to be an outdoor area. ",
 
598
  "text": "There is text that could be read. ",
599
  "obstacle": "Be careful of potential obstacles. ",
600
  }
601
+
602
  enhanced = description
603
  desc_lower = description.lower()
604
+
605
  if any(word in desc_lower for word in ['room', 'indoor', 'inside', 'wall']):
606
  enhanced = enhancements["indoor"] + enhanced
607
  elif any(word in desc_lower for word in ['outdoor', 'outside', 'sky', 'tree']):
608
  enhanced = enhancements["outdoor"] + enhanced
609
+
610
  if any(word in desc_lower for word in ['person', 'people', 'man', 'woman']):
611
  enhanced = enhancements["people"] + enhanced
612
+
613
  if any(word in desc_lower for word in ['sign', 'text', 'letter', 'word']):
614
  enhanced = enhancements["text"] + enhanced
615
+
616
  return enhanced
617
+
618
  def is_system_command(self, text: str) -> bool:
619
  """Check if text contains system commands"""
620
+ if not text:
621
+ return False
622
  commands = ['mode', 'help', 'emergency', 'stop', 'cancel', 'reset']
623
  return any(command in text.lower() for command in commands)
624
+
625
  def handle_system_command(self, command: str) -> dict:
626
  """Handle system control commands"""
627
+ command = (command or "").lower()
628
+
629
  if 'blind' in command:
630
  self.current_mode = "blind"
631
  response = "Blind mode activated. Voice navigation enabled."
632
+ elif 'deaf blind' in command:
633
+ self.current_mode = "deaf_blind"
634
+ response = "Deaf-blind mode activated. Haptic feedback enabled."
635
  elif 'deaf' in command:
636
  self.current_mode = "deaf"
637
  response = "Deaf mode activated. Visual alerts enabled."
638
  elif 'non verbal' in command or 'mute' in command:
639
  self.current_mode = "non_verbal"
640
  response = "Non-verbal mode activated. Text-to-speech ready."
 
 
 
641
  elif 'universal' in command:
642
  self.current_mode = "universal"
643
  response = "Universal mode activated."
 
645
  return self.trigger_emergency_mode("voice_command")
646
  else:
647
  response = f"Current mode: {self.current_mode}. Say 'help' for options."
648
+
649
  return self.create_response(response, "system", audio=self.text_to_speech(response))
650
+
651
  def add_to_conversation(self, speaker: str, text: str, message_type: str = "text"):
652
  """Add message to conversation history"""
653
  self.conversation_history.append({
 
656
  "text": text,
657
  "type": message_type
658
  })
659
+
660
  # Keep only last 100 messages
661
  if len(self.conversation_history) > 100:
662
  self.conversation_history = self.conversation_history[-100:]
663
+
664
  def log_emergency(self, emergency_data: dict):
665
  """Log emergency event"""
666
  try:
 
669
  json.dump(emergency_data, f, indent=2)
670
  except Exception as e:
671
  logger.error(f"Failed to log emergency: {e}")
672
+
673
  def create_response(self, text: str, response_type: str, **kwargs) -> dict:
674
  """Create standardized response object"""
675
  return {
 
686
 
687
  # ==================== GRADIO INTERFACE ====================
688
 
689
+ def create_production_interface(allow_microphone: bool = False):
690
  """Create production-ready Gradio interface"""
691
+
692
+ # Initialize the system; allow microphone only if requested
693
+ voice_bridge = ProductionVoiceBridge(allow_microphone=allow_microphone)
694
+
695
+ # Minimal custom CSS for accessibility (kept from your original)
696
  custom_css = """
697
+ :root { --primary-color: #2563eb; --danger-color: #dc2626; }
698
+ .accessible-btn { min-height:48px !important; padding:12px 18px !important; font-size:16px !important; }
699
+ .emergency-btn { background: linear-gradient(45deg,#dc2626,#ef4444) !important; color:white !important; font-weight:bold !important; }
700
+ .large-text { font-size:18px !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701
  """
702
+
703
+ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="blue"),
704
+ title="VoiceBridge AI - Universal Communication") as demo:
705
+
706
+ gr.Markdown("# 🎯 VoiceBridge AI - Universal Communication Platform")
707
+
708
+ # Status row
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  with gr.Row():
710
+ system_status = gr.Textbox(label="System Status",
711
+ value=" System Ready - VoiceBridge AI Initialized",
712
+ interactive=False)
713
+ current_mode_display = gr.Textbox(label="Current Mode", value=voice_bridge.current_mode, interactive=False)
714
+
715
+ # Emergency
 
 
 
 
 
 
716
  with gr.Row():
717
+ emergency_btn = gr.Button("🚨 ACTIVATE EMERGENCY MODE", elem_classes=["accessible-btn", "emergency-btn"])
718
+ emergency_contact_input = gr.Textbox(label="Emergency Contact (Email/Phone)",
719
+ placeholder="Enter emergency contact information...")
720
+
721
+ # Mode selector
722
+ mode_selector = gr.Radio(choices=[("Universal", "universal"), ("Blind", "blind"),
723
+ ("Deaf", "deaf"), ("Non-Verbal", "non_verbal"),
724
+ ("Deaf-Blind", "deaf_blind")],
725
+ label="Accessibility Mode", value=voice_bridge.current_mode)
726
+
727
+ # Universal Tab
728
+ with gr.Tab("🌐 Universal Communication"):
 
 
 
729
  with gr.Row():
730
+ with gr.Column():
731
+ universal_audio = gr.Audio(label="🎤 Speak (Voice Input)", type="filepath", sources=["microphone", "upload"])
732
+ universal_text = gr.Textbox(label="⌨️ Type to Speak", lines=3)
733
+ universal_image = gr.Image(label="📷 Capture Scene", type="filepath", sources=["webcam", "upload"])
734
+ process_universal = gr.Button("Process Input", elem_classes="accessible-btn")
735
+ with gr.Column():
736
+ universal_output = gr.Textbox(label="Output", lines=6)
737
+ universal_audio_output = gr.Audio(label="Audio Output", type="filepath", interactive=False)
738
+ universal_alert = gr.Textbox(label="Visual Alerts", visible=False)
739
+
740
+ # Blind Tab
741
+ with gr.Tab("👁️ Blind Assistance"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  with gr.Row():
743
+ with gr.Column():
744
+ blind_audio = gr.Audio(label="Voice Commands", type="filepath", sources=["microphone", "upload"])
745
+ blind_commands = gr.Radio(choices=["describe scene", "read text", "navigate", "help"],
746
+ label="Quick Commands", value="describe scene")
747
+ blind_image = gr.Image(label="Camera Feed", type="filepath", sources=["webcam", "upload"])
748
+ process_blind = gr.Button("Execute Command", elem_classes="accessible-btn")
749
+ with gr.Column():
750
+ blind_output = gr.Textbox(label="Scene Description", lines=5)
751
+ blind_audio_output = gr.Audio(label="Audio Description", type="filepath")
752
+
753
+ # Deaf Tab
754
+ with gr.Tab("👂 Deaf Assistance"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
755
  with gr.Row():
756
+ with gr.Column():
757
+ deaf_audio = gr.Audio(label="Audio to Transcribe", type="filepath", sources=["microphone", "upload"])
758
+ continuous_listening = gr.Checkbox(label="Continuous Listening Mode", value=False)
759
+ process_deaf = gr.Button("Transcribe Audio", elem_classes="accessible-btn")
760
+ with gr.Column():
761
+ deaf_output = gr.Textbox(label="Transcription", lines=6)
762
+ deaf_alerts = gr.Textbox(label="Sound Alerts", lines=2)
763
+
764
+ # Non-verbal Tab
765
+ with gr.Tab("🤐 Non-Verbal Communication"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
766
  with gr.Row():
767
+ with gr.Column():
768
+ preset_phrases = gr.Radio(choices=["greeting", "help", "medical", "emergency", "thanks",
769
+ "yes", "no", "pain", "lost", "bathroom"],
770
+ label="Quick Phrases", value="greeting")
771
+ custom_phrase = gr.Textbox(label="Custom Message", lines=2)
772
+ speak_btn = gr.Button("Speak Message", elem_classes="accessible-btn")
773
+ with gr.Column():
774
+ spoken_text = gr.Textbox(label="Message", lines=3)
775
+ message_audio = gr.Audio(label="Spoken Audio", type="filepath")
776
+
777
+ # Deaf-Blind Tab
778
+ with gr.Tab("👁️👂 Deaf-Blind Communication"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
779
  with gr.Row():
780
+ with gr.Column():
781
+ tactile_input = gr.Textbox(label="Message to Convert", lines=3)
782
+ output_format = gr.Radio(choices=["haptic", "braille"], label="Output Format", value="haptic")
783
+ convert_btn = gr.Button("Convert to Tactile", elem_classes="accessible-btn")
784
+ with gr.Column():
785
+ braille_output = gr.Textbox(label="Braille Output", lines=3)
786
+ vibration_pattern = gr.Textbox(label="Vibration Pattern", lines=2)
787
+
788
+ # Settings & Feedback
789
+ with gr.Tab("⚙️ Settings & Feedback"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  with gr.Row():
791
+ with gr.Column():
 
792
  high_contrast = gr.Checkbox(label="High Contrast Mode", value=False)
793
  large_text = gr.Checkbox(label="Large Text Mode", value=False)
794
  voice_navigation = gr.Checkbox(label="Voice Navigation", value=True)
 
 
795
  feedback_email = gr.Textbox(label="Your Email (optional)")
796
+ feedback_message = gr.Textbox(label="Feedback & Suggestions", lines=4)
 
 
 
 
797
  submit_feedback = gr.Button("Submit Feedback", elem_classes="accessible-btn")
798
  feedback_status = gr.Textbox(label="Status", interactive=False)
799
+ with gr.Column():
800
+ conversation_history = gr.Textbox(label="Recent Conversation", lines=8, max_lines=10)
 
 
 
 
 
 
801
  clear_history = gr.Button("Clear History", elem_classes="accessible-btn")
802
  export_data = gr.Button("Export Data", elem_classes="accessible-btn")
803
+
804
+ # ---------------- Event handlers (single definitions, no duplicates) ----------------
805
+
806
  def handle_mode_change(mode):
807
  voice_bridge.current_mode = mode
808
  status_msg = f"Mode changed to: {mode}"
809
  voice_bridge.add_to_conversation("System", status_msg)
810
+ return status_msg, status_msg
811
+
812
  def handle_universal_input(audio, text, image, mode):
813
+ # prioritize audio > text > image
814
  if audio:
815
  input_data = {'type': 'voice', 'audio': audio}
816
  elif text:
 
818
  elif image:
819
  input_data = {'type': 'image', 'image': image}
820
  else:
821
+ return "Please provide input", None, ""
 
822
  result = voice_bridge.universal_communication(input_data)
823
+ return result.get('text', ''), result.get('audio', None), result.get('visual_alert', '')
824
+
825
  def handle_blind_assistance(audio, command, image):
826
  if audio:
827
  transcript = voice_bridge.fallback_speech_to_text(audio)
 
830
  result = voice_bridge.blind_mode(command, image)
831
  else:
832
  result = voice_bridge.blind_mode(command)
833
+ return result.get('text', ''), result.get('audio', None)
834
+
 
835
  def handle_deaf_assistance(audio, continuous):
836
  result = voice_bridge.deaf_mode(audio, continuous)
837
+ return result.get('text', ''), result.get('visual_alert', 'No important sounds detected')
838
+
839
  def handle_non_verbal(preset, custom):
840
+ # preset radio contains phrase key
841
+ text_to_speak = custom if custom and custom.strip() else None
842
  result = voice_bridge.non_verbal_mode(text_to_speak, preset)
843
+ return result.get('text', ''), result.get('audio', None)
844
+
845
+ def handle_deaf_blind(input_text, out_format):
846
+ result = voice_bridge.deaf_blind_mode(input_text, out_format)
847
+ braille = result.get('braille', '') if result else ''
848
+ pattern = result.get('haptic_feedback', []) if result else []
849
+ # Return braille text and vibration pattern string
850
+ return braille, str(pattern)
851
+
 
 
 
 
 
852
  def handle_feedback(email, message):
853
+ if not (message and message.strip()):
854
+ return "Please enter feedback before submitting."
855
+ fb = {
856
+ "timestamp": datetime.now().isoformat(),
857
+ "email": email,
858
+ "message": message
 
 
 
859
  }
860
+ Path("data/feedback").mkdir(parents=True, exist_ok=True)
861
+ fname = f"data/feedback/feedback_{int(time.time())}.json"
862
+ with open(fname, "w") as f:
863
+ json.dump(fb, f, indent=2)
864
+ return "Thank you! Feedback submitted."
865
+
866
+ def handle_clear_history():
867
+ voice_bridge.conversation_history.clear()
868
+ return "History cleared."
869
+
870
+ def handle_export_data():
871
+ export_path = "data/export_conversation.json"
872
+ with open(export_path, "w") as f:
873
+ json.dump(voice_bridge.conversation_history, f, indent=2)
874
+ return f"Conversation exported to {export_path}"
875
+
876
+ def handle_emergency(contact=None):
877
+ if contact:
878
+ voice_bridge.emergency_contacts.append(contact)
879
+ result = voice_bridge.trigger_emergency_mode("manual_button")
880
+ # return message and (if available) audio path
881
+ return result.get("message", ""), result.get("audio", None)
882
+
883
+ # ---------------- Connect components ----------------
884
+
885
+ mode_selector.change(fn=handle_mode_change, inputs=mode_selector, outputs=[system_status, current_mode_display])
886
+
887
+ process_universal.click(fn=handle_universal_input, inputs=[universal_audio, universal_text, universal_image, mode_selector],
888
+ outputs=[universal_output, universal_audio_output, universal_alert])
889
+
890
+ process_blind.click(fn=handle_blind_assistance, inputs=[blind_audio, blind_commands, blind_image],
891
+ outputs=[blind_output, blind_audio_output])
892
+
893
+ process_deaf.click(fn=handle_deaf_assistance, inputs=[deaf_audio, continuous_listening],
894
+ outputs=[deaf_output, deaf_alerts])
895
+
896
+ speak_btn.click(fn=handle_non_verbal, inputs=[preset_phrases, custom_phrase],
897
+ outputs=[spoken_text, message_audio])
898
+
899
+ convert_btn.click(fn=handle_deaf_blind, inputs=[tactile_input, output_format], outputs=[braille_output, vibration_pattern])
900
+
901
+ submit_feedback.click(fn=handle_feedback, inputs=[feedback_email, feedback_message], outputs=feedback_status)
902
+
903
+ clear_history.click(fn=handle_clear_history, outputs=conversation_history)
904
+
905
+ export_data.click(fn=handle_export_data, outputs=feedback_status)
906
+
907
+ emergency_btn.click(fn=lambda: handle_emergency(emergency_contact_input.value), inputs=None, outputs=[system_status])
908
+
909
+ # initial load state
910
+ demo.load(fn=lambda: ("System Ready - VoiceBridge AI Initialized", voice_bridge.current_mode),
911
+ outputs=[system_status, current_mode_display])
912
+
 
 
 
 
 
 
 
 
 
 
 
 
913
  return demo
914
 
915
+
916
+ # ==================== LAUNCH ====================
917
+
918
  if __name__ == "__main__":
919
+ # In most headless deployments (Hugging Face Spaces) you must NOT initialize the microphone.
920
+ # Set allow_microphone=True only if running on a device with a microphone and you want live mic support.
921
+ demo = create_production_interface(allow_microphone=False)
922
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), share=False)