SreekarB commited on
Commit
476583d
·
verified ·
1 Parent(s): 7fd3d01

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +101 -16
  2. requirements.txt +4 -1
app.py CHANGED
@@ -6,13 +6,34 @@ import time
6
  import wave
7
  import requests
8
  import json
 
9
  from gtts import gTTS
10
  import speech_recognition as sr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Conversation state
13
  conversation = []
14
 
15
- # Hugging Face API configuration
16
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
17
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
18
 
@@ -87,6 +108,29 @@ current_assessment = None
87
  current_item_index = 0
88
  assessment_results = []
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def get_ai_response(user_text, context=None):
91
  """Get AI response from Hugging Face API"""
92
  if not user_text:
@@ -148,10 +192,14 @@ def text_to_speech(text):
148
  return None
149
 
150
  def speech_to_text(audio):
151
- """Convert speech to text using SpeechRecognition"""
152
  if audio is None:
153
  return None
154
 
 
 
 
 
155
  # Extract audio data
156
  sample_rate, audio_data = audio
157
 
@@ -167,16 +215,10 @@ def speech_to_text(audio):
167
  wf.setframerate(sample_rate)
168
  wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
169
 
170
- # Use SpeechRecognition to transcribe
171
- recognizer = sr.Recognizer()
172
- with sr.AudioFile(temp_path) as source:
173
- audio_data = recognizer.record(source)
174
- text = recognizer.recognize_google(audio_data)
175
- return text
176
- except sr.UnknownValueError:
177
- return None
178
- except sr.RequestError:
179
- return "Sorry, I couldn't access the speech recognition service."
180
  except Exception as e:
181
  print(f"STT Error: {e}")
182
  return None
@@ -264,7 +306,6 @@ def process_assessment_audio(audio, assessment_type, item_index):
264
 
265
  elif assessment_type == "language":
266
  # Similar processing for language assessment
267
- # Not fully implemented - would follow similar pattern
268
  current_task = language_exercises["tasks"][item_index]
269
 
270
  result = {
@@ -304,6 +345,10 @@ def init_articulation_assessment():
304
  current_item_index = 0
305
  assessment_results = []
306
 
 
 
 
 
307
  instructions = articulation_exercises["instructions"]
308
  first_word = articulation_exercises["words"][0]["word"]
309
  message = f"{instructions}\n\nFirst word: {first_word}"
@@ -320,6 +365,10 @@ def init_language_assessment():
320
  current_item_index = 0
321
  assessment_results = []
322
 
 
 
 
 
323
  instructions = language_exercises["instructions"]
324
  first_prompt = language_exercises["tasks"][0]["prompt"]
325
  message = f"{instructions}\n\nFirst task: {first_prompt}"
@@ -366,6 +415,10 @@ def process_conversation_audio(audio):
366
  if audio is None:
367
  return None, "No audio detected. Please try again."
368
 
 
 
 
 
369
  # Convert speech to text
370
  transcript = speech_to_text(audio)
371
 
@@ -386,6 +439,10 @@ def initialize_conversation():
386
  global conversation
387
  conversation = []
388
 
 
 
 
 
389
  # Add welcome message
390
  welcome = "Hello! I'm your CASL 2 speech therapy assistant. How can I help you today?"
391
  conversation.append({"role": "assistant", "content": welcome})
@@ -395,6 +452,14 @@ def initialize_conversation():
395
 
396
  return welcome_audio, format_conversation()
397
 
 
 
 
 
 
 
 
 
398
  # Custom CSS
399
  custom_css = """
400
  :root {
@@ -462,6 +527,15 @@ button.secondary {
462
  border-radius: 8px;
463
  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
464
  }
 
 
 
 
 
 
 
 
 
465
  """
466
 
467
  # Create Gradio interface with tabs for different modes
@@ -474,6 +548,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
474
  gr.Markdown("# CASL 2 - Speech Therapy Assessment")
475
  gr.Markdown("An interactive tool for speech therapists to assess and treat speech disorders")
476
 
 
 
 
477
  # Main tabs
478
  with gr.Tabs() as tabs:
479
  # Conversation Mode Tab
@@ -490,7 +567,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
490
  # Microphone input
491
  conv_audio_input = gr.Audio(
492
  label="🎤 SPEAK HERE",
493
- type="numpy"
 
 
494
  )
495
 
496
  # Right panel - Conversation
@@ -536,7 +615,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
536
  # Microphone input
537
  art_audio_input = gr.Audio(
538
  label="🎤 RECORD RESPONSE",
539
- type="numpy"
 
 
540
  )
541
 
542
  # Navigation
@@ -580,7 +661,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
580
  # Microphone input
581
  lang_audio_input = gr.Audio(
582
  label="🎤 RECORD RESPONSE",
583
- type="numpy"
 
 
584
  )
585
 
586
  # Navigation
@@ -630,6 +713,8 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
630
  **For therapists**: Use these tools during your sessions to supplement your professional assessment.
631
 
632
  **Privacy Note**: All audio recordings are processed securely and are not stored permanently.
 
 
633
  """)
634
 
635
  # Connect components - Conversation Mode
 
6
  import wave
7
  import requests
8
  import json
9
+ import torch
10
  from gtts import gTTS
11
  import speech_recognition as sr
12
+ import soundfile as sf
13
+ from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
14
+
15
+ # Set up speech-to-text model
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
18
+
19
+ # Use lightweight models suitable for Hugging Face Spaces
20
+ STT_MODEL_ID = "openai/whisper-small"
21
+ TTS_MODEL_ID = "microsoft/speecht5_tts"
22
+
23
+ # Initialize the speech recognition model (will load on first use to save memory)
24
+ speech_recognizer = None
25
+
26
+ # Initialize the text-to-speech model (will load on first use to save memory)
27
+ tts_processor = None
28
+ tts_model = None
29
+
30
+ # Flag to indicate if models are ready
31
+ models_loaded = False
32
 
33
  # Conversation state
34
  conversation = []
35
 
36
+ # Hugging Face API configuration for LLM
37
  HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
38
  HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
39
 
 
108
  current_item_index = 0
109
  assessment_results = []
110
 
111
+ def load_models():
112
+ """Load speech models on first use"""
113
+ global speech_recognizer, tts_processor, tts_model, models_loaded
114
+
115
+ try:
116
+ if speech_recognizer is None:
117
+ # Load lightweight Whisper model for STT
118
+ speech_recognizer = pipeline(
119
+ "automatic-speech-recognition",
120
+ model=STT_MODEL_ID,
121
+ torch_dtype=torch_dtype,
122
+ device=device,
123
+ )
124
+ print("Speech recognition model loaded")
125
+
126
+ # We'll use gTTS for TTS since it's more lightweight for Hugging Face Spaces
127
+ # But we'll keep the code structure to allow for future upgrades
128
+ models_loaded = True
129
+ return "Models loaded successfully"
130
+ except Exception as e:
131
+ print(f"Error loading models: {e}")
132
+ return f"Error loading models: {e}"
133
+
134
  def get_ai_response(user_text, context=None):
135
  """Get AI response from Hugging Face API"""
136
  if not user_text:
 
192
  return None
193
 
194
  def speech_to_text(audio):
195
+ """Convert speech to text using Whisper model"""
196
  if audio is None:
197
  return None
198
 
199
+ # Make sure models are loaded
200
+ if not models_loaded:
201
+ load_models()
202
+
203
  # Extract audio data
204
  sample_rate, audio_data = audio
205
 
 
215
  wf.setframerate(sample_rate)
216
  wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
217
 
218
+ # Use Whisper model to transcribe
219
+ result = speech_recognizer(temp_path)
220
+ text = result["text"]
221
+ return text
 
 
 
 
 
 
222
  except Exception as e:
223
  print(f"STT Error: {e}")
224
  return None
 
306
 
307
  elif assessment_type == "language":
308
  # Similar processing for language assessment
 
309
  current_task = language_exercises["tasks"][item_index]
310
 
311
  result = {
 
345
  current_item_index = 0
346
  assessment_results = []
347
 
348
+ # Make sure models are loaded
349
+ if not models_loaded:
350
+ load_models()
351
+
352
  instructions = articulation_exercises["instructions"]
353
  first_word = articulation_exercises["words"][0]["word"]
354
  message = f"{instructions}\n\nFirst word: {first_word}"
 
365
  current_item_index = 0
366
  assessment_results = []
367
 
368
+ # Make sure models are loaded
369
+ if not models_loaded:
370
+ load_models()
371
+
372
  instructions = language_exercises["instructions"]
373
  first_prompt = language_exercises["tasks"][0]["prompt"]
374
  message = f"{instructions}\n\nFirst task: {first_prompt}"
 
415
  if audio is None:
416
  return None, "No audio detected. Please try again."
417
 
418
+ # Make sure models are loaded
419
+ if not models_loaded:
420
+ load_models()
421
+
422
  # Convert speech to text
423
  transcript = speech_to_text(audio)
424
 
 
439
  global conversation
440
  conversation = []
441
 
442
+ # Make sure models are loaded
443
+ if not models_loaded:
444
+ load_models()
445
+
446
  # Add welcome message
447
  welcome = "Hello! I'm your CASL 2 speech therapy assistant. How can I help you today?"
448
  conversation.append({"role": "assistant", "content": welcome})
 
452
 
453
  return welcome_audio, format_conversation()
454
 
455
+ # Status message function
456
+ def get_status():
457
+ """Get current status of the app"""
458
+ if models_loaded:
459
+ return "Models loaded and ready. The app is working in speech-to-speech mode."
460
+ else:
461
+ return "Models will be loaded on first use. This may take a moment when you first record audio."
462
+
463
  # Custom CSS
464
  custom_css = """
465
  :root {
 
527
  border-radius: 8px;
528
  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
529
  }
530
+
531
+ .status-bar {
532
+ margin-top: 1rem;
533
+ padding: 0.5rem;
534
+ background-color: #f5f5f5;
535
+ border-radius: 4px;
536
+ font-size: 0.9rem;
537
+ color: #666;
538
+ }
539
  """
540
 
541
  # Create Gradio interface with tabs for different modes
 
548
  gr.Markdown("# CASL 2 - Speech Therapy Assessment")
549
  gr.Markdown("An interactive tool for speech therapists to assess and treat speech disorders")
550
 
551
+ # Status bar
552
+ status_box = gr.Textbox(label="Status", value=get_status(), interactive=False, elem_classes="status-bar")
553
+
554
  # Main tabs
555
  with gr.Tabs() as tabs:
556
  # Conversation Mode Tab
 
567
  # Microphone input
568
  conv_audio_input = gr.Audio(
569
  label="🎤 SPEAK HERE",
570
+ type="numpy",
571
+ sources=["microphone"],
572
+ elem_id="conv_mic"
573
  )
574
 
575
  # Right panel - Conversation
 
615
  # Microphone input
616
  art_audio_input = gr.Audio(
617
  label="🎤 RECORD RESPONSE",
618
+ type="numpy",
619
+ sources=["microphone"],
620
+ elem_id="art_mic"
621
  )
622
 
623
  # Navigation
 
661
  # Microphone input
662
  lang_audio_input = gr.Audio(
663
  label="🎤 RECORD RESPONSE",
664
+ type="numpy",
665
+ sources=["microphone"],
666
+ elem_id="lang_mic"
667
  )
668
 
669
  # Navigation
 
713
  **For therapists**: Use these tools during your sessions to supplement your professional assessment.
714
 
715
  **Privacy Note**: All audio recordings are processed securely and are not stored permanently.
716
+
717
+ **Technical Note**: The first time you record audio, the app will load speech models which may take a moment.
718
  """)
719
 
720
  # Connect components - Conversation Mode
requirements.txt CHANGED
@@ -3,4 +3,7 @@ numpy>=1.19.0
3
  SpeechRecognition>=3.8.1
4
  requests>=2.25.1
5
  gTTS>=2.3.2
6
- Pillow>=8.0.0
 
 
 
 
3
  SpeechRecognition>=3.8.1
4
  requests>=2.25.1
5
  gTTS>=2.3.2
6
+ Pillow>=8.0.0
7
+ transformers>=4.27.0
8
+ torch>=1.13.0
9
+ soundfile>=0.12.1