Spaces:

SreekarB
/

SLP

Sleeping

App Files Files Community

SreekarB commited on Mar 19, 2025

Commit

476583d

verified ·

1 Parent(s): 7fd3d01

Upload 2 files

Browse files

Files changed (2) hide show

app.py +101 -16
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -6,13 +6,34 @@ import time
 import wave
 import requests
 import json
 from gtts import gTTS
 import speech_recognition as sr
 # Conversation state
 conversation = []
-# Hugging Face API configuration
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
@@ -87,6 +108,29 @@ current_assessment = None
 current_item_index = 0
 assessment_results = []
 def get_ai_response(user_text, context=None):
     """Get AI response from Hugging Face API"""
     if not user_text:
@@ -148,10 +192,14 @@ def text_to_speech(text):
         return None
 def speech_to_text(audio):
-    """Convert speech to text using SpeechRecognition"""
     if audio is None:
         return None
     # Extract audio data
     sample_rate, audio_data = audio
@@ -167,16 +215,10 @@ def speech_to_text(audio):
             wf.setframerate(sample_rate)
             wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
-        # Use SpeechRecognition to transcribe
-        recognizer = sr.Recognizer()
-        with sr.AudioFile(temp_path) as source:
-            audio_data = recognizer.record(source)
-            text = recognizer.recognize_google(audio_data)
-            return text
-    except sr.UnknownValueError:
-        return None
-    except sr.RequestError:
-        return "Sorry, I couldn't access the speech recognition service."
     except Exception as e:
         print(f"STT Error: {e}")
         return None
@@ -264,7 +306,6 @@ def process_assessment_audio(audio, assessment_type, item_index):
     elif assessment_type == "language":
         # Similar processing for language assessment
-        # Not fully implemented - would follow similar pattern
         current_task = language_exercises["tasks"][item_index]
         result = {
@@ -304,6 +345,10 @@ def init_articulation_assessment():
     current_item_index = 0
     assessment_results = []
     instructions = articulation_exercises["instructions"]
     first_word = articulation_exercises["words"][0]["word"]
     message = f"{instructions}\n\nFirst word: {first_word}"
@@ -320,6 +365,10 @@ def init_language_assessment():
     current_item_index = 0
     assessment_results = []
     instructions = language_exercises["instructions"]
     first_prompt = language_exercises["tasks"][0]["prompt"]
     message = f"{instructions}\n\nFirst task: {first_prompt}"
@@ -366,6 +415,10 @@ def process_conversation_audio(audio):
     if audio is None:
         return None, "No audio detected. Please try again."
     # Convert speech to text
     transcript = speech_to_text(audio)
@@ -386,6 +439,10 @@ def initialize_conversation():
     global conversation
     conversation = []
     # Add welcome message
     welcome = "Hello! I'm your CASL 2 speech therapy assistant. How can I help you today?"
     conversation.append({"role": "assistant", "content": welcome})
@@ -395,6 +452,14 @@ def initialize_conversation():
     return welcome_audio, format_conversation()
 # Custom CSS
 custom_css = """
 :root {
@@ -462,6 +527,15 @@ button.secondary {
   border-radius: 8px;
   box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
 }
 """
 # Create Gradio interface with tabs for different modes
@@ -474,6 +548,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
         gr.Markdown("# CASL 2 - Speech Therapy Assessment")
         gr.Markdown("An interactive tool for speech therapists to assess and treat speech disorders")
     # Main tabs
     with gr.Tabs() as tabs:
         # Conversation Mode Tab
@@ -490,7 +567,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
                     # Microphone input
                     conv_audio_input = gr.Audio(
                         label="🎤 SPEAK HERE",
-                        type="numpy"
                     )
                 # Right panel - Conversation
@@ -536,7 +615,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
                     # Microphone input
                     art_audio_input = gr.Audio(
                         label="🎤 RECORD RESPONSE",
-                        type="numpy"
                     )
                     # Navigation
@@ -580,7 +661,9 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
                     # Microphone input
                     lang_audio_input = gr.Audio(
                         label="🎤 RECORD RESPONSE",
-                        type="numpy"
                     )
                     # Navigation
@@ -630,6 +713,8 @@ with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as de
         **For therapists**: Use these tools during your sessions to supplement your professional assessment.
         **Privacy Note**: All audio recordings are processed securely and are not stored permanently.
         """)
     # Connect components - Conversation Mode

 import wave
 import requests
 import json
+import torch
 from gtts import gTTS
 import speech_recognition as sr
+import soundfile as sf
+from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
+# Set up speech-to-text model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Use lightweight models suitable for Hugging Face Spaces
+STT_MODEL_ID = "openai/whisper-small"
+TTS_MODEL_ID = "microsoft/speecht5_tts"
+# Initialize the speech recognition model (will load on first use to save memory)
+speech_recognizer = None
+# Initialize the text-to-speech model (will load on first use to save memory)
+tts_processor = None
+tts_model = None
+# Flag to indicate if models are ready
+models_loaded = False
 # Conversation state
 conversation = []
+# Hugging Face API configuration for LLM
 HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
 HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
 current_item_index = 0
 assessment_results = []
+def load_models():
+    """Load speech models on first use"""
+    global speech_recognizer, tts_processor, tts_model, models_loaded
+    try:
+        if speech_recognizer is None:
+            # Load lightweight Whisper model for STT
+            speech_recognizer = pipeline(
+                "automatic-speech-recognition",
+                model=STT_MODEL_ID,
+                torch_dtype=torch_dtype,
+                device=device,
+            )
+            print("Speech recognition model loaded")
+        # We'll use gTTS for TTS since it's more lightweight for Hugging Face Spaces
+        # But we'll keep the code structure to allow for future upgrades
+        models_loaded = True
+        return "Models loaded successfully"
+    except Exception as e:
+        print(f"Error loading models: {e}")
+        return f"Error loading models: {e}"
 def get_ai_response(user_text, context=None):
     """Get AI response from Hugging Face API"""
     if not user_text:
         return None
 def speech_to_text(audio):
+    """Convert speech to text using Whisper model"""
     if audio is None:
         return None
+    # Make sure models are loaded
+    if not models_loaded:
+        load_models()
     # Extract audio data
     sample_rate, audio_data = audio
             wf.setframerate(sample_rate)
             wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
+        # Use Whisper model to transcribe
+        result = speech_recognizer(temp_path)
+        text = result["text"]
+        return text
     except Exception as e:
         print(f"STT Error: {e}")
         return None
     elif assessment_type == "language":
         # Similar processing for language assessment
         current_task = language_exercises["tasks"][item_index]
         result = {
     current_item_index = 0
     assessment_results = []
+    # Make sure models are loaded
+    if not models_loaded:
+        load_models()
     instructions = articulation_exercises["instructions"]
     first_word = articulation_exercises["words"][0]["word"]
     message = f"{instructions}\n\nFirst word: {first_word}"
     current_item_index = 0
     assessment_results = []
+    # Make sure models are loaded
+    if not models_loaded:
+        load_models()
     instructions = language_exercises["instructions"]
     first_prompt = language_exercises["tasks"][0]["prompt"]
     message = f"{instructions}\n\nFirst task: {first_prompt}"
     if audio is None:
         return None, "No audio detected. Please try again."
+    # Make sure models are loaded
+    if not models_loaded:
+        load_models()
     # Convert speech to text
     transcript = speech_to_text(audio)
     global conversation
     conversation = []
+    # Make sure models are loaded
+    if not models_loaded:
+        load_models()
     # Add welcome message
     welcome = "Hello! I'm your CASL 2 speech therapy assistant. How can I help you today?"
     conversation.append({"role": "assistant", "content": welcome})
     return welcome_audio, format_conversation()
+# Status message function
+def get_status():
+    """Get current status of the app"""
+    if models_loaded:
+        return "Models loaded and ready. The app is working in speech-to-speech mode."
+    else:
+        return "Models will be loaded on first use. This may take a moment when you first record audio."
 # Custom CSS
 custom_css = """
 :root {
   border-radius: 8px;
   box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
 }
+.status-bar {
+  margin-top: 1rem;
+  padding: 0.5rem;
+  background-color: #f5f5f5;
+  border-radius: 4px;
+  font-size: 0.9rem;
+  color: #666;
+}
 """
 # Create Gradio interface with tabs for different modes
         gr.Markdown("# CASL 2 - Speech Therapy Assessment")
         gr.Markdown("An interactive tool for speech therapists to assess and treat speech disorders")
+    # Status bar
+    status_box = gr.Textbox(label="Status", value=get_status(), interactive=False, elem_classes="status-bar")
     # Main tabs
     with gr.Tabs() as tabs:
         # Conversation Mode Tab
                     # Microphone input
                     conv_audio_input = gr.Audio(
                         label="🎤 SPEAK HERE",
+                        type="numpy",
+                        sources=["microphone"],
+                        elem_id="conv_mic"
                     )
                 # Right panel - Conversation
                     # Microphone input
                     art_audio_input = gr.Audio(
                         label="🎤 RECORD RESPONSE",
+                        type="numpy",
+                        sources=["microphone"],
+                        elem_id="art_mic"
                     )
                     # Navigation
                     # Microphone input
                     lang_audio_input = gr.Audio(
                         label="🎤 RECORD RESPONSE",
+                        type="numpy",
+                        sources=["microphone"],
+                        elem_id="lang_mic"
                     )
                     # Navigation
         **For therapists**: Use these tools during your sessions to supplement your professional assessment.
         **Privacy Note**: All audio recordings are processed securely and are not stored permanently.
+        **Technical Note**: The first time you record audio, the app will load speech models which may take a moment.
         """)
     # Connect components - Conversation Mode

requirements.txt CHANGED Viewed

@@ -3,4 +3,7 @@ numpy>=1.19.0
 SpeechRecognition>=3.8.1
 requests>=2.25.1
 gTTS>=2.3.2
-Pillow>=8.0.0

 SpeechRecognition>=3.8.1
 requests>=2.25.1
 gTTS>=2.3.2
+Pillow>=8.0.0
+transformers>=4.27.0
+torch>=1.13.0
+soundfile>=0.12.1