import gradio as gr import tempfile import numpy as np import os import time import wave import requests import json import torch from gtts import gTTS import speech_recognition as sr import soundfile as sf from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq # Set up speech-to-text model device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Use lightweight models suitable for Hugging Face Spaces STT_MODEL_ID = "openai/whisper-small" TTS_MODEL_ID = "microsoft/speecht5_tts" # Initialize the speech recognition model (will load on first use to save memory) speech_recognizer = None # Initialize the text-to-speech model (will load on first use to save memory) tts_processor = None tts_model = None # Flag to indicate if models are ready models_loaded = False # Conversation state conversation = [] # Hugging Face API configuration for LLM HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf" HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "") headers = { "Authorization": f"Bearer {HF_API_TOKEN}", "Content-Type": "application/json" } # Sample assessment data articulation_exercises = { "title": "Articulation Assessment", "instructions": "Record the child pronouncing each target word. The system will analyze pronunciation accuracy.", "words": [ { "word": "Sun", "target_sound": "s", "position": "initial", "imageUrl": "https://images.unsplash.com/photo-1477500292188-6f0d31f8cb2e?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80" }, { "word": "Mouse", "target_sound": "s", "position": "final", "imageUrl": "https://images.unsplash.com/photo-1425082661705-1834bfd09dca?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80" }, { "word": "Pencil", "target_sound": "s", "position": "medial", "imageUrl": "https://images.unsplash.com/photo-1583485088034-697b5bc54ccd?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80" }, { "word": "Tree", "target_sound": "tr", "position": "initial", "imageUrl": "https://images.unsplash.com/photo-1502082553048-f009c37129b9?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80" }, { "word": "Blue", "target_sound": "bl", "position": "initial", "imageUrl": "https://images.unsplash.com/photo-1557180295-76eee20ae8aa?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80" } ] } language_exercises = { "title": "Language Assessment", "instructions": "Assess receptive and expressive language skills with these tasks. Record the child's response to each prompt.", "tasks": [ { "prompt": "Point to the item that you eat with.", "type": "following_directions", "options": ["Fork", "Book", "Shoe", "Car"], "correct": "Fork" }, { "prompt": "What is the opposite of hot?", "type": "vocabulary", "correct": "Cold" }, { "prompt": "Make a sentence using the word 'happy'.", "type": "sentence_formation", "evaluation": "subjective" } ] } # Current assessment state current_assessment = None current_item_index = 0 assessment_results = [] def load_models(): """Load speech models on first use""" global speech_recognizer, tts_processor, tts_model, models_loaded try: if speech_recognizer is None: # Load lightweight Whisper model for STT speech_recognizer = pipeline( "automatic-speech-recognition", model=STT_MODEL_ID, torch_dtype=torch_dtype, device=device, ) print("Speech recognition model loaded") # We'll use gTTS for TTS since it's more lightweight for Hugging Face Spaces # But we'll keep the code structure to allow for future upgrades models_loaded = True return "Models loaded successfully" except Exception as e: print(f"Error loading models: {e}") return f"Error loading models: {e}" def get_ai_response(user_text, context=None): """Get AI response from Hugging Face API""" if not user_text: return "I couldn't understand what you said. Could you try again?" # Add user input to conversation history conversation.append({"role": "user", "content": user_text}) # Prepare for API call system_prompt = "You are a speech therapy assistant for the CASL 2 assessment tool. Provide helpful, supportive feedback for speech exercises." if context: system_prompt += f" Current context: {context}" messages = [{"role": "system", "content": system_prompt}] messages.extend(conversation) try: if not HF_API_TOKEN: response_text = "Please add a Hugging Face API token in the Space settings to enable AI responses." else: # Make API call payload = { "inputs": messages, "parameters": { "max_new_tokens": 100, "temperature": 0.7, "top_p": 0.9 } } response = requests.post(HF_API_URL, headers=headers, json=payload) if response.status_code == 200: response_text = response.json()[0]["generated_text"] else: response_text = f"I'm having trouble connecting to my language model. Error: {response.status_code}" except Exception as e: response_text = f"An error occurred: {str(e)}" # Add assistant response to conversation history conversation.append({"role": "assistant", "content": response_text}) return response_text def text_to_speech(text): """Convert text to speech using gTTS""" try: # Create a temporary file with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp: filename = temp.name # Generate speech tts = gTTS(text=text, lang="en", slow=False) tts.save(filename) return filename except Exception as e: print(f"TTS Error: {e}") return None def speech_to_text(audio): """Convert speech to text using Whisper model""" if audio is None: return None # Make sure models are loaded if not models_loaded: load_models() # Extract audio data sample_rate, audio_data = audio # Create a temporary WAV file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: temp_path = temp_file.name try: # Save audio to file with wave.open(temp_path, 'wb') as wf: wf.setnchannels(1) wf.setsampwidth(2) # 16-bit audio wf.setframerate(sample_rate) wf.writeframes((audio_data * 32767).astype(np.int16).tobytes()) # Use Whisper model to transcribe result = speech_recognizer(temp_path) text = result["text"] return text except Exception as e: print(f"STT Error: {e}") return None finally: # Clean up if os.path.exists(temp_path): os.unlink(temp_path) def format_conversation(): """Format the conversation history for display""" result = "" for msg in conversation: if msg["role"] != "system": # Skip system messages prefix = "User: " if msg["role"] == "user" else "Assistant: " result += f"{prefix}{msg['content']}\n\n" return result def analyze_speech(text, target): """Simple analysis of speech for assessment""" if not text or not target: return 0 # Simple analysis - check if target word is in the transcribed text # In a real app, this would be more sophisticated if target.lower() in text.lower(): # Simulate accuracy score (in a real app, use phonetic analysis) accuracy = np.random.uniform(70, 100) else: accuracy = np.random.uniform(0, 70) return accuracy def process_assessment_audio(audio, assessment_type, item_index): """Process recorded audio for assessment item""" global current_item_index, assessment_results if audio is None: return None, f"No audio detected. Please try again.", item_index, None # Convert speech to text transcript = speech_to_text(audio) if not transcript: return None, "I couldn't understand the speech. Please try again.", item_index, None # Process based on assessment type if assessment_type == "articulation": current_word = articulation_exercises["words"][item_index] target_word = current_word["word"] accuracy = analyze_speech(transcript, target_word) result = { "word": target_word, "target_sound": current_word["target_sound"], "position": current_word["position"], "transcript": transcript, "accuracy": accuracy, "passed": accuracy > 70 } assessment_results.append(result) # Get feedback from AI context = f"Assessment: Articulation. Target word: {target_word} with {current_word['target_sound']} sound in {current_word['position']} position. User said: {transcript}. Accuracy: {accuracy:.1f}%." feedback = get_ai_response(transcript, context) # Prepare for next item next_index = item_index + 1 if next_index >= len(articulation_exercises["words"]): next_index = 0 # Reset or could end assessment result_display = f""" **Word**: {target_word} **Transcript**: {transcript} **Accuracy**: {accuracy:.1f}% **Result**: {"PASSED" if accuracy > 70 else "NEEDS PRACTICE"} {feedback} """ # Return audio response, result display, next item index, and image URL response_audio = text_to_speech(feedback) next_image = articulation_exercises["words"][next_index]["imageUrl"] if next_index < len(articulation_exercises["words"]) else None return response_audio, result_display, next_index, next_image elif assessment_type == "language": # Similar processing for language assessment current_task = language_exercises["tasks"][item_index] result = { "prompt": current_task["prompt"], "type": current_task["type"], "response": transcript, } assessment_results.append(result) # Get feedback from AI context = f"Assessment: Language. Task: {current_task['prompt']}. User said: {transcript}." feedback = get_ai_response(transcript, context) # Prepare for next item next_index = item_index + 1 if next_index >= len(language_exercises["tasks"]): next_index = 0 # Reset or could end assessment result_display = f""" **Prompt**: {current_task['prompt']} **Response**: {transcript} {feedback} """ # Return audio response, result display, next item index response_audio = text_to_speech(feedback) return response_audio, result_display, next_index, None return None, "Unknown assessment type", item_index, None def init_articulation_assessment(): """Initialize articulation assessment""" global current_assessment, current_item_index, assessment_results current_assessment = "articulation" current_item_index = 0 assessment_results = [] # Make sure models are loaded if not models_loaded: load_models() instructions = articulation_exercises["instructions"] first_word = articulation_exercises["words"][0]["word"] message = f"{instructions}\n\nFirst word: {first_word}" audio_response = text_to_speech(message) current_image = articulation_exercises["words"][0]["imageUrl"] return audio_response, message, current_image, 0 def init_language_assessment(): """Initialize language assessment""" global current_assessment, current_item_index, assessment_results current_assessment = "language" current_item_index = 0 assessment_results = [] # Make sure models are loaded if not models_loaded: load_models() instructions = language_exercises["instructions"] first_prompt = language_exercises["tasks"][0]["prompt"] message = f"{instructions}\n\nFirst task: {first_prompt}" audio_response = text_to_speech(message) return audio_response, message, None, 0 def update_art_item_indicator(idx): """Update articulation item indicator""" return f"{idx+1}/{len(articulation_exercises['words'])}" def update_lang_item_indicator(idx): """Update language item indicator""" return f"{idx+1}/{len(language_exercises['tasks'])}" def navigate_articulation(direction, current_idx): """Navigate through articulation items""" if direction == "prev": new_idx = max(0, current_idx - 1) else: # next new_idx = min(len(articulation_exercises["words"]) - 1, current_idx + 1) current_word = articulation_exercises["words"][new_idx] message = f"Current word: {current_word['word']}" current_image = current_word["imageUrl"] return update_art_item_indicator(new_idx), message, current_image, new_idx def navigate_language(direction, current_idx): """Navigate through language items""" if direction == "prev": new_idx = max(0, current_idx - 1) else: # next new_idx = min(len(language_exercises["tasks"]) - 1, current_idx + 1) current_task = language_exercises["tasks"][new_idx] message = f"Current task: {current_task['prompt']}" return update_lang_item_indicator(new_idx), message, new_idx def process_conversation_audio(audio): """Process recorded audio for conversation mode""" if audio is None: return None, "No audio detected. Please try again." # Make sure models are loaded if not models_loaded: load_models() # Convert speech to text transcript = speech_to_text(audio) if not transcript: return None, format_conversation() + "\nI couldn't understand your speech. Please try again." # Get AI response response = get_ai_response(transcript) # Convert response to speech audio_file = text_to_speech(response) # Return response return audio_file, format_conversation() def initialize_conversation(): """Initialize the conversation with a welcome message""" global conversation conversation = [] # Make sure models are loaded if not models_loaded: load_models() # Add welcome message welcome = "Hello! I'm your CASL 2 speech therapy assistant. How can I help you today?" conversation.append({"role": "assistant", "content": welcome}) # Generate speech welcome_audio = text_to_speech(welcome) return welcome_audio, format_conversation() # Status message function def get_status(): """Get current status of the app""" if models_loaded: return "Models loaded and ready. The app is working in speech-to-speech mode." else: return "Models will be loaded on first use. This may take a moment when you first record audio." # Custom CSS custom_css = """ :root { --primary: #4a6fa5; --secondary: #6b96c3; --accent: #ff7e5f; --light: #f9f9f9; --dark: #333; --success: #4caf50; --warning: #ff9800; --error: #f44336; } .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; max-width: 1200px; margin: auto; } .app-header { background-color: var(--primary); color: white; padding: 1rem; border-radius: 8px 8px 0 0; margin-bottom: 1rem; } .tab-nav { margin-bottom: 1rem; } .input-panel { background-color: white; border-radius: 8px; box-shadow: 0 2px 10px rgba(0, 0, 0, 0.08); padding: 1rem; margin-bottom: 1rem; } .output-panel { background-color: white; border-radius: 8px; box-shadow: 0 2px 10px rgba(0, 0, 0, 0.08); padding: 1rem; } button.primary { background-color: var(--primary); color: white; } button.secondary { background-color: var(--secondary); color: white; } .image-display { display: flex; justify-content: center; margin: 1rem 0; } .image-display img { max-width: 300px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1); } .status-bar { margin-top: 1rem; padding: 0.5rem; background-color: #f5f5f5; border-radius: 4px; font-size: 0.9rem; color: #666; } """ # Create Gradio interface with tabs for different modes with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as demo: # Current state variables (in Gradio 3.50.0, State doesn't have a change event) current_item_idx = gr.State(0) # App header with gr.Column(elem_classes="app-header"): gr.Markdown("# CASL 2 - Speech Therapy Assessment") gr.Markdown("An interactive tool for speech therapists to assess and treat speech disorders") # Status bar status_box = gr.Textbox(label="Status", value=get_status(), interactive=False, elem_classes="status-bar") # Main tabs with gr.Tabs() as tabs: # Conversation Mode Tab with gr.TabItem("Conversation Assistant", elem_classes="tab-nav"): gr.Markdown("### General Conversation Mode") gr.Markdown("Have a natural conversation with the AI assistant for general questions and guidance") with gr.Row(): # Left panel - Controls with gr.Column(scale=1, elem_classes="input-panel"): # Start button conv_start_button = gr.Button("Start Conversation", variant="primary") # Microphone input conv_audio_input = gr.Audio( label="🎤 SPEAK HERE", type="numpy", sources=["microphone"], elem_id="conv_mic" ) # Right panel - Conversation with gr.Column(scale=2, elem_classes="output-panel"): # Conversation display conv_display = gr.Textbox( label="Conversation History", lines=12, value="" ) # Audio playback conv_audio_output = gr.Audio( label="AI Response", type="filepath", autoplay=True ) # Articulation Assessment Tab with gr.TabItem("Articulation Assessment", elem_classes="tab-nav"): gr.Markdown("### Articulation Assessment") gr.Markdown("Evaluate production of speech sounds in various positions within words") with gr.Row(): # Left panel - Controls & Current Word with gr.Column(scale=1, elem_classes="input-panel"): # Start button art_start_button = gr.Button("Start Assessment", variant="primary") # Current word display art_current_display = gr.Textbox( label="Current Task", lines=3 ) # Word image art_image = gr.Image( label="Word Image", type="filepath", elem_classes="image-display" ) # Microphone input art_audio_input = gr.Audio( label="🎤 RECORD RESPONSE", type="numpy", sources=["microphone"], elem_id="art_mic" ) # Navigation with gr.Row(): art_prev_button = gr.Button("◀ Previous") art_item_indicator = gr.Textbox(label="Item", value="1/5", interactive=False) art_next_button = gr.Button("Next ▶") # Right panel - Results with gr.Column(scale=2, elem_classes="output-panel"): # Results display art_result_display = gr.Markdown( label="Assessment Results", value="Start the assessment to see results." ) # Audio feedback art_audio_output = gr.Audio( label="Speech Therapist Feedback", type="filepath", autoplay=True ) # Language Assessment Tab with gr.TabItem("Language Assessment", elem_classes="tab-nav"): gr.Markdown("### Language Assessment") gr.Markdown("Evaluate receptive and expressive language skills including vocabulary and grammar") with gr.Row(): # Left panel - Controls & Current Task with gr.Column(scale=1, elem_classes="input-panel"): # Start button lang_start_button = gr.Button("Start Assessment", variant="primary") # Current task display lang_current_display = gr.Textbox( label="Current Task", lines=3 ) # Microphone input lang_audio_input = gr.Audio( label="🎤 RECORD RESPONSE", type="numpy", sources=["microphone"], elem_id="lang_mic" ) # Navigation with gr.Row(): lang_prev_button = gr.Button("◀ Previous") lang_item_indicator = gr.Textbox(label="Item", value="1/3", interactive=False) lang_next_button = gr.Button("Next ▶") # Right panel - Results with gr.Column(scale=2, elem_classes="output-panel"): # Results display lang_result_display = gr.Markdown( label="Assessment Results", value="Start the assessment to see results." ) # Audio feedback lang_audio_output = gr.Audio( label="Speech Therapist Feedback", type="filepath", autoplay=True ) # Instructions with gr.Accordion("How to use CASL 2", open=True): gr.Markdown(""" ## CASL 2 Speech Therapy Assessment Tool This application provides three main functions: ### 1. Conversation Assistant - General conversation with an AI assistant - Ask questions about speech therapy, techniques, or general information - Get guidance on using the assessment tools ### 2. Articulation Assessment - Evaluate speech sound production - Record the patient pronouncing target words - Get automatic analysis and therapist feedback - Track progress over time ### 3. Language Assessment - Evaluate receptive and expressive language skills - Test vocabulary, following directions, and sentence formation - Record responses and get professional feedback **For therapists**: Use these tools during your sessions to supplement your professional assessment. **Privacy Note**: All audio recordings are processed securely and are not stored permanently. **Technical Note**: The first time you record audio, the app will load speech models which may take a moment. """) # Connect components - Conversation Mode conv_start_button.click( fn=initialize_conversation, outputs=[conv_audio_output, conv_display] ) conv_audio_input.change( fn=process_conversation_audio, inputs=[conv_audio_input], outputs=[conv_audio_output, conv_display] ) # Connect components - Articulation Assessment art_start_button.click( fn=init_articulation_assessment, outputs=[art_audio_output, art_current_display, art_image, current_item_idx] ) art_audio_input.change( fn=process_assessment_audio, inputs=[art_audio_input, gr.Textbox(value="articulation", visible=False), current_item_idx], outputs=[art_audio_output, art_result_display, current_item_idx, art_image] ) # Fixed navigation for Gradio 3.50.0 art_next_button.click( fn=navigate_articulation, inputs=[gr.Textbox(value="next", visible=False), current_item_idx], outputs=[art_item_indicator, art_current_display, art_image, current_item_idx] ) art_prev_button.click( fn=navigate_articulation, inputs=[gr.Textbox(value="prev", visible=False), current_item_idx], outputs=[art_item_indicator, art_current_display, art_image, current_item_idx] ) # Connect components - Language Assessment lang_start_button.click( fn=init_language_assessment, outputs=[lang_audio_output, lang_current_display, gr.Image(visible=False), current_item_idx] ) lang_audio_input.change( fn=process_assessment_audio, inputs=[lang_audio_input, gr.Textbox(value="language", visible=False), current_item_idx], outputs=[lang_audio_output, lang_result_display, current_item_idx, gr.Image(visible=False)] ) # Fixed navigation for language assessment lang_next_button.click( fn=navigate_language, inputs=[gr.Textbox(value="next", visible=False), current_item_idx], outputs=[lang_item_indicator, lang_current_display, current_item_idx] ) lang_prev_button.click( fn=navigate_language, inputs=[gr.Textbox(value="prev", visible=False), current_item_idx], outputs=[lang_item_indicator, lang_current_display, current_item_idx] ) # Launch the app if __name__ == "__main__": demo.launch()