Spaces:

SreekarB
/

SLP

Running

SLP

File size: 27,288 Bytes

import gradio as gr
import tempfile
import numpy as np
import os
import time
import wave
import requests
import json
import torch
from gtts import gTTS
import speech_recognition as sr
import soundfile as sf
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq

# Set up speech-to-text model
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Use lightweight models suitable for Hugging Face Spaces
STT_MODEL_ID = "openai/whisper-small"
TTS_MODEL_ID = "microsoft/speecht5_tts"

# Initialize the speech recognition model (will load on first use to save memory)
speech_recognizer = None

# Initialize the text-to-speech model (will load on first use to save memory)
tts_processor = None
tts_model = None

# Flag to indicate if models are ready
models_loaded = False

# Conversation state
conversation = []

# Hugging Face API configuration for LLM
HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")

headers = {
    "Authorization": f"Bearer {HF_API_TOKEN}",
    "Content-Type": "application/json"
}

# Sample assessment data
articulation_exercises = {
    "title": "Articulation Assessment",
    "instructions": "Record the child pronouncing each target word. The system will analyze pronunciation accuracy.",
    "words": [
        {
            "word": "Sun",
            "target_sound": "s",
            "position": "initial",
            "imageUrl": "https://images.unsplash.com/photo-1477500292188-6f0d31f8cb2e?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
        },
        {
            "word": "Mouse",
            "target_sound": "s",
            "position": "final",
            "imageUrl": "https://images.unsplash.com/photo-1425082661705-1834bfd09dca?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
        },
        {
            "word": "Pencil",
            "target_sound": "s",
            "position": "medial",
            "imageUrl": "https://images.unsplash.com/photo-1583485088034-697b5bc54ccd?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
        },
        {
            "word": "Tree",
            "target_sound": "tr",
            "position": "initial",
            "imageUrl": "https://images.unsplash.com/photo-1502082553048-f009c37129b9?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
        },
        {
            "word": "Blue",
            "target_sound": "bl",
            "position": "initial",
            "imageUrl": "https://images.unsplash.com/photo-1557180295-76eee20ae8aa?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
        }
    ]
}

language_exercises = {
    "title": "Language Assessment",
    "instructions": "Assess receptive and expressive language skills with these tasks. Record the child's response to each prompt.",
    "tasks": [
        {
            "prompt": "Point to the item that you eat with.",
            "type": "following_directions",
            "options": ["Fork", "Book", "Shoe", "Car"],
            "correct": "Fork"
        },
        {
            "prompt": "What is the opposite of hot?",
            "type": "vocabulary",
            "correct": "Cold"
        },
        {
            "prompt": "Make a sentence using the word 'happy'.",
            "type": "sentence_formation",
            "evaluation": "subjective"
        }
    ]
}

# Current assessment state
current_assessment = None
current_item_index = 0
assessment_results = []

def load_models():
    """Load speech models on first use"""
    global speech_recognizer, tts_processor, tts_model, models_loaded
    
    try:
        if speech_recognizer is None:
            # Load lightweight Whisper model for STT
            speech_recognizer = pipeline(
                "automatic-speech-recognition", 
                model=STT_MODEL_ID,
                torch_dtype=torch_dtype,
                device=device,
            )
            print("Speech recognition model loaded")
        
        # We'll use gTTS for TTS since it's more lightweight for Hugging Face Spaces
        # But we'll keep the code structure to allow for future upgrades
        models_loaded = True
        return "Models loaded successfully"
    except Exception as e:
        print(f"Error loading models: {e}")
        return f"Error loading models: {e}"

def get_ai_response(user_text, context=None):
    """Get AI response from Hugging Face API"""
    if not user_text:
        return "I couldn't understand what you said. Could you try again?"
    
    # Add user input to conversation history
    conversation.append({"role": "user", "content": user_text})
    
    # Prepare for API call
    system_prompt = "You are a speech therapy assistant for the CASL 2 assessment tool. Provide helpful, supportive feedback for speech exercises."
    if context:
        system_prompt += f" Current context: {context}"
        
    messages = [{"role": "system", "content": system_prompt}]
    messages.extend(conversation)
    
    try:
        if not HF_API_TOKEN:
            response_text = "Please add a Hugging Face API token in the Space settings to enable AI responses."
        else:
            # Make API call
            payload = {
                "inputs": messages,
                "parameters": {
                    "max_new_tokens": 100,
                    "temperature": 0.7,
                    "top_p": 0.9
                }
            }
            
            response = requests.post(HF_API_URL, headers=headers, json=payload)
            
            if response.status_code == 200:
                response_text = response.json()[0]["generated_text"]
            else:
                response_text = f"I'm having trouble connecting to my language model. Error: {response.status_code}"
    except Exception as e:
        response_text = f"An error occurred: {str(e)}"
    
    # Add assistant response to conversation history
    conversation.append({"role": "assistant", "content": response_text})
    
    return response_text

def text_to_speech(text):
    """Convert text to speech using gTTS"""
    try:
        # Create a temporary file
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp:
            filename = temp.name
        
        # Generate speech
        tts = gTTS(text=text, lang="en", slow=False)
        tts.save(filename)
        
        return filename
    except Exception as e:
        print(f"TTS Error: {e}")
        return None

def speech_to_text(audio):
    """Convert speech to text using Whisper model"""
    if audio is None:
        return None
    
    # Make sure models are loaded
    if not models_loaded:
        load_models()
    
    # Extract audio data
    sample_rate, audio_data = audio
    
    # Create a temporary WAV file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        temp_path = temp_file.name
    
    try:
        # Save audio to file
        with wave.open(temp_path, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)  # 16-bit audio
            wf.setframerate(sample_rate)
            wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
        
        # Use Whisper model to transcribe
        result = speech_recognizer(temp_path)
        text = result["text"]
        return text
    except Exception as e:
        print(f"STT Error: {e}")
        return None
    finally:
        # Clean up
        if os.path.exists(temp_path):
            os.unlink(temp_path)

def format_conversation():
    """Format the conversation history for display"""
    result = ""
    for msg in conversation:
        if msg["role"] != "system":  # Skip system messages
            prefix = "User: " if msg["role"] == "user" else "Assistant: "
            result += f"{prefix}{msg['content']}\n\n"
    return result

def analyze_speech(text, target):
    """Simple analysis of speech for assessment"""
    if not text or not target:
        return 0
    
    # Simple analysis - check if target word is in the transcribed text
    # In a real app, this would be more sophisticated
    if target.lower() in text.lower():
        # Simulate accuracy score (in a real app, use phonetic analysis)
        accuracy = np.random.uniform(70, 100)
    else:
        accuracy = np.random.uniform(0, 70)
        
    return accuracy

def process_assessment_audio(audio, assessment_type, item_index):
    """Process recorded audio for assessment item"""
    global current_item_index, assessment_results
    
    if audio is None:
        return None, f"No audio detected. Please try again.", item_index, None
    
    # Convert speech to text
    transcript = speech_to_text(audio)
    
    if not transcript:
        return None, "I couldn't understand the speech. Please try again.", item_index, None
    
    # Process based on assessment type
    if assessment_type == "articulation":
        current_word = articulation_exercises["words"][item_index]
        target_word = current_word["word"]
        accuracy = analyze_speech(transcript, target_word)
        
        result = {
            "word": target_word,
            "target_sound": current_word["target_sound"],
            "position": current_word["position"],
            "transcript": transcript,
            "accuracy": accuracy,
            "passed": accuracy > 70
        }
        
        assessment_results.append(result)
        
        # Get feedback from AI
        context = f"Assessment: Articulation. Target word: {target_word} with {current_word['target_sound']} sound in {current_word['position']} position. User said: {transcript}. Accuracy: {accuracy:.1f}%."
        feedback = get_ai_response(transcript, context)
        
        # Prepare for next item
        next_index = item_index + 1
        if next_index >= len(articulation_exercises["words"]):
            next_index = 0  # Reset or could end assessment
            
        result_display = f"""
        **Word**: {target_word}
        **Transcript**: {transcript}
        **Accuracy**: {accuracy:.1f}%
        **Result**: {"PASSED" if accuracy > 70 else "NEEDS PRACTICE"}
        
        {feedback}
        """
        
        # Return audio response, result display, next item index, and image URL
        response_audio = text_to_speech(feedback)
        next_image = articulation_exercises["words"][next_index]["imageUrl"] if next_index < len(articulation_exercises["words"]) else None
        return response_audio, result_display, next_index, next_image
        
    elif assessment_type == "language":
        # Similar processing for language assessment
        current_task = language_exercises["tasks"][item_index]
        
        result = {
            "prompt": current_task["prompt"],
            "type": current_task["type"],
            "response": transcript,
        }
        
        assessment_results.append(result)
        
        # Get feedback from AI
        context = f"Assessment: Language. Task: {current_task['prompt']}. User said: {transcript}."
        feedback = get_ai_response(transcript, context)
        
        # Prepare for next item
        next_index = item_index + 1
        if next_index >= len(language_exercises["tasks"]):
            next_index = 0  # Reset or could end assessment
            
        result_display = f"""
        **Prompt**: {current_task['prompt']}
        **Response**: {transcript}
        
        {feedback}
        """
        
        # Return audio response, result display, next item index
        response_audio = text_to_speech(feedback)
        return response_audio, result_display, next_index, None
    
    return None, "Unknown assessment type", item_index, None

def init_articulation_assessment():
    """Initialize articulation assessment"""
    global current_assessment, current_item_index, assessment_results
    current_assessment = "articulation"
    current_item_index = 0
    assessment_results = []
    
    # Make sure models are loaded
    if not models_loaded:
        load_models()
    
    instructions = articulation_exercises["instructions"]
    first_word = articulation_exercises["words"][0]["word"]
    message = f"{instructions}\n\nFirst word: {first_word}"
    
    audio_response = text_to_speech(message)
    current_image = articulation_exercises["words"][0]["imageUrl"]
    
    return audio_response, message, current_image, 0

def init_language_assessment():
    """Initialize language assessment"""
    global current_assessment, current_item_index, assessment_results
    current_assessment = "language"
    current_item_index = 0
    assessment_results = []
    
    # Make sure models are loaded
    if not models_loaded:
        load_models()
    
    instructions = language_exercises["instructions"]
    first_prompt = language_exercises["tasks"][0]["prompt"]
    message = f"{instructions}\n\nFirst task: {first_prompt}"
    
    audio_response = text_to_speech(message)
    
    return audio_response, message, None, 0

def update_art_item_indicator(idx):
    """Update articulation item indicator"""
    return f"{idx+1}/{len(articulation_exercises['words'])}"

def update_lang_item_indicator(idx):
    """Update language item indicator"""
    return f"{idx+1}/{len(language_exercises['tasks'])}"

def navigate_articulation(direction, current_idx):
    """Navigate through articulation items"""
    if direction == "prev":
        new_idx = max(0, current_idx - 1)
    else:  # next
        new_idx = min(len(articulation_exercises["words"]) - 1, current_idx + 1)
    
    current_word = articulation_exercises["words"][new_idx]
    message = f"Current word: {current_word['word']}"
    current_image = current_word["imageUrl"]
    
    return update_art_item_indicator(new_idx), message, current_image, new_idx

def navigate_language(direction, current_idx):
    """Navigate through language items"""
    if direction == "prev":
        new_idx = max(0, current_idx - 1)
    else:  # next
        new_idx = min(len(language_exercises["tasks"]) - 1, current_idx + 1)
    
    current_task = language_exercises["tasks"][new_idx]
    message = f"Current task: {current_task['prompt']}"
    
    return update_lang_item_indicator(new_idx), message, new_idx

def process_conversation_audio(audio):
    """Process recorded audio for conversation mode"""
    if audio is None:
        return None, "No audio detected. Please try again."
    
    # Make sure models are loaded
    if not models_loaded:
        load_models()
    
    # Convert speech to text
    transcript = speech_to_text(audio)
    
    if not transcript:
        return None, format_conversation() + "\nI couldn't understand your speech. Please try again."
    
    # Get AI response
    response = get_ai_response(transcript)
    
    # Convert response to speech
    audio_file = text_to_speech(response)
    
    # Return response
    return audio_file, format_conversation()

def initialize_conversation():
    """Initialize the conversation with a welcome message"""
    global conversation
    conversation = []
    
    # Make sure models are loaded
    if not models_loaded:
        load_models()
    
    # Add welcome message
    welcome = "Hello! I'm your CASL 2 speech therapy assistant. How can I help you today?"
    conversation.append({"role": "assistant", "content": welcome})
    
    # Generate speech
    welcome_audio = text_to_speech(welcome)
    
    return welcome_audio, format_conversation()

# Status message function
def get_status():
    """Get current status of the app"""
    if models_loaded:
        return "Models loaded and ready. The app is working in speech-to-speech mode."
    else:
        return "Models will be loaded on first use. This may take a moment when you first record audio."

# Custom CSS
custom_css = """
:root {
  --primary: #4a6fa5;
  --secondary: #6b96c3;
  --accent: #ff7e5f;
  --light: #f9f9f9;
  --dark: #333;
  --success: #4caf50;
  --warning: #ff9800;
  --error: #f44336;
}

.gradio-container {
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
  max-width: 1200px;
  margin: auto;
}

.app-header {
  background-color: var(--primary);
  color: white;
  padding: 1rem;
  border-radius: 8px 8px 0 0;
  margin-bottom: 1rem;
}

.tab-nav {
  margin-bottom: 1rem;
}

.input-panel {
  background-color: white;
  border-radius: 8px;
  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.08);
  padding: 1rem;
  margin-bottom: 1rem;
}

.output-panel {
  background-color: white;
  border-radius: 8px;
  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.08);
  padding: 1rem;
}

button.primary {
  background-color: var(--primary);
  color: white;
}

button.secondary {
  background-color: var(--secondary);
  color: white;
}

.image-display {
  display: flex;
  justify-content: center;
  margin: 1rem 0;
}

.image-display img {
  max-width: 300px;
  border-radius: 8px;
  box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
}

.status-bar {
  margin-top: 1rem;
  padding: 0.5rem;
  background-color: #f5f5f5;
  border-radius: 4px;
  font-size: 0.9rem;
  color: #666;
}
"""

# Create Gradio interface with tabs for different modes
with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as demo:
    # Current state variables (in Gradio 3.50.0, State doesn't have a change event)
    current_item_idx = gr.State(0)
    
    # App header
    with gr.Column(elem_classes="app-header"):
        gr.Markdown("# CASL 2 - Speech Therapy Assessment")
        gr.Markdown("An interactive tool for speech therapists to assess and treat speech disorders")
    
    # Status bar
    status_box = gr.Textbox(label="Status", value=get_status(), interactive=False, elem_classes="status-bar")
    
    # Main tabs
    with gr.Tabs() as tabs:
        # Conversation Mode Tab
        with gr.TabItem("Conversation Assistant", elem_classes="tab-nav"):
            gr.Markdown("### General Conversation Mode")
            gr.Markdown("Have a natural conversation with the AI assistant for general questions and guidance")
            
            with gr.Row():
                # Left panel - Controls
                with gr.Column(scale=1, elem_classes="input-panel"):
                    # Start button
                    conv_start_button = gr.Button("Start Conversation", variant="primary")
                    
                    # Microphone input
                    conv_audio_input = gr.Audio(
                        label="🎤 SPEAK HERE",
                        type="numpy",
                        sources=["microphone"],
                        elem_id="conv_mic"
                    )
                
                # Right panel - Conversation
                with gr.Column(scale=2, elem_classes="output-panel"):
                    # Conversation display
                    conv_display = gr.Textbox(
                        label="Conversation History",
                        lines=12,
                        value=""
                    )
                    
                    # Audio playback
                    conv_audio_output = gr.Audio(
                        label="AI Response",
                        type="filepath",
                        autoplay=True
                    )
        
        # Articulation Assessment Tab
        with gr.TabItem("Articulation Assessment", elem_classes="tab-nav"):
            gr.Markdown("### Articulation Assessment")
            gr.Markdown("Evaluate production of speech sounds in various positions within words")
            
            with gr.Row():
                # Left panel - Controls & Current Word
                with gr.Column(scale=1, elem_classes="input-panel"):
                    # Start button
                    art_start_button = gr.Button("Start Assessment", variant="primary")
                    
                    # Current word display
                    art_current_display = gr.Textbox(
                        label="Current Task",
                        lines=3
                    )
                    
                    # Word image
                    art_image = gr.Image(
                        label="Word Image",
                        type="filepath",
                        elem_classes="image-display"
                    )
                    
                    # Microphone input
                    art_audio_input = gr.Audio(
                        label="🎤 RECORD RESPONSE",
                        type="numpy",
                        sources=["microphone"],
                        elem_id="art_mic"
                    )
                    
                    # Navigation
                    with gr.Row():
                        art_prev_button = gr.Button("◀ Previous")
                        art_item_indicator = gr.Textbox(label="Item", value="1/5", interactive=False)
                        art_next_button = gr.Button("Next ▶")
                
                # Right panel - Results
                with gr.Column(scale=2, elem_classes="output-panel"):
                    # Results display
                    art_result_display = gr.Markdown(
                        label="Assessment Results",
                        value="Start the assessment to see results."
                    )
                    
                    # Audio feedback
                    art_audio_output = gr.Audio(
                        label="Speech Therapist Feedback",
                        type="filepath",
                        autoplay=True
                    )
        
        # Language Assessment Tab
        with gr.TabItem("Language Assessment", elem_classes="tab-nav"):
            gr.Markdown("### Language Assessment")
            gr.Markdown("Evaluate receptive and expressive language skills including vocabulary and grammar")
            
            with gr.Row():
                # Left panel - Controls & Current Task
                with gr.Column(scale=1, elem_classes="input-panel"):
                    # Start button
                    lang_start_button = gr.Button("Start Assessment", variant="primary")
                    
                    # Current task display
                    lang_current_display = gr.Textbox(
                        label="Current Task",
                        lines=3
                    )
                    
                    # Microphone input
                    lang_audio_input = gr.Audio(
                        label="🎤 RECORD RESPONSE",
                        type="numpy",
                        sources=["microphone"],
                        elem_id="lang_mic"
                    )
                    
                    # Navigation
                    with gr.Row():
                        lang_prev_button = gr.Button("◀ Previous")
                        lang_item_indicator = gr.Textbox(label="Item", value="1/3", interactive=False)
                        lang_next_button = gr.Button("Next ▶")
                
                # Right panel - Results
                with gr.Column(scale=2, elem_classes="output-panel"):
                    # Results display
                    lang_result_display = gr.Markdown(
                        label="Assessment Results",
                        value="Start the assessment to see results."
                    )
                    
                    # Audio feedback
                    lang_audio_output = gr.Audio(
                        label="Speech Therapist Feedback",
                        type="filepath",
                        autoplay=True
                    )
    
    # Instructions
    with gr.Accordion("How to use CASL 2", open=True):
        gr.Markdown("""
        ## CASL 2 Speech Therapy Assessment Tool
        
        This application provides three main functions:
        
        ### 1. Conversation Assistant
        - General conversation with an AI assistant
        - Ask questions about speech therapy, techniques, or general information
        - Get guidance on using the assessment tools
        
        ### 2. Articulation Assessment
        - Evaluate speech sound production
        - Record the patient pronouncing target words
        - Get automatic analysis and therapist feedback
        - Track progress over time
        
        ### 3. Language Assessment
        - Evaluate receptive and expressive language skills
        - Test vocabulary, following directions, and sentence formation
        - Record responses and get professional feedback
        
        **For therapists**: Use these tools during your sessions to supplement your professional assessment.
        
        **Privacy Note**: All audio recordings are processed securely and are not stored permanently.
        
        **Technical Note**: The first time you record audio, the app will load speech models which may take a moment.
        """)
    
    # Connect components - Conversation Mode
    conv_start_button.click(
        fn=initialize_conversation,
        outputs=[conv_audio_output, conv_display]
    )
    
    conv_audio_input.change(
        fn=process_conversation_audio,
        inputs=[conv_audio_input],
        outputs=[conv_audio_output, conv_display]
    )
    
    # Connect components - Articulation Assessment
    art_start_button.click(
        fn=init_articulation_assessment,
        outputs=[art_audio_output, art_current_display, art_image, current_item_idx]
    )
    
    art_audio_input.change(
        fn=process_assessment_audio,
        inputs=[art_audio_input, gr.Textbox(value="articulation", visible=False), current_item_idx],
        outputs=[art_audio_output, art_result_display, current_item_idx, art_image]
    )
    
    # Fixed navigation for Gradio 3.50.0
    art_next_button.click(
        fn=navigate_articulation,
        inputs=[gr.Textbox(value="next", visible=False), current_item_idx],
        outputs=[art_item_indicator, art_current_display, art_image, current_item_idx]
    )
    
    art_prev_button.click(
        fn=navigate_articulation,
        inputs=[gr.Textbox(value="prev", visible=False), current_item_idx],
        outputs=[art_item_indicator, art_current_display, art_image, current_item_idx]
    )
    
    # Connect components - Language Assessment
    lang_start_button.click(
        fn=init_language_assessment,
        outputs=[lang_audio_output, lang_current_display, gr.Image(visible=False), current_item_idx]
    )
    
    lang_audio_input.change(
        fn=process_assessment_audio,
        inputs=[lang_audio_input, gr.Textbox(value="language", visible=False), current_item_idx],
        outputs=[lang_audio_output, lang_result_display, current_item_idx, gr.Image(visible=False)]
    )
    
    # Fixed navigation for language assessment
    lang_next_button.click(
        fn=navigate_language,
        inputs=[gr.Textbox(value="next", visible=False), current_item_idx],
        outputs=[lang_item_indicator, lang_current_display, current_item_idx]
    )
    
    lang_prev_button.click(
        fn=navigate_language,
        inputs=[gr.Textbox(value="prev", visible=False), current_item_idx],
        outputs=[lang_item_indicator, lang_current_display, current_item_idx]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()