Spaces:

SreekarB
/

SLP

Sleeping

App Files Files Community

SLP / app.py

SreekarB

Upload 2 files

476583d verified 10 months ago

raw

history blame contribute delete

27.3 kB

	import gradio as gr
	import tempfile
	import numpy as np
	import os
	import time
	import wave
	import requests
	import json
	import torch
	from gtts import gTTS
	import speech_recognition as sr
	import soundfile as sf
	from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq

	# Set up speech-to-text model
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Use lightweight models suitable for Hugging Face Spaces
	STT_MODEL_ID = "openai/whisper-small"
	TTS_MODEL_ID = "microsoft/speecht5_tts"

	# Initialize the speech recognition model (will load on first use to save memory)
	speech_recognizer = None

	# Initialize the text-to-speech model (will load on first use to save memory)
	tts_processor = None
	tts_model = None

	# Flag to indicate if models are ready
	models_loaded = False

	# Conversation state
	conversation = []

	# Hugging Face API configuration for LLM
	HF_API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
	HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")

	headers = {
	"Authorization": f"Bearer {HF_API_TOKEN}",
	"Content-Type": "application/json"
	}

	# Sample assessment data
	articulation_exercises = {
	"title": "Articulation Assessment",
	"instructions": "Record the child pronouncing each target word. The system will analyze pronunciation accuracy.",
	"words": [
	{
	"word": "Sun",
	"target_sound": "s",
	"position": "initial",
	"imageUrl": "https://images.unsplash.com/photo-1477500292188-6f0d31f8cb2e?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
	},
	{
	"word": "Mouse",
	"target_sound": "s",
	"position": "final",
	"imageUrl": "https://images.unsplash.com/photo-1425082661705-1834bfd09dca?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
	},
	{
	"word": "Pencil",
	"target_sound": "s",
	"position": "medial",
	"imageUrl": "https://images.unsplash.com/photo-1583485088034-697b5bc54ccd?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
	},
	{
	"word": "Tree",
	"target_sound": "tr",
	"position": "initial",
	"imageUrl": "https://images.unsplash.com/photo-1502082553048-f009c37129b9?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
	},
	{
	"word": "Blue",
	"target_sound": "bl",
	"position": "initial",
	"imageUrl": "https://images.unsplash.com/photo-1557180295-76eee20ae8aa?ixlib=rb-1.2.1&auto=format&fit=crop&w=300&q=80"
	}
	]
	}

	language_exercises = {
	"title": "Language Assessment",
	"instructions": "Assess receptive and expressive language skills with these tasks. Record the child's response to each prompt.",
	"tasks": [
	{
	"prompt": "Point to the item that you eat with.",
	"type": "following_directions",
	"options": ["Fork", "Book", "Shoe", "Car"],
	"correct": "Fork"
	},
	{
	"prompt": "What is the opposite of hot?",
	"type": "vocabulary",
	"correct": "Cold"
	},
	{
	"prompt": "Make a sentence using the word 'happy'.",
	"type": "sentence_formation",
	"evaluation": "subjective"
	}
	]
	}

	# Current assessment state
	current_assessment = None
	current_item_index = 0
	assessment_results = []

	def load_models():
	"""Load speech models on first use"""
	global speech_recognizer, tts_processor, tts_model, models_loaded

	try:
	if speech_recognizer is None:
	# Load lightweight Whisper model for STT
	speech_recognizer = pipeline(
	"automatic-speech-recognition",
	model=STT_MODEL_ID,
	torch_dtype=torch_dtype,
	device=device,
	)
	print("Speech recognition model loaded")

	# We'll use gTTS for TTS since it's more lightweight for Hugging Face Spaces
	# But we'll keep the code structure to allow for future upgrades
	models_loaded = True
	return "Models loaded successfully"
	except Exception as e:
	print(f"Error loading models: {e}")
	return f"Error loading models: {e}"

	def get_ai_response(user_text, context=None):
	"""Get AI response from Hugging Face API"""
	if not user_text:
	return "I couldn't understand what you said. Could you try again?"

	# Add user input to conversation history
	conversation.append({"role": "user", "content": user_text})

	# Prepare for API call
	system_prompt = "You are a speech therapy assistant for the CASL 2 assessment tool. Provide helpful, supportive feedback for speech exercises."
	if context:
	system_prompt += f" Current context: {context}"

	messages = [{"role": "system", "content": system_prompt}]
	messages.extend(conversation)

	try:
	if not HF_API_TOKEN:
	response_text = "Please add a Hugging Face API token in the Space settings to enable AI responses."
	else:
	# Make API call
	payload = {
	"inputs": messages,
	"parameters": {
	"max_new_tokens": 100,
	"temperature": 0.7,
	"top_p": 0.9
	}
	}

	response = requests.post(HF_API_URL, headers=headers, json=payload)

	if response.status_code == 200:
	response_text = response.json()[0]["generated_text"]
	else:
	response_text = f"I'm having trouble connecting to my language model. Error: {response.status_code}"
	except Exception as e:
	response_text = f"An error occurred: {str(e)}"

	# Add assistant response to conversation history
	conversation.append({"role": "assistant", "content": response_text})

	return response_text

	def text_to_speech(text):
	"""Convert text to speech using gTTS"""
	try:
	# Create a temporary file
	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp:
	filename = temp.name

	# Generate speech
	tts = gTTS(text=text, lang="en", slow=False)
	tts.save(filename)

	return filename
	except Exception as e:
	print(f"TTS Error: {e}")
	return None

	def speech_to_text(audio):
	"""Convert speech to text using Whisper model"""
	if audio is None:
	return None

	# Make sure models are loaded
	if not models_loaded:
	load_models()

	# Extract audio data
	sample_rate, audio_data = audio

	# Create a temporary WAV file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
	temp_path = temp_file.name

	try:
	# Save audio to file
	with wave.open(temp_path, 'wb') as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2) # 16-bit audio
	wf.setframerate(sample_rate)
	wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())

	# Use Whisper model to transcribe
	result = speech_recognizer(temp_path)
	text = result["text"]
	return text
	except Exception as e:
	print(f"STT Error: {e}")
	return None
	finally:
	# Clean up
	if os.path.exists(temp_path):
	os.unlink(temp_path)

	def format_conversation():
	"""Format the conversation history for display"""
	result = ""
	for msg in conversation:
	if msg["role"] != "system": # Skip system messages
	prefix = "User: " if msg["role"] == "user" else "Assistant: "
	result += f"{prefix}{msg['content']}\n\n"
	return result

	def analyze_speech(text, target):
	"""Simple analysis of speech for assessment"""
	if not text or not target:
	return 0

	# Simple analysis - check if target word is in the transcribed text
	# In a real app, this would be more sophisticated
	if target.lower() in text.lower():
	# Simulate accuracy score (in a real app, use phonetic analysis)
	accuracy = np.random.uniform(70, 100)
	else:
	accuracy = np.random.uniform(0, 70)

	return accuracy

	def process_assessment_audio(audio, assessment_type, item_index):
	"""Process recorded audio for assessment item"""
	global current_item_index, assessment_results

	if audio is None:
	return None, f"No audio detected. Please try again.", item_index, None

	# Convert speech to text
	transcript = speech_to_text(audio)

	if not transcript:
	return None, "I couldn't understand the speech. Please try again.", item_index, None

	# Process based on assessment type
	if assessment_type == "articulation":
	current_word = articulation_exercises["words"][item_index]
	target_word = current_word["word"]
	accuracy = analyze_speech(transcript, target_word)

	result = {
	"word": target_word,
	"target_sound": current_word["target_sound"],
	"position": current_word["position"],
	"transcript": transcript,
	"accuracy": accuracy,
	"passed": accuracy > 70
	}

	assessment_results.append(result)

	# Get feedback from AI
	context = f"Assessment: Articulation. Target word: {target_word} with {current_word['target_sound']} sound in {current_word['position']} position. User said: {transcript}. Accuracy: {accuracy:.1f}%."
	feedback = get_ai_response(transcript, context)

	# Prepare for next item
	next_index = item_index + 1
	if next_index >= len(articulation_exercises["words"]):
	next_index = 0 # Reset or could end assessment

	result_display = f"""
	Word: {target_word}
	Transcript: {transcript}
	Accuracy: {accuracy:.1f}%
	Result: {"PASSED" if accuracy > 70 else "NEEDS PRACTICE"}

	{feedback}
	"""

	# Return audio response, result display, next item index, and image URL
	response_audio = text_to_speech(feedback)
	next_image = articulation_exercises["words"][next_index]["imageUrl"] if next_index < len(articulation_exercises["words"]) else None
	return response_audio, result_display, next_index, next_image

	elif assessment_type == "language":
	# Similar processing for language assessment
	current_task = language_exercises["tasks"][item_index]

	result = {
	"prompt": current_task["prompt"],
	"type": current_task["type"],
	"response": transcript,
	}

	assessment_results.append(result)

	# Get feedback from AI
	context = f"Assessment: Language. Task: {current_task['prompt']}. User said: {transcript}."
	feedback = get_ai_response(transcript, context)

	# Prepare for next item
	next_index = item_index + 1
	if next_index >= len(language_exercises["tasks"]):
	next_index = 0 # Reset or could end assessment

	result_display = f"""
	Prompt: {current_task['prompt']}
	Response: {transcript}

	{feedback}
	"""

	# Return audio response, result display, next item index
	response_audio = text_to_speech(feedback)
	return response_audio, result_display, next_index, None

	return None, "Unknown assessment type", item_index, None

	def init_articulation_assessment():
	"""Initialize articulation assessment"""
	global current_assessment, current_item_index, assessment_results
	current_assessment = "articulation"
	current_item_index = 0
	assessment_results = []

	# Make sure models are loaded
	if not models_loaded:
	load_models()

	instructions = articulation_exercises["instructions"]
	first_word = articulation_exercises["words"][0]["word"]
	message = f"{instructions}\n\nFirst word: {first_word}"

	audio_response = text_to_speech(message)
	current_image = articulation_exercises["words"][0]["imageUrl"]

	return audio_response, message, current_image, 0

	def init_language_assessment():
	"""Initialize language assessment"""
	global current_assessment, current_item_index, assessment_results
	current_assessment = "language"
	current_item_index = 0
	assessment_results = []

	# Make sure models are loaded
	if not models_loaded:
	load_models()

	instructions = language_exercises["instructions"]
	first_prompt = language_exercises["tasks"][0]["prompt"]
	message = f"{instructions}\n\nFirst task: {first_prompt}"

	audio_response = text_to_speech(message)

	return audio_response, message, None, 0

	def update_art_item_indicator(idx):
	"""Update articulation item indicator"""
	return f"{idx+1}/{len(articulation_exercises['words'])}"

	def update_lang_item_indicator(idx):
	"""Update language item indicator"""
	return f"{idx+1}/{len(language_exercises['tasks'])}"

	def navigate_articulation(direction, current_idx):
	"""Navigate through articulation items"""
	if direction == "prev":
	new_idx = max(0, current_idx - 1)
	else: # next
	new_idx = min(len(articulation_exercises["words"]) - 1, current_idx + 1)

	current_word = articulation_exercises["words"][new_idx]
	message = f"Current word: {current_word['word']}"
	current_image = current_word["imageUrl"]

	return update_art_item_indicator(new_idx), message, current_image, new_idx

	def navigate_language(direction, current_idx):
	"""Navigate through language items"""
	if direction == "prev":
	new_idx = max(0, current_idx - 1)
	else: # next
	new_idx = min(len(language_exercises["tasks"]) - 1, current_idx + 1)

	current_task = language_exercises["tasks"][new_idx]
	message = f"Current task: {current_task['prompt']}"

	return update_lang_item_indicator(new_idx), message, new_idx

	def process_conversation_audio(audio):
	"""Process recorded audio for conversation mode"""
	if audio is None:
	return None, "No audio detected. Please try again."

	# Make sure models are loaded
	if not models_loaded:
	load_models()

	# Convert speech to text
	transcript = speech_to_text(audio)

	if not transcript:
	return None, format_conversation() + "\nI couldn't understand your speech. Please try again."

	# Get AI response
	response = get_ai_response(transcript)

	# Convert response to speech
	audio_file = text_to_speech(response)

	# Return response
	return audio_file, format_conversation()

	def initialize_conversation():
	"""Initialize the conversation with a welcome message"""
	global conversation
	conversation = []

	# Make sure models are loaded
	if not models_loaded:
	load_models()

	# Add welcome message
	welcome = "Hello! I'm your CASL 2 speech therapy assistant. How can I help you today?"
	conversation.append({"role": "assistant", "content": welcome})

	# Generate speech
	welcome_audio = text_to_speech(welcome)

	return welcome_audio, format_conversation()

	# Status message function
	def get_status():
	"""Get current status of the app"""
	if models_loaded:
	return "Models loaded and ready. The app is working in speech-to-speech mode."
	else:
	return "Models will be loaded on first use. This may take a moment when you first record audio."

	# Custom CSS
	custom_css = """
	:root {
	--primary: #4a6fa5;
	--secondary: #6b96c3;
	--accent: #ff7e5f;
	--light: #f9f9f9;
	--dark: #333;
	--success: #4caf50;
	--warning: #ff9800;
	--error: #f44336;
	}

	.gradio-container {
	font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
	max-width: 1200px;
	margin: auto;
	}

	.app-header {
	background-color: var(--primary);
	color: white;
	padding: 1rem;
	border-radius: 8px 8px 0 0;
	margin-bottom: 1rem;
	}

	.tab-nav {
	margin-bottom: 1rem;
	}

	.input-panel {
	background-color: white;
	border-radius: 8px;
	box-shadow: 0 2px 10px rgba(0, 0, 0, 0.08);
	padding: 1rem;
	margin-bottom: 1rem;
	}

	.output-panel {
	background-color: white;
	border-radius: 8px;
	box-shadow: 0 2px 10px rgba(0, 0, 0, 0.08);
	padding: 1rem;
	}

	button.primary {
	background-color: var(--primary);
	color: white;
	}

	button.secondary {
	background-color: var(--secondary);
	color: white;
	}

	.image-display {
	display: flex;
	justify-content: center;
	margin: 1rem 0;
	}

	.image-display img {
	max-width: 300px;
	border-radius: 8px;
	box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
	}

	.status-bar {
	margin-top: 1rem;
	padding: 0.5rem;
	background-color: #f5f5f5;
	border-radius: 4px;
	font-size: 0.9rem;
	color: #666;
	}
	"""

	# Create Gradio interface with tabs for different modes
	with gr.Blocks(title="CASL 2 - Speech Therapy Assessment", css=custom_css) as demo:
	# Current state variables (in Gradio 3.50.0, State doesn't have a change event)
	current_item_idx = gr.State(0)

	# App header
	with gr.Column(elem_classes="app-header"):
	gr.Markdown("# CASL 2 - Speech Therapy Assessment")
	gr.Markdown("An interactive tool for speech therapists to assess and treat speech disorders")

	# Status bar
	status_box = gr.Textbox(label="Status", value=get_status(), interactive=False, elem_classes="status-bar")

	# Main tabs
	with gr.Tabs() as tabs:
	# Conversation Mode Tab
	with gr.TabItem("Conversation Assistant", elem_classes="tab-nav"):
	gr.Markdown("### General Conversation Mode")
	gr.Markdown("Have a natural conversation with the AI assistant for general questions and guidance")

	with gr.Row():
	# Left panel - Controls
	with gr.Column(scale=1, elem_classes="input-panel"):
	# Start button
	conv_start_button = gr.Button("Start Conversation", variant="primary")

	# Microphone input
	conv_audio_input = gr.Audio(
	label="🎤 SPEAK HERE",
	type="numpy",
	sources=["microphone"],
	elem_id="conv_mic"
	)

	# Right panel - Conversation
	with gr.Column(scale=2, elem_classes="output-panel"):
	# Conversation display
	conv_display = gr.Textbox(
	label="Conversation History",
	lines=12,
	value=""
	)

	# Audio playback
	conv_audio_output = gr.Audio(
	label="AI Response",
	type="filepath",
	autoplay=True
	)

	# Articulation Assessment Tab
	with gr.TabItem("Articulation Assessment", elem_classes="tab-nav"):
	gr.Markdown("### Articulation Assessment")
	gr.Markdown("Evaluate production of speech sounds in various positions within words")

	with gr.Row():
	# Left panel - Controls & Current Word
	with gr.Column(scale=1, elem_classes="input-panel"):
	# Start button
	art_start_button = gr.Button("Start Assessment", variant="primary")

	# Current word display
	art_current_display = gr.Textbox(
	label="Current Task",
	lines=3
	)

	# Word image
	art_image = gr.Image(
	label="Word Image",
	type="filepath",
	elem_classes="image-display"
	)

	# Microphone input
	art_audio_input = gr.Audio(
	label="🎤 RECORD RESPONSE",
	type="numpy",
	sources=["microphone"],
	elem_id="art_mic"
	)

	# Navigation
	with gr.Row():
	art_prev_button = gr.Button("◀ Previous")
	art_item_indicator = gr.Textbox(label="Item", value="1/5", interactive=False)
	art_next_button = gr.Button("Next ▶")

	# Right panel - Results
	with gr.Column(scale=2, elem_classes="output-panel"):
	# Results display
	art_result_display = gr.Markdown(
	label="Assessment Results",
	value="Start the assessment to see results."
	)

	# Audio feedback
	art_audio_output = gr.Audio(
	label="Speech Therapist Feedback",
	type="filepath",
	autoplay=True
	)

	# Language Assessment Tab
	with gr.TabItem("Language Assessment", elem_classes="tab-nav"):
	gr.Markdown("### Language Assessment")
	gr.Markdown("Evaluate receptive and expressive language skills including vocabulary and grammar")

	with gr.Row():
	# Left panel - Controls & Current Task
	with gr.Column(scale=1, elem_classes="input-panel"):
	# Start button
	lang_start_button = gr.Button("Start Assessment", variant="primary")

	# Current task display
	lang_current_display = gr.Textbox(
	label="Current Task",
	lines=3
	)

	# Microphone input
	lang_audio_input = gr.Audio(
	label="🎤 RECORD RESPONSE",
	type="numpy",
	sources=["microphone"],
	elem_id="lang_mic"
	)

	# Navigation
	with gr.Row():
	lang_prev_button = gr.Button("◀ Previous")
	lang_item_indicator = gr.Textbox(label="Item", value="1/3", interactive=False)
	lang_next_button = gr.Button("Next ▶")

	# Right panel - Results
	with gr.Column(scale=2, elem_classes="output-panel"):
	# Results display
	lang_result_display = gr.Markdown(
	label="Assessment Results",
	value="Start the assessment to see results."
	)

	# Audio feedback
	lang_audio_output = gr.Audio(
	label="Speech Therapist Feedback",
	type="filepath",
	autoplay=True
	)

	# Instructions
	with gr.Accordion("How to use CASL 2", open=True):
	gr.Markdown("""
	## CASL 2 Speech Therapy Assessment Tool

	This application provides three main functions:

	### 1. Conversation Assistant
	- General conversation with an AI assistant
	- Ask questions about speech therapy, techniques, or general information
	- Get guidance on using the assessment tools

	### 2. Articulation Assessment
	- Evaluate speech sound production
	- Record the patient pronouncing target words
	- Get automatic analysis and therapist feedback
	- Track progress over time

	### 3. Language Assessment
	- Evaluate receptive and expressive language skills
	- Test vocabulary, following directions, and sentence formation
	- Record responses and get professional feedback

	For therapists: Use these tools during your sessions to supplement your professional assessment.

	Privacy Note: All audio recordings are processed securely and are not stored permanently.

	Technical Note: The first time you record audio, the app will load speech models which may take a moment.
	""")

	# Connect components - Conversation Mode
	conv_start_button.click(
	fn=initialize_conversation,
	outputs=[conv_audio_output, conv_display]
	)

	conv_audio_input.change(
	fn=process_conversation_audio,
	inputs=[conv_audio_input],
	outputs=[conv_audio_output, conv_display]
	)

	# Connect components - Articulation Assessment
	art_start_button.click(
	fn=init_articulation_assessment,
	outputs=[art_audio_output, art_current_display, art_image, current_item_idx]
	)

	art_audio_input.change(
	fn=process_assessment_audio,
	inputs=[art_audio_input, gr.Textbox(value="articulation", visible=False), current_item_idx],
	outputs=[art_audio_output, art_result_display, current_item_idx, art_image]
	)

	# Fixed navigation for Gradio 3.50.0
	art_next_button.click(
	fn=navigate_articulation,
	inputs=[gr.Textbox(value="next", visible=False), current_item_idx],
	outputs=[art_item_indicator, art_current_display, art_image, current_item_idx]
	)

	art_prev_button.click(
	fn=navigate_articulation,
	inputs=[gr.Textbox(value="prev", visible=False), current_item_idx],
	outputs=[art_item_indicator, art_current_display, art_image, current_item_idx]
	)

	# Connect components - Language Assessment
	lang_start_button.click(
	fn=init_language_assessment,
	outputs=[lang_audio_output, lang_current_display, gr.Image(visible=False), current_item_idx]
	)

	lang_audio_input.change(
	fn=process_assessment_audio,
	inputs=[lang_audio_input, gr.Textbox(value="language", visible=False), current_item_idx],
	outputs=[lang_audio_output, lang_result_display, current_item_idx, gr.Image(visible=False)]
	)

	# Fixed navigation for language assessment
	lang_next_button.click(
	fn=navigate_language,
	inputs=[gr.Textbox(value="next", visible=False), current_item_idx],
	outputs=[lang_item_indicator, lang_current_display, current_item_idx]
	)

	lang_prev_button.click(
	fn=navigate_language,
	inputs=[gr.Textbox(value="prev", visible=False), current_item_idx],
	outputs=[lang_item_indicator, lang_current_display, current_item_idx]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()