speechkid-api / simulate.py
Nanny7's picture
๐Ÿš€ Force deploy new standalone AI engine
f3c6fc0
Raw
History Blame Contribute Delete
16.8 kB
"""
Speech Analysis MVP - Real-Time Simulation Script
This script provides an interactive terminal interface for testing
the speech analysis system in real-time using your microphone.
Author: Speech AI Project
"""
import os
import sys
import time
import threading
import numpy as np
import sounddevice as sd
from scipy.io import wavfile
# Import our score engine
from score_engine import score_pronunciation
# =============================================================================
# Configuration
# =============================================================================
SAMPLE_RATE = 44100
RECORD_DURATION = 3 # seconds
TEMP_FILE = "temp_test.wav"
AUDIO_DIR = "audio_data"
# Available words for practice
AVAILABLE_WORDS = ["shalom", "shemesh", "shir", "shshshsh", "shuk", "geshem"]
# Colors for terminal output (ANSI escape codes)
class Colors:
HEADER = '\033[95m'
BLUE = '\033[94m'
CYAN = '\033[96m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
# =============================================================================
# Terminal UI Functions
# =============================================================================
def clear_screen():
"""Clear the terminal screen."""
os.system('cls' if os.name == 'nt' else 'clear')
def print_header():
"""Print the application header."""
print(f"\n{Colors.CYAN}{Colors.BOLD}")
print("โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—")
print("โ•‘ ๐ŸŽค SPEECH ANALYSIS - 'SH' SOUND TRAINER ๐ŸŽค โ•‘")
print("โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•")
print(f"{Colors.END}")
def print_box(title: str, content: list, color: str = Colors.CYAN):
"""Print content in a nice box."""
max_len = max(len(title), max(len(line) for line in content)) + 4
print(f"\n{color}โ”Œ{'โ”€' * max_len}โ”{Colors.END}")
print(f"{color}โ”‚{Colors.BOLD} {title.center(max_len - 2)} {Colors.END}{color}โ”‚{Colors.END}")
print(f"{color}โ”œ{'โ”€' * max_len}โ”ค{Colors.END}")
for line in content:
padding = max_len - len(line) - 2
print(f"{color}โ”‚{Colors.END} {line}{' ' * padding} {color}โ”‚{Colors.END}")
print(f"{color}โ””{'โ”€' * max_len}โ”˜{Colors.END}")
def print_word_menu():
"""Print the word selection menu."""
print(f"\n{Colors.YELLOW}Available words to practice:{Colors.END}\n")
for i, word in enumerate(AVAILABLE_WORDS, 1):
print(f" {Colors.BOLD}{i}.{Colors.END} {word}")
print(f"\n {Colors.BOLD}0.{Colors.END} Exit")
print()
def get_word_choice() -> str:
"""Get the user's word choice."""
while True:
try:
choice = input(f"{Colors.CYAN}Enter your choice (1-{len(AVAILABLE_WORDS)}): {Colors.END}")
if choice == '0':
return None
idx = int(choice) - 1
if 0 <= idx < len(AVAILABLE_WORDS):
return AVAILABLE_WORDS[idx]
else:
print(f"{Colors.RED}Invalid choice. Please try again.{Colors.END}")
except ValueError:
print(f"{Colors.RED}Please enter a number.{Colors.END}")
def countdown_display(seconds: int):
"""Display a countdown before recording starts."""
print(f"\n{Colors.YELLOW}Get ready to say the word...{Colors.END}")
for i in range(3, 0, -1):
print(f" {Colors.BOLD}{i}...{Colors.END}", end='', flush=True)
time.sleep(0.7)
print(f"\n\n{Colors.GREEN}{Colors.BOLD}๐Ÿ”ด RECORDING NOW! Speak clearly...{Colors.END}\n")
def recording_progress(duration: int):
"""Display a progress bar during recording."""
bar_length = 40
for i in range(duration * 10):
progress = (i + 1) / (duration * 10)
filled = int(bar_length * progress)
bar = 'โ–ˆ' * filled + 'โ–‘' * (bar_length - filled)
remaining = duration - (i / 10)
print(f"\r [{Colors.GREEN}{bar}{Colors.END}] {remaining:.1f}s remaining", end='', flush=True)
time.sleep(0.1)
print(f"\r [{Colors.GREEN}{'โ–ˆ' * bar_length}{Colors.END}] Done! ")
print(f"\n{Colors.CYAN}Processing your recording...{Colors.END}")
# =============================================================================
# Audio Recording Functions
# =============================================================================
def record_audio(duration: int = RECORD_DURATION, sample_rate: int = SAMPLE_RATE) -> np.ndarray:
"""
Record audio from the microphone.
Args:
duration: Recording duration in seconds
sample_rate: Audio sample rate
Returns:
Recorded audio as numpy array
"""
# Start recording in a separate thread so we can show progress
recording = sd.rec(
int(duration * sample_rate),
samplerate=sample_rate,
channels=1,
dtype='float32'
)
# Show progress while recording
recording_progress(duration)
# Wait for recording to complete
sd.wait()
return recording.flatten()
def save_audio(audio: np.ndarray, filepath: str, sample_rate: int = SAMPLE_RATE):
"""Save audio to a WAV file."""
# Convert to 16-bit PCM
audio_int16 = np.int16(audio * 32767)
wavfile.write(filepath, sample_rate, audio_int16)
# =============================================================================
# Result Display Functions
# =============================================================================
ERROR_TYPE_LABELS = {
# New phoneme pipeline diagnosis codes
"CORRECT": "CORRECT (ืชืงื™ืŸ)",
"ERROR_OMISSION": "OMISSION (ื”ืฉืžื˜ื”)",
"ERROR_S_SUBSTITUTION": "S SUBSTITUTION (ื”ื—ืœืคื” ืœ-ืก)",
"ERROR_LATERAL_LISP": "LATERAL LISP (ืฉ' ืฆื™ื“ื™ืช/ืจื˜ื•ื‘ื”)",
"UNCLEAR_DISTORTION": "UNCLEAR (ืขื™ื•ื•ืช)",
# Legacy pipeline codes
"wet_ch": "WET CH (ืฉ' ืจื˜ื•ื‘ื”)",
"s_substitution": "S SUBSTITUTION (ื”ื—ืœืคื” ืœ-ืก)",
"lateral_sh": "LATERAL SH (ืฉ' ืฆื™ื“ื™ืช)",
"omission": "OMISSION (ื”ืฉืžื˜ื”)",
"distortion": "DISTORTION (ืขื™ื•ื•ืช)",
}
def display_result(result: dict, word: str):
"""Display the scoring result in a beautiful format."""
score = result['score']
status = result['status']
error_type = result['error_type']
feedback = result['feedback']
details = result['details']
# Determine color based on score
if score >= 90:
score_color = Colors.GREEN
emoji = "๐ŸŒŸ"
elif score >= 70:
score_color = Colors.YELLOW
emoji = "โœ“"
elif score >= 50:
score_color = Colors.YELLOW
emoji = "โš "
else:
score_color = Colors.RED
emoji = "โœ—"
# Status box color
status_color = Colors.GREEN if status == "PASS" else Colors.RED
# Print result header
print(f"\n{Colors.BOLD}{'โ•' * 60}{Colors.END}")
print(f"{Colors.BOLD} RESULTS FOR: '{word.upper()}'{Colors.END}")
print(f"{Colors.BOLD}{'โ•' * 60}{Colors.END}")
# Score display
print(f"\n {emoji} {Colors.BOLD}SCORE:{Colors.END} {score_color}{Colors.BOLD}{score}/100{Colors.END}")
# Status display
if status == "PASS":
print(f" โœ“ {Colors.GREEN}{Colors.BOLD}STATUS: PASS{Colors.END}")
else:
print(f" โœ— {Colors.RED}{Colors.BOLD}STATUS: FAIL{Colors.END}")
# Error type (show for ALL detected errors, not just FAIL)
if error_type != "none":
error_color = Colors.RED if status == "FAIL" else Colors.YELLOW
error_display = ERROR_TYPE_LABELS.get(error_type, error_type.replace('_', ' ').upper())
print(f"\n {error_color}{Colors.BOLD}โš  ERROR TYPE: {error_display}{Colors.END}")
# Feedback โ€” print directly (handles Hebrew RTL properly)
print(f"\n {status_color}{Colors.BOLD}๐Ÿ’ฌ FEEDBACK:{Colors.END}")
print(f" {feedback}")
# Pipeline indicator
pipeline = result.get('pipeline', 'legacy')
pipeline_label = "Phoneme (Wav2Vec2)" if pipeline == "phoneme" else "Legacy (DTW)"
print(f"\n {Colors.CYAN}Pipeline: {pipeline_label}{Colors.END}")
# Technical details โ€” adapt to pipeline
print(f"\n {Colors.CYAN}Technical Details:{Colors.END}")
if pipeline == "phoneme":
print(f" โ€ข Centroid: {details.get('centroid_hz', 0)} Hz")
print(f" โ€ข S-Band Energy: {details.get('s_band_percent', 0.0):.1f}%")
print(f" โ€ข Lateral Energy: {details.get('lateral_percent', 0.0):.1f}%")
print(f" โ€ข Sub-3kHz Energy: {details.get('sub3k_percent', 0.0):.1f}%")
print(f" โ€ข High/Mid Ratio: {details.get('high_mid_ratio', 0.0):.3f}")
print(f" โ€ข Bandwidth: {details.get('bandwidth_mean', 0.0):.1f} Hz")
print(f" โ€ข Spectral Skewness: {details.get('spectral_skewness', 0.0):.4f}")
# Alignment info
alignment = result.get('alignment', {})
shin = alignment.get('shin', {})
if shin:
print(f"\n {Colors.CYAN}Alignment:{Colors.END}")
print(f" โ€ข Shin segment: {shin.get('start_sec', 0):.3f}s - {shin.get('end_sec', 0):.3f}s ({shin.get('duration', 0)*1000:.0f}ms)")
print(f" โ€ข Alignment confidence: {shin.get('score', 0):.2f}")
segments = alignment.get('segments', [])
if segments:
seg_str = " ".join(
f"{s['char']}[{s['start']:.2f}-{s['end']:.2f}]"
for s in segments
)
print(f" โ€ข All segments: {seg_str}")
else:
print(f" โ€ข Distance: {details.get('distance', 0.0):.4f} (raw: {details.get('raw_distance', 0.0):.4f}, modifier: x{details.get('modifier', 1.0):.1f})")
print(f" โ€ข Centroid: {details.get('centroid_hz', 0)} Hz")
print(f" โ€ข S-Band Energy: {details.get('s_band_percent', 0.0):.1f}%")
print(f" โ€ข Lateral Energy: {details.get('lateral_percent', 0.0):.1f}%")
print(f" โ€ข Sub-3kHz Energy: {details.get('sub3k_percent', 0.0):.1f}%")
print(f" โ€ข Spectral Flatness: {details.get('spectral_flatness', 0.0):.4f}")
print(f" โ€ข Amp Modulation: {details.get('amp_modulation', 0.0):.3f}")
print(f" โ€ข Fricative Frames: {details.get('fricative_frames', 0)}")
print(f" โ€ข Stable: {'Yes' if details.get('is_stable', False) else 'No'}")
print(f"\n{Colors.BOLD}{'โ•' * 60}{Colors.END}")
def display_comparison(word: str):
"""Show what a good vs bad pronunciation looks like."""
print(f"\n{Colors.CYAN}Quick Reference for '{word}':{Colors.END}")
print(f" โ€ข {Colors.GREEN}Good 'Sh':{Colors.END} Tongue back, lips rounded, soft smooth airflow")
print(f" โ€ข {Colors.RED}Bad 'S':{Colors.END} Tongue forward, teeth close, sharp airflow")
print(f" โ€ข {Colors.RED}Bad 'wet CH':{Colors.END} Slushy/saliva sound โ€” keep tongue centered, blow dry air")
# =============================================================================
# Main Simulation Loop
# =============================================================================
def run_simulation():
"""Main simulation loop."""
clear_screen()
print_header()
while True:
print_word_menu()
# Get word choice
word = get_word_choice()
if word is None:
print(f"\n{Colors.CYAN}Thanks for practicing! Goodbye! ๐Ÿ‘‹{Colors.END}\n")
break
# Show word info
print(f"\n{Colors.BOLD}Selected word: {Colors.CYAN}{word.upper()}{Colors.END}")
display_comparison(word)
# Wait for user to be ready
input(f"\n{Colors.YELLOW}Press ENTER when you're ready to record...{Colors.END}")
# Countdown
countdown_display(3)
# Record audio
try:
audio = record_audio(RECORD_DURATION)
save_audio(audio, TEMP_FILE)
except Exception as e:
print(f"\n{Colors.RED}Error recording audio: {e}{Colors.END}")
print(f"{Colors.YELLOW}Make sure your microphone is connected and working.{Colors.END}")
input(f"\n{Colors.CYAN}Press ENTER to try again...{Colors.END}")
continue
# Score the recording
try:
result = score_pronunciation(TEMP_FILE, word, AUDIO_DIR)
if result['status'] == 'ERROR':
print(f"\n{Colors.RED}Error: {result['feedback']}{Colors.END}")
else:
display_result(result, word)
except Exception as e:
print(f"\n{Colors.RED}Error scoring recording: {e}{Colors.END}")
# Ask to continue
print(f"\n{Colors.CYAN}Options:{Colors.END}")
print(f" {Colors.BOLD}1.{Colors.END} Try '{word}' again")
print(f" {Colors.BOLD}2.{Colors.END} Choose a different word")
print(f" {Colors.BOLD}0.{Colors.END} Exit")
choice = input(f"\n{Colors.CYAN}Your choice: {Colors.END}")
if choice == '0':
print(f"\n{Colors.CYAN}Thanks for practicing! Goodbye! ๐Ÿ‘‹{Colors.END}\n")
break
elif choice == '1':
# Try same word again
print(f"\n{Colors.YELLOW}Let's try '{word}' again!{Colors.END}")
input(f"\n{Colors.YELLOW}Press ENTER when you're ready to record...{Colors.END}")
countdown_display(3)
try:
audio = record_audio(RECORD_DURATION)
save_audio(audio, TEMP_FILE)
result = score_pronunciation(TEMP_FILE, word, AUDIO_DIR)
if result['status'] != 'ERROR':
display_result(result, word)
except Exception as e:
print(f"\n{Colors.RED}Error: {e}{Colors.END}")
input(f"\n{Colors.CYAN}Press ENTER to continue...{Colors.END}")
clear_screen()
print_header()
else:
# Go back to word menu
clear_screen()
print_header()
# Cleanup
if os.path.exists(TEMP_FILE):
try:
os.remove(TEMP_FILE)
except:
pass
# =============================================================================
# Entry Point
# =============================================================================
def main():
"""Main entry point."""
# Check for sounddevice
try:
import sounddevice as sd
except ImportError:
print(f"{Colors.RED}Error: 'sounddevice' is not installed.{Colors.END}")
print(f"Please install it with: pip install sounddevice")
sys.exit(1)
# Check for available audio devices (Windows-friendly: try default, then list all inputs)
try:
default_input = sd.query_devices(kind='input')
print(f"{Colors.GREEN}โœ“ Microphone detected: {default_input['name']}{Colors.END}")
except Exception:
# Fallback: iterate device indices to find any input device (works on Windows)
try:
input_idx = None
input_name = None
for i in range(64):
try:
dev = sd.query_devices(i)
if dev.get('max_input_channels', 0) > 0:
input_idx = i
input_name = dev.get('name', f'Device {i}')
break
except Exception:
break
if input_idx is not None and input_name:
try:
out_idx = sd.default.device[1] if isinstance(sd.default.device, (list, tuple)) else sd.default.device
sd.default.device = (input_idx, out_idx)
except Exception:
pass
print(f"{Colors.GREEN}โœ“ Microphone detected: {input_name}{Colors.END}")
else:
raise RuntimeError("No input device in device list")
except Exception as e2:
print(f"{Colors.RED}Error: No microphone detected.{Colors.END}")
print(f"Details: {e2!s}")
print(f"{Colors.YELLOW}On Windows: Set default microphone in Settings > System > Sound > Input.{Colors.END}")
sys.exit(1)
# Run simulation
run_simulation()
if __name__ == "__main__":
main()