Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- Dockerfile +38 -33
- kid_coach_pipeline.py +845 -150
- main.py +239 -71
- requirements (1).txt +27 -0
Dockerfile
CHANGED
|
@@ -1,42 +1,47 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
ffmpeg \
|
|
|
|
| 15 |
git \
|
| 16 |
-
wget \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
-
# Set
|
| 20 |
-
RUN ln -s /usr/bin/python3.10 /usr/bin/python
|
| 21 |
-
|
| 22 |
-
# 2. Setup User
|
| 23 |
WORKDIR /app
|
| 24 |
-
RUN useradd -m -u 1000 user
|
| 25 |
-
RUN chown -R user:user /app
|
| 26 |
-
USER user
|
| 27 |
-
ENV PATH="/home/user/.local/bin:$PATH"
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
COPY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
pip install --no-cache-dir "torch==2.1.2" "torchaudio==2.1.2" --index-url https://download.pytorch.org/whl/cu118 && \
|
| 35 |
-
# Install the rest
|
| 36 |
-
pip install --no-cache-dir -r requirements.txt
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
|
|
|
|
|
| 1 |
+
# Production Dockerfile for Public Speaking Coach API
|
| 2 |
+
# Optimized for Hugging Face Spaces or any cloud deployment
|
| 3 |
+
|
| 4 |
+
FROM python:3.10-slim
|
| 5 |
+
|
| 6 |
+
# Set environment variables
|
| 7 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 8 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 9 |
+
PIP_NO_CACHE_DIR=1 \
|
| 10 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1
|
| 11 |
+
|
| 12 |
+
# Install system dependencies
|
| 13 |
+
RUN apt-get update && apt-get install -y \
|
| 14 |
ffmpeg \
|
| 15 |
+
libsndfile1 \
|
| 16 |
git \
|
|
|
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
+
# Set working directory
|
|
|
|
|
|
|
|
|
|
| 20 |
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
# Copy requirements first (for better caching)
|
| 23 |
+
COPY requirements.txt .
|
| 24 |
+
|
| 25 |
+
# Install Python dependencies
|
| 26 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 27 |
+
|
| 28 |
+
# Download language tool data (for grammar checking)
|
| 29 |
+
RUN python -c "import language_tool_python; language_tool_python.LanguageTool('en-US')" || true
|
| 30 |
+
|
| 31 |
+
# Copy application code
|
| 32 |
+
COPY kid_coach_pipeline.py .
|
| 33 |
+
COPY main.py .
|
| 34 |
+
|
| 35 |
+
# Create directory for temporary files
|
| 36 |
+
RUN mkdir -p /tmp/uploads
|
| 37 |
|
| 38 |
+
# Expose port
|
| 39 |
+
EXPOSE 7860
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
# Health check
|
| 42 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 43 |
+
CMD python -c "import requests; requests.get('http://localhost:7860/health')"
|
| 44 |
|
| 45 |
+
# Run the application
|
| 46 |
+
# Use port 7860 for Hugging Face Spaces compatibility
|
| 47 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
kid_coach_pipeline.py
CHANGED
|
@@ -1,178 +1,873 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import re
|
| 3 |
-
import
|
|
|
|
|
|
|
| 4 |
import torch
|
| 5 |
-
import torchaudio
|
| 6 |
import librosa
|
| 7 |
import numpy as np
|
| 8 |
-
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
self.
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
}
|
| 28 |
-
|
| 29 |
-
def _analyze_metrics(self, transcript_chunks, duration):
|
| 30 |
-
"""Calculates WPM, Fillers, and Stats"""
|
| 31 |
-
full_text = " ".join([c['text'] for c in transcript_chunks]).strip()
|
| 32 |
-
words = full_text.split()
|
| 33 |
-
total_words = len(words)
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
clean = re.sub(r'[^\w\s]', '', w.lower())
|
| 42 |
-
if clean in self.filler_words:
|
| 43 |
-
fillers_found.append(clean)
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
return {
|
| 48 |
-
"full_text": full_text,
|
| 49 |
-
"wpm": round(wpm, 1),
|
| 50 |
-
"duration": round(duration, 2),
|
| 51 |
-
"fillers_count": len(fillers_found),
|
| 52 |
-
"fillers_list": list(set(fillers_found)),
|
| 53 |
-
"filler_pct": round(filler_pct, 1)
|
| 54 |
-
}
|
| 55 |
-
|
| 56 |
-
def _generate_coaching_feedback(self, metrics):
|
| 57 |
-
"""Loads LLM, generates feedback, then unloads it"""
|
| 58 |
-
print("🧠 Loading AI Coach...")
|
| 59 |
try:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
)
|
| 72 |
-
|
| 73 |
-
prompt = f"""
|
| 74 |
-
You are a kind, professional Public Speaking Coach.
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
|
| 87 |
-
|
| 88 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
|
|
|
| 95 |
|
| 96 |
-
#
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
transcriber = pipeline(
|
| 116 |
-
"automatic-speech-recognition",
|
| 117 |
-
model="openai/whisper-large-v3",
|
| 118 |
-
device=self.device,
|
| 119 |
-
torch_dtype=self.torch_dtype,
|
| 120 |
-
chunk_length_s=30
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
# Run transcription with timestamps
|
| 124 |
-
result = transcriber(audio_path, return_timestamps=True)
|
| 125 |
-
|
| 126 |
-
# Cleanup Transcription Model
|
| 127 |
-
del transcriber
|
| 128 |
-
gc.collect()
|
| 129 |
-
torch.cuda.empty_cache()
|
| 130 |
-
|
| 131 |
-
if not result['text']:
|
| 132 |
-
return {"error": "No speech detected."}
|
| 133 |
-
|
| 134 |
-
# Calculate Audio Duration for WPM
|
| 135 |
-
duration = librosa.get_duration(path=audio_path)
|
| 136 |
-
|
| 137 |
-
# 2. METRICS
|
| 138 |
-
print("📊 Analyzing...")
|
| 139 |
-
# Transformers output format is different, we adapt here
|
| 140 |
-
transcript_chunks = result.get('chunks', [{'text': result['text']}])
|
| 141 |
-
metrics = self._analyze_metrics(transcript_chunks, duration)
|
| 142 |
-
|
| 143 |
-
# 3. DIARIZATION (Quick check for multiple speakers)
|
| 144 |
-
print("🗣️ Checking Speakers...")
|
| 145 |
-
try:
|
| 146 |
-
diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=self.hf_token)
|
| 147 |
-
diar.to(torch.device(self.device))
|
| 148 |
-
wav, sr = torchaudio.load(audio_path)
|
| 149 |
-
d_result = diar({"waveform": wav, "sample_rate": sr})
|
| 150 |
-
speaker_count = len(d_result.labels())
|
| 151 |
-
del diar
|
| 152 |
-
gc.collect()
|
| 153 |
-
torch.cuda.empty_cache()
|
| 154 |
-
except:
|
| 155 |
-
speaker_count = 1
|
| 156 |
-
|
| 157 |
-
metrics["speaker_count"] = speaker_count
|
| 158 |
-
|
| 159 |
-
# 4. LLM COACH
|
| 160 |
-
print("🧠 Coaching...")
|
| 161 |
-
feedback = self._generate_coaching_feedback(metrics)
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
"
|
| 166 |
-
|
| 167 |
-
"
|
| 168 |
-
"
|
| 169 |
-
"
|
| 170 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
},
|
| 172 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
}
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
| 175 |
except Exception as e:
|
| 176 |
import traceback
|
| 177 |
traceback.print_exc()
|
| 178 |
-
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Production-Ready Public Speaking Coach Engine
|
| 3 |
+
Supports all ages with comprehensive speech analysis
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import os
|
| 7 |
+
import io
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import warnings
|
| 11 |
import re
|
| 12 |
+
from typing import Dict, List, Any, Optional
|
| 13 |
+
from dataclasses import dataclass, asdict
|
| 14 |
+
|
| 15 |
import torch
|
|
|
|
| 16 |
import librosa
|
| 17 |
import numpy as np
|
| 18 |
+
import soundfile as sf
|
| 19 |
+
from scipy.signal import medfilt
|
| 20 |
+
from scipy.stats import zscore
|
| 21 |
+
import textstat
|
| 22 |
+
|
| 23 |
+
# Suppress warnings
|
| 24 |
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
| 25 |
+
logging.getLogger("whisper").setLevel(logging.ERROR)
|
| 26 |
+
logging.getLogger("transformers").setLevel(logging.ERROR)
|
| 27 |
+
warnings.filterwarnings("ignore")
|
| 28 |
+
|
| 29 |
+
# Validate Whisper installation
|
| 30 |
+
try:
|
| 31 |
+
import whisper
|
| 32 |
+
if not hasattr(whisper, "load_model"):
|
| 33 |
+
raise ImportError("Wrong whisper library installed")
|
| 34 |
+
except ImportError:
|
| 35 |
+
print("\n❌ CRITICAL: Install correct whisper library:")
|
| 36 |
+
print(" pip uninstall -y whisper && pip install openai-whisper")
|
| 37 |
+
exit(1)
|
| 38 |
+
|
| 39 |
+
# Import grammar checker (lazy load to avoid startup delay)
|
| 40 |
+
GRAMMAR_TOOL = None
|
| 41 |
+
|
| 42 |
+
def get_grammar_tool():
|
| 43 |
+
"""Lazy load grammar checker"""
|
| 44 |
+
global GRAMMAR_TOOL
|
| 45 |
+
if GRAMMAR_TOOL is None:
|
| 46 |
+
try:
|
| 47 |
+
import language_tool_python
|
| 48 |
+
GRAMMAR_TOOL = language_tool_python.LanguageTool('en-US')
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logging.warning(f"Grammar tool not available: {e}")
|
| 51 |
+
GRAMMAR_TOOL = False
|
| 52 |
+
return GRAMMAR_TOOL if GRAMMAR_TOOL else None
|
| 53 |
+
|
| 54 |
+
# JSON Serialization Helper
|
| 55 |
+
class NumpyEncoder(json.JSONEncoder):
|
| 56 |
+
"""Handles numpy types in JSON serialization"""
|
| 57 |
+
def default(self, obj):
|
| 58 |
+
if isinstance(obj, (np.integer, np.int64)):
|
| 59 |
+
return int(obj)
|
| 60 |
+
if isinstance(obj, (np.floating, np.float32, np.float64)):
|
| 61 |
+
return float(obj)
|
| 62 |
+
if isinstance(obj, np.ndarray):
|
| 63 |
+
return obj.tolist()
|
| 64 |
+
return super().default(obj)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@dataclass
|
| 68 |
+
class AnalysisResult:
|
| 69 |
+
"""Structured result for type safety"""
|
| 70 |
+
overall_score: int
|
| 71 |
+
fluency_score: int
|
| 72 |
+
confidence_score: int
|
| 73 |
+
content_score: int
|
| 74 |
+
grammar_score: int
|
| 75 |
+
|
| 76 |
+
transcription: str
|
| 77 |
+
word_count: int
|
| 78 |
+
duration_seconds: float
|
| 79 |
+
|
| 80 |
+
filler_words: Dict[str, int]
|
| 81 |
+
repeated_phrases: List[Dict[str, Any]]
|
| 82 |
+
long_pauses: List[Dict[str, float]]
|
| 83 |
+
|
| 84 |
+
pace_analysis: Dict[str, Any]
|
| 85 |
+
tone_analysis: Dict[str, Any]
|
| 86 |
+
grammar_issues: List[Dict[str, str]]
|
| 87 |
+
|
| 88 |
+
strengths: List[str]
|
| 89 |
+
improvements: List[str]
|
| 90 |
+
coaching_feedback: str
|
| 91 |
+
|
| 92 |
+
def to_dict(self):
|
| 93 |
+
return asdict(self)
|
| 94 |
|
| 95 |
+
|
| 96 |
+
class PublicSpeakingCoach:
|
| 97 |
+
"""
|
| 98 |
+
Complete speech analysis engine for public speaking coaching
|
| 99 |
+
Features:
|
| 100 |
+
- Transcription with word-level timestamps
|
| 101 |
+
- Filler word detection
|
| 102 |
+
- Silence/pause analysis
|
| 103 |
+
- Repeated phrase detection
|
| 104 |
+
- Tone & confidence analysis
|
| 105 |
+
- Grammar checking
|
| 106 |
+
- Content quality analysis
|
| 107 |
+
- AI-powered coaching feedback
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
def __init__(self, whisper_model_size: str = "base"):
|
| 111 |
+
"""
|
| 112 |
+
Initialize the coach engine
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
whisper_model_size: Whisper model size (tiny/base/small/medium)
|
| 116 |
+
base = good balance, small = better accuracy
|
| 117 |
+
"""
|
| 118 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 119 |
+
print(f"🚀 Initializing Public Speaking Coach on {self.device}...")
|
| 120 |
+
|
| 121 |
+
# Load Whisper for transcription
|
| 122 |
+
print(f" Loading Whisper ({whisper_model_size})...")
|
| 123 |
+
self.whisper = whisper.load_model(whisper_model_size, device=self.device)
|
| 124 |
|
| 125 |
+
# Linguistic patterns
|
| 126 |
+
self.filler_patterns = {
|
| 127 |
+
"um": r"\bum+h*\b",
|
| 128 |
+
"uh": r"\buh+h*\b",
|
| 129 |
+
"like": r"\blike\b",
|
| 130 |
+
"you know": r"\byou know\b",
|
| 131 |
+
"so": r"\bso+\b",
|
| 132 |
+
"actually": r"\bactually\b",
|
| 133 |
+
"basically": r"\bbasically\b",
|
| 134 |
+
"literally": r"\bliterally\b",
|
| 135 |
+
"i mean": r"\bi mean\b",
|
| 136 |
+
"kind of": r"\bkind of\b",
|
| 137 |
+
"sort of": r"\bsort of\b"
|
| 138 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+
self.power_words = {
|
| 141 |
+
"evidence", "data", "research", "proven", "significantly",
|
| 142 |
+
"innovative", "transform", "achieve", "success", "solution",
|
| 143 |
+
"effective", "results", "impact", "value", "opportunity",
|
| 144 |
+
"believe", "imagine", "discover", "realize", "understand"
|
| 145 |
+
}
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
+
print("✅ Coach Engine Ready!")
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def analyze_speech(self, audio_path: str) -> Dict[str, Any]:
|
| 151 |
+
"""
|
| 152 |
+
Main analysis pipeline
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
audio_path: Path to audio file
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
Comprehensive analysis results as dictionary
|
| 159 |
+
"""
|
| 160 |
+
# Validation
|
| 161 |
+
if not os.path.exists(audio_path):
|
| 162 |
+
return {"error": "Audio file not found"}
|
| 163 |
+
|
| 164 |
+
print(f"🎤 Analyzing: {os.path.basename(audio_path)}")
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
try:
|
| 167 |
+
# Load audio
|
| 168 |
+
audio, sr = self._load_audio(audio_path)
|
| 169 |
+
duration = len(audio) / sr
|
| 170 |
+
|
| 171 |
+
if duration < 1.0:
|
| 172 |
+
return {"error": "Audio too short (minimum 1 second)"}
|
| 173 |
+
|
| 174 |
+
print(f" Duration: {duration:.1f}s")
|
| 175 |
+
|
| 176 |
+
# Step 1: Transcription with timestamps
|
| 177 |
+
print(" 📝 Transcribing...")
|
| 178 |
+
transcript_data = self._transcribe_with_timestamps(audio)
|
| 179 |
+
|
| 180 |
+
if not transcript_data['text'].strip():
|
| 181 |
+
return {"error": "No speech detected"}
|
| 182 |
+
|
| 183 |
+
# Step 2: Filler word analysis
|
| 184 |
+
print(" 🔍 Detecting filler words...")
|
| 185 |
+
filler_analysis = self._detect_fillers(
|
| 186 |
+
transcript_data['text'],
|
| 187 |
+
transcript_data['words']
|
| 188 |
)
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
+
# Step 3: Pause analysis
|
| 191 |
+
print(" ⏸️ Analyzing pauses...")
|
| 192 |
+
pause_analysis = self._analyze_pauses(
|
| 193 |
+
transcript_data['words'],
|
| 194 |
+
duration
|
| 195 |
+
)
|
| 196 |
|
| 197 |
+
# Step 4: Repeated phrase detection
|
| 198 |
+
print(" 🔁 Detecting repetitions...")
|
| 199 |
+
repetition_analysis = self._detect_repetitions(
|
| 200 |
+
transcript_data['words']
|
| 201 |
+
)
|
| 202 |
|
| 203 |
+
# Step 5: Pace analysis
|
| 204 |
+
print(" ⚡ Analyzing pace...")
|
| 205 |
+
pace_analysis = self._analyze_pace(
|
| 206 |
+
transcript_data['words'],
|
| 207 |
+
duration
|
| 208 |
+
)
|
| 209 |
|
| 210 |
+
# Step 6: Tone & confidence analysis
|
| 211 |
+
print(" 🎵 Analyzing tone & confidence...")
|
| 212 |
+
tone_analysis = self._analyze_tone_confidence(audio, sr)
|
| 213 |
|
| 214 |
+
# Step 7: Grammar check
|
| 215 |
+
print(" ✍️ Checking grammar...")
|
| 216 |
+
grammar_analysis = self._check_grammar(transcript_data['text'])
|
| 217 |
|
| 218 |
+
# Step 8: Content quality analysis
|
| 219 |
+
print(" 📊 Evaluating content...")
|
| 220 |
+
content_analysis = self._analyze_content(
|
| 221 |
+
transcript_data['text'],
|
| 222 |
+
transcript_data['words']
|
| 223 |
+
)
|
| 224 |
|
| 225 |
+
# Step 9: Generate scores
|
| 226 |
+
print(" 🎯 Calculating scores...")
|
| 227 |
+
scores = self._calculate_scores(
|
| 228 |
+
filler_analysis,
|
| 229 |
+
pause_analysis,
|
| 230 |
+
repetition_analysis,
|
| 231 |
+
pace_analysis,
|
| 232 |
+
tone_analysis,
|
| 233 |
+
grammar_analysis,
|
| 234 |
+
content_analysis
|
| 235 |
+
)
|
| 236 |
|
| 237 |
+
# Step 10: Generate coaching feedback
|
| 238 |
+
print(" 🤖 Generating coaching...")
|
| 239 |
+
coaching = self._generate_coaching(
|
| 240 |
+
scores,
|
| 241 |
+
filler_analysis,
|
| 242 |
+
pause_analysis,
|
| 243 |
+
repetition_analysis,
|
| 244 |
+
pace_analysis,
|
| 245 |
+
tone_analysis,
|
| 246 |
+
grammar_analysis,
|
| 247 |
+
content_analysis
|
| 248 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
+
# Compile final result
|
| 251 |
+
result = {
|
| 252 |
+
"overall_score": scores['overall'],
|
| 253 |
+
"scores": {
|
| 254 |
+
"fluency": scores['fluency'],
|
| 255 |
+
"confidence": scores['confidence'],
|
| 256 |
+
"content": scores['content'],
|
| 257 |
+
"grammar": scores['grammar']
|
| 258 |
+
},
|
| 259 |
+
"transcription": {
|
| 260 |
+
"text": transcript_data['text'],
|
| 261 |
+
"word_count": len(transcript_data['words']),
|
| 262 |
+
"duration_seconds": round(duration, 2)
|
| 263 |
},
|
| 264 |
+
"fluency_analysis": {
|
| 265 |
+
"filler_words": filler_analysis,
|
| 266 |
+
"repeated_phrases": repetition_analysis,
|
| 267 |
+
"long_pauses": pause_analysis['long_pauses']
|
| 268 |
+
},
|
| 269 |
+
"pace_analysis": pace_analysis,
|
| 270 |
+
"tone_analysis": tone_analysis,
|
| 271 |
+
"grammar_analysis": grammar_analysis,
|
| 272 |
+
"content_analysis": content_analysis,
|
| 273 |
+
"coaching": coaching
|
| 274 |
}
|
| 275 |
+
|
| 276 |
+
print("✅ Analysis complete!")
|
| 277 |
+
return result
|
| 278 |
+
|
| 279 |
except Exception as e:
|
| 280 |
import traceback
|
| 281 |
traceback.print_exc()
|
| 282 |
+
return {"error": f"Analysis failed: {str(e)}"}
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _load_audio(self, path: str) -> tuple:
|
| 286 |
+
"""Load and normalize audio to 16kHz mono"""
|
| 287 |
+
try:
|
| 288 |
+
audio, sr = librosa.load(path, sr=16000, mono=True)
|
| 289 |
+
# Normalize to prevent clipping
|
| 290 |
+
audio = librosa.util.normalize(audio)
|
| 291 |
+
return audio, sr
|
| 292 |
+
except Exception as e:
|
| 293 |
+
raise ValueError(f"Failed to load audio: {e}")
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _transcribe_with_timestamps(self, audio: np.ndarray) -> Dict:
|
| 297 |
+
"""Transcribe with word-level timestamps"""
|
| 298 |
+
result = self.whisper.transcribe(
|
| 299 |
+
audio,
|
| 300 |
+
language='en',
|
| 301 |
+
word_timestamps=True,
|
| 302 |
+
fp16=(self.device == "cuda")
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
words = []
|
| 306 |
+
for segment in result['segments']:
|
| 307 |
+
if 'words' in segment:
|
| 308 |
+
for word_info in segment['words']:
|
| 309 |
+
words.append({
|
| 310 |
+
'word': word_info['word'].strip(),
|
| 311 |
+
'start': word_info['start'],
|
| 312 |
+
'end': word_info['end'],
|
| 313 |
+
'confidence': word_info.get('probability', 1.0)
|
| 314 |
+
})
|
| 315 |
+
|
| 316 |
+
return {
|
| 317 |
+
'text': result['text'].strip(),
|
| 318 |
+
'words': words
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def _detect_fillers(self, text: str, words: List[Dict]) -> Dict:
|
| 323 |
+
"""Detect filler words with counts and positions"""
|
| 324 |
+
text_lower = text.lower()
|
| 325 |
+
filler_counts = {}
|
| 326 |
+
filler_positions = []
|
| 327 |
+
|
| 328 |
+
for filler_name, pattern in self.filler_patterns.items():
|
| 329 |
+
matches = list(re.finditer(pattern, text_lower, re.IGNORECASE))
|
| 330 |
+
count = len(matches)
|
| 331 |
+
if count > 0:
|
| 332 |
+
filler_counts[filler_name] = count
|
| 333 |
+
for match in matches:
|
| 334 |
+
filler_positions.append({
|
| 335 |
+
'filler': filler_name,
|
| 336 |
+
'position': match.start()
|
| 337 |
+
})
|
| 338 |
+
|
| 339 |
+
total_fillers = sum(filler_counts.values())
|
| 340 |
+
total_words = len(words)
|
| 341 |
+
filler_rate = (total_fillers / total_words * 100) if total_words > 0 else 0
|
| 342 |
+
|
| 343 |
+
return {
|
| 344 |
+
'total_count': total_fillers,
|
| 345 |
+
'rate_percentage': round(filler_rate, 2),
|
| 346 |
+
'breakdown': filler_counts,
|
| 347 |
+
'positions': filler_positions
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
def _analyze_pauses(self, words: List[Dict], duration: float) -> Dict:
|
| 352 |
+
"""Analyze pause patterns"""
|
| 353 |
+
if len(words) < 2:
|
| 354 |
+
return {'long_pauses': [], 'average_pause': 0, 'silence_percentage': 0}
|
| 355 |
+
|
| 356 |
+
pauses = []
|
| 357 |
+
long_pauses = []
|
| 358 |
+
|
| 359 |
+
for i in range(len(words) - 1):
|
| 360 |
+
pause_duration = words[i+1]['start'] - words[i]['end']
|
| 361 |
+
if pause_duration > 0:
|
| 362 |
+
pauses.append(pause_duration)
|
| 363 |
+
if pause_duration > 2.0: # Long pause threshold
|
| 364 |
+
long_pauses.append({
|
| 365 |
+
'duration': round(pause_duration, 2),
|
| 366 |
+
'after_word': words[i]['word'],
|
| 367 |
+
'timestamp': round(words[i]['end'], 2)
|
| 368 |
+
})
|
| 369 |
+
|
| 370 |
+
avg_pause = np.mean(pauses) if pauses else 0
|
| 371 |
+
total_pause_time = sum(pauses)
|
| 372 |
+
silence_pct = (total_pause_time / duration * 100) if duration > 0 else 0
|
| 373 |
+
|
| 374 |
+
return {
|
| 375 |
+
'long_pauses': long_pauses,
|
| 376 |
+
'long_pause_count': len(long_pauses),
|
| 377 |
+
'average_pause_seconds': round(avg_pause, 2),
|
| 378 |
+
'silence_percentage': round(silence_pct, 2)
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def _detect_repetitions(self, words: List[Dict]) -> List[Dict]:
|
| 383 |
+
"""Detect repeated phrases (2-5 words)"""
|
| 384 |
+
repetitions = []
|
| 385 |
+
word_list = [w['word'].lower().strip('.,!?') for w in words]
|
| 386 |
+
|
| 387 |
+
# Check for n-gram repetitions (2-5 words)
|
| 388 |
+
for n in range(2, 6):
|
| 389 |
+
seen = {}
|
| 390 |
+
for i in range(len(word_list) - n + 1):
|
| 391 |
+
phrase = ' '.join(word_list[i:i+n])
|
| 392 |
+
if phrase in seen:
|
| 393 |
+
repetitions.append({
|
| 394 |
+
'phrase': phrase,
|
| 395 |
+
'count': seen[phrase] + 1,
|
| 396 |
+
'length': n
|
| 397 |
+
})
|
| 398 |
+
seen[phrase] += 1
|
| 399 |
+
else:
|
| 400 |
+
seen[phrase] = 1
|
| 401 |
+
|
| 402 |
+
# Remove duplicates and sort by count
|
| 403 |
+
unique_reps = {}
|
| 404 |
+
for rep in repetitions:
|
| 405 |
+
key = rep['phrase']
|
| 406 |
+
if key not in unique_reps or rep['count'] > unique_reps[key]['count']:
|
| 407 |
+
unique_reps[key] = rep
|
| 408 |
+
|
| 409 |
+
return sorted(unique_reps.values(), key=lambda x: x['count'], reverse=True)[:10]
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
def _analyze_pace(self, words: List[Dict], duration: float) -> Dict:
|
| 413 |
+
"""Analyze speaking pace"""
|
| 414 |
+
word_count = len(words)
|
| 415 |
+
wpm = (word_count / duration * 60) if duration > 0 else 0
|
| 416 |
+
|
| 417 |
+
# Determine pace category
|
| 418 |
+
if wpm < 100:
|
| 419 |
+
pace_category = "Too Slow"
|
| 420 |
+
pace_feedback = "Consider speaking slightly faster for better engagement"
|
| 421 |
+
elif wpm < 130:
|
| 422 |
+
pace_category = "Good"
|
| 423 |
+
pace_feedback = "Your pace is comfortable and easy to follow"
|
| 424 |
+
elif wpm < 160:
|
| 425 |
+
pace_category = "Optimal"
|
| 426 |
+
pace_feedback = "Excellent pacing - clear and engaging"
|
| 427 |
+
elif wpm < 180:
|
| 428 |
+
pace_category = "Fast"
|
| 429 |
+
pace_feedback = "Speaking quickly but still understandable"
|
| 430 |
+
else:
|
| 431 |
+
pace_category = "Too Fast"
|
| 432 |
+
pace_feedback = "Try slowing down to ensure clarity"
|
| 433 |
+
|
| 434 |
+
# Calculate pace variance (consistency)
|
| 435 |
+
if len(words) > 10:
|
| 436 |
+
segment_size = max(5, len(words) // 10)
|
| 437 |
+
segment_paces = []
|
| 438 |
+
for i in range(0, len(words) - segment_size, segment_size):
|
| 439 |
+
segment = words[i:i+segment_size]
|
| 440 |
+
seg_duration = segment[-1]['end'] - segment[0]['start']
|
| 441 |
+
if seg_duration > 0:
|
| 442 |
+
seg_wpm = len(segment) / seg_duration * 60
|
| 443 |
+
segment_paces.append(seg_wpm)
|
| 444 |
+
|
| 445 |
+
pace_variance = np.std(segment_paces) if len(segment_paces) > 1 else 0
|
| 446 |
+
consistency = "High" if pace_variance < 20 else "Medium" if pace_variance < 40 else "Low"
|
| 447 |
+
else:
|
| 448 |
+
pace_variance = 0
|
| 449 |
+
consistency = "N/A"
|
| 450 |
+
|
| 451 |
+
return {
|
| 452 |
+
'words_per_minute': round(wpm, 1),
|
| 453 |
+
'category': pace_category,
|
| 454 |
+
'consistency': consistency,
|
| 455 |
+
'pace_variance': round(pace_variance, 1),
|
| 456 |
+
'feedback': pace_feedback
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
def _analyze_tone_confidence(self, audio: np.ndarray, sr: int) -> Dict:
|
| 461 |
+
"""Analyze tone variation and confidence indicators"""
|
| 462 |
+
# Pitch analysis (fundamental frequency)
|
| 463 |
+
try:
|
| 464 |
+
f0 = librosa.yin(
|
| 465 |
+
audio.astype(np.float64),
|
| 466 |
+
fmin=80, # Male range
|
| 467 |
+
fmax=400 # Female range
|
| 468 |
+
)
|
| 469 |
+
f0_clean = f0[f0 > 0]
|
| 470 |
+
|
| 471 |
+
if len(f0_clean) > 0:
|
| 472 |
+
avg_pitch = np.mean(f0_clean)
|
| 473 |
+
pitch_std = np.std(f0_clean)
|
| 474 |
+
pitch_range = np.ptp(f0_clean)
|
| 475 |
+
|
| 476 |
+
# Pitch variation indicates expressiveness
|
| 477 |
+
if pitch_std < 20:
|
| 478 |
+
expressiveness = "Monotone"
|
| 479 |
+
expression_score = 40
|
| 480 |
+
elif pitch_std < 40:
|
| 481 |
+
expressiveness = "Moderate Variation"
|
| 482 |
+
expression_score = 70
|
| 483 |
+
else:
|
| 484 |
+
expressiveness = "Expressive"
|
| 485 |
+
expression_score = 95
|
| 486 |
+
else:
|
| 487 |
+
avg_pitch = 0
|
| 488 |
+
pitch_std = 0
|
| 489 |
+
pitch_range = 0
|
| 490 |
+
expressiveness = "Unknown"
|
| 491 |
+
expression_score = 50
|
| 492 |
+
except Exception as e:
|
| 493 |
+
logging.warning(f"Pitch analysis failed: {e}")
|
| 494 |
+
avg_pitch = 0
|
| 495 |
+
pitch_std = 0
|
| 496 |
+
pitch_range = 0
|
| 497 |
+
expressiveness = "Unknown"
|
| 498 |
+
expression_score = 50
|
| 499 |
+
|
| 500 |
+
# Energy/Volume analysis
|
| 501 |
+
rms = librosa.feature.rms(y=audio)[0]
|
| 502 |
+
avg_energy = np.mean(rms)
|
| 503 |
+
energy_std = np.std(rms)
|
| 504 |
+
|
| 505 |
+
# Volume consistency
|
| 506 |
+
if energy_std < 0.02:
|
| 507 |
+
volume_consistency = "Very Consistent"
|
| 508 |
+
elif energy_std < 0.05:
|
| 509 |
+
volume_consistency = "Consistent"
|
| 510 |
+
else:
|
| 511 |
+
volume_consistency = "Varied"
|
| 512 |
+
|
| 513 |
+
# Confidence estimation (based on volume stability and pitch)
|
| 514 |
+
confidence_score = 50 # Base
|
| 515 |
+
if energy_std < 0.03: # Stable volume
|
| 516 |
+
confidence_score += 15
|
| 517 |
+
if 150 < avg_pitch < 250: # Comfortable pitch range
|
| 518 |
+
confidence_score += 15
|
| 519 |
+
if pitch_std > 20: # Some variation (engaged)
|
| 520 |
+
confidence_score += 20
|
| 521 |
+
|
| 522 |
+
confidence_score = min(100, max(0, confidence_score))
|
| 523 |
+
|
| 524 |
+
return {
|
| 525 |
+
'expressiveness': expressiveness,
|
| 526 |
+
'expression_score': expression_score,
|
| 527 |
+
'average_pitch_hz': round(float(avg_pitch), 1),
|
| 528 |
+
'pitch_variation_hz': round(float(pitch_std), 1),
|
| 529 |
+
'volume_consistency': volume_consistency,
|
| 530 |
+
'confidence_score': round(confidence_score, 1)
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
def _check_grammar(self, text: str) -> Dict:
|
| 535 |
+
"""Check grammar using language-tool-python"""
|
| 536 |
+
grammar_tool = get_grammar_tool()
|
| 537 |
+
|
| 538 |
+
if grammar_tool is None:
|
| 539 |
+
return {
|
| 540 |
+
'issue_count': 0,
|
| 541 |
+
'issues': [],
|
| 542 |
+
'available': False
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
try:
|
| 546 |
+
matches = grammar_tool.check(text)
|
| 547 |
+
issues = []
|
| 548 |
+
|
| 549 |
+
for match in matches[:20]: # Limit to top 20
|
| 550 |
+
issues.append({
|
| 551 |
+
'type': match.ruleId,
|
| 552 |
+
'message': match.message,
|
| 553 |
+
'context': match.context,
|
| 554 |
+
'suggestions': match.replacements[:3]
|
| 555 |
+
})
|
| 556 |
+
|
| 557 |
+
return {
|
| 558 |
+
'issue_count': len(matches),
|
| 559 |
+
'issues': issues,
|
| 560 |
+
'available': True
|
| 561 |
+
}
|
| 562 |
+
except Exception as e:
|
| 563 |
+
logging.warning(f"Grammar check failed: {e}")
|
| 564 |
+
return {
|
| 565 |
+
'issue_count': 0,
|
| 566 |
+
'issues': [],
|
| 567 |
+
'available': False
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def _analyze_content(self, text: str, words: List[Dict]) -> Dict:
|
| 572 |
+
"""Analyze content quality and complexity"""
|
| 573 |
+
# Readability metrics
|
| 574 |
+
try:
|
| 575 |
+
flesch_score = textstat.flesch_reading_ease(text)
|
| 576 |
+
grade_level = textstat.text_standard(text, float_output=True)
|
| 577 |
+
except:
|
| 578 |
+
flesch_score = 50
|
| 579 |
+
grade_level = 8
|
| 580 |
+
|
| 581 |
+
# Interpret Flesch score
|
| 582 |
+
if flesch_score >= 90:
|
| 583 |
+
readability = "Very Easy"
|
| 584 |
+
elif flesch_score >= 70:
|
| 585 |
+
readability = "Easy"
|
| 586 |
+
elif flesch_score >= 50:
|
| 587 |
+
readability = "Moderate"
|
| 588 |
+
elif flesch_score >= 30:
|
| 589 |
+
readability = "Difficult"
|
| 590 |
+
else:
|
| 591 |
+
readability = "Very Difficult"
|
| 592 |
+
|
| 593 |
+
# Power word usage
|
| 594 |
+
word_list = [w['word'].lower().strip('.,!?') for w in words]
|
| 595 |
+
power_word_count = sum(1 for w in word_list if w in self.power_words)
|
| 596 |
+
power_word_rate = (power_word_count / len(words) * 100) if len(words) > 0 else 0
|
| 597 |
+
|
| 598 |
+
# Vocabulary diversity
|
| 599 |
+
unique_words = len(set(word_list))
|
| 600 |
+
vocab_diversity = (unique_words / len(words) * 100) if len(words) > 0 else 0
|
| 601 |
+
|
| 602 |
+
# Sentence structure (approximate from punctuation)
|
| 603 |
+
sentence_count = max(1, text.count('.') + text.count('!') + text.count('?'))
|
| 604 |
+
avg_sentence_length = len(words) / sentence_count
|
| 605 |
+
|
| 606 |
+
return {
|
| 607 |
+
'readability_score': round(flesch_score, 1),
|
| 608 |
+
'readability_level': readability,
|
| 609 |
+
'grade_level': round(grade_level, 1),
|
| 610 |
+
'power_words_used': power_word_count,
|
| 611 |
+
'power_word_rate': round(power_word_rate, 2),
|
| 612 |
+
'vocabulary_diversity': round(vocab_diversity, 1),
|
| 613 |
+
'unique_word_count': unique_words,
|
| 614 |
+
'average_sentence_length': round(avg_sentence_length, 1)
|
| 615 |
+
}
|
| 616 |
+
|
| 617 |
+
|
| 618 |
+
def _calculate_scores(
|
| 619 |
+
self,
|
| 620 |
+
filler_analysis: Dict,
|
| 621 |
+
pause_analysis: Dict,
|
| 622 |
+
repetition_analysis: List,
|
| 623 |
+
pace_analysis: Dict,
|
| 624 |
+
tone_analysis: Dict,
|
| 625 |
+
grammar_analysis: Dict,
|
| 626 |
+
content_analysis: Dict
|
| 627 |
+
) -> Dict:
|
| 628 |
+
"""Calculate comprehensive scores"""
|
| 629 |
+
|
| 630 |
+
# Fluency Score (0-100)
|
| 631 |
+
fluency = 100
|
| 632 |
+
fluency -= min(30, filler_analysis['rate_percentage'] * 5) # Filler penalty
|
| 633 |
+
fluency -= min(20, pause_analysis['long_pause_count'] * 5) # Long pause penalty
|
| 634 |
+
fluency -= min(15, len(repetition_analysis) * 3) # Repetition penalty
|
| 635 |
+
|
| 636 |
+
# Pace bonus/penalty
|
| 637 |
+
wpm = pace_analysis['words_per_minute']
|
| 638 |
+
if 130 <= wpm <= 160:
|
| 639 |
+
fluency += 5 # Optimal pace bonus
|
| 640 |
+
elif wpm < 100 or wpm > 180:
|
| 641 |
+
fluency -= 10 # Poor pace penalty
|
| 642 |
+
|
| 643 |
+
fluency = max(0, min(100, fluency))
|
| 644 |
+
|
| 645 |
+
# Confidence Score (from tone analysis)
|
| 646 |
+
confidence = tone_analysis['confidence_score']
|
| 647 |
+
|
| 648 |
+
# Content Score (0-100)
|
| 649 |
+
content = 50 # Base
|
| 650 |
+
content += min(30, content_analysis['power_word_rate'] * 3) # Power words
|
| 651 |
+
content += min(20, content_analysis['vocabulary_diversity'] / 5) # Diversity
|
| 652 |
+
|
| 653 |
+
# Readability bonus/penalty
|
| 654 |
+
flesch = content_analysis['readability_score']
|
| 655 |
+
if 50 <= flesch <= 70:
|
| 656 |
+
content += 10
|
| 657 |
+
|
| 658 |
+
content = max(0, min(100, content))
|
| 659 |
+
|
| 660 |
+
# Grammar Score (0-100)
|
| 661 |
+
if grammar_analysis['available']:
|
| 662 |
+
grammar = max(0, 100 - grammar_analysis['issue_count'] * 2)
|
| 663 |
+
else:
|
| 664 |
+
grammar = 85 # Default if unavailable
|
| 665 |
+
|
| 666 |
+
# Overall Score (weighted average)
|
| 667 |
+
overall = (
|
| 668 |
+
fluency * 0.35 +
|
| 669 |
+
confidence * 0.25 +
|
| 670 |
+
content * 0.25 +
|
| 671 |
+
grammar * 0.15
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
return {
|
| 675 |
+
'overall': round(overall),
|
| 676 |
+
'fluency': round(fluency),
|
| 677 |
+
'confidence': round(confidence),
|
| 678 |
+
'content': round(content),
|
| 679 |
+
'grammar': round(grammar)
|
| 680 |
+
}
|
| 681 |
+
|
| 682 |
+
|
| 683 |
+
def _generate_coaching(
|
| 684 |
+
self,
|
| 685 |
+
scores: Dict,
|
| 686 |
+
filler_analysis: Dict,
|
| 687 |
+
pause_analysis: Dict,
|
| 688 |
+
repetition_analysis: List,
|
| 689 |
+
pace_analysis: Dict,
|
| 690 |
+
tone_analysis: Dict,
|
| 691 |
+
grammar_analysis: Dict,
|
| 692 |
+
content_analysis: Dict
|
| 693 |
+
) -> Dict:
|
| 694 |
+
"""Generate personalized coaching feedback"""
|
| 695 |
+
|
| 696 |
+
strengths = []
|
| 697 |
+
improvements = []
|
| 698 |
+
|
| 699 |
+
# Analyze strengths
|
| 700 |
+
if scores['fluency'] >= 80:
|
| 701 |
+
strengths.append("Excellent fluency - your speech flows naturally")
|
| 702 |
+
|
| 703 |
+
if filler_analysis['rate_percentage'] < 2:
|
| 704 |
+
strengths.append("Minimal use of filler words - very professional")
|
| 705 |
+
|
| 706 |
+
if pace_analysis['words_per_minute'] >= 130 and pace_analysis['words_per_minute'] <= 160:
|
| 707 |
+
strengths.append("Perfect speaking pace - clear and engaging")
|
| 708 |
+
|
| 709 |
+
if tone_analysis['expression_score'] >= 80:
|
| 710 |
+
strengths.append("Great vocal expressiveness - keeps audience engaged")
|
| 711 |
+
|
| 712 |
+
if content_analysis['power_word_rate'] >= 3:
|
| 713 |
+
strengths.append("Strong use of impactful vocabulary")
|
| 714 |
+
|
| 715 |
+
if scores['confidence'] >= 75:
|
| 716 |
+
strengths.append("Confident delivery with strong vocal presence")
|
| 717 |
+
|
| 718 |
+
# Identify improvements
|
| 719 |
+
if filler_analysis['rate_percentage'] >= 5:
|
| 720 |
+
improvements.append(
|
| 721 |
+
f"Reduce filler words ({filler_analysis['rate_percentage']:.1f}% of speech). "
|
| 722 |
+
"Try pausing silently instead of using 'um' or 'uh'"
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
if pause_analysis['long_pause_count'] >= 3:
|
| 726 |
+
improvements.append(
|
| 727 |
+
f"You have {pause_analysis['long_pause_count']} long pauses. "
|
| 728 |
+
"Practice smoother transitions between thoughts"
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
if len(repetition_analysis) >= 3:
|
| 732 |
+
top_rep = repetition_analysis[0]
|
| 733 |
+
improvements.append(
|
| 734 |
+
f"You repeated '{top_rep['phrase']}' {top_rep['count']} times. "
|
| 735 |
+
"Vary your phrasing for more engaging delivery"
|
| 736 |
+
)
|
| 737 |
+
|
| 738 |
+
wpm = pace_analysis['words_per_minute']
|
| 739 |
+
if wpm < 120:
|
| 740 |
+
improvements.append(
|
| 741 |
+
"Your pace is quite slow. Try speaking 10-15% faster to maintain energy"
|
| 742 |
+
)
|
| 743 |
+
elif wpm > 170:
|
| 744 |
+
improvements.append(
|
| 745 |
+
"You're speaking very quickly. Slow down 10-15% to ensure clarity"
|
| 746 |
+
)
|
| 747 |
+
|
| 748 |
+
if tone_analysis['expression_score'] < 60:
|
| 749 |
+
improvements.append(
|
| 750 |
+
"Add more vocal variety. Practice emphasizing key words and varying your pitch"
|
| 751 |
+
)
|
| 752 |
+
|
| 753 |
+
if grammar_analysis['available'] and grammar_analysis['issue_count'] >= 5:
|
| 754 |
+
improvements.append(
|
| 755 |
+
f"Found {grammar_analysis['issue_count']} grammar issues. "
|
| 756 |
+
"Review your script and practice correct phrasing"
|
| 757 |
+
)
|
| 758 |
+
|
| 759 |
+
if content_analysis['vocabulary_diversity'] < 40:
|
| 760 |
+
improvements.append(
|
| 761 |
+
"Expand your vocabulary. Using more diverse words makes speeches more engaging"
|
| 762 |
+
)
|
| 763 |
+
|
| 764 |
+
# Generate overall feedback message
|
| 765 |
+
overall_score = scores['overall']
|
| 766 |
+
|
| 767 |
+
if overall_score >= 90:
|
| 768 |
+
overall_feedback = (
|
| 769 |
+
"🌟 Outstanding performance! Your speech demonstrates excellent "
|
| 770 |
+
"command of public speaking fundamentals. You're ready for any audience!"
|
| 771 |
+
)
|
| 772 |
+
elif overall_score >= 75:
|
| 773 |
+
overall_feedback = (
|
| 774 |
+
"👏 Strong performance! You have solid public speaking skills. "
|
| 775 |
+
"Focus on the improvement areas to reach the next level."
|
| 776 |
+
)
|
| 777 |
+
elif overall_score >= 60:
|
| 778 |
+
overall_feedback = (
|
| 779 |
+
"✅ Good effort! You have a foundation to build on. "
|
| 780 |
+
"Work on the suggested improvements and keep practicing."
|
| 781 |
+
)
|
| 782 |
+
else:
|
| 783 |
+
overall_feedback = (
|
| 784 |
+
"💪 Keep practicing! Public speaking is a skill that improves with practice. "
|
| 785 |
+
"Focus on one improvement area at a time and you'll see progress."
|
| 786 |
+
)
|
| 787 |
+
|
| 788 |
+
# Add default messages if lists are empty
|
| 789 |
+
if not strengths:
|
| 790 |
+
strengths.append("You completed the speech - that takes courage!")
|
| 791 |
+
|
| 792 |
+
if not improvements:
|
| 793 |
+
improvements.append("Keep practicing to maintain your excellent skills")
|
| 794 |
+
|
| 795 |
+
return {
|
| 796 |
+
'overall_feedback': overall_feedback,
|
| 797 |
+
'strengths': strengths[:5], # Top 5
|
| 798 |
+
'improvements': improvements[:5], # Top 5
|
| 799 |
+
'next_steps': self._generate_next_steps(scores, improvements)
|
| 800 |
+
}
|
| 801 |
+
|
| 802 |
+
|
| 803 |
+
def _generate_next_steps(self, scores: Dict, improvements: List[str]) -> List[str]:
|
| 804 |
+
"""Generate actionable next steps"""
|
| 805 |
+
steps = []
|
| 806 |
+
|
| 807 |
+
# Prioritize based on weakest scores
|
| 808 |
+
score_items = [
|
| 809 |
+
('fluency', scores['fluency']),
|
| 810 |
+
('confidence', scores['confidence']),
|
| 811 |
+
('content', scores['content']),
|
| 812 |
+
('grammar', scores['grammar'])
|
| 813 |
+
]
|
| 814 |
+
score_items.sort(key=lambda x: x[1])
|
| 815 |
+
|
| 816 |
+
weakest = score_items[0][0]
|
| 817 |
+
|
| 818 |
+
if weakest == 'fluency':
|
| 819 |
+
steps.append("Practice speaking without filler words - try the 'silent pause' technique")
|
| 820 |
+
steps.append("Record yourself daily and track filler word reduction")
|
| 821 |
+
elif weakest == 'confidence':
|
| 822 |
+
steps.append("Work on vocal projection exercises to build confidence")
|
| 823 |
+
steps.append("Practice power poses before speaking to boost confidence")
|
| 824 |
+
elif weakest == 'content':
|
| 825 |
+
steps.append("Build your vocabulary by learning 2-3 power words per week")
|
| 826 |
+
steps.append("Study speeches by great speakers and note their word choices")
|
| 827 |
+
elif weakest == 'grammar':
|
| 828 |
+
steps.append("Review common grammar rules and practice correct phrasing")
|
| 829 |
+
steps.append("Have someone proofread your speeches before delivery")
|
| 830 |
+
|
| 831 |
+
steps.append("Practice this speech 3 more times and compare your progress")
|
| 832 |
+
|
| 833 |
+
return steps[:4]
|
| 834 |
+
|
| 835 |
+
|
| 836 |
+
# ================= TEST RUNNER =================
|
| 837 |
+
if __name__ == "__main__":
|
| 838 |
+
print("\n" + "="*60)
|
| 839 |
+
print("PUBLIC SPEAKING COACH - ENGINE TEST")
|
| 840 |
+
print("="*60 + "\n")
|
| 841 |
+
|
| 842 |
+
test_file = "test_speech.wav"
|
| 843 |
+
|
| 844 |
+
# Generate test audio if needed
|
| 845 |
+
if not os.path.exists(test_file):
|
| 846 |
+
print("⚠️ No test file found. Generating dummy audio...")
|
| 847 |
+
sr = 16000
|
| 848 |
+
duration = 5
|
| 849 |
+
t = np.linspace(0, duration, sr * duration)
|
| 850 |
+
# Simulate speech-like audio with varying frequency
|
| 851 |
+
audio = 0.3 * np.sin(2 * np.pi * 200 * t) + 0.2 * np.sin(2 * np.pi * 300 * t)
|
| 852 |
+
sf.write(test_file, audio, sr)
|
| 853 |
+
print(f"✅ Created {test_file}\n")
|
| 854 |
+
|
| 855 |
+
try:
|
| 856 |
+
# Initialize coach
|
| 857 |
+
coach = PublicSpeakingCoach(whisper_model_size="base")
|
| 858 |
+
|
| 859 |
+
# Analyze
|
| 860 |
+
result = coach.analyze_speech(test_file)
|
| 861 |
+
|
| 862 |
+
# Display results
|
| 863 |
+
print("\n" + "="*60)
|
| 864 |
+
print("ANALYSIS RESULTS")
|
| 865 |
+
print("="*60)
|
| 866 |
+
print(json.dumps(result, indent=2, cls=NumpyEncoder))
|
| 867 |
+
|
| 868 |
+
print("\n✅ Engine test completed successfully!")
|
| 869 |
+
|
| 870 |
+
except Exception as e:
|
| 871 |
+
print(f"\n❌ ERROR: {e}")
|
| 872 |
+
import traceback
|
| 873 |
+
traceback.print_exc()
|
main.py
CHANGED
|
@@ -1,105 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import uvicorn
|
| 4 |
-
import
|
| 5 |
-
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 6 |
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
-
from
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
app.add_middleware(
|
| 12 |
CORSMiddleware,
|
| 13 |
-
allow_origins=["*"],
|
|
|
|
| 14 |
allow_methods=["*"],
|
| 15 |
allow_headers=["*"],
|
| 16 |
)
|
| 17 |
|
| 18 |
-
# Global
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
@app.on_event("startup")
|
| 22 |
async def startup_event():
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
print("❌ CRITICAL: HF_TOKEN not found in environment variables!")
|
| 29 |
|
| 30 |
-
print("🚀 Initializing KidCoach Engine (Production Mode)...")
|
| 31 |
try:
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
except Exception as e:
|
| 35 |
-
print(f"❌
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
try:
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
except Exception as e:
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
@app.post("/coach")
|
| 59 |
-
async def coach_audio(file: UploadFile = File(...)):
|
| 60 |
-
global engine
|
| 61 |
-
if not engine:
|
| 62 |
-
raise HTTPException(status_code=500, detail="AI Engine is not initialized")
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 67 |
|
| 68 |
-
try:
|
| 69 |
-
# Write upload to disk
|
| 70 |
-
with open(raw_filename, "wb") as buffer:
|
| 71 |
-
shutil.copyfileobj(file.file, buffer)
|
| 72 |
|
| 73 |
-
|
| 74 |
-
print(f"🔄 Processing file: {file.filename}")
|
| 75 |
-
success = convert_to_wav(raw_filename, clean_wav_filename)
|
| 76 |
-
|
| 77 |
-
if not success:
|
| 78 |
-
raise HTTPException(status_code=400, detail="Audio file unreadable. Please upload MP3, WAV, or M4A.")
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
if "error" in result:
|
| 85 |
-
print(f"Pipeline Error: {result['error']}")
|
| 86 |
-
raise HTTPException(status_code=500, detail=result["error"])
|
| 87 |
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
except Exception as e:
|
| 93 |
-
print(f"Server Error: {e}")
|
| 94 |
-
raise HTTPException(status_code=500, detail="Internal Processing Error")
|
| 95 |
-
|
| 96 |
-
finally:
|
| 97 |
-
# 4. Cleanup temp files to save disk space
|
| 98 |
-
if os.path.exists(raw_filename):
|
| 99 |
-
os.remove(raw_filename)
|
| 100 |
-
if os.path.exists(clean_wav_filename):
|
| 101 |
-
os.remove(clean_wav_filename)
|
| 102 |
|
| 103 |
if __name__ == "__main__":
|
| 104 |
-
#
|
| 105 |
-
uvicorn.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Production FastAPI Server for Public Speaking Coach
|
| 3 |
+
Handles audio uploads and returns comprehensive analysis
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import os
|
| 7 |
import shutil
|
| 8 |
+
import tempfile
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
import uvicorn
|
| 13 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, status
|
|
|
|
| 14 |
from fastapi.middleware.cors import CORSMiddleware
|
| 15 |
+
from fastapi.responses import JSONResponse
|
| 16 |
+
from pydantic import BaseModel
|
| 17 |
|
| 18 |
+
from kid_coach_pipeline import PublicSpeakingCoach
|
| 19 |
|
| 20 |
+
# ================= APP CONFIGURATION =================
|
| 21 |
+
|
| 22 |
+
app = FastAPI(
|
| 23 |
+
title="Public Speaking Coach API",
|
| 24 |
+
description="AI-powered speech analysis and coaching for all ages",
|
| 25 |
+
version="2.0.0"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# CORS Configuration - Adjust for production
|
| 29 |
app.add_middleware(
|
| 30 |
CORSMiddleware,
|
| 31 |
+
allow_origins=["*"], # Change to specific domains in production
|
| 32 |
+
allow_credentials=True,
|
| 33 |
allow_methods=["*"],
|
| 34 |
allow_headers=["*"],
|
| 35 |
)
|
| 36 |
|
| 37 |
+
# Global engine instance
|
| 38 |
+
coach_engine: Optional[PublicSpeakingCoach] = None
|
| 39 |
+
|
| 40 |
+
# Supported audio formats
|
| 41 |
+
SUPPORTED_FORMATS = {
|
| 42 |
+
'.wav', '.mp3', '.m4a', '.flac', '.ogg',
|
| 43 |
+
'.wma', '.aac', '.mp4', '.webm'
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# Maximum file size (50MB)
|
| 47 |
+
MAX_FILE_SIZE = 50 * 1024 * 1024
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ================= RESPONSE MODELS =================
|
| 51 |
+
|
| 52 |
+
class HealthResponse(BaseModel):
|
| 53 |
+
"""Health check response"""
|
| 54 |
+
status: str
|
| 55 |
+
engine_loaded: bool
|
| 56 |
+
supported_formats: list
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class ErrorResponse(BaseModel):
|
| 60 |
+
"""Error response format"""
|
| 61 |
+
error: str
|
| 62 |
+
detail: Optional[str] = None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ================= STARTUP/SHUTDOWN =================
|
| 66 |
|
| 67 |
@app.on_event("startup")
|
| 68 |
async def startup_event():
|
| 69 |
+
"""Initialize the coach engine on server start"""
|
| 70 |
+
global coach_engine
|
| 71 |
|
| 72 |
+
print("\n" + "="*60)
|
| 73 |
+
print("🚀 PUBLIC SPEAKING COACH API - STARTING")
|
| 74 |
+
print("="*60)
|
|
|
|
| 75 |
|
|
|
|
| 76 |
try:
|
| 77 |
+
print("\n📦 Loading AI models...")
|
| 78 |
+
coach_engine = PublicSpeakingCoach(whisper_model_size="base")
|
| 79 |
+
print("✅ Coach engine ready!")
|
| 80 |
+
print("\n" + "="*60)
|
| 81 |
+
print("🎤 API is ready to analyze speeches!")
|
| 82 |
+
print("="*60 + "\n")
|
| 83 |
+
|
| 84 |
except Exception as e:
|
| 85 |
+
print(f"\n❌ STARTUP FAILED: {e}")
|
| 86 |
+
print("Server will start but analysis will not work.\n")
|
| 87 |
+
coach_engine = None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@app.on_event("shutdown")
|
| 91 |
+
async def shutdown_event():
|
| 92 |
+
"""Cleanup on server shutdown"""
|
| 93 |
+
print("\n👋 Shutting down Public Speaking Coach API...")
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# ================= ENDPOINTS =================
|
| 97 |
+
|
| 98 |
+
@app.get("/", response_model=HealthResponse)
|
| 99 |
+
async def root():
|
| 100 |
+
"""Root endpoint - API info"""
|
| 101 |
+
return {
|
| 102 |
+
"status": "online",
|
| 103 |
+
"engine_loaded": coach_engine is not None,
|
| 104 |
+
"supported_formats": list(SUPPORTED_FORMATS)
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
|
| 108 |
+
@app.get("/health", response_model=HealthResponse)
|
| 109 |
+
async def health_check():
|
| 110 |
+
"""Health check endpoint"""
|
| 111 |
+
return {
|
| 112 |
+
"status": "healthy" if coach_engine else "degraded",
|
| 113 |
+
"engine_loaded": coach_engine is not None,
|
| 114 |
+
"supported_formats": list(SUPPORTED_FORMATS)
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@app.post("/coach")
|
| 119 |
+
async def analyze_speech(file: UploadFile = File(...)):
|
| 120 |
"""
|
| 121 |
+
Main endpoint: Upload audio file and receive speech analysis
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
file: Audio file (wav, mp3, m4a, flac, ogg, etc.)
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
Comprehensive speech analysis with scores and coaching feedback
|
| 128 |
+
|
| 129 |
+
Raises:
|
| 130 |
+
400: Invalid file format or corrupted audio
|
| 131 |
+
413: File too large
|
| 132 |
+
500: Analysis failed
|
| 133 |
+
503: Engine not loaded
|
| 134 |
"""
|
| 135 |
+
|
| 136 |
+
# Check if engine is loaded
|
| 137 |
+
if coach_engine is None:
|
| 138 |
+
raise HTTPException(
|
| 139 |
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
| 140 |
+
detail="Coach engine not initialized. Please contact administrator."
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Validate file exists
|
| 144 |
+
if not file:
|
| 145 |
+
raise HTTPException(
|
| 146 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 147 |
+
detail="No file provided"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Validate filename
|
| 151 |
+
if not file.filename:
|
| 152 |
+
raise HTTPException(
|
| 153 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 154 |
+
detail="Invalid filename"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Get file extension
|
| 158 |
+
file_ext = Path(file.filename).suffix.lower()
|
| 159 |
+
|
| 160 |
+
# Validate format
|
| 161 |
+
if file_ext not in SUPPORTED_FORMATS:
|
| 162 |
+
raise HTTPException(
|
| 163 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 164 |
+
detail=f"Unsupported format '{file_ext}'. Supported: {', '.join(SUPPORTED_FORMATS)}"
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# Create temporary file
|
| 168 |
+
temp_file = None
|
| 169 |
+
|
| 170 |
try:
|
| 171 |
+
# Read file content
|
| 172 |
+
content = await file.read()
|
| 173 |
+
|
| 174 |
+
# Check file size
|
| 175 |
+
if len(content) > MAX_FILE_SIZE:
|
| 176 |
+
raise HTTPException(
|
| 177 |
+
status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
|
| 178 |
+
detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Create temporary file with proper extension
|
| 182 |
+
with tempfile.NamedTemporaryFile(
|
| 183 |
+
delete=False,
|
| 184 |
+
suffix=file_ext
|
| 185 |
+
) as temp:
|
| 186 |
+
temp.write(content)
|
| 187 |
+
temp_file = temp.name
|
| 188 |
+
|
| 189 |
+
print(f"\n📁 Processing: {file.filename} ({len(content) / 1024:.1f} KB)")
|
| 190 |
+
|
| 191 |
+
# Run analysis
|
| 192 |
+
result = coach_engine.analyze_speech(temp_file)
|
| 193 |
+
|
| 194 |
+
# Check for analysis errors
|
| 195 |
+
if "error" in result:
|
| 196 |
+
raise HTTPException(
|
| 197 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 198 |
+
detail=result["error"]
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
print(f"✅ Analysis complete: Score {result['overall_score']}/100")
|
| 202 |
+
|
| 203 |
+
return JSONResponse(content=result)
|
| 204 |
+
|
| 205 |
+
except HTTPException:
|
| 206 |
+
# Re-raise HTTP exceptions
|
| 207 |
+
raise
|
| 208 |
+
|
| 209 |
except Exception as e:
|
| 210 |
+
# Log unexpected errors
|
| 211 |
+
import traceback
|
| 212 |
+
print(f"\n❌ ANALYSIS ERROR:")
|
| 213 |
+
traceback.print_exc()
|
| 214 |
+
|
| 215 |
+
raise HTTPException(
|
| 216 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 217 |
+
detail=f"Analysis failed: {str(e)}"
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
finally:
|
| 221 |
+
# Cleanup temporary file
|
| 222 |
+
if temp_file and os.path.exists(temp_file):
|
| 223 |
+
try:
|
| 224 |
+
os.remove(temp_file)
|
| 225 |
+
except Exception as e:
|
| 226 |
+
print(f"⚠️ Failed to delete temp file: {e}")
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
+
@app.post("/analyze")
|
| 230 |
+
async def analyze_speech_alias(file: UploadFile = File(...)):
|
| 231 |
+
"""Alias endpoint for /coach (for compatibility)"""
|
| 232 |
+
return await analyze_speech(file)
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
|
| 235 |
+
# ================= ERROR HANDLERS =================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
@app.exception_handler(HTTPException)
|
| 238 |
+
async def http_exception_handler(request, exc):
|
| 239 |
+
"""Custom HTTP exception handler"""
|
| 240 |
+
return JSONResponse(
|
| 241 |
+
status_code=exc.status_code,
|
| 242 |
+
content={
|
| 243 |
+
"error": exc.detail,
|
| 244 |
+
"status_code": exc.status_code
|
| 245 |
+
}
|
| 246 |
+
)
|
| 247 |
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
+
@app.exception_handler(Exception)
|
| 250 |
+
async def general_exception_handler(request, exc):
|
| 251 |
+
"""Catch-all exception handler"""
|
| 252 |
+
import traceback
|
| 253 |
+
traceback.print_exc()
|
| 254 |
+
|
| 255 |
+
return JSONResponse(
|
| 256 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 257 |
+
content={
|
| 258 |
+
"error": "Internal server error",
|
| 259 |
+
"detail": str(exc)
|
| 260 |
+
}
|
| 261 |
+
)
|
| 262 |
|
| 263 |
+
|
| 264 |
+
# ================= MAIN =================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
if __name__ == "__main__":
|
| 267 |
+
# For local development
|
| 268 |
+
uvicorn.run(
|
| 269 |
+
app,
|
| 270 |
+
host="0.0.0.0",
|
| 271 |
+
port=8000,
|
| 272 |
+
log_level="info"
|
| 273 |
+
)
|
requirements (1).txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Dependencies - Tested & Compatible
|
| 2 |
+
torch==2.1.0
|
| 3 |
+
torchaudio==2.1.0
|
| 4 |
+
openai-whisper==20231117
|
| 5 |
+
|
| 6 |
+
# Audio Processing
|
| 7 |
+
librosa==0.10.1
|
| 8 |
+
soundfile==0.12.1
|
| 9 |
+
scipy==1.11.4
|
| 10 |
+
numpy==1.24.3
|
| 11 |
+
|
| 12 |
+
# Text Analysis
|
| 13 |
+
textstat==0.7.3
|
| 14 |
+
language-tool-python==2.8.0
|
| 15 |
+
|
| 16 |
+
# API Framework
|
| 17 |
+
fastapi==0.109.0
|
| 18 |
+
uvicorn[standard]==0.27.0
|
| 19 |
+
python-multipart==0.0.6
|
| 20 |
+
|
| 21 |
+
# LLM Integration (lightweight, no GPU needed)
|
| 22 |
+
transformers==4.36.0
|
| 23 |
+
sentencepiece==0.1.99
|
| 24 |
+
|
| 25 |
+
# Utilities
|
| 26 |
+
pydantic==2.5.3
|
| 27 |
+
python-dotenv==1.0.0
|