akpande2 commited on
Commit
1ba09d2
·
verified ·
1 Parent(s): 8e3cd10

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +38 -33
  2. kid_coach_pipeline.py +845 -150
  3. main.py +239 -71
  4. requirements (1).txt +27 -0
Dockerfile CHANGED
@@ -1,42 +1,47 @@
1
- # Use NVIDIA CUDA 11.8 base
2
- FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
3
-
4
- # Setup environment
5
- ENV DEBIAN_FRONTEND=noninteractive \
6
- PYTHONUNBUFFERED=1 \
7
- PIP_CACHE_DIR=/var/cache/pip
8
-
9
- # 1. Install System Dependencies (FFmpeg is still needed for converting audio)
10
- RUN apt-get update && apt-get install -y --no-install-recommends \
11
- python3.10 \
12
- python3-pip \
13
- python3-dev \
14
  ffmpeg \
 
15
  git \
16
- wget \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
- # Set python3.10 as default
20
- RUN ln -s /usr/bin/python3.10 /usr/bin/python
21
-
22
- # 2. Setup User
23
  WORKDIR /app
24
- RUN useradd -m -u 1000 user
25
- RUN chown -R user:user /app
26
- USER user
27
- ENV PATH="/home/user/.local/bin:$PATH"
28
 
29
- # 3. Install Python Dependencies
30
- COPY --chown=user requirements.txt requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- RUN pip install --no-cache-dir --upgrade pip && \
33
- # Install Torch first to handle the index URL correctly
34
- pip install --no-cache-dir "torch==2.1.2" "torchaudio==2.1.2" --index-url https://download.pytorch.org/whl/cu118 && \
35
- # Install the rest
36
- pip install --no-cache-dir -r requirements.txt
37
 
38
- # 4. Copy Code
39
- COPY --chown=user . .
 
40
 
41
- # 5. Launch
42
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
+ # Production Dockerfile for Public Speaking Coach API
2
+ # Optimized for Hugging Face Spaces or any cloud deployment
3
+
4
+ FROM python:3.10-slim
5
+
6
+ # Set environment variables
7
+ ENV PYTHONUNBUFFERED=1 \
8
+ PYTHONDONTWRITEBYTECODE=1 \
9
+ PIP_NO_CACHE_DIR=1 \
10
+ PIP_DISABLE_PIP_VERSION_CHECK=1
11
+
12
+ # Install system dependencies
13
+ RUN apt-get update && apt-get install -y \
14
  ffmpeg \
15
+ libsndfile1 \
16
  git \
 
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
+ # Set working directory
 
 
 
20
  WORKDIR /app
 
 
 
 
21
 
22
+ # Copy requirements first (for better caching)
23
+ COPY requirements.txt .
24
+
25
+ # Install Python dependencies
26
+ RUN pip install --no-cache-dir -r requirements.txt
27
+
28
+ # Download language tool data (for grammar checking)
29
+ RUN python -c "import language_tool_python; language_tool_python.LanguageTool('en-US')" || true
30
+
31
+ # Copy application code
32
+ COPY kid_coach_pipeline.py .
33
+ COPY main.py .
34
+
35
+ # Create directory for temporary files
36
+ RUN mkdir -p /tmp/uploads
37
 
38
+ # Expose port
39
+ EXPOSE 7860
 
 
 
40
 
41
+ # Health check
42
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
43
+ CMD python -c "import requests; requests.get('http://localhost:7860/health')"
44
 
45
+ # Run the application
46
+ # Use port 7860 for Hugging Face Spaces compatibility
47
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
kid_coach_pipeline.py CHANGED
@@ -1,178 +1,873 @@
 
 
 
 
 
1
  import os
 
 
 
 
2
  import re
3
- import gc
 
 
4
  import torch
5
- import torchaudio
6
  import librosa
7
  import numpy as np
8
- from pyannote.audio import Pipeline
9
- from transformers import (
10
- AutoTokenizer,
11
- AutoModelForCausalLM,
12
- BitsAndBytesConfig,
13
- pipeline
14
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- class KidCoachEngine:
17
- def __init__(self, hf_token: str):
18
- self.hf_token = hf_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
- self.torch_dtype = torch.float16 if self.device == "cuda" else torch.float32
21
- self.llm_id = "microsoft/Phi-3-mini-4k-instruct"
 
 
 
22
 
23
- # Filler words database
24
- self.filler_words = {
25
- 'um', 'uh', 'er', 'ah', 'like', 'you know', 'basically',
26
- 'literally', 'actually', 'mean', 'right', 'okay', 'sort of'
 
 
 
 
 
 
 
 
 
27
  }
28
-
29
- def _analyze_metrics(self, transcript_chunks, duration):
30
- """Calculates WPM, Fillers, and Stats"""
31
- full_text = " ".join([c['text'] for c in transcript_chunks]).strip()
32
- words = full_text.split()
33
- total_words = len(words)
34
 
35
- wpm = (total_words / duration) * 60 if duration > 0 else 0
36
-
37
- # Filler Density
38
- fillers_found = []
39
- for w in words:
40
- # Strip punctuation
41
- clean = re.sub(r'[^\w\s]', '', w.lower())
42
- if clean in self.filler_words:
43
- fillers_found.append(clean)
44
 
45
- filler_pct = (len(fillers_found)/total_words)*100 if total_words > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- return {
48
- "full_text": full_text,
49
- "wpm": round(wpm, 1),
50
- "duration": round(duration, 2),
51
- "fillers_count": len(fillers_found),
52
- "fillers_list": list(set(fillers_found)),
53
- "filler_pct": round(filler_pct, 1)
54
- }
55
-
56
- def _generate_coaching_feedback(self, metrics):
57
- """Loads LLM, generates feedback, then unloads it"""
58
- print("🧠 Loading AI Coach...")
59
  try:
60
- bnb_config = BitsAndBytesConfig(
61
- load_in_4bit=True,
62
- bnb_4bit_compute_dtype=torch.float16
63
- )
64
- tokenizer = AutoTokenizer.from_pretrained(self.llm_id, token=self.hf_token)
65
- model = AutoModelForCausalLM.from_pretrained(
66
- self.llm_id,
67
- quantization_config=bnb_config,
68
- device_map="auto",
69
- token=self.hf_token,
70
- trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
71
  )
72
-
73
- prompt = f"""
74
- You are a kind, professional Public Speaking Coach.
75
 
76
- SPEECH DATA:
77
- - Transcript: "{metrics['full_text'][:1500]}..."
78
- - Speed: {metrics['wpm']} WPM (Target: 130-150)
79
- - Filler Words: {metrics['fillers_count']} found ({metrics['filler_pct']}%)
 
 
80
 
81
- TASK:
82
- 1. Give a score out of 10.
83
- 2. List 2 Strengths.
84
- 3. List 1 specific Improvement (Pace, Fillers, or Grammar).
85
- 4. Give a "Pro Tip".
86
 
87
- Format nicely with emojis. Keep it concise.
88
- """
 
 
 
 
89
 
90
- messages = [{"role": "user", "content": prompt}]
91
- input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.device)
 
92
 
93
- outputs = model.generate(input_ids, max_new_tokens=400, temperature=0.7)
94
- feedback = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
 
95
 
96
- # CLEANUP LLM
97
- del model, tokenizer
98
- gc.collect()
99
- torch.cuda.empty_cache()
 
 
100
 
101
- return feedback
 
 
 
 
 
 
 
 
 
 
102
 
103
- except Exception as e:
104
- return f"Coach Error: {str(e)}"
105
-
106
- def process_pipeline(self, audio_path):
107
- if not self.hf_token:
108
- return {"error": "HF_TOKEN missing in server secrets"}
109
-
110
- try:
111
- # 1. TRANSCRIPTION (Using HuggingFace Transformers - THE SAFE WAY)
112
- print("🎧 Transcribing...")
113
- # We use openai/whisper-large-v3 directly via Transformers
114
- # This avoids all the C++ build errors of faster-whisper
115
- transcriber = pipeline(
116
- "automatic-speech-recognition",
117
- model="openai/whisper-large-v3",
118
- device=self.device,
119
- torch_dtype=self.torch_dtype,
120
- chunk_length_s=30
121
- )
122
-
123
- # Run transcription with timestamps
124
- result = transcriber(audio_path, return_timestamps=True)
125
-
126
- # Cleanup Transcription Model
127
- del transcriber
128
- gc.collect()
129
- torch.cuda.empty_cache()
130
-
131
- if not result['text']:
132
- return {"error": "No speech detected."}
133
-
134
- # Calculate Audio Duration for WPM
135
- duration = librosa.get_duration(path=audio_path)
136
-
137
- # 2. METRICS
138
- print("📊 Analyzing...")
139
- # Transformers output format is different, we adapt here
140
- transcript_chunks = result.get('chunks', [{'text': result['text']}])
141
- metrics = self._analyze_metrics(transcript_chunks, duration)
142
-
143
- # 3. DIARIZATION (Quick check for multiple speakers)
144
- print("🗣️ Checking Speakers...")
145
- try:
146
- diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=self.hf_token)
147
- diar.to(torch.device(self.device))
148
- wav, sr = torchaudio.load(audio_path)
149
- d_result = diar({"waveform": wav, "sample_rate": sr})
150
- speaker_count = len(d_result.labels())
151
- del diar
152
- gc.collect()
153
- torch.cuda.empty_cache()
154
- except:
155
- speaker_count = 1
156
-
157
- metrics["speaker_count"] = speaker_count
158
-
159
- # 4. LLM COACH
160
- print("🧠 Coaching...")
161
- feedback = self._generate_coaching_feedback(metrics)
162
 
163
- return {
164
- "transcript": metrics['full_text'],
165
- "stats": {
166
- "wpm": metrics['wpm'],
167
- "duration": metrics['duration'],
168
- "fillers_count": metrics['fillers_count'],
169
- "filler_percentage": metrics['filler_pct'],
170
- "speakers_detected": speaker_count
 
 
 
 
 
171
  },
172
- "coach_feedback": feedback
 
 
 
 
 
 
 
 
 
173
  }
174
-
 
 
 
175
  except Exception as e:
176
  import traceback
177
  traceback.print_exc()
178
- return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Production-Ready Public Speaking Coach Engine
3
+ Supports all ages with comprehensive speech analysis
4
+ """
5
+
6
  import os
7
+ import io
8
+ import json
9
+ import logging
10
+ import warnings
11
  import re
12
+ from typing import Dict, List, Any, Optional
13
+ from dataclasses import dataclass, asdict
14
+
15
  import torch
 
16
  import librosa
17
  import numpy as np
18
+ import soundfile as sf
19
+ from scipy.signal import medfilt
20
+ from scipy.stats import zscore
21
+ import textstat
22
+
23
+ # Suppress warnings
24
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
25
+ logging.getLogger("whisper").setLevel(logging.ERROR)
26
+ logging.getLogger("transformers").setLevel(logging.ERROR)
27
+ warnings.filterwarnings("ignore")
28
+
29
+ # Validate Whisper installation
30
+ try:
31
+ import whisper
32
+ if not hasattr(whisper, "load_model"):
33
+ raise ImportError("Wrong whisper library installed")
34
+ except ImportError:
35
+ print("\n❌ CRITICAL: Install correct whisper library:")
36
+ print(" pip uninstall -y whisper && pip install openai-whisper")
37
+ exit(1)
38
+
39
+ # Import grammar checker (lazy load to avoid startup delay)
40
+ GRAMMAR_TOOL = None
41
+
42
+ def get_grammar_tool():
43
+ """Lazy load grammar checker"""
44
+ global GRAMMAR_TOOL
45
+ if GRAMMAR_TOOL is None:
46
+ try:
47
+ import language_tool_python
48
+ GRAMMAR_TOOL = language_tool_python.LanguageTool('en-US')
49
+ except Exception as e:
50
+ logging.warning(f"Grammar tool not available: {e}")
51
+ GRAMMAR_TOOL = False
52
+ return GRAMMAR_TOOL if GRAMMAR_TOOL else None
53
+
54
+ # JSON Serialization Helper
55
+ class NumpyEncoder(json.JSONEncoder):
56
+ """Handles numpy types in JSON serialization"""
57
+ def default(self, obj):
58
+ if isinstance(obj, (np.integer, np.int64)):
59
+ return int(obj)
60
+ if isinstance(obj, (np.floating, np.float32, np.float64)):
61
+ return float(obj)
62
+ if isinstance(obj, np.ndarray):
63
+ return obj.tolist()
64
+ return super().default(obj)
65
+
66
+
67
+ @dataclass
68
+ class AnalysisResult:
69
+ """Structured result for type safety"""
70
+ overall_score: int
71
+ fluency_score: int
72
+ confidence_score: int
73
+ content_score: int
74
+ grammar_score: int
75
+
76
+ transcription: str
77
+ word_count: int
78
+ duration_seconds: float
79
+
80
+ filler_words: Dict[str, int]
81
+ repeated_phrases: List[Dict[str, Any]]
82
+ long_pauses: List[Dict[str, float]]
83
+
84
+ pace_analysis: Dict[str, Any]
85
+ tone_analysis: Dict[str, Any]
86
+ grammar_issues: List[Dict[str, str]]
87
+
88
+ strengths: List[str]
89
+ improvements: List[str]
90
+ coaching_feedback: str
91
+
92
+ def to_dict(self):
93
+ return asdict(self)
94
 
95
+
96
+ class PublicSpeakingCoach:
97
+ """
98
+ Complete speech analysis engine for public speaking coaching
99
+ Features:
100
+ - Transcription with word-level timestamps
101
+ - Filler word detection
102
+ - Silence/pause analysis
103
+ - Repeated phrase detection
104
+ - Tone & confidence analysis
105
+ - Grammar checking
106
+ - Content quality analysis
107
+ - AI-powered coaching feedback
108
+ """
109
+
110
+ def __init__(self, whisper_model_size: str = "base"):
111
+ """
112
+ Initialize the coach engine
113
+
114
+ Args:
115
+ whisper_model_size: Whisper model size (tiny/base/small/medium)
116
+ base = good balance, small = better accuracy
117
+ """
118
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
119
+ print(f"🚀 Initializing Public Speaking Coach on {self.device}...")
120
+
121
+ # Load Whisper for transcription
122
+ print(f" Loading Whisper ({whisper_model_size})...")
123
+ self.whisper = whisper.load_model(whisper_model_size, device=self.device)
124
 
125
+ # Linguistic patterns
126
+ self.filler_patterns = {
127
+ "um": r"\bum+h*\b",
128
+ "uh": r"\buh+h*\b",
129
+ "like": r"\blike\b",
130
+ "you know": r"\byou know\b",
131
+ "so": r"\bso+\b",
132
+ "actually": r"\bactually\b",
133
+ "basically": r"\bbasically\b",
134
+ "literally": r"\bliterally\b",
135
+ "i mean": r"\bi mean\b",
136
+ "kind of": r"\bkind of\b",
137
+ "sort of": r"\bsort of\b"
138
  }
 
 
 
 
 
 
139
 
140
+ self.power_words = {
141
+ "evidence", "data", "research", "proven", "significantly",
142
+ "innovative", "transform", "achieve", "success", "solution",
143
+ "effective", "results", "impact", "value", "opportunity",
144
+ "believe", "imagine", "discover", "realize", "understand"
145
+ }
 
 
 
146
 
147
+ print("✅ Coach Engine Ready!")
148
+
149
+
150
+ def analyze_speech(self, audio_path: str) -> Dict[str, Any]:
151
+ """
152
+ Main analysis pipeline
153
+
154
+ Args:
155
+ audio_path: Path to audio file
156
+
157
+ Returns:
158
+ Comprehensive analysis results as dictionary
159
+ """
160
+ # Validation
161
+ if not os.path.exists(audio_path):
162
+ return {"error": "Audio file not found"}
163
+
164
+ print(f"🎤 Analyzing: {os.path.basename(audio_path)}")
165
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  try:
167
+ # Load audio
168
+ audio, sr = self._load_audio(audio_path)
169
+ duration = len(audio) / sr
170
+
171
+ if duration < 1.0:
172
+ return {"error": "Audio too short (minimum 1 second)"}
173
+
174
+ print(f" Duration: {duration:.1f}s")
175
+
176
+ # Step 1: Transcription with timestamps
177
+ print(" 📝 Transcribing...")
178
+ transcript_data = self._transcribe_with_timestamps(audio)
179
+
180
+ if not transcript_data['text'].strip():
181
+ return {"error": "No speech detected"}
182
+
183
+ # Step 2: Filler word analysis
184
+ print(" 🔍 Detecting filler words...")
185
+ filler_analysis = self._detect_fillers(
186
+ transcript_data['text'],
187
+ transcript_data['words']
188
  )
 
 
 
189
 
190
+ # Step 3: Pause analysis
191
+ print(" ⏸️ Analyzing pauses...")
192
+ pause_analysis = self._analyze_pauses(
193
+ transcript_data['words'],
194
+ duration
195
+ )
196
 
197
+ # Step 4: Repeated phrase detection
198
+ print(" 🔁 Detecting repetitions...")
199
+ repetition_analysis = self._detect_repetitions(
200
+ transcript_data['words']
201
+ )
202
 
203
+ # Step 5: Pace analysis
204
+ print(" ⚡ Analyzing pace...")
205
+ pace_analysis = self._analyze_pace(
206
+ transcript_data['words'],
207
+ duration
208
+ )
209
 
210
+ # Step 6: Tone & confidence analysis
211
+ print(" 🎵 Analyzing tone & confidence...")
212
+ tone_analysis = self._analyze_tone_confidence(audio, sr)
213
 
214
+ # Step 7: Grammar check
215
+ print(" ✍️ Checking grammar...")
216
+ grammar_analysis = self._check_grammar(transcript_data['text'])
217
 
218
+ # Step 8: Content quality analysis
219
+ print(" 📊 Evaluating content...")
220
+ content_analysis = self._analyze_content(
221
+ transcript_data['text'],
222
+ transcript_data['words']
223
+ )
224
 
225
+ # Step 9: Generate scores
226
+ print(" 🎯 Calculating scores...")
227
+ scores = self._calculate_scores(
228
+ filler_analysis,
229
+ pause_analysis,
230
+ repetition_analysis,
231
+ pace_analysis,
232
+ tone_analysis,
233
+ grammar_analysis,
234
+ content_analysis
235
+ )
236
 
237
+ # Step 10: Generate coaching feedback
238
+ print(" 🤖 Generating coaching...")
239
+ coaching = self._generate_coaching(
240
+ scores,
241
+ filler_analysis,
242
+ pause_analysis,
243
+ repetition_analysis,
244
+ pace_analysis,
245
+ tone_analysis,
246
+ grammar_analysis,
247
+ content_analysis
248
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ # Compile final result
251
+ result = {
252
+ "overall_score": scores['overall'],
253
+ "scores": {
254
+ "fluency": scores['fluency'],
255
+ "confidence": scores['confidence'],
256
+ "content": scores['content'],
257
+ "grammar": scores['grammar']
258
+ },
259
+ "transcription": {
260
+ "text": transcript_data['text'],
261
+ "word_count": len(transcript_data['words']),
262
+ "duration_seconds": round(duration, 2)
263
  },
264
+ "fluency_analysis": {
265
+ "filler_words": filler_analysis,
266
+ "repeated_phrases": repetition_analysis,
267
+ "long_pauses": pause_analysis['long_pauses']
268
+ },
269
+ "pace_analysis": pace_analysis,
270
+ "tone_analysis": tone_analysis,
271
+ "grammar_analysis": grammar_analysis,
272
+ "content_analysis": content_analysis,
273
+ "coaching": coaching
274
  }
275
+
276
+ print("✅ Analysis complete!")
277
+ return result
278
+
279
  except Exception as e:
280
  import traceback
281
  traceback.print_exc()
282
+ return {"error": f"Analysis failed: {str(e)}"}
283
+
284
+
285
+ def _load_audio(self, path: str) -> tuple:
286
+ """Load and normalize audio to 16kHz mono"""
287
+ try:
288
+ audio, sr = librosa.load(path, sr=16000, mono=True)
289
+ # Normalize to prevent clipping
290
+ audio = librosa.util.normalize(audio)
291
+ return audio, sr
292
+ except Exception as e:
293
+ raise ValueError(f"Failed to load audio: {e}")
294
+
295
+
296
+ def _transcribe_with_timestamps(self, audio: np.ndarray) -> Dict:
297
+ """Transcribe with word-level timestamps"""
298
+ result = self.whisper.transcribe(
299
+ audio,
300
+ language='en',
301
+ word_timestamps=True,
302
+ fp16=(self.device == "cuda")
303
+ )
304
+
305
+ words = []
306
+ for segment in result['segments']:
307
+ if 'words' in segment:
308
+ for word_info in segment['words']:
309
+ words.append({
310
+ 'word': word_info['word'].strip(),
311
+ 'start': word_info['start'],
312
+ 'end': word_info['end'],
313
+ 'confidence': word_info.get('probability', 1.0)
314
+ })
315
+
316
+ return {
317
+ 'text': result['text'].strip(),
318
+ 'words': words
319
+ }
320
+
321
+
322
+ def _detect_fillers(self, text: str, words: List[Dict]) -> Dict:
323
+ """Detect filler words with counts and positions"""
324
+ text_lower = text.lower()
325
+ filler_counts = {}
326
+ filler_positions = []
327
+
328
+ for filler_name, pattern in self.filler_patterns.items():
329
+ matches = list(re.finditer(pattern, text_lower, re.IGNORECASE))
330
+ count = len(matches)
331
+ if count > 0:
332
+ filler_counts[filler_name] = count
333
+ for match in matches:
334
+ filler_positions.append({
335
+ 'filler': filler_name,
336
+ 'position': match.start()
337
+ })
338
+
339
+ total_fillers = sum(filler_counts.values())
340
+ total_words = len(words)
341
+ filler_rate = (total_fillers / total_words * 100) if total_words > 0 else 0
342
+
343
+ return {
344
+ 'total_count': total_fillers,
345
+ 'rate_percentage': round(filler_rate, 2),
346
+ 'breakdown': filler_counts,
347
+ 'positions': filler_positions
348
+ }
349
+
350
+
351
+ def _analyze_pauses(self, words: List[Dict], duration: float) -> Dict:
352
+ """Analyze pause patterns"""
353
+ if len(words) < 2:
354
+ return {'long_pauses': [], 'average_pause': 0, 'silence_percentage': 0}
355
+
356
+ pauses = []
357
+ long_pauses = []
358
+
359
+ for i in range(len(words) - 1):
360
+ pause_duration = words[i+1]['start'] - words[i]['end']
361
+ if pause_duration > 0:
362
+ pauses.append(pause_duration)
363
+ if pause_duration > 2.0: # Long pause threshold
364
+ long_pauses.append({
365
+ 'duration': round(pause_duration, 2),
366
+ 'after_word': words[i]['word'],
367
+ 'timestamp': round(words[i]['end'], 2)
368
+ })
369
+
370
+ avg_pause = np.mean(pauses) if pauses else 0
371
+ total_pause_time = sum(pauses)
372
+ silence_pct = (total_pause_time / duration * 100) if duration > 0 else 0
373
+
374
+ return {
375
+ 'long_pauses': long_pauses,
376
+ 'long_pause_count': len(long_pauses),
377
+ 'average_pause_seconds': round(avg_pause, 2),
378
+ 'silence_percentage': round(silence_pct, 2)
379
+ }
380
+
381
+
382
+ def _detect_repetitions(self, words: List[Dict]) -> List[Dict]:
383
+ """Detect repeated phrases (2-5 words)"""
384
+ repetitions = []
385
+ word_list = [w['word'].lower().strip('.,!?') for w in words]
386
+
387
+ # Check for n-gram repetitions (2-5 words)
388
+ for n in range(2, 6):
389
+ seen = {}
390
+ for i in range(len(word_list) - n + 1):
391
+ phrase = ' '.join(word_list[i:i+n])
392
+ if phrase in seen:
393
+ repetitions.append({
394
+ 'phrase': phrase,
395
+ 'count': seen[phrase] + 1,
396
+ 'length': n
397
+ })
398
+ seen[phrase] += 1
399
+ else:
400
+ seen[phrase] = 1
401
+
402
+ # Remove duplicates and sort by count
403
+ unique_reps = {}
404
+ for rep in repetitions:
405
+ key = rep['phrase']
406
+ if key not in unique_reps or rep['count'] > unique_reps[key]['count']:
407
+ unique_reps[key] = rep
408
+
409
+ return sorted(unique_reps.values(), key=lambda x: x['count'], reverse=True)[:10]
410
+
411
+
412
+ def _analyze_pace(self, words: List[Dict], duration: float) -> Dict:
413
+ """Analyze speaking pace"""
414
+ word_count = len(words)
415
+ wpm = (word_count / duration * 60) if duration > 0 else 0
416
+
417
+ # Determine pace category
418
+ if wpm < 100:
419
+ pace_category = "Too Slow"
420
+ pace_feedback = "Consider speaking slightly faster for better engagement"
421
+ elif wpm < 130:
422
+ pace_category = "Good"
423
+ pace_feedback = "Your pace is comfortable and easy to follow"
424
+ elif wpm < 160:
425
+ pace_category = "Optimal"
426
+ pace_feedback = "Excellent pacing - clear and engaging"
427
+ elif wpm < 180:
428
+ pace_category = "Fast"
429
+ pace_feedback = "Speaking quickly but still understandable"
430
+ else:
431
+ pace_category = "Too Fast"
432
+ pace_feedback = "Try slowing down to ensure clarity"
433
+
434
+ # Calculate pace variance (consistency)
435
+ if len(words) > 10:
436
+ segment_size = max(5, len(words) // 10)
437
+ segment_paces = []
438
+ for i in range(0, len(words) - segment_size, segment_size):
439
+ segment = words[i:i+segment_size]
440
+ seg_duration = segment[-1]['end'] - segment[0]['start']
441
+ if seg_duration > 0:
442
+ seg_wpm = len(segment) / seg_duration * 60
443
+ segment_paces.append(seg_wpm)
444
+
445
+ pace_variance = np.std(segment_paces) if len(segment_paces) > 1 else 0
446
+ consistency = "High" if pace_variance < 20 else "Medium" if pace_variance < 40 else "Low"
447
+ else:
448
+ pace_variance = 0
449
+ consistency = "N/A"
450
+
451
+ return {
452
+ 'words_per_minute': round(wpm, 1),
453
+ 'category': pace_category,
454
+ 'consistency': consistency,
455
+ 'pace_variance': round(pace_variance, 1),
456
+ 'feedback': pace_feedback
457
+ }
458
+
459
+
460
+ def _analyze_tone_confidence(self, audio: np.ndarray, sr: int) -> Dict:
461
+ """Analyze tone variation and confidence indicators"""
462
+ # Pitch analysis (fundamental frequency)
463
+ try:
464
+ f0 = librosa.yin(
465
+ audio.astype(np.float64),
466
+ fmin=80, # Male range
467
+ fmax=400 # Female range
468
+ )
469
+ f0_clean = f0[f0 > 0]
470
+
471
+ if len(f0_clean) > 0:
472
+ avg_pitch = np.mean(f0_clean)
473
+ pitch_std = np.std(f0_clean)
474
+ pitch_range = np.ptp(f0_clean)
475
+
476
+ # Pitch variation indicates expressiveness
477
+ if pitch_std < 20:
478
+ expressiveness = "Monotone"
479
+ expression_score = 40
480
+ elif pitch_std < 40:
481
+ expressiveness = "Moderate Variation"
482
+ expression_score = 70
483
+ else:
484
+ expressiveness = "Expressive"
485
+ expression_score = 95
486
+ else:
487
+ avg_pitch = 0
488
+ pitch_std = 0
489
+ pitch_range = 0
490
+ expressiveness = "Unknown"
491
+ expression_score = 50
492
+ except Exception as e:
493
+ logging.warning(f"Pitch analysis failed: {e}")
494
+ avg_pitch = 0
495
+ pitch_std = 0
496
+ pitch_range = 0
497
+ expressiveness = "Unknown"
498
+ expression_score = 50
499
+
500
+ # Energy/Volume analysis
501
+ rms = librosa.feature.rms(y=audio)[0]
502
+ avg_energy = np.mean(rms)
503
+ energy_std = np.std(rms)
504
+
505
+ # Volume consistency
506
+ if energy_std < 0.02:
507
+ volume_consistency = "Very Consistent"
508
+ elif energy_std < 0.05:
509
+ volume_consistency = "Consistent"
510
+ else:
511
+ volume_consistency = "Varied"
512
+
513
+ # Confidence estimation (based on volume stability and pitch)
514
+ confidence_score = 50 # Base
515
+ if energy_std < 0.03: # Stable volume
516
+ confidence_score += 15
517
+ if 150 < avg_pitch < 250: # Comfortable pitch range
518
+ confidence_score += 15
519
+ if pitch_std > 20: # Some variation (engaged)
520
+ confidence_score += 20
521
+
522
+ confidence_score = min(100, max(0, confidence_score))
523
+
524
+ return {
525
+ 'expressiveness': expressiveness,
526
+ 'expression_score': expression_score,
527
+ 'average_pitch_hz': round(float(avg_pitch), 1),
528
+ 'pitch_variation_hz': round(float(pitch_std), 1),
529
+ 'volume_consistency': volume_consistency,
530
+ 'confidence_score': round(confidence_score, 1)
531
+ }
532
+
533
+
534
+ def _check_grammar(self, text: str) -> Dict:
535
+ """Check grammar using language-tool-python"""
536
+ grammar_tool = get_grammar_tool()
537
+
538
+ if grammar_tool is None:
539
+ return {
540
+ 'issue_count': 0,
541
+ 'issues': [],
542
+ 'available': False
543
+ }
544
+
545
+ try:
546
+ matches = grammar_tool.check(text)
547
+ issues = []
548
+
549
+ for match in matches[:20]: # Limit to top 20
550
+ issues.append({
551
+ 'type': match.ruleId,
552
+ 'message': match.message,
553
+ 'context': match.context,
554
+ 'suggestions': match.replacements[:3]
555
+ })
556
+
557
+ return {
558
+ 'issue_count': len(matches),
559
+ 'issues': issues,
560
+ 'available': True
561
+ }
562
+ except Exception as e:
563
+ logging.warning(f"Grammar check failed: {e}")
564
+ return {
565
+ 'issue_count': 0,
566
+ 'issues': [],
567
+ 'available': False
568
+ }
569
+
570
+
571
+ def _analyze_content(self, text: str, words: List[Dict]) -> Dict:
572
+ """Analyze content quality and complexity"""
573
+ # Readability metrics
574
+ try:
575
+ flesch_score = textstat.flesch_reading_ease(text)
576
+ grade_level = textstat.text_standard(text, float_output=True)
577
+ except:
578
+ flesch_score = 50
579
+ grade_level = 8
580
+
581
+ # Interpret Flesch score
582
+ if flesch_score >= 90:
583
+ readability = "Very Easy"
584
+ elif flesch_score >= 70:
585
+ readability = "Easy"
586
+ elif flesch_score >= 50:
587
+ readability = "Moderate"
588
+ elif flesch_score >= 30:
589
+ readability = "Difficult"
590
+ else:
591
+ readability = "Very Difficult"
592
+
593
+ # Power word usage
594
+ word_list = [w['word'].lower().strip('.,!?') for w in words]
595
+ power_word_count = sum(1 for w in word_list if w in self.power_words)
596
+ power_word_rate = (power_word_count / len(words) * 100) if len(words) > 0 else 0
597
+
598
+ # Vocabulary diversity
599
+ unique_words = len(set(word_list))
600
+ vocab_diversity = (unique_words / len(words) * 100) if len(words) > 0 else 0
601
+
602
+ # Sentence structure (approximate from punctuation)
603
+ sentence_count = max(1, text.count('.') + text.count('!') + text.count('?'))
604
+ avg_sentence_length = len(words) / sentence_count
605
+
606
+ return {
607
+ 'readability_score': round(flesch_score, 1),
608
+ 'readability_level': readability,
609
+ 'grade_level': round(grade_level, 1),
610
+ 'power_words_used': power_word_count,
611
+ 'power_word_rate': round(power_word_rate, 2),
612
+ 'vocabulary_diversity': round(vocab_diversity, 1),
613
+ 'unique_word_count': unique_words,
614
+ 'average_sentence_length': round(avg_sentence_length, 1)
615
+ }
616
+
617
+
618
+ def _calculate_scores(
619
+ self,
620
+ filler_analysis: Dict,
621
+ pause_analysis: Dict,
622
+ repetition_analysis: List,
623
+ pace_analysis: Dict,
624
+ tone_analysis: Dict,
625
+ grammar_analysis: Dict,
626
+ content_analysis: Dict
627
+ ) -> Dict:
628
+ """Calculate comprehensive scores"""
629
+
630
+ # Fluency Score (0-100)
631
+ fluency = 100
632
+ fluency -= min(30, filler_analysis['rate_percentage'] * 5) # Filler penalty
633
+ fluency -= min(20, pause_analysis['long_pause_count'] * 5) # Long pause penalty
634
+ fluency -= min(15, len(repetition_analysis) * 3) # Repetition penalty
635
+
636
+ # Pace bonus/penalty
637
+ wpm = pace_analysis['words_per_minute']
638
+ if 130 <= wpm <= 160:
639
+ fluency += 5 # Optimal pace bonus
640
+ elif wpm < 100 or wpm > 180:
641
+ fluency -= 10 # Poor pace penalty
642
+
643
+ fluency = max(0, min(100, fluency))
644
+
645
+ # Confidence Score (from tone analysis)
646
+ confidence = tone_analysis['confidence_score']
647
+
648
+ # Content Score (0-100)
649
+ content = 50 # Base
650
+ content += min(30, content_analysis['power_word_rate'] * 3) # Power words
651
+ content += min(20, content_analysis['vocabulary_diversity'] / 5) # Diversity
652
+
653
+ # Readability bonus/penalty
654
+ flesch = content_analysis['readability_score']
655
+ if 50 <= flesch <= 70:
656
+ content += 10
657
+
658
+ content = max(0, min(100, content))
659
+
660
+ # Grammar Score (0-100)
661
+ if grammar_analysis['available']:
662
+ grammar = max(0, 100 - grammar_analysis['issue_count'] * 2)
663
+ else:
664
+ grammar = 85 # Default if unavailable
665
+
666
+ # Overall Score (weighted average)
667
+ overall = (
668
+ fluency * 0.35 +
669
+ confidence * 0.25 +
670
+ content * 0.25 +
671
+ grammar * 0.15
672
+ )
673
+
674
+ return {
675
+ 'overall': round(overall),
676
+ 'fluency': round(fluency),
677
+ 'confidence': round(confidence),
678
+ 'content': round(content),
679
+ 'grammar': round(grammar)
680
+ }
681
+
682
+
683
+ def _generate_coaching(
684
+ self,
685
+ scores: Dict,
686
+ filler_analysis: Dict,
687
+ pause_analysis: Dict,
688
+ repetition_analysis: List,
689
+ pace_analysis: Dict,
690
+ tone_analysis: Dict,
691
+ grammar_analysis: Dict,
692
+ content_analysis: Dict
693
+ ) -> Dict:
694
+ """Generate personalized coaching feedback"""
695
+
696
+ strengths = []
697
+ improvements = []
698
+
699
+ # Analyze strengths
700
+ if scores['fluency'] >= 80:
701
+ strengths.append("Excellent fluency - your speech flows naturally")
702
+
703
+ if filler_analysis['rate_percentage'] < 2:
704
+ strengths.append("Minimal use of filler words - very professional")
705
+
706
+ if pace_analysis['words_per_minute'] >= 130 and pace_analysis['words_per_minute'] <= 160:
707
+ strengths.append("Perfect speaking pace - clear and engaging")
708
+
709
+ if tone_analysis['expression_score'] >= 80:
710
+ strengths.append("Great vocal expressiveness - keeps audience engaged")
711
+
712
+ if content_analysis['power_word_rate'] >= 3:
713
+ strengths.append("Strong use of impactful vocabulary")
714
+
715
+ if scores['confidence'] >= 75:
716
+ strengths.append("Confident delivery with strong vocal presence")
717
+
718
+ # Identify improvements
719
+ if filler_analysis['rate_percentage'] >= 5:
720
+ improvements.append(
721
+ f"Reduce filler words ({filler_analysis['rate_percentage']:.1f}% of speech). "
722
+ "Try pausing silently instead of using 'um' or 'uh'"
723
+ )
724
+
725
+ if pause_analysis['long_pause_count'] >= 3:
726
+ improvements.append(
727
+ f"You have {pause_analysis['long_pause_count']} long pauses. "
728
+ "Practice smoother transitions between thoughts"
729
+ )
730
+
731
+ if len(repetition_analysis) >= 3:
732
+ top_rep = repetition_analysis[0]
733
+ improvements.append(
734
+ f"You repeated '{top_rep['phrase']}' {top_rep['count']} times. "
735
+ "Vary your phrasing for more engaging delivery"
736
+ )
737
+
738
+ wpm = pace_analysis['words_per_minute']
739
+ if wpm < 120:
740
+ improvements.append(
741
+ "Your pace is quite slow. Try speaking 10-15% faster to maintain energy"
742
+ )
743
+ elif wpm > 170:
744
+ improvements.append(
745
+ "You're speaking very quickly. Slow down 10-15% to ensure clarity"
746
+ )
747
+
748
+ if tone_analysis['expression_score'] < 60:
749
+ improvements.append(
750
+ "Add more vocal variety. Practice emphasizing key words and varying your pitch"
751
+ )
752
+
753
+ if grammar_analysis['available'] and grammar_analysis['issue_count'] >= 5:
754
+ improvements.append(
755
+ f"Found {grammar_analysis['issue_count']} grammar issues. "
756
+ "Review your script and practice correct phrasing"
757
+ )
758
+
759
+ if content_analysis['vocabulary_diversity'] < 40:
760
+ improvements.append(
761
+ "Expand your vocabulary. Using more diverse words makes speeches more engaging"
762
+ )
763
+
764
+ # Generate overall feedback message
765
+ overall_score = scores['overall']
766
+
767
+ if overall_score >= 90:
768
+ overall_feedback = (
769
+ "🌟 Outstanding performance! Your speech demonstrates excellent "
770
+ "command of public speaking fundamentals. You're ready for any audience!"
771
+ )
772
+ elif overall_score >= 75:
773
+ overall_feedback = (
774
+ "👏 Strong performance! You have solid public speaking skills. "
775
+ "Focus on the improvement areas to reach the next level."
776
+ )
777
+ elif overall_score >= 60:
778
+ overall_feedback = (
779
+ "✅ Good effort! You have a foundation to build on. "
780
+ "Work on the suggested improvements and keep practicing."
781
+ )
782
+ else:
783
+ overall_feedback = (
784
+ "💪 Keep practicing! Public speaking is a skill that improves with practice. "
785
+ "Focus on one improvement area at a time and you'll see progress."
786
+ )
787
+
788
+ # Add default messages if lists are empty
789
+ if not strengths:
790
+ strengths.append("You completed the speech - that takes courage!")
791
+
792
+ if not improvements:
793
+ improvements.append("Keep practicing to maintain your excellent skills")
794
+
795
+ return {
796
+ 'overall_feedback': overall_feedback,
797
+ 'strengths': strengths[:5], # Top 5
798
+ 'improvements': improvements[:5], # Top 5
799
+ 'next_steps': self._generate_next_steps(scores, improvements)
800
+ }
801
+
802
+
803
+ def _generate_next_steps(self, scores: Dict, improvements: List[str]) -> List[str]:
804
+ """Generate actionable next steps"""
805
+ steps = []
806
+
807
+ # Prioritize based on weakest scores
808
+ score_items = [
809
+ ('fluency', scores['fluency']),
810
+ ('confidence', scores['confidence']),
811
+ ('content', scores['content']),
812
+ ('grammar', scores['grammar'])
813
+ ]
814
+ score_items.sort(key=lambda x: x[1])
815
+
816
+ weakest = score_items[0][0]
817
+
818
+ if weakest == 'fluency':
819
+ steps.append("Practice speaking without filler words - try the 'silent pause' technique")
820
+ steps.append("Record yourself daily and track filler word reduction")
821
+ elif weakest == 'confidence':
822
+ steps.append("Work on vocal projection exercises to build confidence")
823
+ steps.append("Practice power poses before speaking to boost confidence")
824
+ elif weakest == 'content':
825
+ steps.append("Build your vocabulary by learning 2-3 power words per week")
826
+ steps.append("Study speeches by great speakers and note their word choices")
827
+ elif weakest == 'grammar':
828
+ steps.append("Review common grammar rules and practice correct phrasing")
829
+ steps.append("Have someone proofread your speeches before delivery")
830
+
831
+ steps.append("Practice this speech 3 more times and compare your progress")
832
+
833
+ return steps[:4]
834
+
835
+
836
+ # ================= TEST RUNNER =================
837
+ if __name__ == "__main__":
838
+ print("\n" + "="*60)
839
+ print("PUBLIC SPEAKING COACH - ENGINE TEST")
840
+ print("="*60 + "\n")
841
+
842
+ test_file = "test_speech.wav"
843
+
844
+ # Generate test audio if needed
845
+ if not os.path.exists(test_file):
846
+ print("⚠️ No test file found. Generating dummy audio...")
847
+ sr = 16000
848
+ duration = 5
849
+ t = np.linspace(0, duration, sr * duration)
850
+ # Simulate speech-like audio with varying frequency
851
+ audio = 0.3 * np.sin(2 * np.pi * 200 * t) + 0.2 * np.sin(2 * np.pi * 300 * t)
852
+ sf.write(test_file, audio, sr)
853
+ print(f"✅ Created {test_file}\n")
854
+
855
+ try:
856
+ # Initialize coach
857
+ coach = PublicSpeakingCoach(whisper_model_size="base")
858
+
859
+ # Analyze
860
+ result = coach.analyze_speech(test_file)
861
+
862
+ # Display results
863
+ print("\n" + "="*60)
864
+ print("ANALYSIS RESULTS")
865
+ print("="*60)
866
+ print(json.dumps(result, indent=2, cls=NumpyEncoder))
867
+
868
+ print("\n✅ Engine test completed successfully!")
869
+
870
+ except Exception as e:
871
+ print(f"\n❌ ERROR: {e}")
872
+ import traceback
873
+ traceback.print_exc()
main.py CHANGED
@@ -1,105 +1,273 @@
 
 
 
 
 
1
  import os
2
  import shutil
 
 
 
 
3
  import uvicorn
4
- import subprocess
5
- from fastapi import FastAPI, UploadFile, File, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
- from kid_coach_pipeline import KidCoachEngine
 
8
 
9
- app = FastAPI(title="Public Speaking Coach API")
10
 
 
 
 
 
 
 
 
 
 
11
  app.add_middleware(
12
  CORSMiddleware,
13
- allow_origins=["*"],
 
14
  allow_methods=["*"],
15
  allow_headers=["*"],
16
  )
17
 
18
- # Global Engine Instance
19
- engine = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  @app.on_event("startup")
22
  async def startup_event():
23
- global engine
 
24
 
25
- # 1. Get Token from Secrets
26
- hf_token = os.environ.get("HF_TOKEN")
27
- if not hf_token:
28
- print("❌ CRITICAL: HF_TOKEN not found in environment variables!")
29
 
30
- print("🚀 Initializing KidCoach Engine (Production Mode)...")
31
  try:
32
- engine = KidCoachEngine(hf_token=hf_token)
33
- print("✅ Engine Ready! Waiting for audio...")
 
 
 
 
 
34
  except Exception as e:
35
- print(f"❌ Engine initialization failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- def convert_to_wav(input_path, output_path):
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
- Sanitizes audio for AI processing.
40
- Converts any input (m4a, mp3, webm) to 16kHz Mono WAV.
 
 
 
 
 
 
 
 
 
 
 
41
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  try:
43
- command = [
44
- "ffmpeg",
45
- "-i", input_path,
46
- "-ar", "16000", # Standard AI Sample Rate
47
- "-ac", "1", # Mono
48
- "-c:a", "pcm_s16le", # Raw WAV
49
- output_path,
50
- "-y"
51
- ]
52
- subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
53
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
55
- print(f"❌ FFmpeg error: {e}")
56
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- @app.post("/coach")
59
- async def coach_audio(file: UploadFile = File(...)):
60
- global engine
61
- if not engine:
62
- raise HTTPException(status_code=500, detail="AI Engine is not initialized")
63
 
64
- # 1. Save Raw File
65
- raw_filename = f"raw_{file.filename}"
66
- clean_wav_filename = f"clean_{file.filename}.wav"
 
67
 
68
- try:
69
- # Write upload to disk
70
- with open(raw_filename, "wb") as buffer:
71
- shutil.copyfileobj(file.file, buffer)
72
 
73
- # 2. Convert to Pristine WAV
74
- print(f"🔄 Processing file: {file.filename}")
75
- success = convert_to_wav(raw_filename, clean_wav_filename)
76
-
77
- if not success:
78
- raise HTTPException(status_code=400, detail="Audio file unreadable. Please upload MP3, WAV, or M4A.")
79
 
80
- # 3. Run The Full AI Pipeline
81
- # This calls our robust logic in kid_coach_pipeline.py
82
- result = engine.process_pipeline(clean_wav_filename)
 
 
 
 
 
 
 
83
 
84
- if "error" in result:
85
- print(f"Pipeline Error: {result['error']}")
86
- raise HTTPException(status_code=500, detail=result["error"])
87
 
88
- return result
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- except HTTPException as he:
91
- raise he
92
- except Exception as e:
93
- print(f"Server Error: {e}")
94
- raise HTTPException(status_code=500, detail="Internal Processing Error")
95
-
96
- finally:
97
- # 4. Cleanup temp files to save disk space
98
- if os.path.exists(raw_filename):
99
- os.remove(raw_filename)
100
- if os.path.exists(clean_wav_filename):
101
- os.remove(clean_wav_filename)
102
 
103
  if __name__ == "__main__":
104
- # Hugging Face Spaces expects port 7860
105
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
1
+ """
2
+ Production FastAPI Server for Public Speaking Coach
3
+ Handles audio uploads and returns comprehensive analysis
4
+ """
5
+
6
  import os
7
  import shutil
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
  import uvicorn
13
+ from fastapi import FastAPI, UploadFile, File, HTTPException, status
 
14
  from fastapi.middleware.cors import CORSMiddleware
15
+ from fastapi.responses import JSONResponse
16
+ from pydantic import BaseModel
17
 
18
+ from kid_coach_pipeline import PublicSpeakingCoach
19
 
20
+ # ================= APP CONFIGURATION =================
21
+
22
+ app = FastAPI(
23
+ title="Public Speaking Coach API",
24
+ description="AI-powered speech analysis and coaching for all ages",
25
+ version="2.0.0"
26
+ )
27
+
28
+ # CORS Configuration - Adjust for production
29
  app.add_middleware(
30
  CORSMiddleware,
31
+ allow_origins=["*"], # Change to specific domains in production
32
+ allow_credentials=True,
33
  allow_methods=["*"],
34
  allow_headers=["*"],
35
  )
36
 
37
+ # Global engine instance
38
+ coach_engine: Optional[PublicSpeakingCoach] = None
39
+
40
+ # Supported audio formats
41
+ SUPPORTED_FORMATS = {
42
+ '.wav', '.mp3', '.m4a', '.flac', '.ogg',
43
+ '.wma', '.aac', '.mp4', '.webm'
44
+ }
45
+
46
+ # Maximum file size (50MB)
47
+ MAX_FILE_SIZE = 50 * 1024 * 1024
48
+
49
+
50
+ # ================= RESPONSE MODELS =================
51
+
52
+ class HealthResponse(BaseModel):
53
+ """Health check response"""
54
+ status: str
55
+ engine_loaded: bool
56
+ supported_formats: list
57
+
58
+
59
+ class ErrorResponse(BaseModel):
60
+ """Error response format"""
61
+ error: str
62
+ detail: Optional[str] = None
63
+
64
+
65
+ # ================= STARTUP/SHUTDOWN =================
66
 
67
  @app.on_event("startup")
68
  async def startup_event():
69
+ """Initialize the coach engine on server start"""
70
+ global coach_engine
71
 
72
+ print("\n" + "="*60)
73
+ print("🚀 PUBLIC SPEAKING COACH API - STARTING")
74
+ print("="*60)
 
75
 
 
76
  try:
77
+ print("\n📦 Loading AI models...")
78
+ coach_engine = PublicSpeakingCoach(whisper_model_size="base")
79
+ print("✅ Coach engine ready!")
80
+ print("\n" + "="*60)
81
+ print("🎤 API is ready to analyze speeches!")
82
+ print("="*60 + "\n")
83
+
84
  except Exception as e:
85
+ print(f"\nSTARTUP FAILED: {e}")
86
+ print("Server will start but analysis will not work.\n")
87
+ coach_engine = None
88
+
89
+
90
+ @app.on_event("shutdown")
91
+ async def shutdown_event():
92
+ """Cleanup on server shutdown"""
93
+ print("\n👋 Shutting down Public Speaking Coach API...")
94
+
95
+
96
+ # ================= ENDPOINTS =================
97
+
98
+ @app.get("/", response_model=HealthResponse)
99
+ async def root():
100
+ """Root endpoint - API info"""
101
+ return {
102
+ "status": "online",
103
+ "engine_loaded": coach_engine is not None,
104
+ "supported_formats": list(SUPPORTED_FORMATS)
105
+ }
106
+
107
 
108
+ @app.get("/health", response_model=HealthResponse)
109
+ async def health_check():
110
+ """Health check endpoint"""
111
+ return {
112
+ "status": "healthy" if coach_engine else "degraded",
113
+ "engine_loaded": coach_engine is not None,
114
+ "supported_formats": list(SUPPORTED_FORMATS)
115
+ }
116
+
117
+
118
+ @app.post("/coach")
119
+ async def analyze_speech(file: UploadFile = File(...)):
120
  """
121
+ Main endpoint: Upload audio file and receive speech analysis
122
+
123
+ Args:
124
+ file: Audio file (wav, mp3, m4a, flac, ogg, etc.)
125
+
126
+ Returns:
127
+ Comprehensive speech analysis with scores and coaching feedback
128
+
129
+ Raises:
130
+ 400: Invalid file format or corrupted audio
131
+ 413: File too large
132
+ 500: Analysis failed
133
+ 503: Engine not loaded
134
  """
135
+
136
+ # Check if engine is loaded
137
+ if coach_engine is None:
138
+ raise HTTPException(
139
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
140
+ detail="Coach engine not initialized. Please contact administrator."
141
+ )
142
+
143
+ # Validate file exists
144
+ if not file:
145
+ raise HTTPException(
146
+ status_code=status.HTTP_400_BAD_REQUEST,
147
+ detail="No file provided"
148
+ )
149
+
150
+ # Validate filename
151
+ if not file.filename:
152
+ raise HTTPException(
153
+ status_code=status.HTTP_400_BAD_REQUEST,
154
+ detail="Invalid filename"
155
+ )
156
+
157
+ # Get file extension
158
+ file_ext = Path(file.filename).suffix.lower()
159
+
160
+ # Validate format
161
+ if file_ext not in SUPPORTED_FORMATS:
162
+ raise HTTPException(
163
+ status_code=status.HTTP_400_BAD_REQUEST,
164
+ detail=f"Unsupported format '{file_ext}'. Supported: {', '.join(SUPPORTED_FORMATS)}"
165
+ )
166
+
167
+ # Create temporary file
168
+ temp_file = None
169
+
170
  try:
171
+ # Read file content
172
+ content = await file.read()
173
+
174
+ # Check file size
175
+ if len(content) > MAX_FILE_SIZE:
176
+ raise HTTPException(
177
+ status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
178
+ detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
179
+ )
180
+
181
+ # Create temporary file with proper extension
182
+ with tempfile.NamedTemporaryFile(
183
+ delete=False,
184
+ suffix=file_ext
185
+ ) as temp:
186
+ temp.write(content)
187
+ temp_file = temp.name
188
+
189
+ print(f"\n📁 Processing: {file.filename} ({len(content) / 1024:.1f} KB)")
190
+
191
+ # Run analysis
192
+ result = coach_engine.analyze_speech(temp_file)
193
+
194
+ # Check for analysis errors
195
+ if "error" in result:
196
+ raise HTTPException(
197
+ status_code=status.HTTP_400_BAD_REQUEST,
198
+ detail=result["error"]
199
+ )
200
+
201
+ print(f"✅ Analysis complete: Score {result['overall_score']}/100")
202
+
203
+ return JSONResponse(content=result)
204
+
205
+ except HTTPException:
206
+ # Re-raise HTTP exceptions
207
+ raise
208
+
209
  except Exception as e:
210
+ # Log unexpected errors
211
+ import traceback
212
+ print(f"\n❌ ANALYSIS ERROR:")
213
+ traceback.print_exc()
214
+
215
+ raise HTTPException(
216
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
217
+ detail=f"Analysis failed: {str(e)}"
218
+ )
219
+
220
+ finally:
221
+ # Cleanup temporary file
222
+ if temp_file and os.path.exists(temp_file):
223
+ try:
224
+ os.remove(temp_file)
225
+ except Exception as e:
226
+ print(f"⚠️ Failed to delete temp file: {e}")
227
 
 
 
 
 
 
228
 
229
+ @app.post("/analyze")
230
+ async def analyze_speech_alias(file: UploadFile = File(...)):
231
+ """Alias endpoint for /coach (for compatibility)"""
232
+ return await analyze_speech(file)
233
 
 
 
 
 
234
 
235
+ # ================= ERROR HANDLERS =================
 
 
 
 
 
236
 
237
+ @app.exception_handler(HTTPException)
238
+ async def http_exception_handler(request, exc):
239
+ """Custom HTTP exception handler"""
240
+ return JSONResponse(
241
+ status_code=exc.status_code,
242
+ content={
243
+ "error": exc.detail,
244
+ "status_code": exc.status_code
245
+ }
246
+ )
247
 
 
 
 
248
 
249
+ @app.exception_handler(Exception)
250
+ async def general_exception_handler(request, exc):
251
+ """Catch-all exception handler"""
252
+ import traceback
253
+ traceback.print_exc()
254
+
255
+ return JSONResponse(
256
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
257
+ content={
258
+ "error": "Internal server error",
259
+ "detail": str(exc)
260
+ }
261
+ )
262
 
263
+
264
+ # ================= MAIN =================
 
 
 
 
 
 
 
 
 
 
265
 
266
  if __name__ == "__main__":
267
+ # For local development
268
+ uvicorn.run(
269
+ app,
270
+ host="0.0.0.0",
271
+ port=8000,
272
+ log_level="info"
273
+ )
requirements (1).txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Dependencies - Tested & Compatible
2
+ torch==2.1.0
3
+ torchaudio==2.1.0
4
+ openai-whisper==20231117
5
+
6
+ # Audio Processing
7
+ librosa==0.10.1
8
+ soundfile==0.12.1
9
+ scipy==1.11.4
10
+ numpy==1.24.3
11
+
12
+ # Text Analysis
13
+ textstat==0.7.3
14
+ language-tool-python==2.8.0
15
+
16
+ # API Framework
17
+ fastapi==0.109.0
18
+ uvicorn[standard]==0.27.0
19
+ python-multipart==0.0.6
20
+
21
+ # LLM Integration (lightweight, no GPU needed)
22
+ transformers==4.36.0
23
+ sentencepiece==0.1.99
24
+
25
+ # Utilities
26
+ pydantic==2.5.3
27
+ python-dotenv==1.0.0