akpande2 commited on
Commit
1a64a8e
·
verified ·
1 Parent(s): a371aa9

Update kid_coach_pipeline.py

Browse files
Files changed (1) hide show
  1. kid_coach_pipeline.py +147 -178
kid_coach_pipeline.py CHANGED
@@ -1,205 +1,174 @@
1
  import os
2
- import torch
3
- import torchaudio
4
  import re
5
  import gc
 
 
6
  import numpy as np
7
- from collections import Counter
8
- from google.colab import files
9
  from faster_whisper import WhisperModel
10
  from pyannote.audio import Pipeline
11
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
12
 
13
- # ================= CONFIGURATION =================
14
- HF_TOKEN = "PASTE_YOUR_TOKEN_HERE"
15
-
16
- # Audio Settings
17
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
- COMPUTE_TYPE = "float16" # Use "int8" for T4 GPU
19
-
20
- # LLM Settings (The "Coach")
21
- # We use Phi-3-mini because it's tiny, smart, and fits easily alongside Whisper/Pyannote
22
- # Alternative: "meta-llama/Meta-Llama-3-8B-Instruct" (Requires 16GB VRAM)
23
- LLM_MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
24
-
25
- print(f"Running on: {DEVICE}")
26
-
27
- # ================= 1. ANALYZER ENGINE =================
28
-
29
- class SpeechAnalyzer:
30
- def __init__(self):
31
  self.filler_words = {
32
- 'um', 'uh', 'er', 'ah', 'like', 'you know', 'sort of', 'kind of',
33
- 'basically', 'literally', 'actually', 'mean', 'right', 'okay'
34
  }
35
-
36
- def analyze_transcript(self, transcript):
37
- """Calculates advanced metrics from the raw transcript list."""
 
 
 
38
 
39
- full_text = " ".join([t['text'] for t in transcript])
40
- total_words = len(full_text.split())
41
  if total_words == 0: return None
42
 
43
- # 1. Pace (WPM)
44
- duration = transcript[-1]['end'] - transcript[0]['start']
45
- wpm = (total_words / duration) * 60
46
-
47
- # 2. Filler Word Density
 
 
48
  fillers_found = []
49
- for word in full_text.lower().split():
50
- clean_word = re.sub(r'[^\w\s]', '', word)
51
- if clean_word in self.filler_words:
52
- fillers_found.append(clean_word)
53
-
54
- filler_percentage = (len(fillers_found) / total_words) * 100
55
-
56
- # 3. Silence / Pause Analysis
57
- pauses = []
58
- for i in range(len(transcript) - 1):
59
- gap = transcript[i+1]['start'] - transcript[i]['end']
60
- if gap > 0.5: # Pauses longer than 0.5s
61
- pauses.append(gap)
62
 
63
- avg_pause = np.mean(pauses) if pauses else 0
64
- awkward_silences = len([p for p in pauses if p > 3.0]) # >3s is awkward
65
-
66
- # 4. Repetitive Phrases (N-grams)
67
- words = full_text.lower().split()
68
- bigrams = zip(words, words[1:])
69
- counts = Counter(bigrams)
70
- # Filter for phrases repeated 3+ times
71
- repetitions = [f"{k[0]} {k[1]}" for k, v in counts.items() if v >= 3]
72
-
73
  return {
 
74
  "wpm": round(wpm, 1),
75
- "fillers": fillers_found,
76
- "filler_pct": round(filler_percentage, 1),
77
- "pauses_count": len(pauses),
78
- "avg_pause": round(avg_pause, 2),
79
- "awkward_pauses": awkward_silences,
80
- "repetitions": repetitions,
81
- "full_text": full_text
82
  }
83
 
84
- # ================= 2. PIPELINE LOGIC =================
85
-
86
- def load_llm_coach():
87
- """Loads the LLM in 4-bit mode to save VRAM."""
88
- print(f"\n🧠 Loading AI Coach ({LLM_MODEL_ID})...")
89
-
90
- bnb_config = BitsAndBytesConfig(
91
- load_in_4bit=True,
92
- bnb_4bit_compute_dtype=torch.float16
93
- )
94
-
95
- tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, token=HF_TOKEN)
96
- model = AutoModelForCausalLM.from_pretrained(
97
- LLM_MODEL_ID,
98
- quantization_config=bnb_config,
99
- device_map="auto",
100
- token=HF_TOKEN,
101
- trust_remote_code=True
102
- )
103
- return model, tokenizer
104
-
105
- def generate_coach_feedback(model, tokenizer, metrics):
106
- """Generates human-like feedback using the LLM."""
107
-
108
- prompt = f"""
109
- You are an expert Public Speaking Coach. Analyze the following speech data and give constructive, encouraging, and specific feedback.
110
-
111
- SPEECH DATA:
112
- - Transcript: "{metrics['full_text'][:1000]}..." (truncated)
113
- - Speaking Rate: {metrics['wpm']} Words Per Minute (Ideal is 130-150)
114
- - Filler Words Used: {len(metrics['fillers'])} ({metrics['filler_pct']}%) -> Found: {list(set(metrics['fillers']))}
115
- - Awkward Pauses (>3s): {metrics['awkward_pauses']}
116
- - Repetitive Phrases: {metrics['repetitions']}
117
-
118
- TASK:
119
- 1. Give a score out of 10.
120
- 2. Highlight 2 strengths.
121
- 3. Highlight 2 areas for improvement (specifically regarding pace, fillers, or clarity).
122
- 4. Give one "Pro Tip" for their next speech.
123
-
124
- Keep the tone professional yet encouraging.
125
- """
126
-
127
- messages = [{"role": "user", "content": prompt}]
128
- input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
129
-
130
- outputs = model.generate(input_ids, max_new_tokens=500, temperature=0.7)
131
- response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
132
- return response
133
-
134
- # ================= 3. MAIN RUNNER =================
135
 
136
- def run_public_speaking_coach(audio_file):
137
- # --- A. SPEECH PROCESSING ---
138
- print("\n[1/3] 🎧 Analyzing Audio (Whisper + Pyannote)...")
139
-
140
- # Load Whisper
141
- asr_model = WhisperModel("large-v3", device=DEVICE, compute_type=COMPUTE_TYPE)
142
-
143
- # Transcribe with Word Timestamps (Crucial for pause detection)
144
- segments, _ = asr_model.transcribe(audio_file, word_timestamps=True, vad_filter=True)
145
-
146
- # Flatten words
147
- all_words = []
148
- for segment in segments:
149
- for word in segment.words:
150
- all_words.append({
151
- "start": word.start,
152
- "end": word.end,
153
- "text": word.word.strip()
154
- })
 
 
 
 
 
 
 
 
 
 
155
 
156
- # Clean up Whisper to free VRAM for the LLM
157
- del asr_model
158
- gc.collect()
159
- torch.cuda.empty_cache()
160
-
161
- # --- B. METRICS ANALYSIS ---
162
- print("[2/3] 📊 Calculating Metrics...")
163
- analyzer = SpeechAnalyzer()
164
- metrics = analyzer.analyze_transcript(all_words)
165
-
166
- if not metrics:
167
- return "Error: No speech detected."
168
 
169
- # --- C. LLM COACHING ---
170
- print("[3/3] 🧠 Generating Feedback...")
171
- llm, tokenizer = load_llm_coach()
172
- feedback = generate_coach_feedback(llm, tokenizer, metrics)
173
-
174
- # Clean up LLM
175
- del llm, tokenizer
176
- gc.collect()
177
- torch.cuda.empty_cache()
178
-
179
- return metrics, feedback
180
 
181
- # ================= EXECUTION =================
182
- if __name__ == "__main__":
183
- if "PASTE" in HF_TOKEN:
184
- print("❌ ERROR: Paste your Hugging Face token at the top.")
185
- else:
186
- print("⬇️ UPLOAD AUDIO FILE ⬇️")
187
- uploaded = files.upload()
188
- filename = list(uploaded.keys())[0]
189
-
190
  try:
191
- metrics, feedback = run_public_speaking_coach(filename)
 
 
 
192
 
193
- print("\n" + "="*50)
194
- print("🎤 SPEECH ANALYSIS REPORT")
195
- print("="*50)
196
- print(f"⏱️ Speaking Rate: {metrics['wpm']} WPM")
197
- print(f"🤐 Silence Score: {metrics['avg_pause']}s avg pause")
198
- print(f"🤔 Fillers: {len(metrics['fillers'])} detected ({metrics['filler_pct']}%)")
199
- print("-" * 50)
200
- print("🤖 COACH FEEDBACK:")
201
- print(feedback)
202
- print("="*50)
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  except Exception as e:
205
- print(f"\n❌ Error: {e}")
 
 
 
1
  import os
 
 
2
  import re
3
  import gc
4
+ import torch
5
+ import torchaudio
6
  import numpy as np
 
 
7
  from faster_whisper import WhisperModel
8
  from pyannote.audio import Pipeline
9
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
10
 
11
+ class KidCoachEngine:
12
+ def __init__(self, hf_token: str):
13
+ self.hf_token = hf_token
14
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ self.compute_type = "float16" if self.device == "cuda" else "int8"
16
+ self.llm_id = "microsoft/Phi-3-mini-4k-instruct"
17
+
18
+ # Filler words database
 
 
 
 
 
 
 
 
 
 
19
  self.filler_words = {
20
+ 'um', 'uh', 'er', 'ah', 'like', 'you know', 'basically',
21
+ 'literally', 'actually', 'mean', 'right', 'okay', 'sort of'
22
  }
23
+
24
+ def _analyze_text_metrics(self, transcript_segments):
25
+ """Internal helper to calculate stats"""
26
+ full_text = " ".join([s['text'] for s in transcript_segments])
27
+ words = full_text.split()
28
+ total_words = len(words)
29
 
 
 
30
  if total_words == 0: return None
31
 
32
+ # Calculate Duration
33
+ start = transcript_segments[0]['start']
34
+ end = transcript_segments[-1]['end']
35
+ duration = end - start
36
+ wpm = (total_words / duration) * 60 if duration > 0 else 0
37
+
38
+ # Filler Density
39
  fillers_found = []
40
+ for w in words:
41
+ clean = re.sub(r'[^\w\s]', '', w.lower())
42
+ if clean in self.filler_words:
43
+ fillers_found.append(clean)
 
 
 
 
 
 
 
 
 
44
 
 
 
 
 
 
 
 
 
 
 
45
  return {
46
+ "full_text": full_text,
47
  "wpm": round(wpm, 1),
48
+ "duration": round(duration, 2),
49
+ "fillers_count": len(fillers_found),
50
+ "fillers_list": list(set(fillers_found)),
51
+ "filler_pct": round((len(fillers_found)/total_words)*100, 1)
 
 
 
52
  }
53
 
54
+ def _generate_coaching_feedback(self, metrics):
55
+ """Loads LLM, generates feedback, then unloads it to save RAM"""
56
+ print("🧠 Loading AI Coach...")
57
+ try:
58
+ bnb_config = BitsAndBytesConfig(
59
+ load_in_4bit=True,
60
+ bnb_4bit_compute_dtype=torch.float16
61
+ )
62
+ tokenizer = AutoTokenizer.from_pretrained(self.llm_id, token=self.hf_token)
63
+ model = AutoModelForCausalLM.from_pretrained(
64
+ self.llm_id,
65
+ quantization_config=bnb_config,
66
+ device_map="auto",
67
+ token=self.hf_token,
68
+ trust_remote_code=True
69
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ prompt = f"""
72
+ You are a friendly, encouraging Public Speaking Coach for students.
73
+
74
+ SPEECH DATA:
75
+ - Transcript: "{metrics['full_text'][:1500]}..."
76
+ - Speed: {metrics['wpm']} WPM (Target: 130-150)
77
+ - Filler Words: {metrics['fillers_count']} found ({metrics['filler_pct']}%)
78
+
79
+ TASK:
80
+ 1. Give a score out of 10.
81
+ 2. Mention 2 things they did great.
82
+ 3. Mention 1 thing to practice (Speed, Fillers, or Clarity).
83
+ 4. Give a fun "Pro Tip".
84
+
85
+ Keep it short, motivating, and easy to read.
86
+ """
87
+
88
+ messages = [{"role": "user", "content": prompt}]
89
+ input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.device)
90
+
91
+ outputs = model.generate(input_ids, max_new_tokens=500, temperature=0.7)
92
+ feedback = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
93
+
94
+ # CLEANUP LLM IMMEDIATELY
95
+ del model, tokenizer
96
+ gc.collect()
97
+ torch.cuda.empty_cache()
98
+
99
+ return feedback
100
 
101
+ except Exception as e:
102
+ return f"Coach is taking a nap (LLM Error): {str(e)}"
 
 
 
 
 
 
 
 
 
 
103
 
104
+ def process_pipeline(self, audio_path):
105
+ """The Main Function called by API"""
106
+ if not self.hf_token:
107
+ return {"error": "HF_TOKEN missing in server secrets"}
 
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
109
  try:
110
+ # 1. TRANSCRIPTION (Faster-Whisper)
111
+ print("🎧 Transcribing...")
112
+ asr = WhisperModel("large-v3", device=self.device, compute_type=self.compute_type)
113
+ segments, _ = asr.transcribe(audio_path, word_timestamps=True, vad_filter=True)
114
 
115
+ transcript_data = []
116
+ for s in segments:
117
+ # We save detailed word data for future timeline mapping if needed
118
+ transcript_data.append({
119
+ "start": s.start,
120
+ "end": s.end,
121
+ "text": s.text.strip()
122
+ })
 
 
123
 
124
+ # Cleanup Whisper
125
+ del asr
126
+ gc.collect()
127
+ torch.cuda.empty_cache()
128
+
129
+ if not transcript_data:
130
+ return {"error": "No speech detected in audio."}
131
+
132
+ # 2. METRICS
133
+ print("📊 Analyzing...")
134
+ metrics = self._analyze_text_metrics(transcript_data)
135
+
136
+ # 3. DIARIZATION (Optional Check)
137
+ # We run a quick check to see if there are multiple speakers
138
+ # Note: We load/unload this to save VRAM
139
+ print("🗣️ Checking Speakers...")
140
+ try:
141
+ diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=self.hf_token)
142
+ diar.to(torch.device(self.device))
143
+ wav, sr = torchaudio.load(audio_path)
144
+ d_result = diar({"waveform": wav, "sample_rate": sr})
145
+ speaker_count = len(d_result.labels())
146
+ del diar
147
+ gc.collect()
148
+ torch.cuda.empty_cache()
149
+ except:
150
+ speaker_count = 1 # Fallback if Diarization fails
151
+
152
+ metrics["speaker_count"] = speaker_count
153
+
154
+ # 4. LLM COACH
155
+ print("🧠 Coaching...")
156
+ feedback = self._generate_coaching_feedback(metrics)
157
+
158
+ # Final Result Construction
159
+ return {
160
+ "transcript": metrics['full_text'],
161
+ "stats": {
162
+ "wpm": metrics['wpm'],
163
+ "duration": metrics['duration'],
164
+ "fillers_count": metrics['fillers_count'],
165
+ "filler_percentage": metrics['filler_pct'],
166
+ "speakers_detected": speaker_count
167
+ },
168
+ "coach_feedback": feedback
169
+ }
170
+
171
  except Exception as e:
172
+ import traceback
173
+ traceback.print_exc()
174
+ return {"error": str(e)}