Spaces:
Sleeping
Sleeping
Update kid_coach_pipeline.py
Browse files- kid_coach_pipeline.py +147 -178
kid_coach_pipeline.py
CHANGED
|
@@ -1,205 +1,174 @@
|
|
| 1 |
import os
|
| 2 |
-
import torch
|
| 3 |
-
import torchaudio
|
| 4 |
import re
|
| 5 |
import gc
|
|
|
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
-
from collections import Counter
|
| 8 |
-
from google.colab import files
|
| 9 |
from faster_whisper import WhisperModel
|
| 10 |
from pyannote.audio import Pipeline
|
| 11 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
#
|
| 21 |
-
# We use Phi-3-mini because it's tiny, smart, and fits easily alongside Whisper/Pyannote
|
| 22 |
-
# Alternative: "meta-llama/Meta-Llama-3-8B-Instruct" (Requires 16GB VRAM)
|
| 23 |
-
LLM_MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
|
| 24 |
-
|
| 25 |
-
print(f"Running on: {DEVICE}")
|
| 26 |
-
|
| 27 |
-
# ================= 1. ANALYZER ENGINE =================
|
| 28 |
-
|
| 29 |
-
class SpeechAnalyzer:
|
| 30 |
-
def __init__(self):
|
| 31 |
self.filler_words = {
|
| 32 |
-
'um', 'uh', 'er', 'ah', 'like', 'you know', '
|
| 33 |
-
'
|
| 34 |
}
|
| 35 |
-
|
| 36 |
-
def
|
| 37 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
full_text = " ".join([t['text'] for t in transcript])
|
| 40 |
-
total_words = len(full_text.split())
|
| 41 |
if total_words == 0: return None
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
fillers_found = []
|
| 49 |
-
for
|
| 50 |
-
|
| 51 |
-
if
|
| 52 |
-
fillers_found.append(
|
| 53 |
-
|
| 54 |
-
filler_percentage = (len(fillers_found) / total_words) * 100
|
| 55 |
-
|
| 56 |
-
# 3. Silence / Pause Analysis
|
| 57 |
-
pauses = []
|
| 58 |
-
for i in range(len(transcript) - 1):
|
| 59 |
-
gap = transcript[i+1]['start'] - transcript[i]['end']
|
| 60 |
-
if gap > 0.5: # Pauses longer than 0.5s
|
| 61 |
-
pauses.append(gap)
|
| 62 |
|
| 63 |
-
avg_pause = np.mean(pauses) if pauses else 0
|
| 64 |
-
awkward_silences = len([p for p in pauses if p > 3.0]) # >3s is awkward
|
| 65 |
-
|
| 66 |
-
# 4. Repetitive Phrases (N-grams)
|
| 67 |
-
words = full_text.lower().split()
|
| 68 |
-
bigrams = zip(words, words[1:])
|
| 69 |
-
counts = Counter(bigrams)
|
| 70 |
-
# Filter for phrases repeated 3+ times
|
| 71 |
-
repetitions = [f"{k[0]} {k[1]}" for k, v in counts.items() if v >= 3]
|
| 72 |
-
|
| 73 |
return {
|
|
|
|
| 74 |
"wpm": round(wpm, 1),
|
| 75 |
-
"
|
| 76 |
-
"
|
| 77 |
-
"
|
| 78 |
-
"
|
| 79 |
-
"awkward_pauses": awkward_silences,
|
| 80 |
-
"repetitions": repetitions,
|
| 81 |
-
"full_text": full_text
|
| 82 |
}
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
token=HF_TOKEN,
|
| 101 |
-
trust_remote_code=True
|
| 102 |
-
)
|
| 103 |
-
return model, tokenizer
|
| 104 |
-
|
| 105 |
-
def generate_coach_feedback(model, tokenizer, metrics):
|
| 106 |
-
"""Generates human-like feedback using the LLM."""
|
| 107 |
-
|
| 108 |
-
prompt = f"""
|
| 109 |
-
You are an expert Public Speaking Coach. Analyze the following speech data and give constructive, encouraging, and specific feedback.
|
| 110 |
-
|
| 111 |
-
SPEECH DATA:
|
| 112 |
-
- Transcript: "{metrics['full_text'][:1000]}..." (truncated)
|
| 113 |
-
- Speaking Rate: {metrics['wpm']} Words Per Minute (Ideal is 130-150)
|
| 114 |
-
- Filler Words Used: {len(metrics['fillers'])} ({metrics['filler_pct']}%) -> Found: {list(set(metrics['fillers']))}
|
| 115 |
-
- Awkward Pauses (>3s): {metrics['awkward_pauses']}
|
| 116 |
-
- Repetitive Phrases: {metrics['repetitions']}
|
| 117 |
-
|
| 118 |
-
TASK:
|
| 119 |
-
1. Give a score out of 10.
|
| 120 |
-
2. Highlight 2 strengths.
|
| 121 |
-
3. Highlight 2 areas for improvement (specifically regarding pace, fillers, or clarity).
|
| 122 |
-
4. Give one "Pro Tip" for their next speech.
|
| 123 |
-
|
| 124 |
-
Keep the tone professional yet encouraging.
|
| 125 |
-
"""
|
| 126 |
-
|
| 127 |
-
messages = [{"role": "user", "content": prompt}]
|
| 128 |
-
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
|
| 129 |
-
|
| 130 |
-
outputs = model.generate(input_ids, max_new_tokens=500, temperature=0.7)
|
| 131 |
-
response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
|
| 132 |
-
return response
|
| 133 |
-
|
| 134 |
-
# ================= 3. MAIN RUNNER =================
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
gc.collect()
|
| 159 |
-
torch.cuda.empty_cache()
|
| 160 |
-
|
| 161 |
-
# --- B. METRICS ANALYSIS ---
|
| 162 |
-
print("[2/3] 📊 Calculating Metrics...")
|
| 163 |
-
analyzer = SpeechAnalyzer()
|
| 164 |
-
metrics = analyzer.analyze_transcript(all_words)
|
| 165 |
-
|
| 166 |
-
if not metrics:
|
| 167 |
-
return "Error: No speech detected."
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
# Clean up LLM
|
| 175 |
-
del llm, tokenizer
|
| 176 |
-
gc.collect()
|
| 177 |
-
torch.cuda.empty_cache()
|
| 178 |
-
|
| 179 |
-
return metrics, feedback
|
| 180 |
|
| 181 |
-
# ================= EXECUTION =================
|
| 182 |
-
if __name__ == "__main__":
|
| 183 |
-
if "PASTE" in HF_TOKEN:
|
| 184 |
-
print("❌ ERROR: Paste your Hugging Face token at the top.")
|
| 185 |
-
else:
|
| 186 |
-
print("⬇️ UPLOAD AUDIO FILE ⬇️")
|
| 187 |
-
uploaded = files.upload()
|
| 188 |
-
filename = list(uploaded.keys())[0]
|
| 189 |
-
|
| 190 |
try:
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
print(feedback)
|
| 202 |
-
print("="*50)
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
except Exception as e:
|
| 205 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import re
|
| 3 |
import gc
|
| 4 |
+
import torch
|
| 5 |
+
import torchaudio
|
| 6 |
import numpy as np
|
|
|
|
|
|
|
| 7 |
from faster_whisper import WhisperModel
|
| 8 |
from pyannote.audio import Pipeline
|
| 9 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
| 10 |
|
| 11 |
+
class KidCoachEngine:
|
| 12 |
+
def __init__(self, hf_token: str):
|
| 13 |
+
self.hf_token = hf_token
|
| 14 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
+
self.compute_type = "float16" if self.device == "cuda" else "int8"
|
| 16 |
+
self.llm_id = "microsoft/Phi-3-mini-4k-instruct"
|
| 17 |
+
|
| 18 |
+
# Filler words database
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
self.filler_words = {
|
| 20 |
+
'um', 'uh', 'er', 'ah', 'like', 'you know', 'basically',
|
| 21 |
+
'literally', 'actually', 'mean', 'right', 'okay', 'sort of'
|
| 22 |
}
|
| 23 |
+
|
| 24 |
+
def _analyze_text_metrics(self, transcript_segments):
|
| 25 |
+
"""Internal helper to calculate stats"""
|
| 26 |
+
full_text = " ".join([s['text'] for s in transcript_segments])
|
| 27 |
+
words = full_text.split()
|
| 28 |
+
total_words = len(words)
|
| 29 |
|
|
|
|
|
|
|
| 30 |
if total_words == 0: return None
|
| 31 |
|
| 32 |
+
# Calculate Duration
|
| 33 |
+
start = transcript_segments[0]['start']
|
| 34 |
+
end = transcript_segments[-1]['end']
|
| 35 |
+
duration = end - start
|
| 36 |
+
wpm = (total_words / duration) * 60 if duration > 0 else 0
|
| 37 |
+
|
| 38 |
+
# Filler Density
|
| 39 |
fillers_found = []
|
| 40 |
+
for w in words:
|
| 41 |
+
clean = re.sub(r'[^\w\s]', '', w.lower())
|
| 42 |
+
if clean in self.filler_words:
|
| 43 |
+
fillers_found.append(clean)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
return {
|
| 46 |
+
"full_text": full_text,
|
| 47 |
"wpm": round(wpm, 1),
|
| 48 |
+
"duration": round(duration, 2),
|
| 49 |
+
"fillers_count": len(fillers_found),
|
| 50 |
+
"fillers_list": list(set(fillers_found)),
|
| 51 |
+
"filler_pct": round((len(fillers_found)/total_words)*100, 1)
|
|
|
|
|
|
|
|
|
|
| 52 |
}
|
| 53 |
|
| 54 |
+
def _generate_coaching_feedback(self, metrics):
|
| 55 |
+
"""Loads LLM, generates feedback, then unloads it to save RAM"""
|
| 56 |
+
print("🧠 Loading AI Coach...")
|
| 57 |
+
try:
|
| 58 |
+
bnb_config = BitsAndBytesConfig(
|
| 59 |
+
load_in_4bit=True,
|
| 60 |
+
bnb_4bit_compute_dtype=torch.float16
|
| 61 |
+
)
|
| 62 |
+
tokenizer = AutoTokenizer.from_pretrained(self.llm_id, token=self.hf_token)
|
| 63 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 64 |
+
self.llm_id,
|
| 65 |
+
quantization_config=bnb_config,
|
| 66 |
+
device_map="auto",
|
| 67 |
+
token=self.hf_token,
|
| 68 |
+
trust_remote_code=True
|
| 69 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
prompt = f"""
|
| 72 |
+
You are a friendly, encouraging Public Speaking Coach for students.
|
| 73 |
+
|
| 74 |
+
SPEECH DATA:
|
| 75 |
+
- Transcript: "{metrics['full_text'][:1500]}..."
|
| 76 |
+
- Speed: {metrics['wpm']} WPM (Target: 130-150)
|
| 77 |
+
- Filler Words: {metrics['fillers_count']} found ({metrics['filler_pct']}%)
|
| 78 |
+
|
| 79 |
+
TASK:
|
| 80 |
+
1. Give a score out of 10.
|
| 81 |
+
2. Mention 2 things they did great.
|
| 82 |
+
3. Mention 1 thing to practice (Speed, Fillers, or Clarity).
|
| 83 |
+
4. Give a fun "Pro Tip".
|
| 84 |
+
|
| 85 |
+
Keep it short, motivating, and easy to read.
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
messages = [{"role": "user", "content": prompt}]
|
| 89 |
+
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.device)
|
| 90 |
+
|
| 91 |
+
outputs = model.generate(input_ids, max_new_tokens=500, temperature=0.7)
|
| 92 |
+
feedback = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
|
| 93 |
+
|
| 94 |
+
# CLEANUP LLM IMMEDIATELY
|
| 95 |
+
del model, tokenizer
|
| 96 |
+
gc.collect()
|
| 97 |
+
torch.cuda.empty_cache()
|
| 98 |
+
|
| 99 |
+
return feedback
|
| 100 |
|
| 101 |
+
except Exception as e:
|
| 102 |
+
return f"Coach is taking a nap (LLM Error): {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
def process_pipeline(self, audio_path):
|
| 105 |
+
"""The Main Function called by API"""
|
| 106 |
+
if not self.hf_token:
|
| 107 |
+
return {"error": "HF_TOKEN missing in server secrets"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
try:
|
| 110 |
+
# 1. TRANSCRIPTION (Faster-Whisper)
|
| 111 |
+
print("🎧 Transcribing...")
|
| 112 |
+
asr = WhisperModel("large-v3", device=self.device, compute_type=self.compute_type)
|
| 113 |
+
segments, _ = asr.transcribe(audio_path, word_timestamps=True, vad_filter=True)
|
| 114 |
|
| 115 |
+
transcript_data = []
|
| 116 |
+
for s in segments:
|
| 117 |
+
# We save detailed word data for future timeline mapping if needed
|
| 118 |
+
transcript_data.append({
|
| 119 |
+
"start": s.start,
|
| 120 |
+
"end": s.end,
|
| 121 |
+
"text": s.text.strip()
|
| 122 |
+
})
|
|
|
|
|
|
|
| 123 |
|
| 124 |
+
# Cleanup Whisper
|
| 125 |
+
del asr
|
| 126 |
+
gc.collect()
|
| 127 |
+
torch.cuda.empty_cache()
|
| 128 |
+
|
| 129 |
+
if not transcript_data:
|
| 130 |
+
return {"error": "No speech detected in audio."}
|
| 131 |
+
|
| 132 |
+
# 2. METRICS
|
| 133 |
+
print("📊 Analyzing...")
|
| 134 |
+
metrics = self._analyze_text_metrics(transcript_data)
|
| 135 |
+
|
| 136 |
+
# 3. DIARIZATION (Optional Check)
|
| 137 |
+
# We run a quick check to see if there are multiple speakers
|
| 138 |
+
# Note: We load/unload this to save VRAM
|
| 139 |
+
print("🗣️ Checking Speakers...")
|
| 140 |
+
try:
|
| 141 |
+
diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=self.hf_token)
|
| 142 |
+
diar.to(torch.device(self.device))
|
| 143 |
+
wav, sr = torchaudio.load(audio_path)
|
| 144 |
+
d_result = diar({"waveform": wav, "sample_rate": sr})
|
| 145 |
+
speaker_count = len(d_result.labels())
|
| 146 |
+
del diar
|
| 147 |
+
gc.collect()
|
| 148 |
+
torch.cuda.empty_cache()
|
| 149 |
+
except:
|
| 150 |
+
speaker_count = 1 # Fallback if Diarization fails
|
| 151 |
+
|
| 152 |
+
metrics["speaker_count"] = speaker_count
|
| 153 |
+
|
| 154 |
+
# 4. LLM COACH
|
| 155 |
+
print("🧠 Coaching...")
|
| 156 |
+
feedback = self._generate_coaching_feedback(metrics)
|
| 157 |
+
|
| 158 |
+
# Final Result Construction
|
| 159 |
+
return {
|
| 160 |
+
"transcript": metrics['full_text'],
|
| 161 |
+
"stats": {
|
| 162 |
+
"wpm": metrics['wpm'],
|
| 163 |
+
"duration": metrics['duration'],
|
| 164 |
+
"fillers_count": metrics['fillers_count'],
|
| 165 |
+
"filler_percentage": metrics['filler_pct'],
|
| 166 |
+
"speakers_detected": speaker_count
|
| 167 |
+
},
|
| 168 |
+
"coach_feedback": feedback
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
except Exception as e:
|
| 172 |
+
import traceback
|
| 173 |
+
traceback.print_exc()
|
| 174 |
+
return {"error": str(e)}
|