Spaces:

akpande2
/

kid-coach-api

Sleeping

App Files Files Community

akpande2 commited on Dec 2, 2025

Commit

1a64a8e

verified ·

1 Parent(s): a371aa9

Update kid_coach_pipeline.py

Browse files

Files changed (1) hide show

kid_coach_pipeline.py +147 -178

kid_coach_pipeline.py CHANGED Viewed

@@ -1,205 +1,174 @@
 import os
-import torch
-import torchaudio
 import re
 import gc
 import numpy as np
-from collections import Counter
-from google.colab import files
 from faster_whisper import WhisperModel
 from pyannote.audio import Pipeline
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-# ================= CONFIGURATION =================
-HF_TOKEN = "PASTE_YOUR_TOKEN_HERE"
-# Audio Settings
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-COMPUTE_TYPE = "float16" # Use "int8" for T4 GPU
-# LLM Settings (The "Coach")
-# We use Phi-3-mini because it's tiny, smart, and fits easily alongside Whisper/Pyannote
-# Alternative: "meta-llama/Meta-Llama-3-8B-Instruct" (Requires 16GB VRAM)
-LLM_MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
-print(f"Running on: {DEVICE}")
-# ================= 1. ANALYZER ENGINE =================
-class SpeechAnalyzer:
-    def __init__(self):
         self.filler_words = {
-            'um', 'uh', 'er', 'ah', 'like', 'you know', 'sort of', 'kind of',
-            'basically', 'literally', 'actually', 'mean', 'right', 'okay'
         }
-    def analyze_transcript(self, transcript):
-        """Calculates advanced metrics from the raw transcript list."""
-        full_text = " ".join([t['text'] for t in transcript])
-        total_words = len(full_text.split())
         if total_words == 0: return None
-        # 1. Pace (WPM)
-        duration = transcript[-1]['end'] - transcript[0]['start']
-        wpm = (total_words / duration) * 60
-        # 2. Filler Word Density
         fillers_found = []
-        for word in full_text.lower().split():
-            clean_word = re.sub(r'[^\w\s]', '', word)
-            if clean_word in self.filler_words:
-                fillers_found.append(clean_word)
-        filler_percentage = (len(fillers_found) / total_words) * 100
-        # 3. Silence / Pause Analysis
-        pauses = []
-        for i in range(len(transcript) - 1):
-            gap = transcript[i+1]['start'] - transcript[i]['end']
-            if gap > 0.5: # Pauses longer than 0.5s
-                pauses.append(gap)
-        avg_pause = np.mean(pauses) if pauses else 0
-        awkward_silences = len([p for p in pauses if p > 3.0]) # >3s is awkward
-        # 4. Repetitive Phrases (N-grams)
-        words = full_text.lower().split()
-        bigrams = zip(words, words[1:])
-        counts = Counter(bigrams)
-        # Filter for phrases repeated 3+ times
-        repetitions = [f"{k[0]} {k[1]}" for k, v in counts.items() if v >= 3]
         return {
             "wpm": round(wpm, 1),
-            "fillers": fillers_found,
-            "filler_pct": round(filler_percentage, 1),
-            "pauses_count": len(pauses),
-            "avg_pause": round(avg_pause, 2),
-            "awkward_pauses": awkward_silences,
-            "repetitions": repetitions,
-            "full_text": full_text
         }
-# ================= 2. PIPELINE LOGIC =================
-def load_llm_coach():
-    """Loads the LLM in 4-bit mode to save VRAM."""
-    print(f"\n🧠 Loading AI Coach ({LLM_MODEL_ID})...")
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.float16
-    )
-    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(
-        LLM_MODEL_ID,
-        quantization_config=bnb_config,
-        device_map="auto",
-        token=HF_TOKEN,
-        trust_remote_code=True
-    )
-    return model, tokenizer
-def generate_coach_feedback(model, tokenizer, metrics):
-    """Generates human-like feedback using the LLM."""
-    prompt = f"""
-    You are an expert Public Speaking Coach. Analyze the following speech data and give constructive, encouraging, and specific feedback.
-    SPEECH DATA:
-    - Transcript: "{metrics['full_text'][:1000]}..." (truncated)
-    - Speaking Rate: {metrics['wpm']} Words Per Minute (Ideal is 130-150)
-    - Filler Words Used: {len(metrics['fillers'])} ({metrics['filler_pct']}%) -> Found: {list(set(metrics['fillers']))}
-    - Awkward Pauses (>3s): {metrics['awkward_pauses']}
-    - Repetitive Phrases: {metrics['repetitions']}
-    TASK:
-    1. Give a score out of 10.
-    2. Highlight 2 strengths.
-    3. Highlight 2 areas for improvement (specifically regarding pace, fillers, or clarity).
-    4. Give one "Pro Tip" for their next speech.
-    Keep the tone professional yet encouraging.
-    """
-    messages = [{"role": "user", "content": prompt}]
-    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
-    outputs = model.generate(input_ids, max_new_tokens=500, temperature=0.7)
-    response = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
-    return response
-# ================= 3. MAIN RUNNER =================
-def run_public_speaking_coach(audio_file):
-    # --- A. SPEECH PROCESSING ---
-    print("\n[1/3] 🎧 Analyzing Audio (Whisper + Pyannote)...")
-    # Load Whisper
-    asr_model = WhisperModel("large-v3", device=DEVICE, compute_type=COMPUTE_TYPE)
-    # Transcribe with Word Timestamps (Crucial for pause detection)
-    segments, _ = asr_model.transcribe(audio_file, word_timestamps=True, vad_filter=True)
-    # Flatten words
-    all_words = []
-    for segment in segments:
-        for word in segment.words:
-            all_words.append({
-                "start": word.start,
-                "end": word.end,
-                "text": word.word.strip()
-            })
-    # Clean up Whisper to free VRAM for the LLM
-    del asr_model
-    gc.collect()
-    torch.cuda.empty_cache()
-    # --- B. METRICS ANALYSIS ---
-    print("[2/3] 📊 Calculating Metrics...")
-    analyzer = SpeechAnalyzer()
-    metrics = analyzer.analyze_transcript(all_words)
-    if not metrics:
-        return "Error: No speech detected."
-    # --- C. LLM COACHING ---
-    print("[3/3] 🧠 Generating Feedback...")
-    llm, tokenizer = load_llm_coach()
-    feedback = generate_coach_feedback(llm, tokenizer, metrics)
-    # Clean up LLM
-    del llm, tokenizer
-    gc.collect()
-    torch.cuda.empty_cache()
-    return metrics, feedback
-# ================= EXECUTION =================
-if __name__ == "__main__":
-    if "PASTE" in HF_TOKEN:
-        print("❌ ERROR: Paste your Hugging Face token at the top.")
-    else:
-        print("⬇️ UPLOAD AUDIO FILE ⬇️")
-        uploaded = files.upload()
-        filename = list(uploaded.keys())[0]
         try:
-            metrics, feedback = run_public_speaking_coach(filename)
-            print("\n" + "="*50)
-            print("🎤 SPEECH ANALYSIS REPORT")
-            print("="*50)
-            print(f"⏱️  Speaking Rate: {metrics['wpm']} WPM")
-            print(f"🤐 Silence Score: {metrics['avg_pause']}s avg pause")
-            print(f"🤔 Fillers: {len(metrics['fillers'])} detected ({metrics['filler_pct']}%)")
-            print("-" * 50)
-            print("🤖 COACH FEEDBACK:")
-            print(feedback)
-            print("="*50)
         except Exception as e:
-            print(f"\n❌ Error: {e}")

 import os
 import re
 import gc
+import torch
+import torchaudio
 import numpy as np
 from faster_whisper import WhisperModel
 from pyannote.audio import Pipeline
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+class KidCoachEngine:
+    def __init__(self, hf_token: str):
+        self.hf_token = hf_token
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.compute_type = "float16" if self.device == "cuda" else "int8"
+        self.llm_id = "microsoft/Phi-3-mini-4k-instruct"
+        # Filler words database
         self.filler_words = {
+            'um', 'uh', 'er', 'ah', 'like', 'you know', 'basically',
+            'literally', 'actually', 'mean', 'right', 'okay', 'sort of'
         }
+    def _analyze_text_metrics(self, transcript_segments):
+        """Internal helper to calculate stats"""
+        full_text = " ".join([s['text'] for s in transcript_segments])
+        words = full_text.split()
+        total_words = len(words)
         if total_words == 0: return None
+        # Calculate Duration
+        start = transcript_segments[0]['start']
+        end = transcript_segments[-1]['end']
+        duration = end - start
+        wpm = (total_words / duration) * 60 if duration > 0 else 0
+        # Filler Density
         fillers_found = []
+        for w in words:
+            clean = re.sub(r'[^\w\s]', '', w.lower())
+            if clean in self.filler_words:
+                fillers_found.append(clean)
         return {
+            "full_text": full_text,
             "wpm": round(wpm, 1),
+            "duration": round(duration, 2),
+            "fillers_count": len(fillers_found),
+            "fillers_list": list(set(fillers_found)),
+            "filler_pct": round((len(fillers_found)/total_words)*100, 1)
         }
+    def _generate_coaching_feedback(self, metrics):
+        """Loads LLM, generates feedback, then unloads it to save RAM"""
+        print("🧠 Loading AI Coach...")
+        try:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16
+            )
+            tokenizer = AutoTokenizer.from_pretrained(self.llm_id, token=self.hf_token)
+            model = AutoModelForCausalLM.from_pretrained(
+                self.llm_id,
+                quantization_config=bnb_config,
+                device_map="auto",
+                token=self.hf_token,
+                trust_remote_code=True
+            )
+            prompt = f"""
+            You are a friendly, encouraging Public Speaking Coach for students.
+            SPEECH DATA:
+            - Transcript: "{metrics['full_text'][:1500]}..."
+            - Speed: {metrics['wpm']} WPM (Target: 130-150)
+            - Filler Words: {metrics['fillers_count']} found ({metrics['filler_pct']}%)
+            TASK:
+            1. Give a score out of 10.
+            2. Mention 2 things they did great.
+            3. Mention 1 thing to practice (Speed, Fillers, or Clarity).
+            4. Give a fun "Pro Tip".
+            Keep it short, motivating, and easy to read.
+            """
+            messages = [{"role": "user", "content": prompt}]
+            input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.device)
+            outputs = model.generate(input_ids, max_new_tokens=500, temperature=0.7)
+            feedback = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
+            # CLEANUP LLM IMMEDIATELY
+            del model, tokenizer
+            gc.collect()
+            torch.cuda.empty_cache()
+            return feedback
+        except Exception as e:
+            return f"Coach is taking a nap (LLM Error): {str(e)}"
+    def process_pipeline(self, audio_path):
+        """The Main Function called by API"""
+        if not self.hf_token:
+            return {"error": "HF_TOKEN missing in server secrets"}
         try:
+            # 1. TRANSCRIPTION (Faster-Whisper)
+            print("🎧 Transcribing...")
+            asr = WhisperModel("large-v3", device=self.device, compute_type=self.compute_type)
+            segments, _ = asr.transcribe(audio_path, word_timestamps=True, vad_filter=True)
+            transcript_data = []
+            for s in segments:
+                # We save detailed word data for future timeline mapping if needed
+                transcript_data.append({
+                    "start": s.start,
+                    "end": s.end,
+                    "text": s.text.strip()
+                })
+            # Cleanup Whisper
+            del asr
+            gc.collect()
+            torch.cuda.empty_cache()
+            if not transcript_data:
+                return {"error": "No speech detected in audio."}
+            # 2. METRICS
+            print("📊 Analyzing...")
+            metrics = self._analyze_text_metrics(transcript_data)
+            # 3. DIARIZATION (Optional Check)
+            # We run a quick check to see if there are multiple speakers
+            # Note: We load/unload this to save VRAM
+            print("🗣️ Checking Speakers...")
+            try:
+                diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=self.hf_token)
+                diar.to(torch.device(self.device))
+                wav, sr = torchaudio.load(audio_path)
+                d_result = diar({"waveform": wav, "sample_rate": sr})
+                speaker_count = len(d_result.labels())
+                del diar
+                gc.collect()
+                torch.cuda.empty_cache()
+            except:
+                speaker_count = 1 # Fallback if Diarization fails
+            metrics["speaker_count"] = speaker_count
+            # 4. LLM COACH
+            print("🧠 Coaching...")
+            feedback = self._generate_coaching_feedback(metrics)
+            # Final Result Construction
+            return {
+                "transcript": metrics['full_text'],
+                "stats": {
+                    "wpm": metrics['wpm'],
+                    "duration": metrics['duration'],
+                    "fillers_count": metrics['fillers_count'],
+                    "filler_percentage": metrics['filler_pct'],
+                    "speakers_detected": speaker_count
+                },
+                "coach_feedback": feedback
+            }
         except Exception as e:
+            import traceback
+            traceback.print_exc()
+            return {"error": str(e)}