Spaces:

akpande2
/

kid-coach-api

Sleeping

App Files Files Community

akpande2 commited on Dec 2, 2025

Commit

1ba09d2

verified ·

1 Parent(s): 8e3cd10

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +38 -33
kid_coach_pipeline.py +845 -150
main.py +239 -71
requirements (1).txt +27 -0

Dockerfile CHANGED Viewed

@@ -1,42 +1,47 @@
-# Use NVIDIA CUDA 11.8 base
-FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
-# Setup environment
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHONUNBUFFERED=1 \
-    PIP_CACHE_DIR=/var/cache/pip
-# 1. Install System Dependencies (FFmpeg is still needed for converting audio)
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3.10 \
-    python3-pip \
-    python3-dev \
     ffmpeg \
     git \
-    wget \
     && rm -rf /var/lib/apt/lists/*
-# Set python3.10 as default
-RUN ln -s /usr/bin/python3.10 /usr/bin/python
-# 2. Setup User
 WORKDIR /app
-RUN useradd -m -u 1000 user
-RUN chown -R user:user /app
-USER user
-ENV PATH="/home/user/.local/bin:$PATH"
-# 3. Install Python Dependencies
-COPY --chown=user requirements.txt requirements.txt
-RUN pip install --no-cache-dir --upgrade pip && \
-    # Install Torch first to handle the index URL correctly
-    pip install --no-cache-dir "torch==2.1.2" "torchaudio==2.1.2" --index-url https://download.pytorch.org/whl/cu118 && \
-    # Install the rest
-    pip install --no-cache-dir -r requirements.txt
-# 4. Copy Code
-COPY --chown=user . .
-# 5. Launch
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

+# Production Dockerfile for Public Speaking Coach API
+# Optimized for Hugging Face Spaces or any cloud deployment
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
     ffmpeg \
+    libsndfile1 \
     git \
     && rm -rf /var/lib/apt/lists/*
+# Set working directory
 WORKDIR /app
+# Copy requirements first (for better caching)
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Download language tool data (for grammar checking)
+RUN python -c "import language_tool_python; language_tool_python.LanguageTool('en-US')" || true
+# Copy application code
+COPY kid_coach_pipeline.py .
+COPY main.py .
+# Create directory for temporary files
+RUN mkdir -p /tmp/uploads
+# Expose port
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/health')"
+# Run the application
+# Use port 7860 for Hugging Face Spaces compatibility
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

kid_coach_pipeline.py CHANGED Viewed

@@ -1,178 +1,873 @@
 import os
 import re
-import gc
 import torch
-import torchaudio
 import librosa
 import numpy as np
-from pyannote.audio import Pipeline
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    BitsAndBytesConfig,
-    pipeline
-)
-class KidCoachEngine:
-    def __init__(self, hf_token: str):
-        self.hf_token = hf_token
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.torch_dtype = torch.float16 if self.device == "cuda" else torch.float32
-        self.llm_id = "microsoft/Phi-3-mini-4k-instruct"
-        # Filler words database
-        self.filler_words = {
-            'um', 'uh', 'er', 'ah', 'like', 'you know', 'basically',
-            'literally', 'actually', 'mean', 'right', 'okay', 'sort of'
         }
-    def _analyze_metrics(self, transcript_chunks, duration):
-        """Calculates WPM, Fillers, and Stats"""
-        full_text = " ".join([c['text'] for c in transcript_chunks]).strip()
-        words = full_text.split()
-        total_words = len(words)
-        wpm = (total_words / duration) * 60 if duration > 0 else 0
-        # Filler Density
-        fillers_found = []
-        for w in words:
-            # Strip punctuation
-            clean = re.sub(r'[^\w\s]', '', w.lower())
-            if clean in self.filler_words:
-                fillers_found.append(clean)
-        filler_pct = (len(fillers_found)/total_words)*100 if total_words > 0 else 0
-        return {
-            "full_text": full_text,
-            "wpm": round(wpm, 1),
-            "duration": round(duration, 2),
-            "fillers_count": len(fillers_found),
-            "fillers_list": list(set(fillers_found)),
-            "filler_pct": round(filler_pct, 1)
-        }
-    def _generate_coaching_feedback(self, metrics):
-        """Loads LLM, generates feedback, then unloads it"""
-        print("🧠 Loading AI Coach...")
         try:
-            bnb_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16
-            )
-            tokenizer = AutoTokenizer.from_pretrained(self.llm_id, token=self.hf_token)
-            model = AutoModelForCausalLM.from_pretrained(
-                self.llm_id,
-                quantization_config=bnb_config,
-                device_map="auto",
-                token=self.hf_token,
-                trust_remote_code=True
             )
-            prompt = f"""
-            You are a kind, professional Public Speaking Coach.
-            SPEECH DATA:
-            - Transcript: "{metrics['full_text'][:1500]}..."
-            - Speed: {metrics['wpm']} WPM (Target: 130-150)
-            - Filler Words: {metrics['fillers_count']} found ({metrics['filler_pct']}%)
-            TASK:
-            1. Give a score out of 10.
-            2. List 2 Strengths.
-            3. List 1 specific Improvement (Pace, Fillers, or Grammar).
-            4. Give a "Pro Tip".
-            Format nicely with emojis. Keep it concise.
-            """
-            messages = [{"role": "user", "content": prompt}]
-            input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.device)
-            outputs = model.generate(input_ids, max_new_tokens=400, temperature=0.7)
-            feedback = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
-            # CLEANUP LLM
-            del model, tokenizer
-            gc.collect()
-            torch.cuda.empty_cache()
-            return feedback
-        except Exception as e:
-            return f"Coach Error: {str(e)}"
-    def process_pipeline(self, audio_path):
-        if not self.hf_token:
-            return {"error": "HF_TOKEN missing in server secrets"}
-        try:
-            # 1. TRANSCRIPTION (Using HuggingFace Transformers - THE SAFE WAY)
-            print("🎧 Transcribing...")
-            # We use openai/whisper-large-v3 directly via Transformers
-            # This avoids all the C++ build errors of faster-whisper
-            transcriber = pipeline(
-                "automatic-speech-recognition",
-                model="openai/whisper-large-v3",
-                device=self.device,
-                torch_dtype=self.torch_dtype,
-                chunk_length_s=30
-            )
-            # Run transcription with timestamps
-            result = transcriber(audio_path, return_timestamps=True)
-            # Cleanup Transcription Model
-            del transcriber
-            gc.collect()
-            torch.cuda.empty_cache()
-            if not result['text']:
-                return {"error": "No speech detected."}
-            # Calculate Audio Duration for WPM
-            duration = librosa.get_duration(path=audio_path)
-            # 2. METRICS
-            print("📊 Analyzing...")
-            # Transformers output format is different, we adapt here
-            transcript_chunks = result.get('chunks', [{'text': result['text']}])
-            metrics = self._analyze_metrics(transcript_chunks, duration)
-            # 3. DIARIZATION (Quick check for multiple speakers)
-            print("🗣️ Checking Speakers...")
-            try:
-                diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=self.hf_token)
-                diar.to(torch.device(self.device))
-                wav, sr = torchaudio.load(audio_path)
-                d_result = diar({"waveform": wav, "sample_rate": sr})
-                speaker_count = len(d_result.labels())
-                del diar
-                gc.collect()
-                torch.cuda.empty_cache()
-            except:
-                speaker_count = 1
-            metrics["speaker_count"] = speaker_count
-            # 4. LLM COACH
-            print("🧠 Coaching...")
-            feedback = self._generate_coaching_feedback(metrics)
-            return {
-                "transcript": metrics['full_text'],
-                "stats": {
-                    "wpm": metrics['wpm'],
-                    "duration": metrics['duration'],
-                    "fillers_count": metrics['fillers_count'],
-                    "filler_percentage": metrics['filler_pct'],
-                    "speakers_detected": speaker_count
                 },
-                "coach_feedback": feedback
             }
         except Exception as e:
             import traceback
             traceback.print_exc()
-            return {"error": str(e)}

+"""
+Production-Ready Public Speaking Coach Engine
+Supports all ages with comprehensive speech analysis
+"""
 import os
+import io
+import json
+import logging
+import warnings
 import re
+from typing import Dict, List, Any, Optional
+from dataclasses import dataclass, asdict
 import torch
 import librosa
 import numpy as np
+import soundfile as sf
+from scipy.signal import medfilt
+from scipy.stats import zscore
+import textstat
+# Suppress warnings
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+logging.getLogger("whisper").setLevel(logging.ERROR)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+warnings.filterwarnings("ignore")
+# Validate Whisper installation
+try:
+    import whisper
+    if not hasattr(whisper, "load_model"):
+        raise ImportError("Wrong whisper library installed")
+except ImportError:
+    print("\n❌ CRITICAL: Install correct whisper library:")
+    print("   pip uninstall -y whisper && pip install openai-whisper")
+    exit(1)
+# Import grammar checker (lazy load to avoid startup delay)
+GRAMMAR_TOOL = None
+def get_grammar_tool():
+    """Lazy load grammar checker"""
+    global GRAMMAR_TOOL
+    if GRAMMAR_TOOL is None:
+        try:
+            import language_tool_python
+            GRAMMAR_TOOL = language_tool_python.LanguageTool('en-US')
+        except Exception as e:
+            logging.warning(f"Grammar tool not available: {e}")
+            GRAMMAR_TOOL = False
+    return GRAMMAR_TOOL if GRAMMAR_TOOL else None
+# JSON Serialization Helper
+class NumpyEncoder(json.JSONEncoder):
+    """Handles numpy types in JSON serialization"""
+    def default(self, obj):
+        if isinstance(obj, (np.integer, np.int64)):
+            return int(obj)
+        if isinstance(obj, (np.floating, np.float32, np.float64)):
+            return float(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super().default(obj)
+@dataclass
+class AnalysisResult:
+    """Structured result for type safety"""
+    overall_score: int
+    fluency_score: int
+    confidence_score: int
+    content_score: int
+    grammar_score: int
+    transcription: str
+    word_count: int
+    duration_seconds: float
+    filler_words: Dict[str, int]
+    repeated_phrases: List[Dict[str, Any]]
+    long_pauses: List[Dict[str, float]]
+    pace_analysis: Dict[str, Any]
+    tone_analysis: Dict[str, Any]
+    grammar_issues: List[Dict[str, str]]
+    strengths: List[str]
+    improvements: List[str]
+    coaching_feedback: str
+    def to_dict(self):
+        return asdict(self)
+class PublicSpeakingCoach:
+    """
+    Complete speech analysis engine for public speaking coaching
+    Features:
+    - Transcription with word-level timestamps
+    - Filler word detection
+    - Silence/pause analysis
+    - Repeated phrase detection
+    - Tone & confidence analysis
+    - Grammar checking
+    - Content quality analysis
+    - AI-powered coaching feedback
+    """
+    def __init__(self, whisper_model_size: str = "base"):
+        """
+        Initialize the coach engine
+        Args:
+            whisper_model_size: Whisper model size (tiny/base/small/medium)
+                               base = good balance, small = better accuracy
+        """
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"🚀 Initializing Public Speaking Coach on {self.device}...")
+        # Load Whisper for transcription
+        print(f"   Loading Whisper ({whisper_model_size})...")
+        self.whisper = whisper.load_model(whisper_model_size, device=self.device)
+        # Linguistic patterns
+        self.filler_patterns = {
+            "um": r"\bum+h*\b",
+            "uh": r"\buh+h*\b",
+            "like": r"\blike\b",
+            "you know": r"\byou know\b",
+            "so": r"\bso+\b",
+            "actually": r"\bactually\b",
+            "basically": r"\bbasically\b",
+            "literally": r"\bliterally\b",
+            "i mean": r"\bi mean\b",
+            "kind of": r"\bkind of\b",
+            "sort of": r"\bsort of\b"
         }
+        self.power_words = {
+            "evidence", "data", "research", "proven", "significantly",
+            "innovative", "transform", "achieve", "success", "solution",
+            "effective", "results", "impact", "value", "opportunity",
+            "believe", "imagine", "discover", "realize", "understand"
+        }
+        print("✅ Coach Engine Ready!")
+    def analyze_speech(self, audio_path: str) -> Dict[str, Any]:
+        """
+        Main analysis pipeline
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Comprehensive analysis results as dictionary
+        """
+        # Validation
+        if not os.path.exists(audio_path):
+            return {"error": "Audio file not found"}
+        print(f"🎤 Analyzing: {os.path.basename(audio_path)}")
         try:
+            # Load audio
+            audio, sr = self._load_audio(audio_path)
+            duration = len(audio) / sr
+            if duration < 1.0:
+                return {"error": "Audio too short (minimum 1 second)"}
+            print(f"   Duration: {duration:.1f}s")
+            # Step 1: Transcription with timestamps
+            print("   📝 Transcribing...")
+            transcript_data = self._transcribe_with_timestamps(audio)
+            if not transcript_data['text'].strip():
+                return {"error": "No speech detected"}
+            # Step 2: Filler word analysis
+            print("   🔍 Detecting filler words...")
+            filler_analysis = self._detect_fillers(
+                transcript_data['text'],
+                transcript_data['words']
             )
+            # Step 3: Pause analysis
+            print("   ⏸️  Analyzing pauses...")
+            pause_analysis = self._analyze_pauses(
+                transcript_data['words'],
+                duration
+            )
+            # Step 4: Repeated phrase detection
+            print("   🔁 Detecting repetitions...")
+            repetition_analysis = self._detect_repetitions(
+                transcript_data['words']
+            )
+            # Step 5: Pace analysis
+            print("   ⚡ Analyzing pace...")
+            pace_analysis = self._analyze_pace(
+                transcript_data['words'],
+                duration
+            )
+            # Step 6: Tone & confidence analysis
+            print("   🎵 Analyzing tone & confidence...")
+            tone_analysis = self._analyze_tone_confidence(audio, sr)
+            # Step 7: Grammar check
+            print("   ✍️  Checking grammar...")
+            grammar_analysis = self._check_grammar(transcript_data['text'])
+            # Step 8: Content quality analysis
+            print("   📊 Evaluating content...")
+            content_analysis = self._analyze_content(
+                transcript_data['text'],
+                transcript_data['words']
+            )
+            # Step 9: Generate scores
+            print("   🎯 Calculating scores...")
+            scores = self._calculate_scores(
+                filler_analysis,
+                pause_analysis,
+                repetition_analysis,
+                pace_analysis,
+                tone_analysis,
+                grammar_analysis,
+                content_analysis
+            )
+            # Step 10: Generate coaching feedback
+            print("   🤖 Generating coaching...")
+            coaching = self._generate_coaching(
+                scores,
+                filler_analysis,
+                pause_analysis,
+                repetition_analysis,
+                pace_analysis,
+                tone_analysis,
+                grammar_analysis,
+                content_analysis
+            )
+            # Compile final result
+            result = {
+                "overall_score": scores['overall'],
+                "scores": {
+                    "fluency": scores['fluency'],
+                    "confidence": scores['confidence'],
+                    "content": scores['content'],
+                    "grammar": scores['grammar']
+                },
+                "transcription": {
+                    "text": transcript_data['text'],
+                    "word_count": len(transcript_data['words']),
+                    "duration_seconds": round(duration, 2)
                 },
+                "fluency_analysis": {
+                    "filler_words": filler_analysis,
+                    "repeated_phrases": repetition_analysis,
+                    "long_pauses": pause_analysis['long_pauses']
+                },
+                "pace_analysis": pace_analysis,
+                "tone_analysis": tone_analysis,
+                "grammar_analysis": grammar_analysis,
+                "content_analysis": content_analysis,
+                "coaching": coaching
             }
+            print("✅ Analysis complete!")
+            return result
         except Exception as e:
             import traceback
             traceback.print_exc()
+            return {"error": f"Analysis failed: {str(e)}"}
+    def _load_audio(self, path: str) -> tuple:
+        """Load and normalize audio to 16kHz mono"""
+        try:
+            audio, sr = librosa.load(path, sr=16000, mono=True)
+            # Normalize to prevent clipping
+            audio = librosa.util.normalize(audio)
+            return audio, sr
+        except Exception as e:
+            raise ValueError(f"Failed to load audio: {e}")
+    def _transcribe_with_timestamps(self, audio: np.ndarray) -> Dict:
+        """Transcribe with word-level timestamps"""
+        result = self.whisper.transcribe(
+            audio,
+            language='en',
+            word_timestamps=True,
+            fp16=(self.device == "cuda")
+        )
+        words = []
+        for segment in result['segments']:
+            if 'words' in segment:
+                for word_info in segment['words']:
+                    words.append({
+                        'word': word_info['word'].strip(),
+                        'start': word_info['start'],
+                        'end': word_info['end'],
+                        'confidence': word_info.get('probability', 1.0)
+                    })
+        return {
+            'text': result['text'].strip(),
+            'words': words
+        }
+    def _detect_fillers(self, text: str, words: List[Dict]) -> Dict:
+        """Detect filler words with counts and positions"""
+        text_lower = text.lower()
+        filler_counts = {}
+        filler_positions = []
+        for filler_name, pattern in self.filler_patterns.items():
+            matches = list(re.finditer(pattern, text_lower, re.IGNORECASE))
+            count = len(matches)
+            if count > 0:
+                filler_counts[filler_name] = count
+                for match in matches:
+                    filler_positions.append({
+                        'filler': filler_name,
+                        'position': match.start()
+                    })
+        total_fillers = sum(filler_counts.values())
+        total_words = len(words)
+        filler_rate = (total_fillers / total_words * 100) if total_words > 0 else 0
+        return {
+            'total_count': total_fillers,
+            'rate_percentage': round(filler_rate, 2),
+            'breakdown': filler_counts,
+            'positions': filler_positions
+        }
+    def _analyze_pauses(self, words: List[Dict], duration: float) -> Dict:
+        """Analyze pause patterns"""
+        if len(words) < 2:
+            return {'long_pauses': [], 'average_pause': 0, 'silence_percentage': 0}
+        pauses = []
+        long_pauses = []
+        for i in range(len(words) - 1):
+            pause_duration = words[i+1]['start'] - words[i]['end']
+            if pause_duration > 0:
+                pauses.append(pause_duration)
+                if pause_duration > 2.0:  # Long pause threshold
+                    long_pauses.append({
+                        'duration': round(pause_duration, 2),
+                        'after_word': words[i]['word'],
+                        'timestamp': round(words[i]['end'], 2)
+                    })
+        avg_pause = np.mean(pauses) if pauses else 0
+        total_pause_time = sum(pauses)
+        silence_pct = (total_pause_time / duration * 100) if duration > 0 else 0
+        return {
+            'long_pauses': long_pauses,
+            'long_pause_count': len(long_pauses),
+            'average_pause_seconds': round(avg_pause, 2),
+            'silence_percentage': round(silence_pct, 2)
+        }
+    def _detect_repetitions(self, words: List[Dict]) -> List[Dict]:
+        """Detect repeated phrases (2-5 words)"""
+        repetitions = []
+        word_list = [w['word'].lower().strip('.,!?') for w in words]
+        # Check for n-gram repetitions (2-5 words)
+        for n in range(2, 6):
+            seen = {}
+            for i in range(len(word_list) - n + 1):
+                phrase = ' '.join(word_list[i:i+n])
+                if phrase in seen:
+                    repetitions.append({
+                        'phrase': phrase,
+                        'count': seen[phrase] + 1,
+                        'length': n
+                    })
+                    seen[phrase] += 1
+                else:
+                    seen[phrase] = 1
+        # Remove duplicates and sort by count
+        unique_reps = {}
+        for rep in repetitions:
+            key = rep['phrase']
+            if key not in unique_reps or rep['count'] > unique_reps[key]['count']:
+                unique_reps[key] = rep
+        return sorted(unique_reps.values(), key=lambda x: x['count'], reverse=True)[:10]
+    def _analyze_pace(self, words: List[Dict], duration: float) -> Dict:
+        """Analyze speaking pace"""
+        word_count = len(words)
+        wpm = (word_count / duration * 60) if duration > 0 else 0
+        # Determine pace category
+        if wpm < 100:
+            pace_category = "Too Slow"
+            pace_feedback = "Consider speaking slightly faster for better engagement"
+        elif wpm < 130:
+            pace_category = "Good"
+            pace_feedback = "Your pace is comfortable and easy to follow"
+        elif wpm < 160:
+            pace_category = "Optimal"
+            pace_feedback = "Excellent pacing - clear and engaging"
+        elif wpm < 180:
+            pace_category = "Fast"
+            pace_feedback = "Speaking quickly but still understandable"
+        else:
+            pace_category = "Too Fast"
+            pace_feedback = "Try slowing down to ensure clarity"
+        # Calculate pace variance (consistency)
+        if len(words) > 10:
+            segment_size = max(5, len(words) // 10)
+            segment_paces = []
+            for i in range(0, len(words) - segment_size, segment_size):
+                segment = words[i:i+segment_size]
+                seg_duration = segment[-1]['end'] - segment[0]['start']
+                if seg_duration > 0:
+                    seg_wpm = len(segment) / seg_duration * 60
+                    segment_paces.append(seg_wpm)
+            pace_variance = np.std(segment_paces) if len(segment_paces) > 1 else 0
+            consistency = "High" if pace_variance < 20 else "Medium" if pace_variance < 40 else "Low"
+        else:
+            pace_variance = 0
+            consistency = "N/A"
+        return {
+            'words_per_minute': round(wpm, 1),
+            'category': pace_category,
+            'consistency': consistency,
+            'pace_variance': round(pace_variance, 1),
+            'feedback': pace_feedback
+        }
+    def _analyze_tone_confidence(self, audio: np.ndarray, sr: int) -> Dict:
+        """Analyze tone variation and confidence indicators"""
+        # Pitch analysis (fundamental frequency)
+        try:
+            f0 = librosa.yin(
+                audio.astype(np.float64),
+                fmin=80,  # Male range
+                fmax=400  # Female range
+            )
+            f0_clean = f0[f0 > 0]
+            if len(f0_clean) > 0:
+                avg_pitch = np.mean(f0_clean)
+                pitch_std = np.std(f0_clean)
+                pitch_range = np.ptp(f0_clean)
+                # Pitch variation indicates expressiveness
+                if pitch_std < 20:
+                    expressiveness = "Monotone"
+                    expression_score = 40
+                elif pitch_std < 40:
+                    expressiveness = "Moderate Variation"
+                    expression_score = 70
+                else:
+                    expressiveness = "Expressive"
+                    expression_score = 95
+            else:
+                avg_pitch = 0
+                pitch_std = 0
+                pitch_range = 0
+                expressiveness = "Unknown"
+                expression_score = 50
+        except Exception as e:
+            logging.warning(f"Pitch analysis failed: {e}")
+            avg_pitch = 0
+            pitch_std = 0
+            pitch_range = 0
+            expressiveness = "Unknown"
+            expression_score = 50
+        # Energy/Volume analysis
+        rms = librosa.feature.rms(y=audio)[0]
+        avg_energy = np.mean(rms)
+        energy_std = np.std(rms)
+        # Volume consistency
+        if energy_std < 0.02:
+            volume_consistency = "Very Consistent"
+        elif energy_std < 0.05:
+            volume_consistency = "Consistent"
+        else:
+            volume_consistency = "Varied"
+        # Confidence estimation (based on volume stability and pitch)
+        confidence_score = 50  # Base
+        if energy_std < 0.03:  # Stable volume
+            confidence_score += 15
+        if 150 < avg_pitch < 250:  # Comfortable pitch range
+            confidence_score += 15
+        if pitch_std > 20:  # Some variation (engaged)
+            confidence_score += 20
+        confidence_score = min(100, max(0, confidence_score))
+        return {
+            'expressiveness': expressiveness,
+            'expression_score': expression_score,
+            'average_pitch_hz': round(float(avg_pitch), 1),
+            'pitch_variation_hz': round(float(pitch_std), 1),
+            'volume_consistency': volume_consistency,
+            'confidence_score': round(confidence_score, 1)
+        }
+    def _check_grammar(self, text: str) -> Dict:
+        """Check grammar using language-tool-python"""
+        grammar_tool = get_grammar_tool()
+        if grammar_tool is None:
+            return {
+                'issue_count': 0,
+                'issues': [],
+                'available': False
+            }
+        try:
+            matches = grammar_tool.check(text)
+            issues = []
+            for match in matches[:20]:  # Limit to top 20
+                issues.append({
+                    'type': match.ruleId,
+                    'message': match.message,
+                    'context': match.context,
+                    'suggestions': match.replacements[:3]
+                })
+            return {
+                'issue_count': len(matches),
+                'issues': issues,
+                'available': True
+            }
+        except Exception as e:
+            logging.warning(f"Grammar check failed: {e}")
+            return {
+                'issue_count': 0,
+                'issues': [],
+                'available': False
+            }
+    def _analyze_content(self, text: str, words: List[Dict]) -> Dict:
+        """Analyze content quality and complexity"""
+        # Readability metrics
+        try:
+            flesch_score = textstat.flesch_reading_ease(text)
+            grade_level = textstat.text_standard(text, float_output=True)
+        except:
+            flesch_score = 50
+            grade_level = 8
+        # Interpret Flesch score
+        if flesch_score >= 90:
+            readability = "Very Easy"
+        elif flesch_score >= 70:
+            readability = "Easy"
+        elif flesch_score >= 50:
+            readability = "Moderate"
+        elif flesch_score >= 30:
+            readability = "Difficult"
+        else:
+            readability = "Very Difficult"
+        # Power word usage
+        word_list = [w['word'].lower().strip('.,!?') for w in words]
+        power_word_count = sum(1 for w in word_list if w in self.power_words)
+        power_word_rate = (power_word_count / len(words) * 100) if len(words) > 0 else 0
+        # Vocabulary diversity
+        unique_words = len(set(word_list))
+        vocab_diversity = (unique_words / len(words) * 100) if len(words) > 0 else 0
+        # Sentence structure (approximate from punctuation)
+        sentence_count = max(1, text.count('.') + text.count('!') + text.count('?'))
+        avg_sentence_length = len(words) / sentence_count
+        return {
+            'readability_score': round(flesch_score, 1),
+            'readability_level': readability,
+            'grade_level': round(grade_level, 1),
+            'power_words_used': power_word_count,
+            'power_word_rate': round(power_word_rate, 2),
+            'vocabulary_diversity': round(vocab_diversity, 1),
+            'unique_word_count': unique_words,
+            'average_sentence_length': round(avg_sentence_length, 1)
+        }
+    def _calculate_scores(
+        self,
+        filler_analysis: Dict,
+        pause_analysis: Dict,
+        repetition_analysis: List,
+        pace_analysis: Dict,
+        tone_analysis: Dict,
+        grammar_analysis: Dict,
+        content_analysis: Dict
+    ) -> Dict:
+        """Calculate comprehensive scores"""
+        # Fluency Score (0-100)
+        fluency = 100
+        fluency -= min(30, filler_analysis['rate_percentage'] * 5)  # Filler penalty
+        fluency -= min(20, pause_analysis['long_pause_count'] * 5)  # Long pause penalty
+        fluency -= min(15, len(repetition_analysis) * 3)  # Repetition penalty
+        # Pace bonus/penalty
+        wpm = pace_analysis['words_per_minute']
+        if 130 <= wpm <= 160:
+            fluency += 5  # Optimal pace bonus
+        elif wpm < 100 or wpm > 180:
+            fluency -= 10  # Poor pace penalty
+        fluency = max(0, min(100, fluency))
+        # Confidence Score (from tone analysis)
+        confidence = tone_analysis['confidence_score']
+        # Content Score (0-100)
+        content = 50  # Base
+        content += min(30, content_analysis['power_word_rate'] * 3)  # Power words
+        content += min(20, content_analysis['vocabulary_diversity'] / 5)  # Diversity
+        # Readability bonus/penalty
+        flesch = content_analysis['readability_score']
+        if 50 <= flesch <= 70:
+            content += 10
+        content = max(0, min(100, content))
+        # Grammar Score (0-100)
+        if grammar_analysis['available']:
+            grammar = max(0, 100 - grammar_analysis['issue_count'] * 2)
+        else:
+            grammar = 85  # Default if unavailable
+        # Overall Score (weighted average)
+        overall = (
+            fluency * 0.35 +
+            confidence * 0.25 +
+            content * 0.25 +
+            grammar * 0.15
+        )
+        return {
+            'overall': round(overall),
+            'fluency': round(fluency),
+            'confidence': round(confidence),
+            'content': round(content),
+            'grammar': round(grammar)
+        }
+    def _generate_coaching(
+        self,
+        scores: Dict,
+        filler_analysis: Dict,
+        pause_analysis: Dict,
+        repetition_analysis: List,
+        pace_analysis: Dict,
+        tone_analysis: Dict,
+        grammar_analysis: Dict,
+        content_analysis: Dict
+    ) -> Dict:
+        """Generate personalized coaching feedback"""
+        strengths = []
+        improvements = []
+        # Analyze strengths
+        if scores['fluency'] >= 80:
+            strengths.append("Excellent fluency - your speech flows naturally")
+        if filler_analysis['rate_percentage'] < 2:
+            strengths.append("Minimal use of filler words - very professional")
+        if pace_analysis['words_per_minute'] >= 130 and pace_analysis['words_per_minute'] <= 160:
+            strengths.append("Perfect speaking pace - clear and engaging")
+        if tone_analysis['expression_score'] >= 80:
+            strengths.append("Great vocal expressiveness - keeps audience engaged")
+        if content_analysis['power_word_rate'] >= 3:
+            strengths.append("Strong use of impactful vocabulary")
+        if scores['confidence'] >= 75:
+            strengths.append("Confident delivery with strong vocal presence")
+        # Identify improvements
+        if filler_analysis['rate_percentage'] >= 5:
+            improvements.append(
+                f"Reduce filler words ({filler_analysis['rate_percentage']:.1f}% of speech). "
+                "Try pausing silently instead of using 'um' or 'uh'"
+            )
+        if pause_analysis['long_pause_count'] >= 3:
+            improvements.append(
+                f"You have {pause_analysis['long_pause_count']} long pauses. "
+                "Practice smoother transitions between thoughts"
+            )
+        if len(repetition_analysis) >= 3:
+            top_rep = repetition_analysis[0]
+            improvements.append(
+                f"You repeated '{top_rep['phrase']}' {top_rep['count']} times. "
+                "Vary your phrasing for more engaging delivery"
+            )
+        wpm = pace_analysis['words_per_minute']
+        if wpm < 120:
+            improvements.append(
+                "Your pace is quite slow. Try speaking 10-15% faster to maintain energy"
+            )
+        elif wpm > 170:
+            improvements.append(
+                "You're speaking very quickly. Slow down 10-15% to ensure clarity"
+            )
+        if tone_analysis['expression_score'] < 60:
+            improvements.append(
+                "Add more vocal variety. Practice emphasizing key words and varying your pitch"
+            )
+        if grammar_analysis['available'] and grammar_analysis['issue_count'] >= 5:
+            improvements.append(
+                f"Found {grammar_analysis['issue_count']} grammar issues. "
+                "Review your script and practice correct phrasing"
+            )
+        if content_analysis['vocabulary_diversity'] < 40:
+            improvements.append(
+                "Expand your vocabulary. Using more diverse words makes speeches more engaging"
+            )
+        # Generate overall feedback message
+        overall_score = scores['overall']
+        if overall_score >= 90:
+            overall_feedback = (
+                "🌟 Outstanding performance! Your speech demonstrates excellent "
+                "command of public speaking fundamentals. You're ready for any audience!"
+            )
+        elif overall_score >= 75:
+            overall_feedback = (
+                "👏 Strong performance! You have solid public speaking skills. "
+                "Focus on the improvement areas to reach the next level."
+            )
+        elif overall_score >= 60:
+            overall_feedback = (
+                "✅ Good effort! You have a foundation to build on. "
+                "Work on the suggested improvements and keep practicing."
+            )
+        else:
+            overall_feedback = (
+                "💪 Keep practicing! Public speaking is a skill that improves with practice. "
+                "Focus on one improvement area at a time and you'll see progress."
+            )
+        # Add default messages if lists are empty
+        if not strengths:
+            strengths.append("You completed the speech - that takes courage!")
+        if not improvements:
+            improvements.append("Keep practicing to maintain your excellent skills")
+        return {
+            'overall_feedback': overall_feedback,
+            'strengths': strengths[:5],  # Top 5
+            'improvements': improvements[:5],  # Top 5
+            'next_steps': self._generate_next_steps(scores, improvements)
+        }
+    def _generate_next_steps(self, scores: Dict, improvements: List[str]) -> List[str]:
+        """Generate actionable next steps"""
+        steps = []
+        # Prioritize based on weakest scores
+        score_items = [
+            ('fluency', scores['fluency']),
+            ('confidence', scores['confidence']),
+            ('content', scores['content']),
+            ('grammar', scores['grammar'])
+        ]
+        score_items.sort(key=lambda x: x[1])
+        weakest = score_items[0][0]
+        if weakest == 'fluency':
+            steps.append("Practice speaking without filler words - try the 'silent pause' technique")
+            steps.append("Record yourself daily and track filler word reduction")
+        elif weakest == 'confidence':
+            steps.append("Work on vocal projection exercises to build confidence")
+            steps.append("Practice power poses before speaking to boost confidence")
+        elif weakest == 'content':
+            steps.append("Build your vocabulary by learning 2-3 power words per week")
+            steps.append("Study speeches by great speakers and note their word choices")
+        elif weakest == 'grammar':
+            steps.append("Review common grammar rules and practice correct phrasing")
+            steps.append("Have someone proofread your speeches before delivery")
+        steps.append("Practice this speech 3 more times and compare your progress")
+        return steps[:4]
+# ================= TEST RUNNER =================
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("PUBLIC SPEAKING COACH - ENGINE TEST")
+    print("="*60 + "\n")
+    test_file = "test_speech.wav"
+    # Generate test audio if needed
+    if not os.path.exists(test_file):
+        print("⚠️  No test file found. Generating dummy audio...")
+        sr = 16000
+        duration = 5
+        t = np.linspace(0, duration, sr * duration)
+        # Simulate speech-like audio with varying frequency
+        audio = 0.3 * np.sin(2 * np.pi * 200 * t) + 0.2 * np.sin(2 * np.pi * 300 * t)
+        sf.write(test_file, audio, sr)
+        print(f"✅ Created {test_file}\n")
+    try:
+        # Initialize coach
+        coach = PublicSpeakingCoach(whisper_model_size="base")
+        # Analyze
+        result = coach.analyze_speech(test_file)
+        # Display results
+        print("\n" + "="*60)
+        print("ANALYSIS RESULTS")
+        print("="*60)
+        print(json.dumps(result, indent=2, cls=NumpyEncoder))
+        print("\n✅ Engine test completed successfully!")
+    except Exception as e:
+        print(f"\n❌ ERROR: {e}")
+        import traceback
+        traceback.print_exc()

main.py CHANGED Viewed

@@ -1,105 +1,273 @@
 import os
 import shutil
 import uvicorn
-import subprocess
-from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from kid_coach_pipeline import KidCoachEngine
-app = FastAPI(title="Public Speaking Coach API")
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# Global Engine Instance
-engine = None
 @app.on_event("startup")
 async def startup_event():
-    global engine
-    # 1. Get Token from Secrets
-    hf_token = os.environ.get("HF_TOKEN")
-    if not hf_token:
-        print("❌ CRITICAL: HF_TOKEN not found in environment variables!")
-    print("🚀 Initializing KidCoach Engine (Production Mode)...")
     try:
-        engine = KidCoachEngine(hf_token=hf_token)
-        print("✅ Engine Ready! Waiting for audio...")
     except Exception as e:
-        print(f"❌ Engine initialization failed: {e}")
-def convert_to_wav(input_path, output_path):
     """
-    Sanitizes audio for AI processing.
-    Converts any input (m4a, mp3, webm) to 16kHz Mono WAV.
     """
     try:
-        command = [
-            "ffmpeg",
-            "-i", input_path,
-            "-ar", "16000",      # Standard AI Sample Rate
-            "-ac", "1",          # Mono
-            "-c:a", "pcm_s16le", # Raw WAV
-            output_path,
-            "-y"
-        ]
-        subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        return True
     except Exception as e:
-        print(f"❌ FFmpeg error: {e}")
-        return False
-@app.post("/coach")
-async def coach_audio(file: UploadFile = File(...)):
-    global engine
-    if not engine:
-        raise HTTPException(status_code=500, detail="AI Engine is not initialized")
-    # 1. Save Raw File
-    raw_filename = f"raw_{file.filename}"
-    clean_wav_filename = f"clean_{file.filename}.wav"
-    try:
-        # Write upload to disk
-        with open(raw_filename, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # 2. Convert to Pristine WAV
-        print(f"🔄 Processing file: {file.filename}")
-        success = convert_to_wav(raw_filename, clean_wav_filename)
-        if not success:
-            raise HTTPException(status_code=400, detail="Audio file unreadable. Please upload MP3, WAV, or M4A.")
-        # 3. Run The Full AI Pipeline
-        # This calls our robust logic in kid_coach_pipeline.py
-        result = engine.process_pipeline(clean_wav_filename)
-        if "error" in result:
-            print(f"Pipeline Error: {result['error']}")
-            raise HTTPException(status_code=500, detail=result["error"])
-        return result
-    except HTTPException as he:
-        raise he
-    except Exception as e:
-        print(f"Server Error: {e}")
-        raise HTTPException(status_code=500, detail="Internal Processing Error")
-    finally:
-        # 4. Cleanup temp files to save disk space
-        if os.path.exists(raw_filename):
-            os.remove(raw_filename)
-        if os.path.exists(clean_wav_filename):
-            os.remove(clean_wav_filename)
 if __name__ == "__main__":
-    # Hugging Face Spaces expects port 7860
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+"""
+Production FastAPI Server for Public Speaking Coach
+Handles audio uploads and returns comprehensive analysis
+"""
 import os
 import shutil
+import tempfile
+from pathlib import Path
+from typing import Optional
 import uvicorn
+from fastapi import FastAPI, UploadFile, File, HTTPException, status
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from kid_coach_pipeline import PublicSpeakingCoach
+# ================= APP CONFIGURATION =================
+app = FastAPI(
+    title="Public Speaking Coach API",
+    description="AI-powered speech analysis and coaching for all ages",
+    version="2.0.0"
+)
+# CORS Configuration - Adjust for production
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # Change to specific domains in production
+    allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+# Global engine instance
+coach_engine: Optional[PublicSpeakingCoach] = None
+# Supported audio formats
+SUPPORTED_FORMATS = {
+    '.wav', '.mp3', '.m4a', '.flac', '.ogg',
+    '.wma', '.aac', '.mp4', '.webm'
+}
+# Maximum file size (50MB)
+MAX_FILE_SIZE = 50 * 1024 * 1024
+# ================= RESPONSE MODELS =================
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str
+    engine_loaded: bool
+    supported_formats: list
+class ErrorResponse(BaseModel):
+    """Error response format"""
+    error: str
+    detail: Optional[str] = None
+# ================= STARTUP/SHUTDOWN =================
 @app.on_event("startup")
 async def startup_event():
+    """Initialize the coach engine on server start"""
+    global coach_engine
+    print("\n" + "="*60)
+    print("🚀 PUBLIC SPEAKING COACH API - STARTING")
+    print("="*60)
     try:
+        print("\n📦 Loading AI models...")
+        coach_engine = PublicSpeakingCoach(whisper_model_size="base")
+        print("✅ Coach engine ready!")
+        print("\n" + "="*60)
+        print("🎤 API is ready to analyze speeches!")
+        print("="*60 + "\n")
     except Exception as e:
+        print(f"\n❌ STARTUP FAILED: {e}")
+        print("Server will start but analysis will not work.\n")
+        coach_engine = None
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Cleanup on server shutdown"""
+    print("\n👋 Shutting down Public Speaking Coach API...")
+# ================= ENDPOINTS =================
+@app.get("/", response_model=HealthResponse)
+async def root():
+    """Root endpoint - API info"""
+    return {
+        "status": "online",
+        "engine_loaded": coach_engine is not None,
+        "supported_formats": list(SUPPORTED_FORMATS)
+    }
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy" if coach_engine else "degraded",
+        "engine_loaded": coach_engine is not None,
+        "supported_formats": list(SUPPORTED_FORMATS)
+    }
+@app.post("/coach")
+async def analyze_speech(file: UploadFile = File(...)):
     """
+    Main endpoint: Upload audio file and receive speech analysis
+    Args:
+        file: Audio file (wav, mp3, m4a, flac, ogg, etc.)
+    Returns:
+        Comprehensive speech analysis with scores and coaching feedback
+    Raises:
+        400: Invalid file format or corrupted audio
+        413: File too large
+        500: Analysis failed
+        503: Engine not loaded
     """
+    # Check if engine is loaded
+    if coach_engine is None:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Coach engine not initialized. Please contact administrator."
+        )
+    # Validate file exists
+    if not file:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="No file provided"
+        )
+    # Validate filename
+    if not file.filename:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Invalid filename"
+        )
+    # Get file extension
+    file_ext = Path(file.filename).suffix.lower()
+    # Validate format
+    if file_ext not in SUPPORTED_FORMATS:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Unsupported format '{file_ext}'. Supported: {', '.join(SUPPORTED_FORMATS)}"
+        )
+    # Create temporary file
+    temp_file = None
     try:
+        # Read file content
+        content = await file.read()
+        # Check file size
+        if len(content) > MAX_FILE_SIZE:
+            raise HTTPException(
+                status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
+                detail=f"File too large. Maximum size: {MAX_FILE_SIZE // (1024*1024)}MB"
+            )
+        # Create temporary file with proper extension
+        with tempfile.NamedTemporaryFile(
+            delete=False,
+            suffix=file_ext
+        ) as temp:
+            temp.write(content)
+            temp_file = temp.name
+        print(f"\n📁 Processing: {file.filename} ({len(content) / 1024:.1f} KB)")
+        # Run analysis
+        result = coach_engine.analyze_speech(temp_file)
+        # Check for analysis errors
+        if "error" in result:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=result["error"]
+            )
+        print(f"✅ Analysis complete: Score {result['overall_score']}/100")
+        return JSONResponse(content=result)
+    except HTTPException:
+        # Re-raise HTTP exceptions
+        raise
     except Exception as e:
+        # Log unexpected errors
+        import traceback
+        print(f"\n❌ ANALYSIS ERROR:")
+        traceback.print_exc()
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Analysis failed: {str(e)}"
+        )
+    finally:
+        # Cleanup temporary file
+        if temp_file and os.path.exists(temp_file):
+            try:
+                os.remove(temp_file)
+            except Exception as e:
+                print(f"⚠️  Failed to delete temp file: {e}")
+@app.post("/analyze")
+async def analyze_speech_alias(file: UploadFile = File(...)):
+    """Alias endpoint for /coach (for compatibility)"""
+    return await analyze_speech(file)
+# ================= ERROR HANDLERS =================
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    """Custom HTTP exception handler"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "error": exc.detail,
+            "status_code": exc.status_code
+        }
+    )
+@app.exception_handler(Exception)
+async def general_exception_handler(request, exc):
+    """Catch-all exception handler"""
+    import traceback
+    traceback.print_exc()
+    return JSONResponse(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        content={
+            "error": "Internal server error",
+            "detail": str(exc)
+        }
+    )
+# ================= MAIN =================
 if __name__ == "__main__":
+    # For local development
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8000,
+        log_level="info"
+    )

requirements (1).txt ADDED Viewed

	@@ -0,0 +1,27 @@

+# Core Dependencies - Tested & Compatible
+torch==2.1.0
+torchaudio==2.1.0
+openai-whisper==20231117
+# Audio Processing
+librosa==0.10.1
+soundfile==0.12.1
+scipy==1.11.4
+numpy==1.24.3
+# Text Analysis
+textstat==0.7.3
+language-tool-python==2.8.0
+# API Framework
+fastapi==0.109.0
+uvicorn[standard]==0.27.0
+python-multipart==0.0.6
+# LLM Integration (lightweight, no GPU needed)
+transformers==4.36.0
+sentencepiece==0.1.99
+# Utilities
+pydantic==2.5.3
+python-dotenv==1.0.0