Spaces:

anfastech
/

zlaqa-version-c-ai-enginee

Sleeping

App Files Files Community

anfastech commited on Dec 28, 2025

Commit

278e294

1 Parent(s): 3d66487

New: implemented many, many changes. 10% Phone-level detection: WORKING

Browse files

Files changed (17) hide show

Docs/QUICK_START.md +2 -0
api/__init__.py +4 -0
api/routes.py +389 -0
api/schemas.py +115 -0
api/streaming.py +305 -0
app.py +47 -15
config.py +12 -0
data/therapy_recommendations.json +60 -0
diagnosis/ai_engine/detect_stuttering.py +3 -0
inference/inference_pipeline.py +250 -336
models/error_taxonomy.py +333 -0
models/phoneme_mapper.py +364 -0
models/speech_pathology_model.py +64 -16
tests/__init__.py +4 -0
tests/integration_tests.py +249 -0
tests/performance_tests.py +344 -0
ui/gradio_interface.py +43 -23

Docs/QUICK_START.md CHANGED Viewed

@@ -327,6 +327,7 @@ with Pool(4) as pool:
 ## API Reference
 ### Main Method
 ```python
 analyze_audio(
     audio_path: str,           # Path to .wav file
@@ -336,6 +337,7 @@ analyze_audio(
 ```
 ### Utility Methods
 ```python
 # Phonetic similarity (0-1)
 _calculate_phonetic_similarity(char1: str, char2: str) -> float

 ## API Reference
 ### Main Method
 ```python
 analyze_audio(
     audio_path: str,           # Path to .wav file
 ```
 ### Utility Methods
 ```python
 # Phonetic similarity (0-1)
 _calculate_phonetic_similarity(char1: str, char2: str) -> float

api/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+API module for speech pathology diagnosis endpoints.
+"""

api/routes.py ADDED Viewed

	@@ -0,0 +1,389 @@

+"""
+REST API routes for Speech Pathology Diagnosis.
+This module provides FastAPI endpoints for batch file analysis,
+session management, and health checks.
+"""
+import logging
+import os
+import time
+import tempfile
+import uuid
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+from datetime import datetime
+from fastapi import APIRouter, UploadFile, File, HTTPException, Query
+from fastapi.responses import JSONResponse
+from api.schemas import (
+    BatchDiagnosisResponse,
+    FrameDiagnosis,
+    ErrorReport,
+    SummaryMetrics,
+    SessionListResponse,
+    HealthResponse,
+    ErrorDetailSchema,
+    FluencyInfo,
+    ArticulationInfo
+)
+from models.phoneme_mapper import PhonemeMapper
+from models.error_taxonomy import ErrorMapper, ErrorType, SeverityLevel
+from inference.inference_pipeline import InferencePipeline
+from config import AudioConfig, default_audio_config
+logger = logging.getLogger(__name__)
+# Create router
+router = APIRouter(prefix="/diagnose", tags=["diagnosis"])
+# In-memory session storage (in production, use Redis or database)
+sessions: Dict[str, BatchDiagnosisResponse] = {}
+# Global instances (will be injected)
+inference_pipeline: Optional[InferencePipeline] = None
+phoneme_mapper: Optional[PhonemeMapper] = None
+error_mapper: Optional[ErrorMapper] = None
+def initialize_routes(
+    pipeline: InferencePipeline,
+    mapper: Optional[PhonemeMapper] = None,
+    error_mapper_instance: Optional[ErrorMapper] = None
+):
+    """
+    Initialize routes with dependencies.
+    Args:
+        pipeline: InferencePipeline instance
+        mapper: Optional PhonemeMapper instance
+        error_mapper_instance: Optional ErrorMapper instance
+    """
+    global inference_pipeline, phoneme_mapper, error_mapper
+    inference_pipeline = pipeline
+    if mapper is None:
+        try:
+            phoneme_mapper = PhonemeMapper(
+                frame_duration_ms=default_audio_config.chunk_duration_ms,
+                sample_rate=default_audio_config.sample_rate
+            )
+            logger.info("✅ PhonemeMapper initialized")
+        except Exception as e:
+            logger.warning(f"⚠️ PhonemeMapper not available: {e}")
+            phoneme_mapper = None
+    if error_mapper_instance is None:
+        try:
+            error_mapper = ErrorMapper()
+            logger.info("✅ ErrorMapper initialized")
+        except Exception as e:
+            logger.error(f"❌ ErrorMapper failed to initialize: {e}")
+            error_mapper = None
+@router.post("/file", response_model=BatchDiagnosisResponse)
+async def diagnose_file(
+    audio: UploadFile = File(...),
+    text: Optional[str] = Query(None, description="Expected text/transcript for phoneme mapping"),
+    session_id: Optional[str] = Query(None, description="Optional session ID")
+):
+    """
+    Analyze audio file for speech pathology errors.
+    Performs complete phoneme-level analysis:
+    - Extracts Wav2Vec2 features
+    - Classifies fluency and articulation per frame
+    - Maps phonemes to frames
+    - Detects errors and generates therapy recommendations
+    Args:
+        audio: Audio file (WAV, MP3, etc.)
+        text: Optional expected text for phoneme mapping
+        session_id: Optional session ID (auto-generated if not provided)
+    Returns:
+        BatchDiagnosisResponse with detailed error analysis
+    """
+    if inference_pipeline is None:
+        raise HTTPException(status_code=503, detail="Inference pipeline not loaded")
+    start_time = time.time()
+    # Generate session ID
+    if not session_id:
+        session_id = str(uuid.uuid4())
+    # Save uploaded file
+    temp_file = None
+    try:
+        # Create temp file
+        temp_dir = tempfile.gettempdir()
+        os.makedirs(temp_dir, exist_ok=True)
+        temp_file = os.path.join(temp_dir, f"diagnosis_{session_id}_{audio.filename}")
+        # Save file
+        content = await audio.read()
+        with open(temp_file, "wb") as f:
+            f.write(content)
+        file_size_mb = len(content) / 1024 / 1024
+        logger.info(f"📂 Saved file: {temp_file} ({file_size_mb:.2f} MB)")
+        # Run inference
+        logger.info("🔄 Running phone-level inference...")
+        result = inference_pipeline.predict_phone_level(
+            temp_file,
+            return_timestamps=True
+        )
+        # Map phonemes to frames if text provided
+        frame_phonemes = []
+        if text and phoneme_mapper:
+            try:
+                frame_phonemes = phoneme_mapper.map_text_to_frames(
+                    text,
+                    num_frames=result.num_frames,
+                    audio_duration=result.duration
+                )
+                logger.info(f"✅ Mapped {len(frame_phonemes)} phonemes to frames")
+            except Exception as e:
+                logger.warning(f"⚠️ Phoneme mapping failed: {e}, using empty phonemes")
+                frame_phonemes = [''] * result.num_frames
+        else:
+            frame_phonemes = [''] * result.num_frames
+            if not text:
+                logger.warning("⚠️ No text provided, phoneme mapping skipped")
+        # Process frame predictions with error mapping
+        frame_diagnoses = []
+        error_reports = []
+        error_count = 0
+        for i, frame_pred in enumerate(result.frame_predictions):
+            # Get phoneme for this frame
+            phoneme = frame_phonemes[i] if i < len(frame_phonemes) else ''
+            # Map classifier output to error detail
+            # Combine fluency and articulation into 8-class system
+            # Class = articulation_class * 2 + (1 if stutter else 0)
+            class_id = frame_pred.articulation_class
+            if frame_pred.fluency_label == 'stutter':
+                class_id += 4  # Add 4 for stutter classes (4-7)
+            # Get error detail
+            error_detail = None
+            if error_mapper:
+                try:
+                    error_detail_obj = error_mapper.map_classifier_output(
+                        class_id=class_id,
+                        confidence=frame_pred.confidence,
+                        phoneme=phoneme if phoneme else 'unknown',
+                        fluency_label=frame_pred.fluency_label
+                    )
+                    # Add frame index
+                    error_detail_obj.frame_indices = [i]
+                    # Convert to schema
+                    if error_detail_obj.error_type != ErrorType.NORMAL:
+                        error_detail = ErrorDetailSchema(
+                            phoneme=error_detail_obj.phoneme,
+                            error_type=error_detail_obj.error_type.value,
+                            wrong_sound=error_detail_obj.wrong_sound,
+                            severity=error_detail_obj.severity,
+                            confidence=error_detail_obj.confidence,
+                            therapy=error_detail_obj.therapy,
+                            frame_indices=[i]
+                        )
+                        error_count += 1
+                        # Create error report
+                        severity_level = error_mapper.get_severity_level(error_detail_obj.severity)
+                        error_reports.append(ErrorReport(
+                            frame_id=i,
+                            timestamp=frame_pred.time,
+                            phoneme=error_detail_obj.phoneme,
+                            error=error_detail,
+                            severity_level=severity_level.value
+                        ))
+                except Exception as e:
+                    logger.warning(f"Error mapping failed for frame {i}: {e}")
+            # Create frame diagnosis
+            severity_level_str = "none"
+            if error_detail:
+                severity_level_str = error_mapper.get_severity_level(error_detail.severity).value if error_mapper else "none"
+            frame_diagnoses.append(FrameDiagnosis(
+                frame_id=i,
+                timestamp=frame_pred.time,
+                phoneme=phoneme if phoneme else 'unknown',
+                fluency=FluencyInfo(
+                    label=frame_pred.fluency_label,
+                    confidence=frame_pred.fluency_prob if frame_pred.fluency_label == 'stutter' else (1.0 - frame_pred.fluency_prob)
+                ),
+                articulation=ArticulationInfo(
+                    label=frame_pred.articulation_label,
+                    confidence=frame_pred.confidence,
+                    class_id=frame_pred.articulation_class
+                ),
+                error=error_detail,
+                severity_level=severity_level_str,
+                confidence=frame_pred.confidence
+            ))
+        # Calculate summary metrics
+        fluency_scores = [1.0 - fp.fluency_prob for fp in result.frame_predictions]  # Convert stutter prob to fluency
+        avg_fluency = sum(fluency_scores) / len(fluency_scores) if fluency_scores else 0.0
+        # Articulation score: percentage of normal frames
+        normal_frames = sum(1 for fp in result.frame_predictions if fp.articulation_class == 0)
+        articulation_score = normal_frames / result.num_frames if result.num_frames > 0 else 0.0
+        summary = SummaryMetrics(
+            fluency_score=avg_fluency,
+            fluency_percentage=avg_fluency * 100.0,
+            articulation_score=articulation_score,
+            error_count=error_count,
+            error_rate=error_count / result.num_frames if result.num_frames > 0 else 0.0
+        )
+        # Generate therapy plan (unique therapy recommendations)
+        therapy_plan = []
+        if error_mapper:
+            seen_therapies = set()
+            for error_report in error_reports:
+                if error_report.error.therapy and error_report.error.therapy not in seen_therapies:
+                    therapy_plan.append(error_report.error.therapy)
+                    seen_therapies.add(error_report.error.therapy)
+        processing_time_ms = (time.time() - start_time) * 1000
+        # Create response
+        response = BatchDiagnosisResponse(
+            session_id=session_id,
+            filename=audio.filename or "unknown",
+            duration=result.duration,
+            total_frames=result.num_frames,
+            error_count=error_count,
+            errors=error_reports,
+            frame_diagnoses=frame_diagnoses,
+            summary=summary,
+            therapy_plan=therapy_plan,
+            processing_time_ms=processing_time_ms,
+            created_at=datetime.now()
+        )
+        # Store in sessions
+        sessions[session_id] = response
+        logger.info(f"✅ Diagnosis complete: {error_count} errors, {processing_time_ms:.0f}ms")
+        return response
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"❌ Diagnosis failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Diagnosis failed: {str(e)}")
+    finally:
+        # Cleanup temp file
+        if temp_file and os.path.exists(temp_file):
+            try:
+                os.remove(temp_file)
+                logger.debug(f"🧹 Cleaned up: {temp_file}")
+            except Exception as e:
+                logger.warning(f"Could not clean up {temp_file}: {e}")
+@router.get("/results/{session_id}", response_model=BatchDiagnosisResponse)
+async def get_results(session_id: str):
+    """
+    Get cached diagnosis results for a session.
+    Args:
+        session_id: Session identifier
+    Returns:
+        BatchDiagnosisResponse
+    """
+    if session_id not in sessions:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+    return sessions[session_id]
+@router.get("/results", response_model=SessionListResponse)
+async def list_results(limit: int = Query(10, ge=1, le=100)):
+    """
+    List all cached diagnosis sessions.
+    Args:
+        limit: Maximum number of sessions to return
+    Returns:
+        SessionListResponse with session metadata
+    """
+    session_list = []
+    for sid, response in list(sessions.items())[:limit]:
+        session_list.append({
+            "session_id": sid,
+            "filename": response.filename,
+            "duration": response.duration,
+            "error_count": response.error_count,
+            "created_at": response.created_at.isoformat(),
+            "processing_time_ms": response.processing_time_ms
+        })
+    return SessionListResponse(
+        sessions=session_list,
+        total=len(sessions)
+    )
+@router.delete("/results/{session_id}")
+async def delete_results(session_id: str):
+    """
+    Delete cached diagnosis results for a session.
+    Args:
+        session_id: Session identifier
+    Returns:
+        Success message
+    """
+    if session_id not in sessions:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found")
+    del sessions[session_id]
+    logger.info(f"🗑️ Deleted session: {session_id}")
+    return {"status": "success", "message": f"Session {session_id} deleted"}
+@router.get("/health", response_model=HealthResponse)
+async def health_check():
+    """
+    Health check endpoint.
+    Returns:
+        HealthResponse with service status
+    """
+    import time
+    start_time = getattr(health_check, '_start_time', time.time())
+    if not hasattr(health_check, '_start_time'):
+        health_check._start_time = start_time
+    uptime = time.time() - start_time
+    return HealthResponse(
+        status="healthy" if inference_pipeline is not None else "degraded",
+        version="2.0.0",
+        model_loaded=inference_pipeline is not None,
+        uptime_seconds=uptime
+    )

api/schemas.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Pydantic schemas for Speech Pathology Diagnosis API.
+This module defines request and response models for REST API and WebSocket endpoints.
+"""
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+from datetime import datetime
+class FluencyInfo(BaseModel):
+    """Fluency classification information."""
+    label: str = Field(..., description="Fluency label: 'normal' or 'stutter'")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score (0-1)")
+class ArticulationInfo(BaseModel):
+    """Articulation classification information."""
+    label: str = Field(..., description="Articulation label: 'normal', 'substitution', 'omission', 'distortion'")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score (0-1)")
+    class_id: int = Field(..., ge=0, le=3, description="Class ID: 0=normal, 1=substitution, 2=omission, 3=distortion")
+class ErrorDetailSchema(BaseModel):
+    """Error detail schema for API responses."""
+    phoneme: str = Field(..., description="Expected phoneme symbol")
+    error_type: str = Field(..., description="Error type: normal, substitution, omission, distortion")
+    wrong_sound: Optional[str] = Field(None, description="For substitutions, the incorrect phoneme produced")
+    severity: float = Field(..., ge=0.0, le=1.0, description="Severity score (0-1)")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Model confidence (0-1)")
+    therapy: str = Field(..., description="Therapy recommendation")
+    frame_indices: List[int] = Field(default_factory=list, description="Frame indices where error occurs")
+class FrameDiagnosis(BaseModel):
+    """Diagnosis for a single frame."""
+    frame_id: int = Field(..., description="Frame index")
+    timestamp: float = Field(..., ge=0.0, description="Timestamp in seconds")
+    phoneme: str = Field(..., description="Expected phoneme for this frame")
+    fluency: FluencyInfo = Field(..., description="Fluency classification")
+    articulation: ArticulationInfo = Field(..., description="Articulation classification")
+    error: Optional[ErrorDetailSchema] = Field(None, description="Error details if error detected")
+    severity_level: str = Field(..., description="Severity level: none, low, medium, high")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Overall confidence")
+class ErrorReport(BaseModel):
+    """Detailed error report for a frame."""
+    frame_id: int = Field(..., description="Frame index")
+    timestamp: float = Field(..., ge=0.0, description="Timestamp in seconds")
+    phoneme: str = Field(..., description="Expected phoneme")
+    error: ErrorDetailSchema = Field(..., description="Error details")
+    severity_level: str = Field(..., description="Severity level: none, low, medium, high")
+class SummaryMetrics(BaseModel):
+    """Summary metrics for the analysis."""
+    fluency_score: float = Field(..., ge=0.0, le=1.0, description="Average fluency score (0=stutter, 1=normal)")
+    fluency_percentage: float = Field(..., ge=0.0, le=100.0, description="Fluency percentage")
+    articulation_score: float = Field(..., ge=0.0, le=1.0, description="Average articulation correctness")
+    error_count: int = Field(..., ge=0, description="Total number of errors detected")
+    error_rate: float = Field(..., ge=0.0, le=1.0, description="Error rate (errors/total_frames)")
+class BatchDiagnosisResponse(BaseModel):
+    """Response for batch file diagnosis."""
+    session_id: str = Field(..., description="Session identifier")
+    filename: str = Field(..., description="Processed filename")
+    duration: float = Field(..., ge=0.0, description="Audio duration in seconds")
+    total_frames: int = Field(..., ge=0, description="Total number of frames analyzed")
+    error_count: int = Field(..., ge=0, description="Number of errors detected")
+    errors: List[ErrorReport] = Field(default_factory=list, description="List of error reports")
+    frame_diagnoses: List[FrameDiagnosis] = Field(default_factory=list, description="All frame diagnoses")
+    summary: SummaryMetrics = Field(..., description="Summary metrics")
+    therapy_plan: List[str] = Field(default_factory=list, description="Therapy recommendations")
+    processing_time_ms: float = Field(..., ge=0.0, description="Processing time in milliseconds")
+    created_at: datetime = Field(default_factory=datetime.now, description="Analysis timestamp")
+class StreamingDiagnosisRequest(BaseModel):
+    """Request for streaming diagnosis."""
+    audio_chunk: bytes = Field(..., description="Audio chunk data (320 samples for 20ms @ 16kHz)")
+    sample_rate: int = Field(16000, description="Sample rate in Hz")
+    session_id: str = Field(..., description="Session identifier")
+    frame_index: Optional[int] = Field(None, description="Frame index for tracking")
+class StreamingDiagnosisResponse(BaseModel):
+    """Response for streaming diagnosis (single frame)."""
+    session_id: str = Field(..., description="Session identifier")
+    frame_id: int = Field(..., description="Frame index")
+    timestamp: float = Field(..., ge=0.0, description="Timestamp in seconds")
+    phoneme: str = Field(..., description="Expected phoneme")
+    fluency: FluencyInfo = Field(..., description="Fluency classification")
+    articulation: ArticulationInfo = Field(..., description="Articulation classification")
+    error: Optional[ErrorDetailSchema] = Field(None, description="Error details if error detected")
+    severity_level: str = Field(..., description="Severity level")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Overall confidence")
+    latency_ms: float = Field(..., ge=0.0, description="Processing latency in milliseconds")
+class SessionListResponse(BaseModel):
+    """Response for listing sessions."""
+    sessions: List[Dict[str, Any]] = Field(..., description="List of session metadata")
+    total: int = Field(..., ge=0, description="Total number of sessions")
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str = Field(..., description="Service status")
+    version: str = Field(..., description="API version")
+    model_loaded: bool = Field(..., description="Whether model is loaded")
+    uptime_seconds: float = Field(..., ge=0.0, description="Service uptime in seconds")

api/streaming.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+WebSocket streaming for real-time speech pathology diagnosis.
+This module provides WebSocket endpoint for streaming audio analysis
+with <50ms latency per frame requirement.
+"""
+import logging
+import time
+import uuid
+import numpy as np
+from typing import Optional, Dict
+from collections import deque
+from datetime import datetime
+from fastapi import WebSocket, WebSocketDisconnect, HTTPException
+from api.schemas import StreamingDiagnosisResponse, FluencyInfo, ArticulationInfo, ErrorDetailSchema
+from models.phoneme_mapper import PhonemeMapper
+from models.error_taxonomy import ErrorMapper, ErrorType
+from inference.inference_pipeline import InferencePipeline
+from config import AudioConfig, default_audio_config
+logger = logging.getLogger(__name__)
+class StreamingBuffer:
+    """
+    Buffer for managing sliding window in streaming audio.
+    Maintains a buffer of audio samples and provides frames
+    for processing with overlap management.
+    """
+    def __init__(self, window_size_samples: int, hop_size_samples: int):
+        """
+        Initialize streaming buffer.
+        Args:
+            window_size_samples: Size of analysis window in samples
+            hop_size_samples: Hop size between windows in samples
+        """
+        self.window_size_samples = window_size_samples
+        self.hop_size_samples = hop_size_samples
+        self.buffer = deque(maxlen=window_size_samples + hop_size_samples)
+        self.frame_index = 0
+        logger.debug(f"StreamingBuffer initialized: window={window_size_samples}, hop={hop_size_samples}")
+    def add_chunk(self, audio_chunk: np.ndarray) -> bool:
+        """
+        Add audio chunk to buffer.
+        Args:
+            audio_chunk: Audio samples to add
+        Returns:
+            True if buffer has enough data for a frame, False otherwise
+        """
+        self.buffer.extend(audio_chunk)
+        return len(self.buffer) >= self.window_size_samples
+    def get_frame(self) -> Optional[np.ndarray]:
+        """
+        Get current frame from buffer.
+        Returns:
+            Audio frame array if ready, None otherwise
+        """
+        if len(self.buffer) < self.window_size_samples:
+            return None
+        # Extract window (last window_size_samples)
+        frame = np.array(list(self.buffer)[-self.window_size_samples:])
+        return frame
+    def slide(self):
+        """Advance buffer by hop size."""
+        # Remove oldest hop_size_samples
+        for _ in range(min(self.hop_size_samples, len(self.buffer))):
+            if self.buffer:
+                self.buffer.popleft()
+        self.frame_index += 1
+# Global instances (will be injected)
+inference_pipeline: Optional[InferencePipeline] = None
+phoneme_mapper: Optional[PhonemeMapper] = None
+error_mapper: Optional[ErrorMapper] = None
+# Active streaming sessions
+streaming_sessions: Dict[str, Dict] = {}
+def initialize_streaming(
+    pipeline: InferencePipeline,
+    mapper: Optional[PhonemeMapper] = None,
+    error_mapper_instance: Optional[ErrorMapper] = None
+):
+    """
+    Initialize streaming with dependencies.
+    Args:
+        pipeline: InferencePipeline instance
+        mapper: Optional PhonemeMapper instance
+        error_mapper_instance: Optional ErrorMapper instance
+    """
+    global inference_pipeline, phoneme_mapper, error_mapper
+    inference_pipeline = pipeline
+    if mapper is None:
+        try:
+            phoneme_mapper = PhonemeMapper(
+                frame_duration_ms=default_audio_config.chunk_duration_ms,
+                sample_rate=default_audio_config.sample_rate
+            )
+            logger.info("✅ PhonemeMapper initialized for streaming")
+        except Exception as e:
+            logger.warning(f"⚠️ PhonemeMapper not available: {e}")
+            phoneme_mapper = None
+    if error_mapper_instance is None:
+        try:
+            error_mapper = ErrorMapper()
+            logger.info("✅ ErrorMapper initialized for streaming")
+        except Exception as e:
+            logger.error(f"❌ ErrorMapper failed to initialize: {e}")
+            error_mapper = None
+async def handle_streaming_websocket(websocket: WebSocket, session_id: Optional[str] = None):
+    """
+    Handle WebSocket connection for streaming diagnosis.
+    Args:
+        websocket: WebSocket connection
+        session_id: Optional session ID (auto-generated if not provided)
+    """
+    if inference_pipeline is None:
+        await websocket.close(code=1003, reason="Inference pipeline not loaded")
+        return
+    # Generate session ID
+    if not session_id:
+        session_id = str(uuid.uuid4())
+    # Accept connection
+    await websocket.accept()
+    logger.info(f"🔌 WebSocket connected: session_id={session_id}")
+    # Initialize buffer
+    window_size_samples = int(
+        inference_pipeline.inference_config.window_size_ms *
+        inference_pipeline.audio_config.sample_rate / 1000
+    )
+    hop_size_samples = int(
+        inference_pipeline.inference_config.hop_size_ms *
+        inference_pipeline.audio_config.sample_rate / 1000
+    )
+    buffer = StreamingBuffer(window_size_samples, hop_size_samples)
+    # Session metadata
+    streaming_sessions[session_id] = {
+        "session_id": session_id,
+        "connected_at": datetime.now(),
+        "frame_count": 0,
+        "total_latency_ms": 0.0
+    }
+    frame_index = 0
+    start_time = time.time()
+    try:
+        while True:
+            # Receive audio chunk
+            try:
+                data = await websocket.receive_bytes()
+                # Convert bytes to numpy array
+                # Assuming 16-bit PCM, mono, 16kHz
+                audio_chunk = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
+                # Add to buffer
+                buffer.add_chunk(audio_chunk)
+                # Process if buffer is ready
+                if buffer.get_frame() is not None:
+                    frame_start_time = time.time()
+                    # Get frame
+                    frame = buffer.get_frame()
+                    # Run inference
+                    try:
+                        result = inference_pipeline.predict_phone_level(
+                            frame,
+                            return_timestamps=False
+                        )
+                        if result.frame_predictions:
+                            frame_pred = result.frame_predictions[0]  # Single frame result
+                            # Map to error detail
+                            class_id = frame_pred.articulation_class
+                            if frame_pred.fluency_label == 'stutter':
+                                class_id += 4
+                            error_detail = None
+                            phoneme = ''  # Streaming doesn't have text input
+                            if error_mapper:
+                                try:
+                                    error_detail_obj = error_mapper.map_classifier_output(
+                                        class_id=class_id,
+                                        confidence=frame_pred.confidence,
+                                        phoneme=phoneme,
+                                        fluency_label=frame_pred.fluency_label
+                                    )
+                                    if error_detail_obj.error_type != ErrorType.NORMAL:
+                                        error_detail = ErrorDetailSchema(
+                                            phoneme=error_detail_obj.phoneme,
+                                            error_type=error_detail_obj.error_type.value,
+                                            wrong_sound=error_detail_obj.wrong_sound,
+                                            severity=error_detail_obj.severity,
+                                            confidence=error_detail_obj.confidence,
+                                            therapy=error_detail_obj.therapy,
+                                            frame_indices=[frame_index]
+                                        )
+                                except Exception as e:
+                                    logger.warning(f"Error mapping failed: {e}")
+                            # Calculate latency
+                            latency_ms = (time.time() - frame_start_time) * 1000
+                            # Get severity level
+                            severity_level = "none"
+                            if error_detail and error_mapper:
+                                severity_level = error_mapper.get_severity_level(error_detail.severity).value
+                            # Create response
+                            response = StreamingDiagnosisResponse(
+                                session_id=session_id,
+                                frame_id=frame_index,
+                                timestamp=frame_index * (inference_pipeline.inference_config.hop_size_ms / 1000.0),
+                                phoneme=phoneme,
+                                fluency=FluencyInfo(
+                                    label=frame_pred.fluency_label,
+                                    confidence=frame_pred.fluency_prob if frame_pred.fluency_label == 'stutter' else (1.0 - frame_pred.fluency_prob)
+                                ),
+                                articulation=ArticulationInfo(
+                                    label=frame_pred.articulation_label,
+                                    confidence=frame_pred.confidence,
+                                    class_id=frame_pred.articulation_class
+                                ),
+                                error=error_detail,
+                                severity_level=severity_level,
+                                confidence=frame_pred.confidence,
+                                latency_ms=latency_ms
+                            )
+                            # Send response
+                            await websocket.send_json(response.model_dump())
+                            # Update session stats
+                            streaming_sessions[session_id]["frame_count"] += 1
+                            streaming_sessions[session_id]["total_latency_ms"] += latency_ms
+                            # Check latency requirement
+                            if latency_ms > 50.0:
+                                logger.warning(f"⚠️ Latency exceeded 50ms: {latency_ms:.1f}ms")
+                            # Slide buffer
+                            buffer.slide()
+                            frame_index += 1
+                    except Exception as e:
+                        logger.error(f"❌ Inference failed: {e}", exc_info=True)
+                        await websocket.send_json({
+                            "error": f"Inference failed: {str(e)}",
+                            "frame_id": frame_index
+                        })
+            except Exception as e:
+                logger.error(f"❌ Error processing chunk: {e}", exc_info=True)
+                await websocket.send_json({
+                    "error": f"Processing failed: {str(e)}",
+                    "frame_id": frame_index
+                })
+    except WebSocketDisconnect:
+        logger.info(f"🔌 WebSocket disconnected: session_id={session_id}")
+    except Exception as e:
+        logger.error(f"❌ WebSocket error: {e}", exc_info=True)
+    finally:
+        # Cleanup session
+        if session_id in streaming_sessions:
+            session_data = streaming_sessions[session_id]
+            avg_latency = session_data["total_latency_ms"] / session_data["frame_count"] if session_data["frame_count"] > 0 else 0.0
+            logger.info(f"📊 Session {session_id} stats: {session_data['frame_count']} frames, "
+                       f"avg_latency={avg_latency:.1f}ms")
+            del streaming_sessions[session_id]

app.py CHANGED Viewed

@@ -173,33 +173,65 @@ async def diagnose_speech(
         # Run inference
         logger.info("🔄 Running inference pipeline...")
-        result = inference_pipeline.predict_batch(
             temp_file,
-            return_timestamps=True,
-            apply_smoothing=True
         )
         processing_time_ms = (time.time() - start_time) * 1000
         # Format response
         response = {
             "status": "success",
             "fluency_metrics": {
-                "mean_fluency": result.fluency_metrics.get("mean", 0.0),
-                "fluency_percentage": result.fluency_metrics.get("mean", 0.0) * 100,
-                "fluent_frames_ratio": result.fluency_metrics.get("fluent_frames_ratio", 0.0),
-                "std": result.fluency_metrics.get("std", 0.0),
-                "min": result.fluency_metrics.get("min", 0.0),
-                "max": result.fluency_metrics.get("max", 0.0),
-                "median": result.fluency_metrics.get("median", 0.0)
             },
             "articulation_results": {
-                "total_frames": len(result.articulation_scores),
-                "frame_duration_ms": result.frame_duration_ms,
-                "scores": result.articulation_scores  # All frames
             },
-            "confidence": result.confidence,
-            "confidence_percentage": result.confidence * 100,
             "processing_time_ms": processing_time_ms
         }

         # Run inference
         logger.info("🔄 Running inference pipeline...")
+        # Use new phone-level prediction
+        result = inference_pipeline.predict_phone_level(
             temp_file,
+            return_timestamps=True
         )
         processing_time_ms = (time.time() - start_time) * 1000
+        # Extract metrics from new PhoneLevelResult format
+        aggregate = result.aggregate
+        mean_fluency_stutter = aggregate.get("fluency_score", 0.0)
+        fluency_percentage = (1.0 - mean_fluency_stutter) * 100  # Convert stutter prob to fluency percentage
+        # Count fluent frames
+        fluent_frames = sum(1 for fp in result.frame_predictions if fp.fluency_label == 'normal')
+        fluent_frames_ratio = fluent_frames / result.num_frames if result.num_frames > 0 else 0.0
+        # Extract articulation class distribution
+        articulation_class_counts = {}
+        for fp in result.frame_predictions:
+            label = fp.articulation_label
+            articulation_class_counts[label] = articulation_class_counts.get(label, 0) + 1
+        # Get dominant articulation class
+        dominant_articulation = aggregate.get("articulation_label", "normal")
+        # Calculate average confidence
+        avg_confidence = sum(fp.confidence for fp in result.frame_predictions) / result.num_frames if result.num_frames > 0 else 0.0
         # Format response
         response = {
             "status": "success",
             "fluency_metrics": {
+                "mean_fluency": fluency_percentage / 100.0,
+                "fluency_percentage": fluency_percentage,
+                "fluent_frames_ratio": fluent_frames_ratio,
+                "fluent_frames_percentage": fluent_frames_ratio * 100,
+                "stutter_probability": mean_fluency_stutter
             },
             "articulation_results": {
+                "total_frames": result.num_frames,
+                "frame_duration_ms": int(inference_pipeline.inference_config.hop_size_ms),
+                "dominant_class": aggregate.get("articulation_class", 0),
+                "dominant_label": dominant_articulation,
+                "class_distribution": articulation_class_counts,
+                "frame_predictions": [
+                    {
+                        "time": fp.time,
+                        "fluency_prob": fp.fluency_prob,
+                        "fluency_label": fp.fluency_label,
+                        "articulation_class": fp.articulation_class,
+                        "articulation_label": fp.articulation_label,
+                        "confidence": fp.confidence
+                    }
+                    for fp in result.frame_predictions
+                ]
             },
+            "confidence": avg_confidence,
+            "confidence_percentage": avg_confidence * 100,
             "processing_time_ms": processing_time_ms
         }

config.py CHANGED Viewed

@@ -92,12 +92,24 @@ class InferenceConfig:
                          Reduces jitter in frame-level predictions.
         batch_size: Number of chunks to process in parallel during inference.
                    Higher values = faster but more memory usage.
     """
     fluency_threshold: float = 0.5
     articulation_threshold: float = 0.6
     min_chunk_duration_ms: int = 10
     smoothing_window: int = 5
     batch_size: int = 32
 @dataclass

                          Reduces jitter in frame-level predictions.
         batch_size: Number of chunks to process in parallel during inference.
                    Higher values = faster but more memory usage.
+        window_size_ms: Size of sliding window in milliseconds (default: 1000ms = 1 second).
+                       Minimum for Wav2Vec2 stability.
+        hop_size_ms: Hop size between windows in milliseconds (default: 10ms).
+                    Controls temporal resolution (100 frames/second).
+        frame_rate: Frames per second (calculated from hop_size_ms).
+        minimum_audio_length: Minimum audio length in seconds (must be >= window_size_ms).
+        phone_level_strategy: Strategy for phone-level analysis ("sliding_window").
     """
     fluency_threshold: float = 0.5
     articulation_threshold: float = 0.6
     min_chunk_duration_ms: int = 10
     smoothing_window: int = 5
     batch_size: int = 32
+    window_size_ms: int = 1000  # 1 second minimum for Wav2Vec2
+    hop_size_ms: int = 10  # 10ms for phone-level resolution
+    frame_rate: float = 100.0  # 100 frames per second (1/hop_size_ms)
+    minimum_audio_length: float = 1.0  # Must be >= window_size_ms
+    phone_level_strategy: str = "sliding_window"
 @dataclass

data/therapy_recommendations.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "substitutions": {
+    "/s/→/θ/": "Lisp - Use tongue tip placement behind upper teeth. Practice /s/ in isolation, then in words. Use mirror feedback to ensure tongue is not protruding.",
+    "/s/→/ʃ/": "Sibilant confusion - Practice /s/ vs /sh/ distinction. Focus on tongue position: /s/ has tongue tip up, /sh/ has tongue body back.",
+    "/s/→/z/": "Voicing error - Practice voiceless /s/ vs voiced /z/. Place hand on throat to feel vibration difference.",
+    "/r/→/w/": "Rhotacism - Practice tongue position: curl tongue back, avoid lip rounding. Start with /r/ in isolation, then CV syllables (ra, re, ri, ro, ru).",
+    "/r/→/l/": "Rhotacism - Focus on tongue tip position vs. tongue body placement. /r/ uses tongue body, /l/ uses tongue tip.",
+    "/r/→/ɹ/": "Rhotacism variant - Practice standard /r/ production with proper tongue curl.",
+    "/l/→/w/": "Liquid substitution - Practice lateral tongue placement with tongue tip up to alveolar ridge.",
+    "/l/→/j/": "Liquid substitution - Focus on tongue tip contact for /l/ vs. tongue body for /j/.",
+    "/k/→/t/": "Velar to alveolar substitution - Practice back tongue placement for /k/. Use mirror to see tongue position.",
+    "/k/→/p/": "Velar to bilabial substitution - Practice velar placement: tongue back, soft palate contact.",
+    "/g/→/d/": "Velar to alveolar substitution - Practice voiced velar /g/ with tongue back position.",
+    "/g/→/b/": "Velar to bilabial substitution - Practice velar placement for /g/.",
+    "/θ/→/f/": "Th-fronting - Practice tongue tip placement between teeth for /θ/. Use mirror to ensure correct position.",
+    "/θ/→/s/": "Th-fronting - Practice interdental placement for /θ/ vs. alveolar /s/.",
+    "/ð/→/v/": "Voiced th-fronting - Practice interdental placement for /ð/ vs. labiodental /v/.",
+    "/ð/→/z/": "Voiced th-fronting - Practice interdental /ð/ vs. alveolar /z/.",
+    "/ʃ/→/s/": "Sh-sound confusion - Practice /sh/ with tongue body back vs. /s/ with tongue tip up.",
+    "/ʃ/→/tʃ/": "Fricative to affricate - Practice sustained /sh/ vs. stop-release /ch/.",
+    "/tʃ/→/ʃ/": "Affricate to fricative - Practice stop component of /ch/ before fricative release.",
+    "/tʃ/→/ts/": "Affricate substitution - Practice /ch/ with proper tongue placement and air release.",
+    "generic": "Substitution error for {phoneme}. Practice correct articulator placement with mirror feedback. Start in isolation, then syllables, then words."
+  },
+  "omissions": {
+    "/r/": "Practice /r/ in isolation, then in CV syllables (ra, re, ri, ro, ru). Focus on tongue curl and lip position. Use visual cues and mirror feedback.",
+    "/l/": "Lateral tongue placement - practice with tongue tip up to alveolar ridge. Start with /l/ in isolation, then blend into words.",
+    "/s/": "Practice /s/ with tongue tip placement, use mirror to check position. Start in isolation, then fricative-only words (sss, see, say).",
+    "/k/": "Practice velar /k/ with tongue back position. Use mirror to see tongue placement. Start with /k/ in isolation.",
+    "/g/": "Practice voiced velar /g/ with tongue back. Start in isolation, then CV syllables.",
+    "/t/": "Practice alveolar /t/ with tongue tip contact. Use mirror feedback for placement.",
+    "/d/": "Practice voiced alveolar /d/ with tongue tip contact.",
+    "/p/": "Practice bilabial /p/ with lip closure. Use mirror to ensure proper closure.",
+    "/b/": "Practice voiced bilabial /b/ with lip closure.",
+    "/f/": "Practice labiodental /f/ with lower lip to upper teeth contact.",
+    "/v/": "Practice voiced labiodental /v/ with lip-teeth contact.",
+    "/θ/": "Practice interdental /θ/ with tongue tip between teeth.",
+    "/ð/": "Practice voiced interdental /ð/ with tongue tip between teeth.",
+    "/ʃ/": "Practice /sh/ with tongue body back and lip rounding.",
+    "/tʃ/": "Practice /ch/ with stop then fricative release.",
+    "/dʒ/": "Practice /j/ (as in judge) with stop then fricative release.",
+    "generic": "Omission error for {phoneme}. Say the sound separately first, then blend into syllables, then words. Use visual cues and mirror feedback."
+  },
+  "distortions": {
+    "/s/": "Sibilant clarity - use mirror feedback, ensure tongue tip is up and air stream is central. Practice sustained /s/ sound.",
+    "/z/": "Voiced sibilant clarity - practice with voicing, ensure proper tongue placement.",
+    "/ʃ/": "Fricative voicing exercise - practice /sh/ vs /s/ distinction. Focus on tongue body position.",
+    "/tʃ/": "Affricate clarity - practice stop component then fricative release. Ensure proper timing.",
+    "/r/": "Rhotacism - practice tongue position and lip rounding control. Use mirror to see tongue curl.",
+    "/l/": "Lateral clarity - ensure tongue tip is up and air flows over sides of tongue.",
+    "/θ/": "Interdental clarity - practice with tongue tip between teeth, ensure air stream is correct.",
+    "/ð/": "Voiced interdental clarity - practice with voicing and proper tongue placement.",
+    "/k/": "Velar stop clarity - practice with proper tongue back placement and release.",
+    "/g/": "Voiced velar stop clarity - practice with voicing and tongue placement.",
+    "/t/": "Alveolar stop clarity - practice with tongue tip contact and clean release.",
+    "/d/": "Voiced alveolar stop clarity - practice with voicing and proper contact.",
+    "generic": "Distortion error for {phoneme}. Use mirror feedback and watch articulator position carefully. Practice in isolation first, then in words."
+  }
+}

diagnosis/ai_engine/detect_stuttering.py CHANGED Viewed

@@ -30,6 +30,9 @@ from scipy.spatial import ConvexHull
 from scipy.stats import pearsonr
 from difflib import SequenceMatcher
 logger = logging.getLogger(__name__)
 # === CONFIGURATION ===

 from scipy.stats import pearsonr
 from difflib import SequenceMatcher
+import warnings
+warnings.filterwarnings("ignore", message="CUDA requested but not available")
 logger = logging.getLogger(__name__)
 # === CONFIGURATION ===

inference/inference_pipeline.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Inference Pipeline for Speech Pathology Diagnosis
 This module provides the inference pipeline for real-time and batch processing
-of audio for fluency and articulation analysis.
 """
 import logging
@@ -13,7 +13,8 @@ import soundfile as sf
 from typing import Dict, List, Optional, Tuple, Union
 from pathlib import Path
 import time
-from dataclasses import dataclass
 from models.speech_pathology_model import SpeechPathologyClassifier, load_speech_pathology_model
 from config import AudioConfig, ModelConfig, InferenceConfig
@@ -22,60 +23,50 @@ logger = logging.getLogger(__name__)
 @dataclass
-class PredictionResult:
     """
-    Container for prediction results.
     Attributes:
-        fluency_score: Probability of fluent speech (0-1)
         articulation_class: Class index (0-3)
-        articulation_class_name: Class name string
-        articulation_probs: Probabilities for all 4 classes
         confidence: Overall confidence score
-        timestamp_ms: Timestamp in milliseconds (for streaming)
-        frame_index: Frame index (for streaming)
     """
-    fluency_score: float
     articulation_class: int
-    articulation_class_name: str
-    articulation_probs: List[float]
     confidence: float
-    timestamp_ms: Optional[float] = None
-    frame_index: Optional[int] = None
 @dataclass
-class BatchPredictionResult:
     """
-    Container for batch prediction results.
     Attributes:
-        fluency_metrics: Dictionary with fluency statistics
-        articulation_scores: List of articulation predictions per frame
-        confidence: Overall confidence
-        timestamps: List of timestamps in milliseconds
-        frame_duration_ms: Duration of each frame in milliseconds
     """
-    fluency_metrics: Dict[str, float]
-    articulation_scores: List[Dict[str, Union[int, str, List[float]]]]
-    confidence: float
-    timestamps: List[float]
-    frame_duration_ms: float
 class InferencePipeline:
     """
-    Inference pipeline for speech pathology diagnosis.
-    Handles both batch processing (full audio files) and streaming
-    (real-time chunk-by-chunk) inference with phone-level granularity.
-    Features:
-    - Batch inference for complete audio files
-    - Streaming inference for real-time processing
-    - Phone-level analysis (20ms frames)
-    - Automatic audio preprocessing and normalization
-    - Temporal smoothing of predictions
     """
     def __init__(
@@ -93,9 +84,6 @@ class InferencePipeline:
             audio_config: Audio processing configuration
             model_config: Model configuration
             inference_config: Inference configuration
-        Raises:
-            RuntimeError: If model cannot be loaded or initialized
         """
         # Load configurations
         from config import default_audio_config, default_model_config, default_inference_config
@@ -104,18 +92,17 @@ class InferencePipeline:
         self.model_config = model_config or default_model_config
         self.inference_config = inference_config or default_inference_config
-        logger.info("Initializing InferencePipeline...")
-        logger.info(f"Audio config: sample_rate={self.audio_config.sample_rate}, "
-                   f"chunk_duration_ms={self.audio_config.chunk_duration_ms}")
-        logger.info(f"Inference config: fluency_threshold={self.inference_config.fluency_threshold}, "
-                   f"articulation_threshold={self.inference_config.articulation_threshold}")
         # Initialize or use provided model
         if model is None:
             logger.info("Loading SpeechPathologyClassifier...")
             self.model = load_speech_pathology_model(
                 model_name=self.model_config.model_name,
-                classifier_hidden_dims=self.model_config.classifier_hidden_dims,
                 dropout=self.model_config.dropout,
                 device=self.model_config.device,
                 use_fp16=self.model_config.use_fp16
@@ -127,16 +114,16 @@ class InferencePipeline:
         # Get processor for audio preprocessing
         self.processor = self.model.processor
-        # Calculate frame size in samples
-        self.frame_size_samples = int(
-            self.audio_config.chunk_duration_ms * self.audio_config.sample_rate / 1000
         )
         self.hop_size_samples = int(
-            self.audio_config.hop_length_ms * self.audio_config.sample_rate / 1000
         )
-        logger.info(f"Frame size: {self.frame_size_samples} samples "
-                   f"({self.audio_config.chunk_duration_ms}ms)")
         logger.info("✅ InferencePipeline initialized successfully")
     def preprocess_audio(
@@ -153,335 +140,276 @@ class InferencePipeline:
         Returns:
             Preprocessed audio array normalized to [-1, 1] range
-        Raises:
-            ValueError: If audio cannot be loaded or processed
         """
         target_sr = target_sr or self.audio_config.sample_rate
         # Load audio if path provided
         if isinstance(audio, (str, Path)):
-            try:
-                audio_array, sr = librosa.load(str(audio), sr=target_sr, mono=True)
-                logger.debug(f"Loaded audio from {audio}: shape={audio_array.shape}, sr={sr}")
-            except Exception as e:
-                logger.error(f"Failed to load audio from {audio}: {e}")
-                raise ValueError(f"Cannot load audio file: {e}") from e
         else:
-            audio_array = audio
-            # Resample if needed
             if len(audio_array.shape) > 1:
-                audio_array = librosa.to_mono(audio_array)
-        # Normalize to [-1, 1] range
-        if audio_array.max() > 1.0 or audio_array.min() < -1.0:
-            max_val = np.abs(audio_array).max()
-            if max_val > 0:
-                audio_array = audio_array / max_val
-            logger.debug("Normalized audio to [-1, 1] range")
         return audio_array
-    def predict_batch(
         self,
-        audio_path: Union[str, Path, np.ndarray],
-        return_timestamps: bool = True,
-        apply_smoothing: bool = True
-    ) -> BatchPredictionResult:
         """
-        Predict fluency and articulation for a complete audio file.
-        Processes audio in overlapping frames for phone-level analysis.
         Args:
-            audio_path: Path to audio file or audio array
-            return_timestamps: Whether to include timestamps for each frame
-            apply_smoothing: Whether to apply temporal smoothing
         Returns:
-            BatchPredictionResult with aggregated metrics and per-frame predictions
-        Raises:
-            ValueError: If audio cannot be processed
-            RuntimeError: If inference fails
         """
-        logger.info(f"Starting batch prediction for: {audio_path}")
-        start_time = time.time()
-        try:
-            # Preprocess audio
-            audio_array = self.preprocess_audio(audio_path)
-            duration_seconds = len(audio_array) / self.audio_config.sample_rate
-            logger.info(f"Audio duration: {duration_seconds:.2f}s, "
-                       f"frames: {len(audio_array) // self.hop_size_samples}")
-            # Process in overlapping frames
-            predictions = []
-            timestamps = []
-            frame_idx = 0
-            for start_sample in range(0, len(audio_array) - self.frame_size_samples + 1,
-                                     self.hop_size_samples):
-                # Extract frame
-                frame = audio_array[start_sample:start_sample + self.frame_size_samples]
-                # Skip if frame is too short
-                if len(frame) < self.inference_config.min_chunk_duration_ms * \
-                   self.audio_config.sample_rate / 1000:
-                    continue
-                # Predict on frame
-                frame_timestamp_ms = (start_sample / self.audio_config.sample_rate) * 1000
-                try:
-                    pred_result = self._predict_single_frame(
-                        frame,
-                        frame_idx=frame_idx,
-                        timestamp_ms=frame_timestamp_ms if return_timestamps else None
-                    )
-                    predictions.append(pred_result)
-                    if return_timestamps:
-                        timestamps.append(frame_timestamp_ms)
-                    frame_idx += 1
-                except Exception as e:
-                    logger.warning(f"Failed to predict frame {frame_idx}: {e}")
-                    continue
-            if not predictions:
-                raise ValueError("No valid frames extracted from audio")
-            logger.info(f"Processed {len(predictions)} frames")
-            # Apply temporal smoothing if requested
-            if apply_smoothing and len(predictions) > 1:
-                predictions = self._apply_temporal_smoothing(predictions)
-            # Aggregate results
-            result = self._aggregate_predictions(predictions, timestamps)
-            elapsed_time = time.time() - start_time
-            logger.info(f"Batch prediction completed in {elapsed_time:.2f}s "
-                       f"({duration_seconds/elapsed_time:.1f}x real-time)")
-            return result
-        except Exception as e:
-            logger.error(f"Batch prediction failed: {e}", exc_info=True)
-            raise RuntimeError(f"Batch prediction failed: {e}") from e
-    def predict_streaming(
         self,
-        audio_chunk: np.ndarray,
-        frame_index: Optional[int] = None,
-        timestamp_ms: Optional[float] = None
-    ) -> PredictionResult:
         """
-        Predict fluency and articulation for a single audio chunk (streaming).
-        Designed for real-time processing with <200ms latency requirement.
         Args:
-            audio_chunk: Audio chunk array (should match chunk_duration_ms)
-            frame_index: Optional frame index for tracking
-            timestamp_ms: Optional timestamp in milliseconds
         Returns:
-            PredictionResult for the chunk
-        Raises:
-            ValueError: If chunk is invalid
-            RuntimeError: If inference fails
         """
         try:
-            # Preprocess chunk
-            chunk = self.preprocess_audio(audio_chunk)
-            # Validate chunk size
-            expected_samples = self.frame_size_samples
-            if len(chunk) < expected_samples * 0.5:  # Allow some tolerance
-                logger.warning(f"Chunk size {len(chunk)} is smaller than expected {expected_samples}")
-            # Pad or truncate to expected size
-            if len(chunk) < expected_samples:
-                chunk = np.pad(chunk, (0, expected_samples - len(chunk)), mode='constant')
-            elif len(chunk) > expected_samples:
-                chunk = chunk[:expected_samples]
-            # Predict
-            return self._predict_single_frame(
-                chunk,
-                frame_index=frame_index,
-                timestamp_ms=timestamp_ms
-            )
-        except Exception as e:
-            logger.error(f"Streaming prediction failed: {e}", exc_info=True)
-            raise RuntimeError(f"Streaming prediction failed: {e}") from e
-    def _predict_single_frame(
-        self,
-        frame: np.ndarray,
-        frame_index: Optional[int] = None,
-        timestamp_ms: Optional[float] = None
-    ) -> PredictionResult:
-        """
-        Predict on a single audio frame.
-        Args:
-            frame: Audio frame array
-            frame_index: Optional frame index
-            timestamp_ms: Optional timestamp
-        Returns:
-            PredictionResult
-        """
-        # Convert to tensor
-        audio_tensor = torch.from_numpy(frame).float()
-        # Process through model
-        with torch.no_grad():
-            # Use processor to prepare input
-            inputs = self.processor(
-                audio_tensor,
-                sampling_rate=self.audio_config.sample_rate,
-                return_tensors="pt",
-                padding=True
-            )
-            # Move to device
-            input_values = inputs.input_values.to(self.model.device)
-            # Get predictions
-            outputs = self.model.predict(
-                input_values.squeeze(0),  # Remove batch dimension if needed
-                sample_rate=self.audio_config.sample_rate,
-                return_dict=True
             )
-        return PredictionResult(
-            fluency_score=outputs["fluency_score"],
-            articulation_class=outputs["articulation_class"],
-            articulation_class_name=outputs["articulation_class_name"],
-            articulation_probs=outputs["articulation_probs"],
-            confidence=outputs["confidence"],
-            timestamp_ms=timestamp_ms,
-            frame_index=frame_index
-        )
-    def _apply_temporal_smoothing(
         self,
-        predictions: List[PredictionResult],
-        window_size: Optional[int] = None
-    ) -> List[PredictionResult]:
         """
-        Apply temporal smoothing to predictions using moving average.
         Args:
-            predictions: List of prediction results
-            window_size: Smoothing window size (defaults to inference_config.smoothing_window)
         Returns:
-            List of smoothed predictions
         """
-        window_size = window_size or self.inference_config.smoothing_window
-        if len(predictions) <= window_size:
-            return predictions
-        smoothed = []
-        for i in range(len(predictions)):
-            # Get window indices
-            start_idx = max(0, i - window_size // 2)
-            end_idx = min(len(predictions), i + window_size // 2 + 1)
-            window_preds = predictions[start_idx:end_idx]
-            # Average fluency scores
-            avg_fluency = np.mean([p.fluency_score for p in window_preds])
-            # Average articulation probabilities
-            avg_articulation_probs = np.mean(
-                [p.articulation_probs for p in window_preds],
-                axis=0
-            )
-            # Get most likely class from averaged probabilities
-            articulation_class = int(np.argmax(avg_articulation_probs))
-            articulation_class_name = self.model.get_articulation_class_name(articulation_class)
-            # Calculate confidence
-            confidence = (avg_fluency + avg_articulation_probs[articulation_class]) / 2.0
-            smoothed.append(PredictionResult(
-                fluency_score=float(avg_fluency),
-                articulation_class=articulation_class,
-                articulation_class_name=articulation_class_name,
-                articulation_probs=avg_articulation_probs.tolist(),
-                confidence=float(confidence),
-                timestamp_ms=predictions[i].timestamp_ms,
-                frame_index=predictions[i].frame_index
-            ))
-        return smoothed
-    def _aggregate_predictions(
         self,
-        predictions: List[PredictionResult],
-        timestamps: Optional[List[float]] = None
-    ) -> BatchPredictionResult:
         """
-        Aggregate frame-level predictions into batch results.
         Args:
-            predictions: List of frame predictions
-            timestamps: Optional list of timestamps
         Returns:
-            BatchPredictionResult with aggregated metrics
         """
-        if not predictions:
-            raise ValueError("Cannot aggregate empty predictions")
-        # Calculate fluency metrics
-        fluency_scores = [p.fluency_score for p in predictions]
-        fluency_metrics = {
-            "mean": float(np.mean(fluency_scores)),
-            "std": float(np.std(fluency_scores)),
-            "min": float(np.min(fluency_scores)),
-            "max": float(np.max(fluency_scores)),
-            "median": float(np.median(fluency_scores)),
-            "fluent_frames_ratio": float(np.mean([s >= self.inference_config.fluency_threshold
-                                                  for s in fluency_scores]))
-        }
-        # Articulation scores per frame
-        articulation_scores = [
-            {
-                "class": p.articulation_class,
-                "class_name": p.articulation_class_name,
-                "probs": p.articulation_probs,
-                "confidence": p.confidence
-            }
-            for p in predictions
-        ]
-        # Overall confidence
-        overall_confidence = float(np.mean([p.confidence for p in predictions]))
-        # Timestamps
-        if timestamps is None:
-            timestamps = [p.timestamp_ms for p in predictions if p.timestamp_ms is not None]
-        return BatchPredictionResult(
-            fluency_metrics=fluency_metrics,
-            articulation_scores=articulation_scores,
-            confidence=overall_confidence,
-            timestamps=timestamps or [],
-            frame_duration_ms=self.audio_config.chunk_duration_ms
-        )
 def create_inference_pipeline(
-    model_path: Optional[str] = None,
     audio_config: Optional[AudioConfig] = None,
     model_config: Optional[ModelConfig] = None,
     inference_config: Optional[InferenceConfig] = None
@@ -490,31 +418,17 @@ def create_inference_pipeline(
     Factory function to create an InferencePipeline instance.
     Args:
-        model_path: Optional path to saved model checkpoint
-        audio_config: Audio configuration
-        model_config: Model configuration
-        inference_config: Inference configuration
     Returns:
-        InferencePipeline instance
     """
-    model = None
-    if model_path:
-        logger.info(f"Loading model from: {model_path}")
-        model_config = model_config or ModelConfig()
-        model = load_speech_pathology_model(
-            model_name=model_config.model_name,
-            classifier_hidden_dims=model_config.classifier_hidden_dims,
-            dropout=model_config.dropout,
-            device=model_config.device,
-            use_fp16=model_config.use_fp16,
-            model_path=model_path
-        )
     return InferencePipeline(
         model=model,
         audio_config=audio_config,
         model_config=model_config,
         inference_config=inference_config
     )

 Inference Pipeline for Speech Pathology Diagnosis
 This module provides the inference pipeline for real-time and batch processing
+of audio for fluency and articulation analysis using sliding window approach.
 """
 import logging
 from typing import Dict, List, Optional, Tuple, Union
 from pathlib import Path
 import time
+from dataclasses import dataclass, field
+from collections import deque
 from models.speech_pathology_model import SpeechPathologyClassifier, load_speech_pathology_model
 from config import AudioConfig, ModelConfig, InferenceConfig
 @dataclass
+class FramePrediction:
     """
+    Container for a single frame prediction.
     Attributes:
+        time: Timestamp in seconds
+        fluency_prob: Probability of stutter (0-1)
+        fluency_label: 'normal' or 'stutter'
         articulation_class: Class index (0-3)
+        articulation_label: Class name
         confidence: Overall confidence score
     """
+    time: float
+    fluency_prob: float
+    fluency_label: str
     articulation_class: int
+    articulation_label: str
     confidence: float
 @dataclass
+class PhoneLevelResult:
     """
+    Container for phone-level prediction results.
     Attributes:
+        frame_predictions: List of frame-level predictions
+        aggregate: Aggregated statistics
     """
+    frame_predictions: List[FramePrediction]
+    aggregate: Dict[str, Union[float, int, str]]
+    duration: float
+    num_frames: int
 class InferencePipeline:
     """
+    Inference pipeline for speech pathology diagnosis using sliding window approach.
+    Architecture:
+    - 1-second sliding windows (minimum for Wav2Vec2)
+    - 10ms hop size for phone-level resolution
+    - Wav2Vec2 feature extraction per window
+    - Multi-task classifier for fluency + articulation
     """
     def __init__(
             audio_config: Audio processing configuration
             model_config: Model configuration
             inference_config: Inference configuration
         """
         # Load configurations
         from config import default_audio_config, default_model_config, default_inference_config
         self.model_config = model_config or default_model_config
         self.inference_config = inference_config or default_inference_config
+        logger.info("Initializing InferencePipeline (sliding window)...")
+        logger.info(f"Window size: {self.inference_config.window_size_ms}ms")
+        logger.info(f"Hop size: {self.inference_config.hop_size_ms}ms")
+        logger.info(f"Frame rate: {self.inference_config.frame_rate} fps")
         # Initialize or use provided model
         if model is None:
             logger.info("Loading SpeechPathologyClassifier...")
             self.model = load_speech_pathology_model(
                 model_name=self.model_config.model_name,
+                classifier_hidden_dims=[512, 256],  # 1024 → 512 → 256
                 dropout=self.model_config.dropout,
                 device=self.model_config.device,
                 use_fp16=self.model_config.use_fp16
         # Get processor for audio preprocessing
         self.processor = self.model.processor
+        # Calculate window and hop sizes in samples
+        self.window_size_samples = int(
+            self.inference_config.window_size_ms * self.audio_config.sample_rate / 1000
         )
         self.hop_size_samples = int(
+            self.inference_config.hop_size_ms * self.audio_config.sample_rate / 1000
         )
+        logger.info(f"Window size: {self.window_size_samples} samples")
+        logger.info(f"Hop size: {self.hop_size_samples} samples")
         logger.info("✅ InferencePipeline initialized successfully")
     def preprocess_audio(
         Returns:
             Preprocessed audio array normalized to [-1, 1] range
         """
         target_sr = target_sr or self.audio_config.sample_rate
         # Load audio if path provided
         if isinstance(audio, (str, Path)):
+            audio_path = Path(audio)
+            if not audio_path.exists():
+                raise ValueError(f"Audio file not found: {audio_path}")
+            audio_array, sr = librosa.load(str(audio_path), sr=target_sr, mono=True)
         else:
+            audio_array = np.array(audio, dtype=np.float32)
             if len(audio_array.shape) > 1:
+                audio_array = np.mean(audio_array, axis=0)  # Convert to mono
+        # Normalize to [-1, 1]
+        max_val = np.abs(audio_array).max()
+        if max_val > 0:
+            audio_array = audio_array / max_val
         return audio_array
+    def get_phone_level_features(
         self,
+        audio: np.ndarray
+    ) -> Tuple[torch.Tensor, np.ndarray]:
         """
+        Extract phone-level features using sliding window approach.
         Args:
+            audio: Preprocessed audio array (16kHz, mono, normalized)
         Returns:
+            Tuple of (frame_features, frame_times):
+            - frame_features: Tensor of shape (num_frames, 1024)
+            - frame_times: Array of timestamps in seconds
         """
+        num_samples = len(audio)
+        num_windows = max(1, (num_samples - self.window_size_samples) // self.hop_size_samples + 1)
+        frame_features_list = []
+        frame_times = []
+        logger.info(f"Extracting features from {num_windows} windows...")
+        for i in range(num_windows):
+            start_sample = i * self.hop_size_samples
+            end_sample = min(start_sample + self.window_size_samples, num_samples)
+            # Extract window
+            window = audio[start_sample:end_sample]
+            # Pad if necessary (at the end of audio)
+            if len(window) < self.window_size_samples:
+                padding = self.window_size_samples - len(window)
+                window = np.pad(window, (0, padding), mode='constant')
+            # Convert to tensor
+            audio_tensor = torch.from_numpy(window).float()
+            # Process through feature extractor
+            with torch.no_grad():
+                inputs = self.processor(
+                    audio_tensor,
+                    sampling_rate=self.audio_config.sample_rate,
+                    return_tensors="pt",
+                    padding=True
+                )
+                input_values = inputs.input_values.to(self.model.device)
+                # Extract Wav2Vec2 features
+                wav2vec2_outputs = self.model.wav2vec2_model(
+                    input_values=input_values
+                )
+                # Get features: (batch_size, seq_len, 1024)
+                features = wav2vec2_outputs.last_hidden_state
+                # Pool to single vector: mean over sequence length
+                pooled_features = torch.mean(features, dim=1)  # (batch_size, 1024)
+                frame_features_list.append(pooled_features.cpu())
+            # Calculate timestamp (center of window)
+            frame_time = (start_sample + self.window_size_samples / 2) / self.audio_config.sample_rate
+            frame_times.append(frame_time)
+        # Stack all features
+        frame_features = torch.cat(frame_features_list, dim=0)  # (num_frames, 1024)
+        frame_times = np.array(frame_times)
+        logger.info(f"Extracted {len(frame_features)} frame features")
+        return frame_features, frame_times
+    def predict_phone_level(
         self,
+        audio: Union[np.ndarray, str, Path],
+        return_timestamps: bool = True
+    ) -> PhoneLevelResult:
         """
+        Predict fluency and articulation at phone-level resolution.
         Args:
+            audio: Audio array, file path, or Path object
+            return_timestamps: Whether to include timestamps in results
         Returns:
+            PhoneLevelResult with frame-level predictions and aggregates
         """
+        start_time = time.time()
         try:
+            # Preprocess audio
+            audio_array = self.preprocess_audio(audio)
+            duration = len(audio_array) / self.audio_config.sample_rate
+            logger.info(f"Processing audio: {duration:.2f}s")
+            # Check minimum length
+            if duration < self.inference_config.minimum_audio_length:
+                logger.warning(f"Audio shorter than minimum ({duration:.2f}s < {self.inference_config.minimum_audio_length}s), "
+                             f"padding to minimum")
+                min_samples = int(self.inference_config.minimum_audio_length * self.audio_config.sample_rate)
+                if len(audio_array) < min_samples:
+                    padding = min_samples - len(audio_array)
+                    audio_array = np.pad(audio_array, (0, padding), mode='constant')
+                    duration = len(audio_array) / self.audio_config.sample_rate
+            # Extract phone-level features
+            frame_features, frame_times = self.get_phone_level_features(audio_array)
+            # Move features to device
+            frame_features = frame_features.to(self.model.device)
+            # Predict using classifier
+            self.model.eval()
+            with torch.no_grad():
+                # Pass through shared layers and heads
+                shared_features = self.model.classifier_head.shared_layers(frame_features)
+                # Get predictions from all heads
+                fluency_logits = self.model.classifier_head.fluency_head(shared_features)
+                articulation_logits = self.model.classifier_head.articulation_head(shared_features)
+                full_logits = self.model.classifier_head.full_head(shared_features)
+                # Apply softmax
+                fluency_probs = torch.softmax(fluency_logits, dim=-1)  # (num_frames, 2)
+                articulation_probs = torch.softmax(articulation_logits, dim=-1)  # (num_frames, 4)
+                full_probs = torch.softmax(full_logits, dim=-1)  # (num_frames, 8)
+            # Convert to numpy
+            fluency_probs = fluency_probs.cpu().numpy()
+            articulation_probs = articulation_probs.cpu().numpy()
+            full_probs = full_probs.cpu().numpy()
+            # Create frame predictions
+            frame_predictions = []
+            for i in range(len(frame_features)):
+                # Fluency: class 0 = normal, class 1 = stutter
+                fluency_prob_stutter = fluency_probs[i, 1]
+                fluency_label = 'stutter' if fluency_prob_stutter > self.inference_config.fluency_threshold else 'normal'
+                # Articulation: get class with highest probability
+                articulation_class = int(np.argmax(articulation_probs[i]))
+                articulation_label = self.model.get_articulation_class_name(articulation_class)
+                # Confidence: average of max probabilities
+                confidence = (np.max(fluency_probs[i]) + np.max(articulation_probs[i])) / 2.0
+                frame_pred = FramePrediction(
+                    time=frame_times[i] if return_timestamps else 0.0,
+                    fluency_prob=float(fluency_prob_stutter),
+                    fluency_label=fluency_label,
+                    articulation_class=articulation_class,
+                    articulation_label=articulation_label,
+                    confidence=float(confidence)
+                )
+                frame_predictions.append(frame_pred)
+            # Aggregate statistics
+            fluency_scores = [fp.fluency_prob for fp in frame_predictions]
+            articulation_classes = [fp.articulation_class for fp in frame_predictions]
+            aggregate = {
+                'fluency_score': float(np.mean(fluency_scores)),
+                'articulation_class': int(np.bincount(articulation_classes).argmax()),
+                'articulation_label': self.model.get_articulation_class_name(
+                    int(np.bincount(articulation_classes).argmax())
+                ),
+                'num_frames': len(frame_predictions),
+                'duration': duration
+            }
+            elapsed_time = time.time() - start_time
+            logger.info(f"Phone-level prediction completed in {elapsed_time:.2f}s "
+                       f"({duration/elapsed_time:.1f}x real-time)")
+            return PhoneLevelResult(
+                frame_predictions=frame_predictions,
+                aggregate=aggregate,
+                duration=duration,
+                num_frames=len(frame_predictions)
             )
+        except Exception as e:
+            logger.error(f"Phone-level prediction failed: {e}", exc_info=True)
+            raise RuntimeError(f"Phone-level prediction failed: {e}") from e
+    def predict_batch(
         self,
+        audio_path: Union[str, Path],
+        return_timestamps: bool = True,
+        apply_smoothing: bool = True
+    ) -> PhoneLevelResult:
         """
+        Predict on audio file (batch processing).
         Args:
+            audio_path: Path to audio file
+            return_timestamps: Whether to include timestamps
+            apply_smoothing: Whether to apply temporal smoothing (not implemented yet)
         Returns:
+            PhoneLevelResult
         """
+        return self.predict_phone_level(audio_path, return_timestamps=return_timestamps)
+    def predict_streaming_chunk(
         self,
+        chunk: np.ndarray,
+        buffer: Optional[deque] = None,
+        timestamp: Optional[float] = None
+    ) -> Optional[FramePrediction]:
         """
+        Predict on streaming audio chunk.
         Args:
+            chunk: Audio chunk array
+            buffer: Optional buffer for maintaining sliding window
+            timestamp: Optional timestamp for the chunk
         Returns:
+            FramePrediction if enough data accumulated, None otherwise
         """
+        if buffer is None:
+            buffer = deque(maxlen=self.window_size_samples)
+        # Add chunk to buffer
+        buffer.extend(chunk)
+        # Check if we have enough data for a window
+        if len(buffer) >= self.window_size_samples:
+            # Extract window
+            window = np.array(list(buffer)[-self.window_size_samples:])
+            # Process window
+            try:
+                result = self.predict_phone_level(window, return_timestamps=False)
+                if result.frame_predictions:
+                    return result.frame_predictions[-1]  # Return latest frame
+            except Exception as e:
+                logger.warning(f"Streaming prediction failed: {e}")
+                return None
+        return None
 def create_inference_pipeline(
+    model: Optional[SpeechPathologyClassifier] = None,
     audio_config: Optional[AudioConfig] = None,
     model_config: Optional[ModelConfig] = None,
     inference_config: Optional[InferenceConfig] = None
     Factory function to create an InferencePipeline instance.
     Args:
+        model: Optional pre-initialized model
+        audio_config: Optional audio configuration
+        model_config: Optional model configuration
+        inference_config: Optional inference configuration
     Returns:
+        Initialized InferencePipeline instance
     """
     return InferencePipeline(
         model=model,
         audio_config=audio_config,
         model_config=model_config,
         inference_config=inference_config
     )

models/error_taxonomy.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+Error Taxonomy for Speech Pathology Analysis
+This module defines error types, severity levels, and therapy recommendations
+for phoneme-level error detection.
+"""
+import logging
+import json
+from enum import Enum
+from typing import Optional, Dict, List
+from pathlib import Path
+from dataclasses import dataclass, field
+from pydantic import BaseModel, Field
+logger = logging.getLogger(__name__)
+class ErrorType(str, Enum):
+    """Types of articulation errors."""
+    NORMAL = "normal"
+    SUBSTITUTION = "substitution"
+    OMISSION = "omission"
+    DISTORTION = "distortion"
+class SeverityLevel(str, Enum):
+    """Severity levels for errors."""
+    NONE = "none"
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+@dataclass
+class ErrorDetail:
+    """
+    Detailed error information for a phoneme.
+    Attributes:
+        phoneme: Expected phoneme symbol (e.g., '/s/')
+        error_type: Type of error (NORMAL, SUBSTITUTION, OMISSION, DISTORTION)
+        wrong_sound: For substitutions, the incorrect phoneme produced (e.g., '/θ/')
+        severity: Severity score (0.0-1.0)
+        confidence: Model confidence in the error detection (0.0-1.0)
+        therapy: Therapy recommendation text
+        frame_indices: List of frame indices where this error occurs
+    """
+    phoneme: str
+    error_type: ErrorType
+    wrong_sound: Optional[str] = None
+    severity: float = 0.0
+    confidence: float = 0.0
+    therapy: str = ""
+    frame_indices: List[int] = field(default_factory=list)
+class ErrorDetailPydantic(BaseModel):
+    """Pydantic model for API serialization."""
+    phoneme: str
+    error_type: str
+    wrong_sound: Optional[str] = None
+    severity: float = Field(ge=0.0, le=1.0)
+    confidence: float = Field(ge=0.0, le=1.0)
+    therapy: str
+    frame_indices: List[int] = Field(default_factory=list)
+class ErrorMapper:
+    """
+    Maps classifier outputs to error types and generates therapy recommendations.
+    Classifier output mapping (8 classes):
+    - Class 0: Normal articulation, normal fluency
+    - Class 1: Substitution, normal fluency
+    - Class 2: Omission, normal fluency
+    - Class 3: Distortion, normal fluency
+    - Class 4: Normal articulation, stutter
+    - Class 5: Substitution, stutter
+    - Class 6: Omission, stutter
+    - Class 7: Distortion, stutter
+    """
+    def __init__(self, therapy_db_path: Optional[str] = None):
+        """
+        Initialize the ErrorMapper.
+        Args:
+            therapy_db_path: Path to therapy recommendations JSON file.
+                           If None, uses default location: data/therapy_recommendations.json
+        """
+        self.therapy_db: Dict = {}
+        # Default path
+        if therapy_db_path is None:
+            therapy_db_path = Path(__file__).parent.parent / "data" / "therapy_recommendations.json"
+        else:
+            therapy_db_path = Path(therapy_db_path)
+        # Load therapy database
+        try:
+            if therapy_db_path.exists():
+                with open(therapy_db_path, 'r', encoding='utf-8') as f:
+                    self.therapy_db = json.load(f)
+                logger.info(f"✅ Loaded therapy database from {therapy_db_path}")
+            else:
+                logger.warning(f"Therapy database not found at {therapy_db_path}, using defaults")
+                self.therapy_db = self._get_default_therapy_db()
+        except Exception as e:
+            logger.error(f"Failed to load therapy database: {e}, using defaults")
+            self.therapy_db = self._get_default_therapy_db()
+        # Common substitution mappings (phoneme → likely wrong sound)
+        self.substitution_map: Dict[str, List[str]] = {
+            '/s/': ['/θ/', '/ʃ/', '/z/'],  # lisp, sh-sound, voicing
+            '/r/': ['/w/', '/l/', '/ɹ/'],  # rhotacism variants
+            '/l/': ['/w/', '/j/'],  # liquid substitutions
+            '/k/': ['/t/', '/p/'],  # velar → alveolar/bilabial
+            '/g/': ['/d/', '/b/'],  # velar → alveolar/bilabial
+            '/θ/': ['/f/', '/s/'],  # th → f or s
+            '/ð/': ['/v/', '/z/'],  # voiced th → v or z
+            '/ʃ/': ['/s/', '/tʃ/'],  # sh → s or ch
+            '/tʃ/': ['/ʃ/', '/ts/'],  # ch → sh or ts
+        }
+    def map_classifier_output(
+        self,
+        class_id: int,
+        confidence: float,
+        phoneme: str,
+        fluency_label: str = "normal"
+    ) -> ErrorDetail:
+        """
+        Map classifier output to ErrorDetail.
+        Args:
+            class_id: Classifier output class (0-7)
+            confidence: Model confidence (0.0-1.0)
+            phoneme: Expected phoneme symbol
+            fluency_label: Fluency label ("normal" or "stutter")
+        Returns:
+            ErrorDetail object with error information
+        """
+        # Determine error type from class_id
+        if class_id == 0 or class_id == 4:
+            error_type = ErrorType.NORMAL
+        elif class_id == 1 or class_id == 5:
+            error_type = ErrorType.SUBSTITUTION
+        elif class_id == 2 or class_id == 6:
+            error_type = ErrorType.OMISSION
+        elif class_id == 3 or class_id == 7:
+            error_type = ErrorType.DISTORTION
+        else:
+            logger.warning(f"Unknown class_id: {class_id}, defaulting to NORMAL")
+            error_type = ErrorType.NORMAL
+        # Calculate severity from confidence
+        # Higher confidence in error = higher severity
+        if error_type == ErrorType.NORMAL:
+            severity = 0.0
+        else:
+            severity = confidence  # Use confidence as severity proxy
+        # Get wrong sound for substitutions
+        wrong_sound = None
+        if error_type == ErrorType.SUBSTITUTION:
+            wrong_sound = self._map_substitution(phoneme, confidence)
+        # Get therapy recommendation
+        therapy = self.get_therapy(error_type, phoneme, wrong_sound)
+        return ErrorDetail(
+            phoneme=phoneme,
+            error_type=error_type,
+            wrong_sound=wrong_sound,
+            severity=severity,
+            confidence=confidence,
+            therapy=therapy
+        )
+    def _map_substitution(self, phoneme: str, confidence: float) -> Optional[str]:
+        """
+        Map substitution error to likely wrong sound.
+        Args:
+            phoneme: Expected phoneme
+            confidence: Model confidence
+        Returns:
+            Most likely wrong phoneme, or None if unknown
+        """
+        if phoneme in self.substitution_map:
+            # Return first (most common) substitution
+            return self.substitution_map[phoneme][0]
+        return None
+    def get_therapy(
+        self,
+        error_type: ErrorType,
+        phoneme: str,
+        wrong_sound: Optional[str] = None
+    ) -> str:
+        """
+        Get therapy recommendation for an error.
+        Args:
+            error_type: Type of error
+            phoneme: Expected phoneme
+            wrong_sound: For substitutions, the wrong sound produced
+        Returns:
+            Therapy recommendation text
+        """
+        if error_type == ErrorType.NORMAL:
+            return "No therapy needed - production is correct."
+        # Build lookup key
+        if error_type == ErrorType.SUBSTITUTION and wrong_sound:
+            key = f"{phoneme}→{wrong_sound}"
+            if "substitutions" in self.therapy_db and key in self.therapy_db["substitutions"]:
+                return self.therapy_db["substitutions"][key]
+        # Fallback to generic recommendations
+        if error_type == ErrorType.SUBSTITUTION:
+            if "substitutions" in self.therapy_db and "generic" in self.therapy_db["substitutions"]:
+                return self.therapy_db["substitutions"]["generic"].replace("{phoneme}", phoneme)
+            return f"Substitution error for {phoneme}. Practice correct articulator placement."
+        elif error_type == ErrorType.OMISSION:
+            if "omissions" in self.therapy_db and phoneme in self.therapy_db["omissions"]:
+                return self.therapy_db["omissions"][phoneme]
+            if "omissions" in self.therapy_db and "generic" in self.therapy_db["omissions"]:
+                return self.therapy_db["omissions"]["generic"].replace("{phoneme}", phoneme)
+            return f"Omission error for {phoneme}. Practice saying the sound separately first."
+        elif error_type == ErrorType.DISTORTION:
+            if "distortions" in self.therapy_db and phoneme in self.therapy_db["distortions"]:
+                return self.therapy_db["distortions"][phoneme]
+            if "distortions" in self.therapy_db and "generic" in self.therapy_db["distortions"]:
+                return self.therapy_db["distortions"]["generic"].replace("{phoneme}", phoneme)
+            return f"Distortion error for {phoneme}. Use mirror feedback and watch articulator position."
+        return "Consult with speech-language pathologist for personalized therapy plan."
+    def get_severity_level(self, severity: float) -> SeverityLevel:
+        """
+        Convert severity score to severity level.
+        Args:
+            severity: Severity score (0.0-1.0)
+        Returns:
+            SeverityLevel enum
+        """
+        if severity == 0.0:
+            return SeverityLevel.NONE
+        elif severity < 0.3:
+            return SeverityLevel.LOW
+        elif severity < 0.7:
+            return SeverityLevel.MEDIUM
+        else:
+            return SeverityLevel.HIGH
+    def _get_default_therapy_db(self) -> Dict:
+        """Get default therapy database if file not found."""
+        return {
+            "substitutions": {
+                "/s/→/θ/": "Lisp - Use tongue tip placement behind upper teeth. Practice /s/ in isolation.",
+                "/r/→/w/": "Rhotacism - Practice tongue position: curl tongue back, avoid lip rounding.",
+                "/r/→/l/": "Rhotacism - Focus on tongue tip position vs. tongue body placement.",
+                "generic": "Substitution error for {phoneme}. Practice correct articulator placement with mirror feedback."
+            },
+            "omissions": {
+                "/r/": "Practice /r/ in isolation, then in CV syllables (ra, re, ri, ro, ru).",
+                "/l/": "Lateral tongue placement - practice with tongue tip up to alveolar ridge.",
+                "/s/": "Practice /s/ with tongue tip placement, use mirror to check position.",
+                "generic": "Omission error for {phoneme}. Say the sound separately first, then blend into words."
+            },
+            "distortions": {
+                "/s/": "Sibilant clarity - use mirror feedback, ensure tongue tip is up and air stream is central.",
+                "/ʃ/": "Fricative voicing exercise - practice /sh/ vs /s/ distinction.",
+                "/r/": "Rhotacism - practice tongue position and lip rounding control.",
+                "generic": "Distortion error for {phoneme}. Use mirror feedback and watch articulator position carefully."
+            }
+        }
+# Unit test function
+def test_error_mapper():
+    """Test the ErrorMapper."""
+    print("Testing ErrorMapper...")
+    mapper = ErrorMapper()
+    # Test 1: Normal (class 0)
+    error = mapper.map_classifier_output(0, 0.95, "/k/")
+    assert error.error_type == ErrorType.NORMAL
+    assert error.severity == 0.0
+    print(f"✅ Normal error: {error.error_type}, therapy: {error.therapy[:50]}...")
+    # Test 2: Substitution (class 1)
+    error = mapper.map_classifier_output(1, 0.78, "/s/")
+    assert error.error_type == ErrorType.SUBSTITUTION
+    assert error.wrong_sound is not None
+    print(f"✅ Substitution error: {error.error_type}, wrong_sound: {error.wrong_sound}")
+    print(f"   Therapy: {error.therapy[:80]}...")
+    # Test 3: Omission (class 2)
+    error = mapper.map_classifier_output(2, 0.85, "/r/")
+    assert error.error_type == ErrorType.OMISSION
+    print(f"✅ Omission error: {error.error_type}")
+    print(f"   Therapy: {error.therapy[:80]}...")
+    # Test 4: Distortion (class 3)
+    error = mapper.map_classifier_output(3, 0.65, "/s/")
+    assert error.error_type == ErrorType.DISTORTION
+    print(f"✅ Distortion error: {error.error_type}")
+    print(f"   Therapy: {error.therapy[:80]}...")
+    # Test 5: Severity levels
+    assert mapper.get_severity_level(0.0) == SeverityLevel.NONE
+    assert mapper.get_severity_level(0.2) == SeverityLevel.LOW
+    assert mapper.get_severity_level(0.5) == SeverityLevel.MEDIUM
+    assert mapper.get_severity_level(0.8) == SeverityLevel.HIGH
+    print("✅ Severity level mapping correct")
+    print("\n✅ All tests passed!")
+if __name__ == "__main__":
+    test_error_mapper()

models/phoneme_mapper.py ADDED Viewed

	@@ -0,0 +1,364 @@

+"""
+Phoneme Mapper for Speech Pathology Analysis
+This module provides grapheme-to-phoneme (G2P) conversion and alignment
+of phonemes to audio frames for phone-level error detection.
+"""
+import logging
+from typing import List, Tuple, Optional, Dict
+from dataclasses import dataclass
+import numpy as np
+try:
+    import g2p_en
+    G2P_AVAILABLE = True
+except ImportError:
+    G2P_AVAILABLE = False
+    logging.warning("g2p_en not available. Install with: pip install g2p-en")
+logger = logging.getLogger(__name__)
+@dataclass
+class PhonemeSegment:
+    """
+    Represents a phoneme segment with timing information.
+    Attributes:
+        phoneme: Phoneme symbol (e.g., '/r/', '/k/')
+        start_time: Start time in seconds
+        end_time: End time in seconds
+        duration: Duration in seconds
+        frame_start: Starting frame index
+        frame_end: Ending frame index (exclusive)
+    """
+    phoneme: str
+    start_time: float
+    end_time: float
+    duration: float
+    frame_start: int
+    frame_end: int
+class PhonemeMapper:
+    """
+    Maps text to phonemes and aligns them to audio frames.
+    Uses g2p_en library for English grapheme-to-phoneme conversion.
+    Aligns phonemes to 20ms frames for phone-level analysis.
+    Example:
+        >>> mapper = PhonemeMapper()
+        >>> phonemes = mapper.text_to_phonemes("robot")
+        >>> # Returns: [('/r/', 0.0), ('/o/', 0.1), ('/b/', 0.2), ('/o/', 0.3), ('/t/', 0.4)]
+        >>> frame_phonemes = mapper.align_phonemes_to_frames(phonemes, num_frames=25, frame_duration_ms=20)
+        >>> # Returns: ['/r/', '/r/', '/r/', '/o/', '/o/', '/b/', '/b/', ...]
+    """
+    def __init__(self, frame_duration_ms: int = 20, sample_rate: int = 16000):
+        """
+        Initialize the PhonemeMapper.
+        Args:
+            frame_duration_ms: Duration of each frame in milliseconds (default: 20ms)
+            sample_rate: Audio sample rate in Hz (default: 16000)
+        Raises:
+            ImportError: If g2p_en is not available
+        """
+        if not G2P_AVAILABLE:
+            raise ImportError(
+                "g2p_en library is required. Install with: pip install g2p-en"
+            )
+        try:
+            self.g2p = g2p_en.G2p()
+            logger.info("✅ G2P model loaded successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to load G2P model: {e}")
+            raise
+        self.frame_duration_ms = frame_duration_ms
+        self.frame_duration_s = frame_duration_ms / 1000.0
+        self.sample_rate = sample_rate
+        # Average phoneme duration (typical English: 50-100ms)
+        # We'll use 80ms as default, but adjust based on text length
+        self.avg_phoneme_duration_ms = 80
+        self.avg_phoneme_duration_s = self.avg_phoneme_duration_ms / 1000.0
+        logger.info(f"PhonemeMapper initialized: frame_duration={frame_duration_ms}ms, "
+                   f"avg_phoneme_duration={self.avg_phoneme_duration_ms}ms")
+    def text_to_phonemes(
+        self,
+        text: str,
+        duration: Optional[float] = None
+    ) -> List[Tuple[str, float]]:
+        """
+        Convert text to phonemes with timing information.
+        Args:
+            text: Input text string (e.g., "robot", "cat")
+            duration: Optional audio duration in seconds. If provided, phonemes
+                     are distributed evenly across this duration. If None, uses
+                     estimated duration based on phoneme count.
+        Returns:
+            List of tuples: [(phoneme, start_time), ...]
+            - phoneme: Phoneme symbol with slashes (e.g., '/r/', '/k/')
+            - start_time: Start time in seconds
+        Example:
+            >>> mapper = PhonemeMapper()
+            >>> phonemes = mapper.text_to_phonemes("cat")
+            >>> # Returns: [('/k/', 0.0), ('/æ/', 0.08), ('/t/', 0.16)]
+        """
+        if not text or not text.strip():
+            logger.warning("Empty text provided, returning empty phoneme list")
+            return []
+        try:
+            # Convert to phonemes using g2p_en
+            phoneme_list = self.g2p(text.lower().strip())
+            # Filter out punctuation and empty strings
+            phoneme_list = [p for p in phoneme_list if p and p.strip() and not p.isspace()]
+            if not phoneme_list:
+                logger.warning(f"No phonemes extracted from text: '{text}'")
+                return []
+            # Add slashes if not present
+            formatted_phonemes = []
+            for p in phoneme_list:
+                if not p.startswith('/'):
+                    p = '/' + p
+                if not p.endswith('/'):
+                    p = p + '/'
+                formatted_phonemes.append(p)
+            logger.debug(f"Extracted {len(formatted_phonemes)} phonemes from '{text}': {formatted_phonemes}")
+            # Calculate timing
+            if duration is None:
+                # Estimate duration: avg_phoneme_duration * num_phonemes
+                total_duration = len(formatted_phonemes) * self.avg_phoneme_duration_s
+            else:
+                total_duration = duration
+            # Distribute phonemes evenly across duration
+            if len(formatted_phonemes) == 1:
+                phoneme_duration = total_duration
+            else:
+                phoneme_duration = total_duration / len(formatted_phonemes)
+            # Create phoneme-time pairs
+            phoneme_times = []
+            for i, phoneme in enumerate(formatted_phonemes):
+                start_time = i * phoneme_duration
+                phoneme_times.append((phoneme, start_time))
+            logger.info(f"Converted '{text}' to {len(phoneme_times)} phonemes over {total_duration:.2f}s")
+            return phoneme_times
+        except Exception as e:
+            logger.error(f"Error converting text to phonemes: {e}", exc_info=True)
+            raise RuntimeError(f"Failed to convert text to phonemes: {e}") from e
+    def align_phonemes_to_frames(
+        self,
+        phoneme_times: List[Tuple[str, float]],
+        num_frames: int,
+        frame_duration_ms: Optional[int] = None
+    ) -> List[str]:
+        """
+        Align phonemes to audio frames.
+        Each frame gets assigned the phoneme that overlaps with its time window.
+        If multiple phonemes overlap, uses the one with the most overlap.
+        Args:
+            phoneme_times: List of (phoneme, start_time) tuples from text_to_phonemes()
+            num_frames: Total number of frames in the audio
+            frame_duration_ms: Optional frame duration override
+        Returns:
+            List of phonemes, one per frame: ['/r/', '/r/', '/o/', '/b/', ...]
+        Example:
+            >>> mapper = PhonemeMapper()
+            >>> phonemes = [('/k/', 0.0), ('/æ/', 0.08), ('/t/', 0.16)]
+            >>> frames = mapper.align_phonemes_to_frames(phonemes, num_frames=15, frame_duration_ms=20)
+            >>> # Returns: ['/k/', '/k/', '/k/', '/k/', '/æ/', '/æ/', '/æ/', '/æ/', '/t/', ...]
+        """
+        if not phoneme_times:
+            logger.warning("No phonemes provided, returning empty frame list")
+            return [''] * num_frames
+        frame_duration_s = (frame_duration_ms / 1000.0) if frame_duration_ms else self.frame_duration_s
+        # Calculate phoneme end times (assume equal duration for simplicity)
+        phoneme_segments = []
+        for i, (phoneme, start_time) in enumerate(phoneme_times):
+            if i < len(phoneme_times) - 1:
+                end_time = phoneme_times[i + 1][1]
+            else:
+                # Last phoneme: estimate duration
+                if len(phoneme_times) > 1:
+                    avg_duration = phoneme_times[1][1] - phoneme_times[0][1]
+                else:
+                    avg_duration = self.avg_phoneme_duration_s
+                end_time = start_time + avg_duration
+            phoneme_segments.append(PhonemeSegment(
+                phoneme=phoneme,
+                start_time=start_time,
+                end_time=end_time,
+                duration=end_time - start_time,
+                frame_start=-1,  # Will be calculated
+                frame_end=-1
+            ))
+        # Map each frame to a phoneme
+        frame_phonemes = []
+        for frame_idx in range(num_frames):
+            frame_start_time = frame_idx * frame_duration_s
+            frame_end_time = (frame_idx + 1) * frame_duration_s
+            frame_center_time = frame_start_time + (frame_duration_s / 2.0)
+            # Find phoneme with most overlap
+            best_phoneme = ''
+            max_overlap = 0.0
+            for seg in phoneme_segments:
+                # Calculate overlap
+                overlap_start = max(frame_start_time, seg.start_time)
+                overlap_end = min(frame_end_time, seg.end_time)
+                overlap = max(0.0, overlap_end - overlap_start)
+                if overlap > max_overlap:
+                    max_overlap = overlap
+                    best_phoneme = seg.phoneme
+            # If no overlap, use closest phoneme
+            if not best_phoneme:
+                closest_seg = min(
+                    phoneme_segments,
+                    key=lambda s: abs(frame_center_time - (s.start_time + s.duration / 2))
+                )
+                best_phoneme = closest_seg.phoneme
+            frame_phonemes.append(best_phoneme)
+        logger.debug(f"Aligned {len(phoneme_times)} phonemes to {num_frames} frames")
+        return frame_phonemes
+    def get_phoneme_boundaries(
+        self,
+        phoneme_times: List[Tuple[str, float]],
+        duration: float
+    ) -> List[PhonemeSegment]:
+        """
+        Get detailed phoneme boundary information.
+        Args:
+            phoneme_times: List of (phoneme, start_time) tuples
+            duration: Total audio duration in seconds
+        Returns:
+            List of PhonemeSegment objects with timing and frame information
+        """
+        segments = []
+        for i, (phoneme, start_time) in enumerate(phoneme_times):
+            if i < len(phoneme_times) - 1:
+                end_time = phoneme_times[i + 1][1]
+            else:
+                end_time = duration
+            frame_start = int(start_time / self.frame_duration_s)
+            frame_end = int(end_time / self.frame_duration_s)
+            segments.append(PhonemeSegment(
+                phoneme=phoneme,
+                start_time=start_time,
+                end_time=end_time,
+                duration=end_time - start_time,
+                frame_start=frame_start,
+                frame_end=frame_end
+            ))
+        return segments
+    def map_text_to_frames(
+        self,
+        text: str,
+        num_frames: int,
+        audio_duration: Optional[float] = None
+    ) -> List[str]:
+        """
+        Complete pipeline: text → phonemes → frame alignment.
+        Args:
+            text: Input text string
+            num_frames: Number of audio frames
+            audio_duration: Optional audio duration in seconds
+        Returns:
+            List of phonemes, one per frame
+        """
+        # Convert text to phonemes
+        phoneme_times = self.text_to_phonemes(text, duration=audio_duration)
+        if not phoneme_times:
+            return [''] * num_frames
+        # Align to frames
+        frame_phonemes = self.align_phonemes_to_frames(phoneme_times, num_frames)
+        return frame_phonemes
+# Unit test function
+def test_phoneme_mapper():
+    """Test the PhonemeMapper with example text."""
+    print("Testing PhonemeMapper...")
+    try:
+        mapper = PhonemeMapper(frame_duration_ms=20)
+        # Test 1: Simple word
+        print("\n1. Testing 'robot':")
+        phonemes = mapper.text_to_phonemes("robot")
+        print(f"   Phonemes: {phonemes}")
+        assert len(phonemes) > 0, "Should extract phonemes"
+        # Test 2: Frame alignment
+        print("\n2. Testing frame alignment:")
+        frame_phonemes = mapper.align_phonemes_to_frames(phonemes, num_frames=25)
+        print(f"   Frame phonemes (first 10): {frame_phonemes[:10]}")
+        assert len(frame_phonemes) == 25, "Should have 25 frames"
+        # Test 3: Complete pipeline
+        print("\n3. Testing complete pipeline with 'cat':")
+        cat_frames = mapper.map_text_to_frames("cat", num_frames=15)
+        print(f"   Frame phonemes: {cat_frames}")
+        assert len(cat_frames) == 15, "Should have 15 frames"
+        print("\n✅ All tests passed!")
+    except ImportError as e:
+        print(f"❌ G2P library not available: {e}")
+        print("   Install with: pip install g2p-en")
+    except Exception as e:
+        print(f"❌ Test failed: {e}")
+        raise
+if __name__ == "__main__":
+    test_phoneme_mapper()

models/speech_pathology_model.py CHANGED Viewed

@@ -11,7 +11,7 @@ import logging
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from transformers import Wav2Vec2Model, Wav2Vec2Processor, Wav2Vec2Config
 from typing import Dict, Optional, Tuple, List
 import os
@@ -51,7 +51,7 @@ class MultiTaskClassifierHead(nn.Module):
         self.num_articulation_classes = num_articulation_classes
-        # Build shared feature layers
         layers = []
         prev_dim = input_dim
@@ -67,15 +67,15 @@ class MultiTaskClassifierHead(nn.Module):
         self.shared_layers = nn.Sequential(*layers)
         shared_output_dim = prev_dim
-        # Fluency head (binary classification: fluent vs disfluent)
         self.fluency_head = nn.Sequential(
             nn.Linear(shared_output_dim, 64),
             nn.ReLU(),
             nn.Dropout(dropout),
-            nn.Linear(64, 1),  # Binary output (sigmoid)
         )
-        # Articulation head (multi-class classification)
         self.articulation_head = nn.Sequential(
             nn.Linear(shared_output_dim, 64),
             nn.ReLU(),
@@ -83,6 +83,14 @@ class MultiTaskClassifierHead(nn.Module):
             nn.Linear(64, num_articulation_classes),  # 4 classes
         )
         logger.info(
             f"Initialized MultiTaskClassifierHead: "
             f"input_dim={input_dim}, hidden_dims={hidden_dims}, "
@@ -124,18 +132,23 @@ class MultiTaskClassifierHead(nn.Module):
         shared_features = self.shared_layers(pooled_features)
         # Task-specific heads
-        fluency_logits = self.fluency_head(shared_features)
-        articulation_logits = self.articulation_head(shared_features)
         # Apply activations
-        fluency_probs = torch.sigmoid(fluency_logits)
-        articulation_probs = F.softmax(articulation_logits, dim=-1)
         return {
             "fluency_logits": fluency_logits,
             "articulation_logits": articulation_logits,
             "fluency_probs": fluency_probs,
             "articulation_probs": articulation_probs,
         }
@@ -210,13 +223,15 @@ class SpeechPathologyClassifier(nn.Module):
             # Load Wav2Vec2 model and processor
             hf_token = os.getenv("HF_TOKEN")
-            logger.info("Loading Wav2Vec2 model and processor...")
             self.wav2vec2_model = Wav2Vec2Model.from_pretrained(
                 model_name,
                 token=hf_token if hf_token else None
             )
-            self.processor = Wav2Vec2Processor.from_pretrained(
                 model_name,
                 token=hf_token if hf_token else None
             )
@@ -281,16 +296,49 @@ class SpeechPathologyClassifier(nn.Module):
                 - articulation_probs: Articulation class probabilities
                 - wav2vec2_features: Raw Wav2Vec2 features (for debugging)
         """
         # Extract features using Wav2Vec2
-        with torch.no_grad() if not self.training else torch.enable_grad():
-            wav2vec2_outputs = self.wav2vec2_model(
-                input_values=input_values,
-                attention_mask=attention_mask
-            )
         # Get last hidden state (features)
         features = wav2vec2_outputs.last_hidden_state  # (batch_size, seq_len, feature_dim)
         # Pass through classifier head
         outputs = self.classifier_head(features, attention_mask)

 import torch
 import torch.nn as nn
 from torch.nn import functional as F
+from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor, Wav2Vec2Config
 from typing import Dict, Optional, Tuple, List
 import os
         self.num_articulation_classes = num_articulation_classes
+        # Build shared feature layers: 1024 → 512 → 256
         layers = []
         prev_dim = input_dim
         self.shared_layers = nn.Sequential(*layers)
         shared_output_dim = prev_dim
+        # Fluency head: 256 → 64 → 2 (stutter/normal)
         self.fluency_head = nn.Sequential(
             nn.Linear(shared_output_dim, 64),
             nn.ReLU(),
             nn.Dropout(dropout),
+            nn.Linear(64, 2),  # 2 classes: stutter/normal
         )
+        # Articulation head: 256 → 64 → 4 (normal/sub/omit/dist)
         self.articulation_head = nn.Sequential(
             nn.Linear(shared_output_dim, 64),
             nn.ReLU(),
             nn.Linear(64, num_articulation_classes),  # 4 classes
         )
+        # Full combined head: 256 → 128 → 8 (all classes combined)
+        self.full_head = nn.Sequential(
+            nn.Linear(shared_output_dim, 128),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(128, 8),  # 8 classes (combined fluency + articulation)
+        )
         logger.info(
             f"Initialized MultiTaskClassifierHead: "
             f"input_dim={input_dim}, hidden_dims={hidden_dims}, "
         shared_features = self.shared_layers(pooled_features)
         # Task-specific heads
+        fluency_logits = self.fluency_head(shared_features)  # (batch, 2)
+        articulation_logits = self.articulation_head(shared_features)  # (batch, 4)
+        full_logits = self.full_head(shared_features)  # (batch, 8)
         # Apply activations
+        fluency_probs = F.softmax(fluency_logits, dim=-1)  # (batch, 2)
+        articulation_probs = F.softmax(articulation_logits, dim=-1)  # (batch, 4)
+        full_probs = F.softmax(full_logits, dim=-1)  # (batch, 8)
         return {
             "fluency_logits": fluency_logits,
             "articulation_logits": articulation_logits,
+            "full_logits": full_logits,
             "fluency_probs": fluency_probs,
             "articulation_probs": articulation_probs,
+            "full_probs": full_probs,
+            "shared_features": shared_features,
         }
             # Load Wav2Vec2 model and processor
             hf_token = os.getenv("HF_TOKEN")
+            logger.info("Loading Wav2Vec2 model and feature extractor...")
             self.wav2vec2_model = Wav2Vec2Model.from_pretrained(
                 model_name,
                 token=hf_token if hf_token else None
             )
+            # Use FeatureExtractor instead of Processor for feature extraction tasks
+            # Processor includes tokenizer which requires vocab file (not available for pre-trained models)
+            self.processor = Wav2Vec2FeatureExtractor.from_pretrained(
                 model_name,
                 token=hf_token if hf_token else None
             )
                 - articulation_probs: Articulation class probabilities
                 - wav2vec2_features: Raw Wav2Vec2 features (for debugging)
         """
+        # #region agent log
+        try:
+            with open(r'c:\Users\kpanfas\Desktop\zlaqa\slaq-version-d-to-a\zlaqa-version-b\ai-enginee\zlaqa-version-b-ai-enginee\.cursor\debug.log', 'a') as f:
+                import json, time
+                f.write(json.dumps({"sessionId":"debug-session","runId":"run1","hypothesisId":"D","location":"speech_pathology_model.py:288","message":"Before Wav2Vec2 forward","data":{"input_values_shape":list(input_values.shape)},"timestamp":int(time.time()*1000)}) + '\n')
+        except: pass
+        # #endregion
         # Extract features using Wav2Vec2
+        try:
+            with torch.no_grad() if not self.training else torch.enable_grad():
+                wav2vec2_outputs = self.wav2vec2_model(
+                    input_values=input_values,
+                    attention_mask=attention_mask
+                )
+        except Exception as e:
+            # #region agent log
+            try:
+                with open(r'c:\Users\kpanfas\Desktop\zlaqa\slaq-version-d-to-a\zlaqa-version-b\ai-enginee\zlaqa-version-b-ai-enginee\.cursor\debug.log', 'a') as f:
+                    import json, time
+                    f.write(json.dumps({"sessionId":"debug-session","runId":"run1","hypothesisId":"D","location":"speech_pathology_model.py:288","message":"Wav2Vec2 forward exception","data":{"error":str(e),"error_type":type(e).__name__,"input_shape":list(input_values.shape)},"timestamp":int(time.time()*1000)}) + '\n')
+            except: pass
+            # #endregion
+            raise
         # Get last hidden state (features)
         features = wav2vec2_outputs.last_hidden_state  # (batch_size, seq_len, feature_dim)
+        # #region agent log
+        try:
+            with open(r'c:\Users\kpanfas\Desktop\zlaqa\slaq-version-d-to-a\zlaqa-version-b\ai-enginee\zlaqa-version-b-ai-enginee\.cursor\debug.log', 'a') as f:
+                import json, time
+                f.write(json.dumps({"sessionId":"debug-session","runId":"run1","hypothesisId":"D","location":"speech_pathology_model.py:297","message":"After Wav2Vec2 forward","data":{"features_shape":list(features.shape),"seq_len":features.shape[1] if len(features.shape) > 1 else 0},"timestamp":int(time.time()*1000)}) + '\n')
+        except: pass
+        # #endregion
+        # Safety check: ensure sequence length is valid (at least 1)
+        if features.shape[1] < 1:
+            raise ValueError(
+                f"Wav2Vec2 output sequence length is too short: {features.shape[1]}. "
+                f"Input was {input_values.shape}. Try using longer audio segments (>= 500ms)."
+            )
         # Pass through classifier head
         outputs = self.classifier_head(features, attention_mask)

tests/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Test module for speech pathology diagnosis system.
+"""

tests/integration_tests.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Integration tests for speech pathology diagnosis API.
+Tests API endpoints, error mapping, and therapy recommendations.
+"""
+import logging
+import numpy as np
+import tempfile
+import soundfile as sf
+from pathlib import Path
+import json
+logger = logging.getLogger(__name__)
+def test_phoneme_mapping():
+    """Test phoneme mapping functionality."""
+    logger.info("Testing phoneme mapping...")
+    try:
+        from models.phoneme_mapper import PhonemeMapper
+        mapper = PhonemeMapper(frame_duration_ms=20)
+        # Test 1: Simple word
+        phonemes = mapper.text_to_phonemes("robot")
+        assert len(phonemes) > 0, "Should extract phonemes"
+        logger.info(f"✅ 'robot' → {len(phonemes)} phonemes: {[p[0] for p in phonemes]}")
+        # Test 2: Frame alignment
+        frame_phonemes = mapper.align_phonemes_to_frames(phonemes, num_frames=25)
+        assert len(frame_phonemes) == 25, "Should have 25 frames"
+        logger.info(f"✅ Aligned to {len(frame_phonemes)} frames")
+        # Test 3: Complete pipeline
+        cat_frames = mapper.map_text_to_frames("cat", num_frames=15)
+        assert len(cat_frames) == 15, "Should have 15 frames"
+        logger.info(f"✅ 'cat' → {len(cat_frames)} frame phonemes")
+        return True
+    except ImportError as e:
+        logger.warning(f"⚠️ G2P library not available: {e}")
+        return False
+    except Exception as e:
+        logger.error(f"❌ Phoneme mapping test failed: {e}")
+        return False
+def test_error_taxonomy():
+    """Test error taxonomy and therapy mapping."""
+    logger.info("Testing error taxonomy...")
+    try:
+        from models.error_taxonomy import ErrorMapper, ErrorType, SeverityLevel
+        mapper = ErrorMapper()
+        # Test 1: Normal (class 0)
+        error = mapper.map_classifier_output(0, 0.95, "/k/")
+        assert error.error_type == ErrorType.NORMAL
+        assert error.severity == 0.0
+        logger.info(f"✅ Normal error mapping: {error.error_type}")
+        # Test 2: Substitution (class 1)
+        error = mapper.map_classifier_output(1, 0.78, "/s/")
+        assert error.error_type == ErrorType.SUBSTITUTION
+        assert error.wrong_sound is not None
+        logger.info(f"✅ Substitution error: {error.error_type}, wrong_sound={error.wrong_sound}")
+        logger.info(f"   Therapy: {error.therapy[:60]}...")
+        # Test 3: Omission (class 2)
+        error = mapper.map_classifier_output(2, 0.85, "/r/")
+        assert error.error_type == ErrorType.OMISSION
+        logger.info(f"✅ Omission error: {error.error_type}")
+        logger.info(f"   Therapy: {error.therapy[:60]}...")
+        # Test 4: Distortion (class 3)
+        error = mapper.map_classifier_output(3, 0.65, "/s/")
+        assert error.error_type == ErrorType.DISTORTION
+        logger.info(f"✅ Distortion error: {error.error_type}")
+        logger.info(f"   Therapy: {error.therapy[:60]}...")
+        # Test 5: Severity levels
+        assert mapper.get_severity_level(0.0) == SeverityLevel.NONE
+        assert mapper.get_severity_level(0.2) == SeverityLevel.LOW
+        assert mapper.get_severity_level(0.5) == SeverityLevel.MEDIUM
+        assert mapper.get_severity_level(0.8) == SeverityLevel.HIGH
+        logger.info("✅ Severity level mapping correct")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Error taxonomy test failed: {e}")
+        return False
+def test_batch_diagnosis_endpoint(pipeline, phoneme_mapper, error_mapper):
+    """Test batch diagnosis endpoint functionality."""
+    logger.info("Testing batch diagnosis endpoint...")
+    try:
+        # Generate test audio
+        duration = 2.0
+        sample_rate = 16000
+        num_samples = int(duration * sample_rate)
+        audio = 0.5 * np.sin(2 * np.pi * 440 * np.linspace(0, duration, num_samples))
+        audio = audio.astype(np.float32)
+        # Save to temp file
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+            temp_path = f.name
+            sf.write(temp_path, audio, sample_rate)
+        try:
+            # Run inference
+            result = pipeline.predict_phone_level(temp_path, return_timestamps=True)
+            # Map phonemes
+            text = "test audio"
+            frame_phonemes = phoneme_mapper.map_text_to_frames(
+                text,
+                num_frames=result.num_frames,
+                audio_duration=result.duration
+            )
+            # Process errors
+            errors = []
+            for i, frame_pred in enumerate(result.frame_predictions):
+                class_id = frame_pred.articulation_class
+                if frame_pred.fluency_label == 'stutter':
+                    class_id += 4
+                error_detail = error_mapper.map_classifier_output(
+                    class_id=class_id,
+                    confidence=frame_pred.confidence,
+                    phoneme=frame_phonemes[i] if i < len(frame_phonemes) else '',
+                    fluency_label=frame_pred.fluency_label
+                )
+                if error_detail.error_type != ErrorType.NORMAL:
+                    errors.append(error_detail)
+            logger.info(f"✅ Batch diagnosis: {result.num_frames} frames, {len(errors)} errors detected")
+            return True
+        finally:
+            import os
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+    except Exception as e:
+        logger.error(f"❌ Batch diagnosis test failed: {e}")
+        return False
+def test_therapy_recommendations():
+    """Test therapy recommendation coverage."""
+    logger.info("Testing therapy recommendations...")
+    try:
+        from models.error_taxonomy import ErrorMapper, ErrorType
+        mapper = ErrorMapper()
+        # Test common phonemes
+        test_cases = [
+            ("/s/", ErrorType.SUBSTITUTION, "/θ/"),
+            ("/r/", ErrorType.OMISSION, None),
+            ("/s/", ErrorType.DISTORTION, None),
+        ]
+        for phoneme, error_type, wrong_sound in test_cases:
+            therapy = mapper.get_therapy(error_type, phoneme, wrong_sound)
+            assert therapy and len(therapy) > 0, f"Therapy should not be empty for {phoneme}"
+            logger.info(f"✅ {phoneme} {error_type.value}: {therapy[:50]}...")
+        return True
+    except Exception as e:
+        logger.error(f"❌ Therapy recommendations test failed: {e}")
+        return False
+def run_all_integration_tests():
+    """Run all integration tests."""
+    logger.info("=" * 60)
+    logger.info("Running Integration Tests")
+    logger.info("=" * 60)
+    results = {}
+    # Test 1: Phoneme mapping
+    logger.info("\n1. Phoneme Mapping Test")
+    results["phoneme_mapping"] = test_phoneme_mapping()
+    # Test 2: Error taxonomy
+    logger.info("\n2. Error Taxonomy Test")
+    results["error_taxonomy"] = test_error_taxonomy()
+    # Test 3: Therapy recommendations
+    logger.info("\n3. Therapy Recommendations Test")
+    results["therapy_recommendations"] = test_therapy_recommendations()
+    # Test 4: Batch diagnosis (if pipeline available)
+    try:
+        from inference.inference_pipeline import create_inference_pipeline
+        from models.phoneme_mapper import PhonemeMapper
+        from models.error_taxonomy import ErrorMapper
+        logger.info("\n4. Batch Diagnosis Test")
+        pipeline = create_inference_pipeline()
+        phoneme_mapper = PhonemeMapper()
+        error_mapper = ErrorMapper()
+        results["batch_diagnosis"] = test_batch_diagnosis_endpoint(
+            pipeline, phoneme_mapper, error_mapper
+        )
+    except Exception as e:
+        logger.warning(f"⚠️ Batch diagnosis test skipped: {e}")
+        results["batch_diagnosis"] = False
+    # Summary
+    logger.info("\n" + "=" * 60)
+    logger.info("Integration Test Summary")
+    logger.info("=" * 60)
+    for test_name, passed in results.items():
+        status = "✅ PASSED" if passed else "❌ FAILED"
+        logger.info(f"{status}: {test_name}")
+    all_passed = all(results.values())
+    return all_passed, results
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    all_passed, results = run_all_integration_tests()
+    if all_passed:
+        logger.info("\n✅ All integration tests passed!")
+        exit(0)
+    else:
+        logger.error("\n❌ Some integration tests failed!")
+        exit(1)

tests/performance_tests.py ADDED Viewed

	@@ -0,0 +1,344 @@

+"""
+Performance tests for speech pathology diagnosis system.
+Tests latency requirements:
+- File batch: <200ms per file
+- Per-frame: <50ms
+- WebSocket roundtrip: <100ms
+"""
+import time
+import numpy as np
+import logging
+from pathlib import Path
+import asyncio
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+def generate_test_audio(duration_seconds: float = 1.0, sample_rate: int = 16000) -> np.ndarray:
+    """
+    Generate synthetic test audio.
+    Args:
+        duration_seconds: Duration in seconds
+        sample_rate: Sample rate in Hz
+    Returns:
+        Audio array
+    """
+    num_samples = int(duration_seconds * sample_rate)
+    # Generate simple sine wave
+    t = np.linspace(0, duration_seconds, num_samples)
+    audio = 0.5 * np.sin(2 * np.pi * 440 * t)  # 440 Hz tone
+    return audio.astype(np.float32)
+def test_batch_latency(pipeline, num_files: int = 10) -> Dict[str, float]:
+    """
+    Test batch file processing latency.
+    Args:
+        pipeline: InferencePipeline instance
+        num_files: Number of test files to process
+    Returns:
+        Dictionary with latency statistics
+    """
+    logger.info(f"Testing batch latency with {num_files} files...")
+    latencies = []
+    for i in range(num_files):
+        # Generate test audio
+        audio = generate_test_audio(duration_seconds=1.0)
+        # Save to temp file
+        import tempfile
+        import soundfile as sf
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+            temp_path = f.name
+            sf.write(temp_path, audio, 16000)
+        try:
+            start_time = time.time()
+            result = pipeline.predict_phone_level(temp_path, return_timestamps=True)
+            latency_ms = (time.time() - start_time) * 1000
+            latencies.append(latency_ms)
+            logger.info(f"  File {i+1}: {latency_ms:.1f}ms ({result.num_frames} frames)")
+        except Exception as e:
+            logger.error(f"  File {i+1} failed: {e}")
+        finally:
+            import os
+            if os.path.exists(temp_path):
+                os.remove(temp_path)
+    if not latencies:
+        return {"error": "No successful runs"}
+    avg_latency = sum(latencies) / len(latencies)
+    max_latency = max(latencies)
+    min_latency = min(latencies)
+    result = {
+        "avg_latency_ms": avg_latency,
+        "max_latency_ms": max_latency,
+        "min_latency_ms": min_latency,
+        "num_files": len(latencies),
+        "target_ms": 200.0,
+        "passed": avg_latency < 200.0
+    }
+    logger.info(f"✅ Batch latency test: avg={avg_latency:.1f}ms, max={max_latency:.1f}ms, "
+               f"target=200ms, passed={result['passed']}")
+    return result
+def test_frame_latency(pipeline, num_frames: int = 100) -> Dict[str, float]:
+    """
+    Test per-frame processing latency.
+    Args:
+        pipeline: InferencePipeline instance
+        num_frames: Number of frames to test
+    Returns:
+        Dictionary with latency statistics
+    """
+    logger.info(f"Testing frame latency with {num_frames} frames...")
+    # Generate 1 second of audio (enough for one window)
+    audio = generate_test_audio(duration_seconds=1.0)
+    latencies = []
+    for i in range(num_frames):
+        start_time = time.time()
+        try:
+            result = pipeline.predict_phone_level(audio, return_timestamps=False)
+            latency_ms = (time.time() - start_time) * 1000
+            latencies.append(latency_ms)
+        except Exception as e:
+            logger.error(f"  Frame {i+1} failed: {e}")
+    if not latencies:
+        return {"error": "No successful runs"}
+    avg_latency = sum(latencies) / len(latencies)
+    max_latency = max(latencies)
+    min_latency = min(latencies)
+    p95_latency = sorted(latencies)[int(len(latencies) * 0.95)]
+    result = {
+        "avg_latency_ms": avg_latency,
+        "max_latency_ms": max_latency,
+        "min_latency_ms": min_latency,
+        "p95_latency_ms": p95_latency,
+        "num_frames": len(latencies),
+        "target_ms": 50.0,
+        "passed": avg_latency < 50.0
+    }
+    logger.info(f"✅ Frame latency test: avg={avg_latency:.1f}ms, p95={p95_latency:.1f}ms, "
+               f"target=50ms, passed={result['passed']}")
+    return result
+async def test_websocket_latency(websocket_url: str, num_chunks: int = 50) -> Dict[str, float]:
+    """
+    Test WebSocket streaming latency.
+    Args:
+        websocket_url: WebSocket URL
+        num_chunks: Number of chunks to send
+    Returns:
+        Dictionary with latency statistics
+    """
+    try:
+        import websockets
+        logger.info(f"Testing WebSocket latency with {num_chunks} chunks...")
+        latencies = []
+        async with websockets.connect(websocket_url) as websocket:
+            # Generate test audio chunk (20ms @ 16kHz = 320 samples)
+            chunk_samples = 320
+            audio_chunk = generate_test_audio(duration_seconds=0.02)
+            chunk_bytes = (audio_chunk * 32768).astype(np.int16).tobytes()
+            for i in range(num_chunks):
+                start_time = time.time()
+                # Send chunk
+                await websocket.send(chunk_bytes)
+                # Receive response
+                response = await websocket.recv()
+                latency_ms = (time.time() - start_time) * 1000
+                latencies.append(latency_ms)
+                if i % 10 == 0:
+                    logger.info(f"  Chunk {i+1}: {latency_ms:.1f}ms")
+        if not latencies:
+            return {"error": "No successful runs"}
+        avg_latency = sum(latencies) / len(latencies)
+        max_latency = max(latencies)
+        p95_latency = sorted(latencies)[int(len(latencies) * 0.95)]
+        result = {
+            "avg_latency_ms": avg_latency,
+            "max_latency_ms": max_latency,
+            "p95_latency_ms": p95_latency,
+            "num_chunks": len(latencies),
+            "target_ms": 100.0,
+            "passed": avg_latency < 100.0
+        }
+        logger.info(f"✅ WebSocket latency test: avg={avg_latency:.1f}ms, p95={p95_latency:.1f}ms, "
+                   f"target=100ms, passed={result['passed']}")
+        return result
+    except ImportError:
+        logger.warning("websockets library not available, skipping WebSocket test")
+        return {"error": "websockets library not available"}
+    except Exception as e:
+        logger.error(f"WebSocket test failed: {e}")
+        return {"error": str(e)}
+def test_concurrent_connections(pipeline, num_connections: int = 10) -> Dict[str, Any]:
+    """
+    Test concurrent processing (simulated).
+    Args:
+        pipeline: InferencePipeline instance
+        num_connections: Number of concurrent requests
+    Returns:
+        Dictionary with results
+    """
+    logger.info(f"Testing {num_connections} concurrent connections...")
+    import concurrent.futures
+    def process_audio(i: int):
+        try:
+            audio = generate_test_audio(duration_seconds=0.5)
+            start_time = time.time()
+            result = pipeline.predict_phone_level(audio, return_timestamps=False)
+            latency_ms = (time.time() - start_time) * 1000
+            return {"success": True, "latency_ms": latency_ms, "frames": result.num_frames}
+        except Exception as e:
+            return {"success": False, "error": str(e)}
+    start_time = time.time()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_connections) as executor:
+        futures = [executor.submit(process_audio, i) for i in range(num_connections)]
+        results = [f.result() for f in concurrent.futures.as_completed(futures)]
+    total_time = time.time() - start_time
+    successful = sum(1 for r in results if r.get("success", False))
+    avg_latency = sum(r["latency_ms"] for r in results if r.get("success", False)) / successful if successful > 0 else 0.0
+    result = {
+        "total_connections": num_connections,
+        "successful": successful,
+        "failed": num_connections - successful,
+        "total_time_seconds": total_time,
+        "avg_latency_ms": avg_latency,
+        "throughput_per_second": successful / total_time if total_time > 0 else 0.0
+    }
+    logger.info(f"✅ Concurrent test: {successful}/{num_connections} successful, "
+               f"avg_latency={avg_latency:.1f}ms, throughput={result['throughput_per_second']:.1f}/s")
+    return result
+def run_all_performance_tests(pipeline, websocket_url: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Run all performance tests.
+    Args:
+        pipeline: InferencePipeline instance
+        websocket_url: Optional WebSocket URL for streaming tests
+    Returns:
+        Dictionary with all test results
+    """
+    logger.info("=" * 60)
+    logger.info("Running Performance Tests")
+    logger.info("=" * 60)
+    results = {}
+    # Test 1: Batch latency
+    logger.info("\n1. Batch File Latency Test")
+    results["batch_latency"] = test_batch_latency(pipeline)
+    # Test 2: Frame latency
+    logger.info("\n2. Per-Frame Latency Test")
+    results["frame_latency"] = test_frame_latency(pipeline)
+    # Test 3: Concurrent connections
+    logger.info("\n3. Concurrent Connections Test")
+    results["concurrent"] = test_concurrent_connections(pipeline, num_connections=10)
+    # Test 4: WebSocket latency (if URL provided)
+    if websocket_url:
+        logger.info("\n4. WebSocket Latency Test")
+        results["websocket_latency"] = asyncio.run(test_websocket_latency(websocket_url))
+    # Summary
+    logger.info("\n" + "=" * 60)
+    logger.info("Performance Test Summary")
+    logger.info("=" * 60)
+    if "batch_latency" in results and results["batch_latency"].get("passed"):
+        logger.info("✅ Batch latency: PASSED")
+    else:
+        logger.warning("❌ Batch latency: FAILED")
+    if "frame_latency" in results and results["frame_latency"].get("passed"):
+        logger.info("✅ Frame latency: PASSED")
+    else:
+        logger.warning("❌ Frame latency: FAILED")
+    if "websocket_latency" in results and results["websocket_latency"].get("passed"):
+        logger.info("✅ WebSocket latency: PASSED")
+    elif "websocket_latency" in results:
+        logger.warning("❌ WebSocket latency: FAILED")
+    return results
+if __name__ == "__main__":
+    # Example usage
+    logging.basicConfig(level=logging.INFO)
+    try:
+        from inference.inference_pipeline import create_inference_pipeline
+        pipeline = create_inference_pipeline()
+        results = run_all_performance_tests(pipeline)
+        print("\nTest Results:")
+        import json
+        print(json.dumps(results, indent=2))
+    except Exception as e:
+        logger.error(f"Test failed: {e}", exc_info=True)

ui/gradio_interface.py CHANGED Viewed

@@ -163,21 +163,21 @@ def analyze_speech(
         try:
             with open(r'c:\Users\kpanfas\Desktop\zlaqa\slaq-version-d-to-a\zlaqa-version-b\ai-enginee\zlaqa-version-b-ai-enginee\.cursor\debug.log', 'a') as f:
                 import json
-                f.write(json.dumps({"sessionId":"debug-session","runId":"run1","hypothesisId":"B","location":"gradio_interface.py:139","message":"After predict_batch call","data":{"success":True},"timestamp":int(time.time()*1000)}) + '\n')
         except: pass
         # #endregion
         # Calculate processing time
         processing_time_ms = (time.time() - start_time) * 1000
-        # Extract metrics
-        fluency_metrics = result.fluency_metrics
-        mean_fluency = fluency_metrics.get("mean", 0.0)
-        fluent_frames_ratio = fluency_metrics.get("fluent_frames_ratio", 0.0)
-        # Convert fluency score to percentage (0-100)
-        fluency_percentage = mean_fluency * 100
-        fluent_frames_percentage = fluent_frames_ratio * 100
         # Format fluency score with color coding
         if fluency_percentage >= 80:
@@ -203,10 +203,22 @@ def analyze_speech(
         """
         # Format articulation issues
-        articulation_text = format_articulation_issues(result.articulation_scores)
-        # Format confidence
-        confidence_percentage = result.confidence * 100
         confidence_html = f"""
         <div style='text-align: center; padding: 10px;'>
             <h3 style='color: #2196F3; font-size: 32px; margin: 5px 0;'>
@@ -223,7 +235,7 @@ def analyze_speech(
                 ⏱️ Processing Time: <strong>{processing_time_ms:.0f}ms</strong>
             </p>
             <p style='color: #999; font-size: 12px;'>
-                Analyzed {len(result.articulation_scores)} frames
             </p>
         </div>
         """
@@ -232,24 +244,32 @@ def analyze_speech(
         json_output = {
             "status": "success",
             "fluency_metrics": {
-                "mean_fluency": mean_fluency,
                 "fluency_percentage": fluency_percentage,
-                "fluent_frames_ratio": fluent_frames_ratio,
                 "fluent_frames_percentage": fluent_frames_percentage,
-                "std": fluency_metrics.get("std", 0.0),
-                "min": fluency_metrics.get("min", 0.0),
-                "max": fluency_metrics.get("max", 0.0),
-                "median": fluency_metrics.get("median", 0.0)
             },
             "articulation_results": {
-                "total_frames": len(result.articulation_scores),
-                "frame_duration_ms": result.frame_duration_ms,
-                "scores": result.articulation_scores[:10]  # First 10 frames for preview
             },
-            "confidence": result.confidence,
             "confidence_percentage": confidence_percentage,
             "processing_time_ms": processing_time_ms,
-            "timestamps": result.timestamps[:10] if result.timestamps else []
         }
         logger.info(f"✅ Analysis complete: fluency={fluency_percentage:.1f}%, "

         try:
             with open(r'c:\Users\kpanfas\Desktop\zlaqa\slaq-version-d-to-a\zlaqa-version-b\ai-enginee\zlaqa-version-b-ai-enginee\.cursor\debug.log', 'a') as f:
                 import json
+                f.write(json.dumps({"sessionId":"debug-session","runId":"run1","hypothesisId":"B","location":"gradio_interface.py:139","message":"After predict_batch call","data":{"success":True,"num_frames":result.num_frames},"timestamp":int(time.time()*1000)}) + '\n')
         except: pass
         # #endregion
         # Calculate processing time
         processing_time_ms = (time.time() - start_time) * 1000
+        # Extract metrics from new PhoneLevelResult format
+        aggregate = result.aggregate
+        mean_fluency_stutter = aggregate.get("fluency_score", 0.0)
+        fluency_percentage = (1.0 - mean_fluency_stutter) * 100  # Convert stutter prob to fluency percentage
+        # Count fluent frames
+        fluent_frames = sum(1 for fp in result.frame_predictions if fp.fluency_label == 'normal')
+        fluent_frames_percentage = (fluent_frames / result.num_frames * 100) if result.num_frames > 0 else 0.0
         # Format fluency score with color coding
         if fluency_percentage >= 80:
         """
         # Format articulation issues
+        articulation_class = aggregate.get("articulation_class", 0)
+        articulation_label = aggregate.get("articulation_label", "normal")
+        articulation_text = f"**Dominant Class:** {articulation_label.capitalize()}\n\n"
+        articulation_text += f"**Frame Breakdown:**\n"
+        class_counts = {}
+        for fp in result.frame_predictions:
+            label = fp.articulation_label
+            class_counts[label] = class_counts.get(label, 0) + 1
+        for label, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
+            percentage = (count / result.num_frames * 100) if result.num_frames > 0 else 0.0
+            articulation_text += f"- {label.capitalize()}: {count} frames ({percentage:.1f}%)\n"
+        # Calculate average confidence
+        avg_confidence = sum(fp.confidence for fp in result.frame_predictions) / result.num_frames if result.num_frames > 0 else 0.0
+        confidence_percentage = avg_confidence * 100
         confidence_html = f"""
         <div style='text-align: center; padding: 10px;'>
             <h3 style='color: #2196F3; font-size: 32px; margin: 5px 0;'>
                 ⏱️ Processing Time: <strong>{processing_time_ms:.0f}ms</strong>
             </p>
             <p style='color: #999; font-size: 12px;'>
+                Analyzed {result.num_frames} frames ({result.duration:.2f}s audio)
             </p>
         </div>
         """
         json_output = {
             "status": "success",
             "fluency_metrics": {
+                "mean_fluency": fluency_percentage / 100.0,
                 "fluency_percentage": fluency_percentage,
+                "fluent_frames_ratio": fluent_frames / result.num_frames if result.num_frames > 0 else 0.0,
                 "fluent_frames_percentage": fluent_frames_percentage,
+                "stutter_probability": mean_fluency_stutter
             },
             "articulation_results": {
+                "total_frames": result.num_frames,
+                "dominant_class": articulation_class,
+                "dominant_label": articulation_label,
+                "class_distribution": class_counts
             },
+            "confidence": avg_confidence,
             "confidence_percentage": confidence_percentage,
             "processing_time_ms": processing_time_ms,
+            "frame_predictions": [
+                {
+                    "time": fp.time,
+                    "fluency_prob": fp.fluency_prob,
+                    "fluency_label": fp.fluency_label,
+                    "articulation_class": fp.articulation_class,
+                    "articulation_label": fp.articulation_label,
+                    "confidence": fp.confidence
+                }
+                for fp in result.frame_predictions[:20]  # First 20 frames for preview
+            ]
         }
         logger.info(f"✅ Analysis complete: fluency={fluency_percentage:.1f}%, "