Spaces:

mutisya
/

polyglot-backend-quant

Sleeping

App Files Files Community

mutisya commited on Oct 23, 2025

Commit

e2ccb09

verified ·

1 Parent(s): 5e05084

Deploy Polyglot backend with quantized models

Browse files

Files changed (31) hide show

.dockerignore +12 -0
Dockerfile +47 -0
README.md +40 -10
app/__init__.py +1 -0
app/auth.py +310 -0
app/config/__init__.py +7 -0
app/config/cors.py +295 -0
app/main.py +345 -0
app/main.py.bak +345 -0
app/models/__init__.py +77 -0
app/routers/__init__.py +1 -0
app/routers/add_phase_endpoints.py +490 -0
app/routers/learning.py +1020 -0
app/routers/mobile.py +536 -0
app/routers/sessions.py +200 -0
app/routers/watch.py +152 -0
app/services/__init__.py +1 -0
app/services/learning_data_service.py +415 -0
app/services/quantization_utils.py +124 -0
app/services/session_manager.py +180 -0
app/services/transcription_service.py +736 -0
app/services/transcription_service.py.bak +726 -0
app/services/transcription_service_onnx.py +682 -0
app/services/transcription_service_onnx_optimized.py +251 -0
app/services/translation_service.py +151 -0
app/services/translation_service_onnx.py +268 -0
app/services/tts_service.py +541 -0
app/services/tts_service_onnx.py +587 -0
app/services/websocket_manager.py +909 -0
preload_models.py +23 -0
requirements.txt +58 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.cache
+nltk_data
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info
+dist
+build

Dockerfile ADDED Viewed

	@@ -0,0 +1,47 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    sox \
+    espeak \
+    espeak-data \
+    libespeak1 \
+    libespeak-dev \
+    wget \
+    gnupg \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY app ./app
+COPY preload_models.py .
+# Set environment variables for caching
+ENV HF_HOME=/app/.cache
+ENV TRANSFORMERS_CACHE=/app/.cache
+ENV NLTK_DATA=/app/nltk_data
+ENV PYTHONPATH=/app
+ENV PORT=7860
+# Create cache directories
+RUN mkdir -p $HF_HOME && chmod -R 777 $HF_HOME
+RUN mkdir -p $NLTK_DATA && chmod -R 777 $NLTK_DATA
+# Download models using HF token from environment
+# HuggingFace Spaces automatically provides HUGGING_FACE_HUB_TOKEN
+ARG HUGGING_FACE_HUB_TOKEN
+RUN python preload_models.py $HUGGING_FACE_HUB_TOKEN || echo "Model preload skipped - will download on first use"
+# Expose port 7860 (HuggingFace Spaces standard)
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "app.main:socket_app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,40 @@
----
-title: Polyglot Backend Quant
-emoji: 📊
-colorFrom: yellow
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Polyglot Translation Backend
+emoji: 🌍
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+license: mit
+app_port: 7860
+---
+# Polyglot Translation Backend - Quantized Models
+Real-time speech transcription and translation API with Socket.IO for WebSocket communication. This version uses INT8 quantized models for improved performance and reduced memory footprint.
+## Features
+- **Real-time Speech Recognition**: Support for English, Swahili, Kikuyu, Kamba, Kimeru, Luo, and Somali
+- **Translation**: Multi-language translation using NLLB models
+- **Text-to-Speech**: Generate speech in multiple languages
+- **WebSocket Support**: Real-time communication via Socket.IO
+- **Model Quantization**: INT8 dynamic quantization for faster inference
+## API Endpoints
+- `GET /health` - Health check endpoint
+- `WebSocket /` - Socket.IO connection for real-time communication
+## Environment
+This Space requires a HuggingFace token for model access. The token is automatically provided by HuggingFace Spaces when configured as a secret.
+## Technical Details
+- **Framework**: FastAPI with Socket.IO
+- **Models**:
+  - ASR: Whisper (English) and Wav2Vec2-BERT (African languages)
+  - Translation: NLLB-600M fine-tuned model
+  - TTS: VITS models for each language
+- **Optimization**: INT8 dynamic quantization via PyTorch

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Backend application package

app/auth.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+Authentication module for HuggingFace token validation
+"""
+import os
+from typing import Optional
+from fastapi import HTTPException, status, Request
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from fastapi.security.utils import get_authorization_scheme_param
+def is_local_development() -> bool:
+    """
+    Detect if the application is running in local development mode.
+    This checks multiple indicators to determine if auth should be disabled.
+    """
+    # Method 1: Explicit disable auth flag
+    disable_auth = os.getenv('DISABLE_AUTH', '').lower()
+    if disable_auth in ['true', '1', 'yes']:
+        return True
+    # Method 2: Check ENVIRONMENT variable
+    environment = os.getenv('ENVIRONMENT', '').lower()
+    if environment in ['development', 'dev', 'local']:
+        return True
+    # Method 3: Check DEBUG flag
+    debug = os.getenv('DEBUG', '').lower()
+    if debug in ['true', '1', 'yes']:
+        return True
+    # Method 4: Check if running on localhost/development ports
+    host = os.getenv('HOST', '')
+    port = os.getenv('PORT', '')
+    if host in ['localhost', '127.0.0.1', '0.0.0.0'] and port == '7860':
+        return True
+    # Method 5: Check for presence of local development files
+    local_indicators = [
+        '.env.local',
+        'docker-compose.local.yml',
+        'Dockerfile.local'
+    ]
+    for indicator in local_indicators:
+        if os.path.exists(indicator):
+            return True
+    # Method 6: Check if we're in a Docker container with local development setup
+    if os.path.exists('/.dockerenv'):
+        # We're in Docker, check if it's local development
+        if os.getenv('ALLOW_ALL_ORIGINS', '').lower() == 'true':
+            return True
+    return False
+class HuggingFaceTokenAuth:
+    """HuggingFace token authentication handler"""
+    def __init__(self):
+        self.bearer = HTTPBearer(auto_error=False)
+        self.is_local = is_local_development()
+        if self.is_local:
+            print("🔓 RUNNING IN LOCAL DEVELOPMENT MODE - AUTH DISABLED")
+            print("   Environment indicators:")
+            print(f"   - DISABLE_AUTH: {os.getenv('DISABLE_AUTH', 'not set')}")
+            print(f"   - ENVIRONMENT: {os.getenv('ENVIRONMENT', 'not set')}")
+            print(f"   - DEBUG: {os.getenv('DEBUG', 'not set')}")
+            print(f"   - HOST: {os.getenv('HOST', 'not set')}")
+            print(f"   - PORT: {os.getenv('PORT', 'not set')}")
+            print(f"   - ALLOW_ALL_ORIGINS: {os.getenv('ALLOW_ALL_ORIGINS', 'not set')}")
+            print(f"   - Docker container: {os.path.exists('/.dockerenv')}")
+            print(f"   - .env.local exists: {os.path.exists('.env.local')}")
+        else:
+            print("🔒 RUNNING IN PRODUCTION MODE - AUTH REQUIRED")
+    def verify_token(self, token: str) -> bool:
+        """
+        Verify if the token is a valid HuggingFace token.
+        In local development mode, always returns True.
+        """
+        # Skip token validation in local development
+        if self.is_local:
+            print("🔓 Local development mode: skipping token validation")
+            return True
+        try:
+            if not token:
+                return False
+            if not isinstance(token, str):
+                print(f"❌ Token is not a string: {type(token)}")
+                return False
+            # HuggingFace tokens start with 'hf_'
+            if not token.startswith('hf_'):
+                print(f"❌ Token does not start with 'hf_': {token[:10]}...")
+                return False
+            # Additional validation can be added here
+            # For example, you could make a request to HuggingFace API
+            # to validate the token, but that would add latency
+            return True
+        except Exception as e:
+            print(f"❌ Error in verify_token: {e}")
+            return False
+    def get_token_from_request(self, request: Request) -> Optional[str]:
+        """Extract token from various sources in the request"""
+        # Method 1: Authorization header
+        authorization = request.headers.get("Authorization")
+        if authorization:
+            scheme, token = get_authorization_scheme_param(authorization)
+            if scheme.lower() == "bearer":
+                return token
+        # Method 2: Query parameter (for WebSocket initial handshake)
+        token = request.query_params.get("token")
+        if token:
+            return token
+        # Method 3: Custom header (alternative)
+        token = request.headers.get("X-HF-Token")
+        if token:
+            return token
+        return None
+    async def authenticate_request(self, request: Request) -> bool:
+        """Authenticate a request using HuggingFace token"""
+        token = self.get_token_from_request(request)
+        if not token:
+            return False
+        return self.verify_token(token)
+# Global instance
+hf_auth = HuggingFaceTokenAuth()
+async def require_hf_token(request: Request) -> str:
+    """
+    FastAPI dependency that requires a valid HuggingFace token.
+    In local development mode, returns a dummy token.
+    Returns the token if valid, raises HTTPException if not.
+    """
+    # Skip authentication in local development
+    if hf_auth.is_local:
+        print("🔓 Local development mode: bypassing HF token requirement")
+        return "local-development-bypass"
+    token = hf_auth.get_token_from_request(request)
+    if not token:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="HuggingFace token required. Please provide a valid token in Authorization header.",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    if not hf_auth.verify_token(token):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid HuggingFace token. Token must start with 'hf_'.",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    return token
+async def optional_hf_token(request: Request) -> Optional[str]:
+    """
+    FastAPI dependency that optionally validates HuggingFace token.
+    In local development mode, returns a dummy token if no real token provided.
+    Returns the token if present and valid, None otherwise.
+    Useful for endpoints that work with or without authentication.
+    """
+    # In local development, always return a token
+    if hf_auth.is_local:
+        token = hf_auth.get_token_from_request(request)
+        if token and hf_auth.verify_token(token):
+            return token
+        else:
+            print("🔓 Local development mode: providing dummy token for optional auth")
+            return "local-development-bypass"
+    token = hf_auth.get_token_from_request(request)
+    if not token:
+        return None
+    if hf_auth.verify_token(token):
+        return token
+    return None
+def authenticate_websocket_connect(environ: dict) -> bool:
+    """
+    Authenticate WebSocket connection using token from various sources.
+    In local development mode, always returns True.
+    This is called during the Socket.IO connect event.
+    """
+    # Skip authentication in local development
+    if hf_auth.is_local:
+        print("🔓 Local development mode: bypassing WebSocket authentication")
+        return True
+    try:
+        print("=== WEBSOCKET ENVIRON AUTHENTICATION ===")
+        print(f"Environ type: {type(environ)}")
+        if not isinstance(environ, dict):
+            print(f"❌ Environ is not a dict: {type(environ)}")
+            return False
+        # Method 1: Check query parameters
+        query_string = environ.get('QUERY_STRING', '')
+        print(f"Query string: {query_string}")
+        if query_string:
+            from urllib.parse import parse_qs
+            query_params = parse_qs(query_string)
+            print(f"Parsed query params: {query_params}")
+            tokens = query_params.get('token', [])
+            if tokens:
+                token = tokens[0]
+                print(f"Found token in query: {token[:10]}...")
+                if hf_auth.verify_token(token):
+                    print("✓ Token validated via query params")
+                    return True
+        # Method 2: Check headers
+        auth_header = environ.get('HTTP_AUTHORIZATION', '')
+        print(f"Authorization header: {auth_header[:20] if auth_header else 'None'}...")
+        if auth_header:
+            if auth_header.startswith('Bearer '):
+                token = auth_header[7:]  # Remove 'Bearer ' prefix
+                print(f"Found token in Authorization header: {token[:10]}...")
+                if hf_auth.verify_token(token):
+                    print("✓ Token validated via Authorization header")
+                    return True
+        # Method 3: Check custom header
+        hf_token_header = environ.get('HTTP_X_HF_TOKEN', '')
+        print(f"X-HF-Token header: {hf_token_header[:10] if hf_token_header else 'None'}...")
+        if hf_token_header:
+            if hf_auth.verify_token(hf_token_header):
+                print("✓ Token validated via X-HF-Token header")
+                return True
+        print("❌ No valid token found in environ")
+        print(f"Available environ keys: {list(environ.keys())}")
+        return False
+    except Exception as e:
+        print(f"❌ Error in authenticate_websocket_connect: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+def authenticate_websocket_auth_data(auth_data: dict) -> bool:
+    """
+    Authenticate WebSocket connection using auth data from Socket.IO.
+    In local development mode, always returns True.
+    This is called when the client sends auth data in the connection.
+    """
+    # Skip authentication in local development
+    if hf_auth.is_local:
+        print("🔓 Local development mode: bypassing WebSocket auth data validation")
+        return True
+    try:
+        print("=== WEBSOCKET AUTH DATA AUTHENTICATION ===")
+        print(f"Auth data received: {auth_data}")
+        print(f"Auth data type: {type(auth_data)}")
+        if not auth_data:
+            print("❌ No auth data provided")
+            return False
+        if not isinstance(auth_data, dict):
+            print(f"❌ Auth data is not a dict: {type(auth_data)}")
+            return False
+        # Check for token in auth data
+        token = auth_data.get('token')
+        if token:
+            print(f"Found token in auth data: {token[:10]}...")
+            if hf_auth.verify_token(token):
+                print("✓ Token validated via auth data")
+                return True
+            else:
+                print("❌ Invalid token in auth data")
+        else:
+            print("❌ No token in auth data")
+            print(f"Available keys in auth data: {list(auth_data.keys())}")
+        return False
+    except Exception as e:
+        print(f"❌ Error in authenticate_websocket_auth_data: {e}")
+        import traceback
+        traceback.print_exc()
+        return False

app/config/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Configuration package for Polyglot backend
+"""
+from .cors import cors_config
+__all__ = ["cors_config"]

app/config/cors.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""
+CORS Configuration Module
+Centralized CORS configuration supporting multiple deployment environments.
+"""
+import os
+import re
+from typing import List, Optional
+from enum import Enum
+class Environment(str, Enum):
+    """Deployment environment types"""
+    LOCAL = "local"
+    DEVELOPMENT = "development"
+    STAGING = "staging"
+    PRODUCTION = "production"
+class CORSConfig:
+    """CORS configuration manager"""
+    # Default origins for local development
+    DEFAULT_LOCAL_ORIGINS = [
+        "http://localhost:3000",      # React/Next.js dev server
+        "http://localhost:3001",      # Polyglot frontend (Vite)
+        "http://localhost:3002",      # Lessons UI (Vite)
+        "http://localhost:3003",      # Podium (Vite)
+        "http://localhost:3004",      # Podium alternative port
+        "http://localhost:5173",      # Vite dev server
+        "http://localhost:7860",      # Backend self-reference
+        "http://localhost:8080",      # Alternative dev server
+        "http://127.0.0.1:3000",      # IPv4 localhost variant
+        "http://127.0.0.1:3001",      # IPv4 localhost variant
+        "http://127.0.0.1:3002",      # IPv4 localhost variant
+        "http://127.0.0.1:3003",      # IPv4 localhost variant
+        "http://127.0.0.1:3004",      # IPv4 localhost variant
+        "http://127.0.0.1:5173",      # IPv4 localhost variant
+        "http://127.0.0.1:7860",      # IPv4 localhost variant
+    ]
+    # Default patterns for production deployments
+    DEFAULT_PRODUCTION_PATTERNS = [
+        r"^https://.*\.tafiti\.dev$",           # Tafiti production/staging
+        r"^https://.*\.vercel\.app$",           # Vercel deployments
+        r"^https://.*\.hf\.space$",             # HuggingFace Spaces
+        r"^https://milimani\.tafiti-api\.org$", # Production API
+    ]
+    # Mobile app protocols
+    MOBILE_PROTOCOLS = [
+        "capacitor://localhost",      # Capacitor apps
+        "ionic://localhost",          # Ionic apps
+        "http://localhost",           # Mobile WebView
+    ]
+    def __init__(self):
+        self.environment = self._get_environment()
+        self.allowed_origins = self._build_allowed_origins()
+        self.allow_all = self._should_allow_all()
+        self.origin_patterns = self._build_origin_patterns()
+    def _get_environment(self) -> Environment:
+        """Get current deployment environment"""
+        env_str = os.getenv("ENVIRONMENT", "local").lower()
+        try:
+            return Environment(env_str)
+        except ValueError:
+            print(f"⚠️  Unknown environment '{env_str}', defaulting to 'local'")
+            return Environment.LOCAL
+    def _should_allow_all(self) -> bool:
+        """Check if CORS should allow all origins (insecure, dev only)"""
+        allow_all = os.getenv("CORS_ALLOW_ALL", "false").lower()
+        if allow_all == "true":
+            if self.environment == Environment.PRODUCTION:
+                print("❌ ERROR: CORS_ALLOW_ALL=true is not allowed in production")
+                return False
+            else:
+                print("⚠️  WARNING: CORS allowing all origins - INSECURE, use only for development")
+                return True
+        return False
+    def _build_allowed_origins(self) -> List[str]:
+        """Build list of allowed origins from environment and defaults"""
+        origins = []
+        # Get custom origins from environment variable
+        custom_origins_str = os.getenv("CORS_ALLOWED_ORIGINS", "")
+        if custom_origins_str:
+            # Parse comma-separated origins
+            custom_origins = [
+                origin.strip()
+                for origin in custom_origins_str.split(",")
+                if origin.strip()
+            ]
+            origins.extend(custom_origins)
+            print(f"✓ Loaded {len(custom_origins)} custom CORS origins from environment")
+        # Add defaults based on environment
+        if self.environment == Environment.LOCAL:
+            origins.extend(self.DEFAULT_LOCAL_ORIGINS)
+            print(f"✓ Added {len(self.DEFAULT_LOCAL_ORIGINS)} default local origins")
+        # Always include mobile protocols in non-production
+        if self.environment != Environment.PRODUCTION:
+            origins.extend(self.MOBILE_PROTOCOLS)
+            print(f"✓ Added {len(self.MOBILE_PROTOCOLS)} mobile protocol origins")
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_origins = []
+        for origin in origins:
+            if origin not in seen:
+                seen.add(origin)
+                unique_origins.append(origin)
+        return unique_origins
+    def _build_origin_patterns(self) -> List[re.Pattern]:
+        """Build regex patterns for origin matching"""
+        patterns = []
+        # Get custom patterns from environment
+        custom_patterns_str = os.getenv("CORS_ALLOWED_PATTERNS", "")
+        if custom_patterns_str:
+            custom_pattern_strs = [
+                p.strip()
+                for p in custom_patterns_str.split(",")
+                if p.strip()
+            ]
+            for pattern_str in custom_pattern_strs:
+                try:
+                    patterns.append(re.compile(pattern_str))
+                except re.error as e:
+                    print(f"⚠️  Invalid regex pattern '{pattern_str}': {e}")
+            print(f"✓ Loaded {len(patterns)} custom CORS patterns from environment")
+        # Add default production patterns if in production/staging/development
+        if self.environment in [Environment.PRODUCTION, Environment.STAGING, Environment.DEVELOPMENT]:
+            for pattern_str in self.DEFAULT_PRODUCTION_PATTERNS:
+                patterns.append(re.compile(pattern_str))
+            print(f"✓ Added {len(self.DEFAULT_PRODUCTION_PATTERNS)} default production patterns")
+        # Add localhost pattern for development
+        if self.environment == Environment.LOCAL:
+            patterns.append(re.compile(r"^http://localhost:\d+$"))
+            patterns.append(re.compile(r"^http://127\.0\.0\.1:\d+$"))
+            print("✓ Added localhost wildcard patterns for development")
+        return patterns
+    def is_origin_allowed(self, origin: str) -> bool:
+        """
+        Check if an origin is allowed based on explicit list or patterns
+        Args:
+            origin: Origin to check (e.g., "https://app.tafiti.dev")
+        Returns:
+            True if origin is allowed, False otherwise
+        """
+        # If allow_all is enabled (dev only)
+        if self.allow_all:
+            return True
+        # Check explicit origins list
+        if origin in self.allowed_origins:
+            return True
+        # Check against patterns
+        for pattern in self.origin_patterns:
+            if pattern.match(origin):
+                return True
+        return False
+    def get_cors_middleware_config(self) -> dict:
+        """Get configuration dict for FastAPI CORSMiddleware"""
+        if self.allow_all:
+            return {
+                "allow_origins": ["*"],
+                "allow_credentials": False,  # Cannot use credentials with wildcard
+                "allow_methods": ["*"],
+                "allow_headers": ["*"],
+            }
+        # Build origin regex for pattern matching
+        if self.origin_patterns:
+            # Combine all patterns into a single regex
+            combined_pattern = "|".join(f"({p.pattern})" for p in self.origin_patterns)
+            return {
+                "allow_origins": self.allowed_origins,
+                "allow_origin_regex": combined_pattern,
+                "allow_credentials": True,
+                "allow_methods": ["*"],
+                "allow_headers": ["*"],
+            }
+        else:
+            return {
+                "allow_origins": self.allowed_origins,
+                "allow_credentials": True,
+                "allow_methods": ["*"],
+                "allow_headers": ["*"],
+            }
+    def get_socketio_cors_origins(self):
+        """
+        Get CORS origins for Socket.IO
+        Socket.IO doesn't support regex patterns, so we need to provide explicit list.
+        For production, this means we need to enumerate common origins.
+        """
+        if self.allow_all:
+            return "*"
+        # For Socket.IO, we can only provide explicit origins
+        # In production, we may need to enumerate common subdomains
+        socketio_origins = self.allowed_origins.copy()
+        # Add common production subdomains if using production patterns
+        if self.environment in [Environment.PRODUCTION, Environment.STAGING]:
+            # These should be added to CORS_ALLOWED_ORIGINS for Socket.IO support
+            production_origins = [
+                "https://app.tafiti.dev",
+                "https://www.tafiti.dev",
+                "https://polyglot.tafiti.dev",
+                "https://podium.tafiti.dev",
+                "https://milimani.tafiti-api.org",
+                "https://polyglot-ashy-beta.vercel.app",
+                "https://lessons-silk.vercel.app",
+                "https://lessons.tafiti.dev⁠",
+                "https://podium-chi.vercel.app",
+            ]
+            for origin in production_origins:
+                if origin not in socketio_origins:
+                    socketio_origins.append(origin)
+        return socketio_origins
+    def print_config_summary(self):
+        """Print CORS configuration summary for debugging"""
+        print("\n" + "="*70)
+        print("CORS CONFIGURATION SUMMARY")
+        print("="*70)
+        print(f"Environment: {self.environment.value}")
+        print(f"Allow All: {self.allow_all}")
+        print(f"\nExplicit Origins ({len(self.allowed_origins)}):")
+        for origin in self.allowed_origins:
+            print(f"  • {origin}")
+        if self.origin_patterns:
+            print(f"\nOrigin Patterns ({len(self.origin_patterns)}):")
+            for pattern in self.origin_patterns:
+                print(f"  • {pattern.pattern}")
+        print("\nExample Origins That Would Be Allowed:")
+        test_origins = [
+            "http://localhost:3001",
+            "http://localhost:3002",
+            "http://localhost:3003",
+            "http://localhost:3004",
+            "http://localhost:5173",
+            "https://app.tafiti.dev",
+            "https://polyglot.tafiti.dev",
+            "https://podium.tafiti.dev",
+            "https://polyglot.vercel.app",
+            "https://lessons-silk.vercel.app",
+            "https://podium-chi.vercel.app",
+            "https://polyglot-ashy-beta.vercel.app",
+            "https://mutisya-translator.hf.space",
+            "https://milimani.tafiti-api.org",
+            "capacitor://localhost",
+            "https://example.com",
+        ]
+        for test_origin in test_origins:
+            allowed = "✓" if self.is_origin_allowed(test_origin) else "✗"
+            print(f"  {allowed} {test_origin}")
+        print("="*70 + "\n")
+# Global CORS configuration instance
+cors_config = CORSConfig()

app/main.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import os
+import os
+import asyncio
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from contextlib import asynccontextmanager
+import logging
+import socketio
+import engineio
+import re
+from app.routers import sessions, mobile, watch, learning
+from app.services.session_manager import SessionManager
+from app.services.transcription_service import TranscriptionService
+from app.services.translation_service import TranslationService
+from app.services.tts_service import TTSService
+from app.services.websocket_manager import WebSocketManager
+from app.auth import require_hf_token, optional_hf_token, authenticate_websocket_connect, authenticate_websocket_auth_data
+from app.config.cors import cors_config
+class ChunkArrayTruncateFilter(logging.Filter):
+    """Custom logging filter to truncate long arrays in Socket.IO logs for better readability"""
+    def filter(self, record):
+        if hasattr(record, 'msg') and isinstance(record.msg, str):
+            # More aggressive approach to truncate audioData arrays
+            # Pattern to match: "audioData":[numbers,numbers,numbers,...]
+            audiodata_pattern = r'"audioData":\[([0-9,-]+(?:,[0-9,-]+)*)\]'
+            def truncate_audiodata(match):
+                array_content = match.group(1)
+                # Split by comma and get first 10 items
+                items = array_content.split(',')
+                if len(items) > 10:
+                    truncated = ','.join(items[:10])
+                    return f'"audioData":[{truncated}, ...] (truncated {len(items)-10} more items)'
+                return match.group(0)
+            record.msg = re.sub(audiodata_pattern, truncate_audiodata, record.msg)
+            # Also handle any other large numeric arrays in brackets
+            # Pattern for arrays with more than 20 numbers
+            large_numeric_array_pattern = r'(\[)([0-9,-]+(?:,[0-9,-]+){20,})(\])'
+            def truncate_large_numeric_array(match):
+                prefix = match.group(1)
+                array_content = match.group(2)
+                suffix = match.group(3)
+                # Split by comma and get first 10 items
+                items = array_content.split(',')
+                if len(items) > 10:
+                    truncated = ','.join(items[:10])
+                    return f'{prefix}{truncated}, ... (truncated {len(items)-10} more){suffix}'
+                return match.group(0)
+            record.msg = re.sub(large_numeric_array_pattern, truncate_large_numeric_array, record.msg)
+            # Truncate other field types
+            for field_name in ['chunk', 'wavChunk', 'data']:
+                field_pattern = rf'"{field_name}":\[([0-9,-]+(?:,[0-9,-]+)*)\]'
+                def make_truncate_field(fname):
+                    def truncate_field(match):
+                        array_content = match.group(1)
+                        items = array_content.split(',')
+                        if len(items) > 10:
+                            truncated = ','.join(items[:10])
+                            return f'"{fname}":[{truncated}, ...] (truncated {len(items)-10} more)'
+                        return match.group(0)
+                    return truncate_field
+                record.msg = re.sub(field_pattern, make_truncate_field(field_name), record.msg)
+        return True
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Initialize services
+    print("=== INITIALIZING BACKEND SERVICES ===")
+    try:
+        print("Initializing transcription service...")
+        await transcription_service.initialize()
+        print("✓ Transcription service initialized")
+        print("Initializing translation service...")
+        await translation_service.initialize()
+        print("✓ Translation service initialized")
+        print("Initializing TTS service...")
+        await tts_service.initialize()
+        print("✓ TTS service initialized")
+        print("=== ALL SERVICES INITIALIZED SUCCESSFULLY ===" )
+        # Start background loading of additional models after successful startup
+        print("=== STARTING BACKGROUND MODEL LOADING ===")
+        transcription_service.start_background_loading()
+        tts_service.start_background_loading()
+        print("=== BACKGROUND MODEL LOADING INITIATED ===")
+        # Print CORS configuration summary
+        cors_config.print_config_summary()
+    except Exception as e:
+        print(f"❌ SERVICE INITIALIZATION FAILED: {e}")
+        import traceback
+        traceback.print_exc()
+        raise
+    yield
+    # Cleanup
+    print("=== CLEANING UP SERVICES ===")
+    await transcription_service.cleanup()
+    await translation_service.cleanup()
+    await tts_service.cleanup()
+    print("=== CLEANUP COMPLETE ===")
+app = FastAPI(
+    title="Real-time Transcription & Translation API",
+    description="Backend API for real-time speech transcription and translation",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# CORS middleware with environment-based configuration
+cors_middleware_config = cors_config.get_cors_middleware_config()
+print(f"Configuring CORS middleware with keys: {list(cors_middleware_config.keys())}")
+app.add_middleware(
+    CORSMiddleware,
+    **cors_middleware_config
+)
+# Initialize services - using PyTorch models for better compatibility
+session_manager = SessionManager()
+transcription_service = TranscriptionService()
+translation_service = TranslationService()
+tts_service = TTSService()
+websocket_manager = WebSocketManager(
+    session_manager=session_manager,
+    transcription_service=transcription_service,
+    translation_service=translation_service,
+    tts_service=tts_service
+)
+# Include routers
+app.include_router(sessions.router, prefix="/api")
+app.include_router(mobile.router, prefix="/api")
+app.include_router(watch.router, prefix="/api")
+app.include_router(learning.router)
+# Set the session manager in the router
+sessions.session_manager = session_manager
+sessions.translation_service = translation_service
+sessions.tts_service = tts_service
+sessions.transcription_service = transcription_service
+# Set the mobile router
+mobile.translation_service = translation_service
+mobile.tts_service = tts_service
+mobile.transcription_service = transcription_service
+# Set the watch router
+watch.translation_service = translation_service
+watch.tts_service = tts_service
+watch.transcription_service = transcription_service
+# Configure logging with custom filter to truncate chunk arrays
+chunk_filter = ChunkArrayTruncateFilter()
+sio_logger = logging.getLogger('socketio')
+sio_logger.setLevel(logging.INFO)  # Show info logs with truncated arrays
+sio_logger.addFilter(chunk_filter)
+engineio_logger = logging.getLogger('engineio')
+engineio_logger.setLevel(logging.INFO)  # Show info logs with truncated arrays
+engineio_logger.addFilter(chunk_filter)
+# Also apply filter to the root logger to catch any other verbose logging
+root_logger = logging.getLogger()
+root_logger.addFilter(chunk_filter)
+# Configure Engine.IO payload limits for large audio chunks
+engineio.payload.Payload.max_decode_packets = 250
+# Socket.IO setup with environment-based CORS
+socketio_cors_origins = cors_config.get_socketio_cors_origins()
+print(f"Configuring Socket.IO CORS: {len(socketio_cors_origins) if isinstance(socketio_cors_origins, list) else 'all'} origins")
+sio = socketio.AsyncServer(
+    async_mode='asgi',
+    cors_allowed_origins=socketio_cors_origins,
+    cors_credentials=not cors_config.allow_all,  # Cannot use credentials with wildcard
+    logger=True,  # Re-enabled with custom filtering
+    engineio_logger=True,  # Re-enabled with custom filtering
+    always_connect=False  # This ensures connect event is called for authentication
+)
+# Set the socketio instance in websocket manager
+websocket_manager.set_socketio(sio)
+socket_app = socketio.ASGIApp(sio, app)
+@app.get("/health")
+async def health_check(token: str = Depends(optional_hf_token)):
+    """Health check endpoint - optionally authenticated"""
+    from app.auth import hf_auth
+    auth_status = "bypassed (local development)" if hf_auth.is_local else "authenticated"
+    if not hf_auth.is_local and not token:
+        auth_status = "unauthenticated"
+    return {
+        "status": "healthy",
+        "message": "Translation service is running",
+        "auth_status": auth_status,
+        "local_development": hf_auth.is_local,
+        "auth_bypassed": hf_auth.is_local,
+        "token_prefix": token[:10] + "..." if token and token != "local-development-bypass" else "local-bypass" if hf_auth.is_local else None,
+        "environment": {
+            "ENVIRONMENT": os.getenv('ENVIRONMENT', 'not set'),
+            "DEBUG": os.getenv('DEBUG', 'not set'),
+            "DISABLE_AUTH": os.getenv('DISABLE_AUTH', 'not set'),
+            "HOST": os.getenv('HOST', 'not set'),
+            "PORT": os.getenv('PORT', 'not set')
+        },
+        "services": {
+            "transcription": transcription_service is not None,
+            "translation": translation_service is not None,
+            "tts": tts_service is not None,
+            "sessions": session_manager is not None
+        }
+    }
+@sio.event
+async def connect(sid, environ=None, auth=None):
+    """Handle Socket.IO connection with authentication"""
+    try:
+        print(f"=== WEBSOCKET CONNECTION ATTEMPT ===")
+        print(f"SID: {sid}")
+        print(f"Auth data: {auth}")
+        print(f"Environ type: {type(environ)}")
+        print(f"Environ data: {environ}")
+        # Ensure environ is a dict
+        if environ is None:
+            environ = {}
+        print(f"Query string: {environ.get('QUERY_STRING', 'None')}")
+        print(f"Headers: {[k for k in environ.keys() if k.startswith('HTTP_')] if isinstance(environ, dict) else 'environ not dict'}")
+        # Check authentication from multiple sources
+        authenticated = False
+        auth_method = None
+        # Method 1: Check auth data from client
+        if auth and authenticate_websocket_auth_data(auth):
+            authenticated = True
+            auth_method = "auth_data"
+            print("✓ Authenticated via auth data")
+        # Method 2: Check environment (headers, query params)
+        elif environ and isinstance(environ, dict) and authenticate_websocket_connect(environ):
+            authenticated = True
+            auth_method = "environ"
+            print("✓ Authenticated via headers/query")
+        # TEMPORARY: Allow connections for debugging (remove in production)
+        # This helps identify if the issue is authentication or something else
+        if not authenticated:
+            print("⚠️  Authentication failed, but allowing for debugging")
+            if isinstance(environ, dict):
+                print(f"Available environ keys: {list(environ.keys())}")
+            # Uncomment the next line to temporarily allow unauthenticated connections for debugging
+            authenticated = True
+            auth_method = "debug_bypass"
+        if not authenticated:
+            print("❌ Authentication failed - disconnecting")
+            await sio.disconnect(sid)
+            return False
+        print(f"✓ WebSocket connection authenticated successfully via {auth_method}")
+        return True
+    except Exception as e:
+        print(f"❌ Error in connect handler: {e}")
+        import traceback
+        traceback.print_exc()
+        try:
+            await sio.disconnect(sid)
+        except:
+            pass
+        return False
+@sio.event
+async def disconnect(sid):
+    await websocket_manager.handle_disconnect(sid)
+@sio.event
+async def join_session(sid, data):
+    await websocket_manager.handle_join_session(sid, data)
+@sio.event
+async def join_hub(sid, data):
+    await websocket_manager.handle_join_hub(sid, data)
+@sio.event
+async def leave_session(sid, data):
+    await websocket_manager.handle_leave_session(sid, data)
+@sio.event
+async def audio_chunk(sid, data):
+    await websocket_manager.handle_audio_chunk(sid, data)
+@sio.event
+async def speaking_status(sid, data):
+    await websocket_manager.handle_speaking_status(sid, data)
+@sio.event
+async def test_echo(sid, data):
+    """Test event to verify WebSocket communication"""
+    await sio.emit('test_echo_response', data, room=sid)
+@sio.event
+async def update_participant_language(sid, data):
+    """Update participant's language (affects speech recognition)"""
+    await websocket_manager.handle_update_participant_language(sid, data)
+@sio.event
+async def update_session_languages(sid, data):
+    """Update session's languages (affects translation targets)"""
+    await websocket_manager.handle_update_session_languages(sid, data)
+# Serve static files (for frontend)
+if os.path.exists("../frontend/dist"):
+    app.mount("/", StaticFiles(directory="../frontend/dist", html=True), name="static")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:socket_app", host="0.0.0.0", port=7860, reload=True)

app/main.py.bak ADDED Viewed

	@@ -0,0 +1,345 @@

+import os
+import os
+import asyncio
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, Depends
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from contextlib import asynccontextmanager
+import logging
+import socketio
+import engineio
+import re
+from app.routers import sessions, mobile, watch, learning
+from app.services.session_manager import SessionManager
+from app.services.transcription_service import TranscriptionService
+from app.services.translation_service import TranslationService
+from app.services.tts_service import TTSService
+from app.services.websocket_manager import WebSocketManager
+from app.auth import require_hf_token, optional_hf_token, authenticate_websocket_connect, authenticate_websocket_auth_data
+from app.config.cors import cors_config
+class ChunkArrayTruncateFilter(logging.Filter):
+    """Custom logging filter to truncate long arrays in Socket.IO logs for better readability"""
+    def filter(self, record):
+        if hasattr(record, 'msg') and isinstance(record.msg, str):
+            # More aggressive approach to truncate audioData arrays
+            # Pattern to match: "audioData":[numbers,numbers,numbers,...]
+            audiodata_pattern = r'"audioData":\[([0-9,-]+(?:,[0-9,-]+)*)\]'
+            def truncate_audiodata(match):
+                array_content = match.group(1)
+                # Split by comma and get first 10 items
+                items = array_content.split(',')
+                if len(items) > 10:
+                    truncated = ','.join(items[:10])
+                    return f'"audioData":[{truncated}, ...] (truncated {len(items)-10} more items)'
+                return match.group(0)
+            record.msg = re.sub(audiodata_pattern, truncate_audiodata, record.msg)
+            # Also handle any other large numeric arrays in brackets
+            # Pattern for arrays with more than 20 numbers
+            large_numeric_array_pattern = r'(\[)([0-9,-]+(?:,[0-9,-]+){20,})(\])'
+            def truncate_large_numeric_array(match):
+                prefix = match.group(1)
+                array_content = match.group(2)
+                suffix = match.group(3)
+                # Split by comma and get first 10 items
+                items = array_content.split(',')
+                if len(items) > 10:
+                    truncated = ','.join(items[:10])
+                    return f'{prefix}{truncated}, ... (truncated {len(items)-10} more){suffix}'
+                return match.group(0)
+            record.msg = re.sub(large_numeric_array_pattern, truncate_large_numeric_array, record.msg)
+            # Truncate other field types
+            for field_name in ['chunk', 'wavChunk', 'data']:
+                field_pattern = rf'"{field_name}":\[([0-9,-]+(?:,[0-9,-]+)*)\]'
+                def make_truncate_field(fname):
+                    def truncate_field(match):
+                        array_content = match.group(1)
+                        items = array_content.split(',')
+                        if len(items) > 10:
+                            truncated = ','.join(items[:10])
+                            return f'"{fname}":[{truncated}, ...] (truncated {len(items)-10} more)'
+                        return match.group(0)
+                    return truncate_field
+                record.msg = re.sub(field_pattern, make_truncate_field(field_name), record.msg)
+        return True
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Initialize services
+    print("=== INITIALIZING BACKEND SERVICES ===")
+    try:
+        print("Initializing transcription service...")
+        await transcription_service.initialize()
+        print("✓ Transcription service initialized")
+        print("Initializing translation service...")
+        await translation_service.initialize()
+        print("✓ Translation service initialized")
+        print("Initializing TTS service...")
+        await tts_service.initialize()
+        print("✓ TTS service initialized")
+        print("=== ALL SERVICES INITIALIZED SUCCESSFULLY ===")
+        # Start background loading of additional models after successful startup
+        print("=== STARTING BACKGROUND MODEL LOADING ===")
+        transcription_service.start_background_loading()
+        tts_service.start_background_loading()
+        print("=== BACKGROUND MODEL LOADING INITIATED ===")
+        # Print CORS configuration summary
+        cors_config.print_config_summary()
+    except Exception as e:
+        print(f"❌ SERVICE INITIALIZATION FAILED: {e}")
+        import traceback
+        traceback.print_exc()
+        raise
+    yield
+    # Cleanup
+    print("=== CLEANING UP SERVICES ===")
+    await transcription_service.cleanup()
+    await translation_service.cleanup()
+    await tts_service.cleanup()
+    print("=== CLEANUP COMPLETE ===")
+app = FastAPI(
+    title="Real-time Transcription & Translation API",
+    description="Backend API for real-time speech transcription and translation",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# CORS middleware with environment-based configuration
+cors_middleware_config = cors_config.get_cors_middleware_config()
+print(f"Configuring CORS middleware with keys: {list(cors_middleware_config.keys())}")
+app.add_middleware(
+    CORSMiddleware,
+    **cors_middleware_config
+)
+# Initialize services - using PyTorch models for better compatibility
+session_manager = SessionManager()
+transcription_service = TranscriptionService()
+translation_service = TranslationService()
+tts_service = TTSService()
+websocket_manager = WebSocketManager(
+    session_manager=session_manager,
+    transcription_service=transcription_service,
+    translation_service=translation_service,
+    tts_service=tts_service
+)
+# Include routers
+app.include_router(sessions.router, prefix="/api")
+app.include_router(mobile.router, prefix="/api")
+app.include_router(watch.router, prefix="/api")
+app.include_router(learning.router)
+# Set the session manager in the router
+sessions.session_manager = session_manager
+sessions.translation_service = translation_service
+sessions.tts_service = tts_service
+sessions.transcription_service = transcription_service
+# Set the mobile router
+mobile.translation_service = translation_service
+mobile.tts_service = tts_service
+mobile.transcription_service = transcription_service
+# Set the watch router
+watch.translation_service = translation_service
+watch.tts_service = tts_service
+watch.transcription_service = transcription_service
+# Configure logging with custom filter to truncate chunk arrays
+chunk_filter = ChunkArrayTruncateFilter()
+sio_logger = logging.getLogger('socketio')
+sio_logger.setLevel(logging.INFO)  # Show info logs with truncated arrays
+sio_logger.addFilter(chunk_filter)
+engineio_logger = logging.getLogger('engineio')
+engineio_logger.setLevel(logging.INFO)  # Show info logs with truncated arrays
+engineio_logger.addFilter(chunk_filter)
+# Also apply filter to the root logger to catch any other verbose logging
+root_logger = logging.getLogger()
+root_logger.addFilter(chunk_filter)
+# Configure Engine.IO payload limits for large audio chunks
+engineio.payload.Payload.max_decode_packets = 250
+# Socket.IO setup with environment-based CORS
+socketio_cors_origins = cors_config.get_socketio_cors_origins()
+print(f"Configuring Socket.IO CORS: {len(socketio_cors_origins) if isinstance(socketio_cors_origins, list) else 'all'} origins")
+sio = socketio.AsyncServer(
+    async_mode='asgi',
+    cors_allowed_origins=socketio_cors_origins,
+    cors_credentials=not cors_config.allow_all,  # Cannot use credentials with wildcard
+    logger=True,  # Re-enabled with custom filtering
+    engineio_logger=True,  # Re-enabled with custom filtering
+    always_connect=False  # This ensures connect event is called for authentication
+)
+# Set the socketio instance in websocket manager
+websocket_manager.set_socketio(sio)
+socket_app = socketio.ASGIApp(sio, app)
+@app.get("/health")
+async def health_check(token: str = Depends(optional_hf_token)):
+    """Health check endpoint - optionally authenticated"""
+    from app.auth import hf_auth
+    auth_status = "bypassed (local development)" if hf_auth.is_local else "authenticated"
+    if not hf_auth.is_local and not token:
+        auth_status = "unauthenticated"
+    return {
+        "status": "healthy",
+        "message": "Translation service is running",
+        "auth_status": auth_status,
+        "local_development": hf_auth.is_local,
+        "auth_bypassed": hf_auth.is_local,
+        "token_prefix": token[:10] + "..." if token and token != "local-development-bypass" else "local-bypass" if hf_auth.is_local else None,
+        "environment": {
+            "ENVIRONMENT": os.getenv('ENVIRONMENT', 'not set'),
+            "DEBUG": os.getenv('DEBUG', 'not set'),
+            "DISABLE_AUTH": os.getenv('DISABLE_AUTH', 'not set'),
+            "HOST": os.getenv('HOST', 'not set'),
+            "PORT": os.getenv('PORT', 'not set')
+        },
+        "services": {
+            "transcription": transcription_service is not None,
+            "translation": translation_service is not None,
+            "tts": tts_service is not None,
+            "sessions": session_manager is not None
+        }
+    }
+@sio.event
+async def connect(sid, environ=None, auth=None):
+    """Handle Socket.IO connection with authentication"""
+    try:
+        print(f"=== WEBSOCKET CONNECTION ATTEMPT ===")
+        print(f"SID: {sid}")
+        print(f"Auth data: {auth}")
+        print(f"Environ type: {type(environ)}")
+        print(f"Environ data: {environ}")
+        # Ensure environ is a dict
+        if environ is None:
+            environ = {}
+        print(f"Query string: {environ.get('QUERY_STRING', 'None')}")
+        print(f"Headers: {[k for k in environ.keys() if k.startswith('HTTP_')] if isinstance(environ, dict) else 'environ not dict'}")
+        # Check authentication from multiple sources
+        authenticated = False
+        auth_method = None
+        # Method 1: Check auth data from client
+        if auth and authenticate_websocket_auth_data(auth):
+            authenticated = True
+            auth_method = "auth_data"
+            print("✓ Authenticated via auth data")
+        # Method 2: Check environment (headers, query params)
+        elif environ and isinstance(environ, dict) and authenticate_websocket_connect(environ):
+            authenticated = True
+            auth_method = "environ"
+            print("✓ Authenticated via headers/query")
+        # TEMPORARY: Allow connections for debugging (remove in production)
+        # This helps identify if the issue is authentication or something else
+        if not authenticated:
+            print("⚠️  Authentication failed, but allowing for debugging")
+            if isinstance(environ, dict):
+                print(f"Available environ keys: {list(environ.keys())}")
+            # Uncomment the next line to temporarily allow unauthenticated connections for debugging
+            authenticated = True
+            auth_method = "debug_bypass"
+        if not authenticated:
+            print("❌ Authentication failed - disconnecting")
+            await sio.disconnect(sid)
+            return False
+        print(f"✓ WebSocket connection authenticated successfully via {auth_method}")
+        return True
+    except Exception as e:
+        print(f"❌ Error in connect handler: {e}")
+        import traceback
+        traceback.print_exc()
+        try:
+            await sio.disconnect(sid)
+        except:
+            pass
+        return False
+@sio.event
+async def disconnect(sid):
+    await websocket_manager.handle_disconnect(sid)
+@sio.event
+async def join_session(sid, data):
+    await websocket_manager.handle_join_session(sid, data)
+@sio.event
+async def join_hub(sid, data):
+    await websocket_manager.handle_join_hub(sid, data)
+@sio.event
+async def leave_session(sid, data):
+    await websocket_manager.handle_leave_session(sid, data)
+@sio.event
+async def audio_chunk(sid, data):
+    await websocket_manager.handle_audio_chunk(sid, data)
+@sio.event
+async def speaking_status(sid, data):
+    await websocket_manager.handle_speaking_status(sid, data)
+@sio.event
+async def test_echo(sid, data):
+    """Test event to verify WebSocket communication"""
+    await sio.emit('test_echo_response', data, room=sid)
+@sio.event
+async def update_participant_language(sid, data):
+    """Update participant's language (affects speech recognition)"""
+    await websocket_manager.handle_update_participant_language(sid, data)
+@sio.event
+async def update_session_languages(sid, data):
+    """Update session's languages (affects translation targets)"""
+    await websocket_manager.handle_update_session_languages(sid, data)
+# Serve static files (for frontend)
+if os.path.exists("../frontend/dist"):
+    app.mount("/", StaticFiles(directory="../frontend/dist", html=True), name="static")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("main:socket_app", host="0.0.0.0", port=7860, reload=True)

app/models/__init__.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from pydantic import BaseModel, Field
+from typing import List, Dict, Optional
+from enum import Enum
+class LanguageCode(str, Enum):
+    ENGLISH = "eng"
+    SWAHILI = "swa"
+    KIKUYU = "kik"
+    KAMBA = "kam"
+    KIMERU = "mer"
+    LUO = "luo"
+    SOMALI = "som"
+class Language(BaseModel):
+    code: LanguageCode
+    name: str
+    display_name: str
+class ParticipantCreate(BaseModel):
+    name: str
+    language: LanguageCode
+class Participant(BaseModel):
+    id: str
+    name: str
+    language: Language
+    is_organizer: bool = False
+    is_speaking: bool = False
+    is_connected: bool = False
+class SessionCreate(BaseModel):
+    name: str
+    organizer_name: str
+    languages: List[LanguageCode]
+    enable_tts: bool = True  # Enable TTS by default for backward compatibility
+class Session(BaseModel):
+    id: str
+    name: str
+    organizer_name: str
+    participants: List[Participant] = []
+    languages: List[Language] = []
+    qr_code_url: Optional[str] = None
+    is_active: bool = True
+    enable_tts: bool = True  # TTS enabled by default
+class Message(BaseModel):
+    id: str
+    session_id: str
+    speaker_id: str
+    speaker_name: str
+    original_text: str
+    original_language: Language
+    translations: Dict[str, str] = {}
+    is_transcribing: bool = False
+class TranscriptionUpdate(BaseModel):
+    message_id: str
+    text: str
+    is_complete: bool
+    confidence: Optional[float] = None
+class TranslationUpdate(BaseModel):
+    message_id: str
+    target_language: LanguageCode
+    translated_text: str
+class AudioChunk(BaseModel):
+    session_id: str
+    participant_id: str
+    audio_data: bytes
+class WebSocketMessage(BaseModel):
+    type: str
+    data: Dict
+    session_id: str
+    participant_id: Optional[str] = None

app/routers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Routers package

app/routers/add_phase_endpoints.py ADDED Viewed

	@@ -0,0 +1,490 @@

+# Script to add remaining Phase 1-3 endpoints to learning.py
+endpoints_code = """
+@router.post("/vocabulary/add")
+async def add_vocabulary_to_practice(
+    vocab_request: VocabularyAddRequest,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Add a vocabulary word to user's practice queue with FSRS initialization\"\"\"
+    try:
+        user_id = token if token else 'anonymous'
+        vocab = learning_service.get_vocabulary(vocab_request.vocab_id)
+        if not vocab:
+            raise HTTPException(status_code=404, detail="Vocabulary not found")
+        fsrs_data = {
+            'difficulty': 0.3,
+            'stability': 2.5,
+            'retrievability': 1.0,
+            'review_count': 0,
+            'last_review': None,
+            'next_review': datetime.utcnow().isoformat() + 'Z',
+            'lapses': 0,
+            'state': 'new'
+        }
+        user_vocab = {
+            'vocabulary_id': vocab_request.vocab_id,
+            'swahili': vocab.get('swahili', ''),
+            'english': vocab.get('english', ''),
+            'part_of_speech': vocab.get('part_of_speech', 'unknown'),
+            'added_at': datetime.utcnow().isoformat() + 'Z',
+            'added_from': vocab_request.source_lesson_id,
+            'fsrs': fsrs_data,
+            'mastery_level': 0,
+            'times_reviewed': 0,
+            'times_correct': 0,
+            'accuracy': 0.0
+        }
+        success = learning_service.update_vocabulary_progress(
+            user_id, str(vocab_request.vocab_id), user_vocab
+        )
+        if success:
+            return {"success": True, "vocabulary": user_vocab}
+        else:
+            raise HTTPException(status_code=500, detail="Failed to add vocabulary")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error adding vocabulary: {e}")
+        raise HTTPException(status_code=500, detail="Failed to add vocabulary")
+def calculate_next_review_fsrs(fsrs: Dict, grade: int) -> Dict:
+    \"\"\"Implement FSRS algorithm\"\"\"
+    from datetime import timedelta
+    difficulty = fsrs['difficulty']
+    stability = fsrs['stability']
+    if grade == 0:
+        new_difficulty = min(difficulty + 0.2, 1.0)
+    elif grade == 2:
+        new_difficulty = min(difficulty + 0.1, 1.0)
+    elif grade == 4:
+        new_difficulty = max(difficulty - 0.1, 0.0)
+    else:
+        new_difficulty = difficulty
+    if grade == 0:
+        new_stability = stability * 0.5
+        state = 'relearning'
+        interval_minutes = 10
+    elif grade == 2:
+        new_stability = stability * 1.2
+        state = 'review'
+        interval_minutes = int(new_stability * 24 * 60)
+    elif grade == 3:
+        new_stability = stability * 2.5
+        state = 'review'
+        interval_minutes = int(new_stability * 24 * 60)
+    else:
+        new_stability = stability * 4.0
+        state = 'review'
+        interval_minutes = int(new_stability * 24 * 60)
+    next_review = datetime.utcnow() + timedelta(minutes=interval_minutes)
+    return {
+        'difficulty': new_difficulty,
+        'stability': new_stability,
+        'retrievability': 0.9 if grade >= 2 else 0.0,
+        'review_count': fsrs['review_count'] + 1,
+        'last_review': datetime.utcnow().isoformat() + 'Z',
+        'next_review': next_review.isoformat() + 'Z',
+        'lapses': fsrs['lapses'],
+        'state': state,
+        'interval_days': interval_minutes / (24 * 60)
+    }
+def calculate_mastery_level(vocab: Dict) -> int:
+    \"\"\"Calculate mastery level (0-5)\"\"\"
+    accuracy = vocab['accuracy']
+    reviews = vocab['times_reviewed']
+    stability = vocab['fsrs']['stability']
+    if reviews == 0:
+        return 0
+    elif reviews < 5 or accuracy < 70:
+        return 1
+    elif reviews < 10 or accuracy < 85:
+        return 2
+    elif reviews < 20 or accuracy < 95:
+        return 3
+    elif reviews >= 20 and accuracy >= 95 and stability >= 30:
+        return 4
+    elif reviews >= 40 and accuracy >= 98 and stability >= 90:
+        return 5
+    else:
+        return 3
+@router.post("/vocabulary/review")
+async def record_vocabulary_review_fsrs(
+    review_request: VocabularyReviewRequest,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Record vocabulary review and update FSRS parameters\"\"\"
+    try:
+        user_id = token if token else 'anonymous'
+        progress = learning_service.get_user_progress(user_id)
+        if not progress or str(review_request.vocab_id) not in progress.get('vocabulary_progress', {}):
+            raise HTTPException(status_code=404, detail="Vocabulary not in practice queue")
+        vocab = progress['vocabulary_progress'][str(review_request.vocab_id)]
+        fsrs = vocab['fsrs']
+        grade_map = {'again': 0, 'hard': 2, 'good': 3, 'easy': 4}
+        grade = grade_map.get(review_request.rating, 3)
+        new_fsrs = calculate_next_review_fsrs(fsrs, grade)
+        vocab['fsrs'] = new_fsrs
+        vocab['times_reviewed'] += 1
+        if grade >= 2:
+            vocab['times_correct'] += 1
+        else:
+            vocab['fsrs']['lapses'] += 1
+        vocab['accuracy'] = (vocab['times_correct'] / vocab['times_reviewed']) * 100 if vocab['times_reviewed'] > 0 else 0
+        vocab['mastery_level'] = calculate_mastery_level(vocab)
+        vocab['last_reviewed_at'] = datetime.utcnow().isoformat() + 'Z'
+        if 'vocabulary_reviewed' not in progress['overall_stats']:
+            progress['overall_stats']['vocabulary_reviewed'] = 0
+        progress['overall_stats']['vocabulary_reviewed'] += 1
+        learning_service.save_user_progress(user_id, progress)
+        return {
+            "success": True,
+            "vocabulary": vocab,
+            "next_review": new_fsrs['next_review'],
+            "interval_days": new_fsrs['interval_days']
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error recording vocabulary review: {e}")
+        raise HTTPException(status_code=500, detail="Failed to record review")
+@router.get("/vocabulary/stats")
+async def get_vocabulary_stats(
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Get vocabulary mastery statistics\"\"\"
+    try:
+        user_id = token if token else 'anonymous'
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            return {
+                "total_words": 0,
+                "in_practice": 0,
+                "mastery_breakdown": {str(i): 0 for i in range(6)},
+                "average_accuracy": 0
+            }
+        vocab_progress = progress.get('vocabulary_progress', {})
+        mastery_breakdown = {str(i): 0 for i in range(6)}
+        total_accuracy = 0
+        total_with_reviews = 0
+        for vocab_data in vocab_progress.values():
+            level = vocab_data.get('mastery_level', 0)
+            mastery_breakdown[str(level)] += 1
+            if vocab_data.get('times_reviewed', 0) > 0:
+                total_accuracy += vocab_data.get('accuracy', 0)
+                total_with_reviews += 1
+        avg_accuracy = total_accuracy / total_with_reviews if total_with_reviews > 0 else 0
+        return {
+            "total_words": len(vocab_progress),
+            "in_practice": len(vocab_progress),
+            "mastery_breakdown": mastery_breakdown,
+            "average_accuracy": round(avg_accuracy, 1),
+            "total_reviews": sum(v.get('times_reviewed', 0) for v in vocab_progress.values())
+        }
+    except Exception as e:
+        logger.error(f"Error getting vocabulary stats: {e}")
+        raise HTTPException(status_code=500, detail="Failed to get stats")
+@router.get("/vocabulary/library")
+async def get_vocabulary_library(
+    lesson_id: Optional[int] = None,
+    level: Optional[str] = None,
+    search: Optional[str] = None,
+    request: Request = None,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Browse all vocabulary with filters\"\"\"
+    try:
+        user_id = token if token else 'anonymous'
+        all_vocab = learning_service.get_all_vocabulary()
+        progress = learning_service.get_user_progress(user_id)
+        user_vocab = progress.get('vocabulary_progress', {}) if progress else {}
+        filtered_vocab = all_vocab
+        if lesson_id:
+            filtered_vocab = [v for v in filtered_vocab if v.get('lesson_id') == lesson_id]
+        if level:
+            filtered_vocab = [v for v in filtered_vocab if v.get('level') == level]
+        if search:
+            search_lower = search.lower()
+            filtered_vocab = [v for v in filtered_vocab
+                            if search_lower in v.get('swahili', '').lower()
+                            or search_lower in v.get('english', '').lower()]
+        for vocab in filtered_vocab:
+            vocab_id = str(vocab.get('vocabulary_id') or vocab.get('id'))
+            if vocab_id in user_vocab:
+                vocab['status'] = 'practicing'
+                vocab['mastery_level'] = user_vocab[vocab_id].get('mastery_level', 0)
+                vocab['accuracy'] = user_vocab[vocab_id].get('accuracy', 0)
+                vocab['next_review'] = user_vocab[vocab_id].get('fsrs', {}).get('next_review')
+            else:
+                vocab['status'] = 'not_practicing'
+                vocab['mastery_level'] = 0
+        return {
+            "vocabulary": filtered_vocab,
+            "total": len(filtered_vocab),
+            "filters_applied": {
+                "lesson_id": lesson_id,
+                "level": level,
+                "search": search
+            }
+        }
+    except Exception as e:
+        logger.error(f"Error getting vocabulary library: {e}")
+        raise HTTPException(status_code=500, detail="Failed to get vocabulary")
+# Reading Comprehension
+class ComprehensionAnswer(BaseModel):
+    question_id: str
+    answer: str
+class ComprehensionSubmission(BaseModel):
+    lesson_id: int
+    passage_id: str
+    answers: List[ComprehensionAnswer]
+@router.post("/comprehension/submit")
+async def submit_comprehension_answers(
+    submission: ComprehensionSubmission,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Submit reading comprehension answers and get scoring\"\"\"
+    try:
+        user_id = token if token else 'anonymous'
+        lesson = learning_service.get_lesson(submission.lesson_id)
+        if not lesson:
+            raise HTTPException(status_code=404, detail="Lesson not found")
+        passage = None
+        for p in lesson.get('reading_passages', []):
+            if p['passage_id'] == submission.passage_id:
+                passage = p
+                break
+        if not passage:
+            raise HTTPException(status_code=404, detail="Passage not found")
+        results = []
+        correct_count = 0
+        for submitted in submission.answers:
+            question_id = submitted.question_id
+            user_answer = submitted.answer.strip().lower()
+            question = None
+            for q in passage['comprehension_questions']:
+                if q['question_id'] == question_id:
+                    question = q
+                    break
+            if not question:
+                continue
+            correct_answers = [ans.strip().lower() for ans in question.get('correct_answers', [])]
+            is_correct = user_answer in correct_answers
+            if is_correct:
+                correct_count += 1
+            results.append({
+                "question_id": question_id,
+                "correct": is_correct,
+                "user_answer": user_answer,
+                "correct_answer": question['correct_answers'][0] if correct_answers else None,
+                "explanation": question.get('explanation')
+            })
+        score = (correct_count / len(submission.answers)) * 100 if submission.answers else 0
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            progress = learning_service.create_default_progress(user_id)
+        if 'comprehension_scores' not in progress:
+            progress['comprehension_scores'] = {}
+        progress['comprehension_scores'][f"{submission.lesson_id}_{submission.passage_id}"] = {
+            "score": score,
+            "completed_at": datetime.utcnow().isoformat() + 'Z',
+            "attempts": progress['comprehension_scores'].get(f"{submission.lesson_id}_{submission.passage_id}", {}).get('attempts', 0) + 1
+        }
+        learning_service.save_user_progress(user_id, progress)
+        return {
+            "results": results,
+            "score": round(score, 1),
+            "correct": correct_count,
+            "total": len(submission.answers)
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error submitting comprehension: {e}")
+        raise HTTPException(status_code=500, detail="Failed to submit comprehension")
+# Task Scenarios
+class ScenarioProgressUpdate(BaseModel):
+    turn_id: str
+    choice_id: str
+@router.get("/scenarios/{scenario_id}")
+async def get_scenario(
+    scenario_id: str,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Get task scenario with branching dialogue\"\"\"
+    try:
+        user_id = token if token else 'anonymous'
+        scenario = learning_service.get_scenario(scenario_id)
+        if not scenario:
+            raise HTTPException(status_code=404, detail="Scenario not found")
+        progress = learning_service.get_user_progress(user_id)
+        scenario_progress = None
+        if progress and 'scenario_progress' in progress:
+            scenario_progress = progress['scenario_progress'].get(scenario_id)
+        return {
+            "scenario": scenario,
+            "user_progress": scenario_progress
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting scenario: {e}")
+        raise HTTPException(status_code=500, detail="Failed to get scenario")
+@router.post("/scenarios/{scenario_id}/progress")
+async def update_scenario_progress(
+    scenario_id: str,
+    progress_update: ScenarioProgressUpdate,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Update scenario progress with user choice\"\"\"
+    try:
+        user_id = token if token else 'anonymous'
+        scenario = learning_service.get_scenario(scenario_id)
+        if not scenario:
+            raise HTTPException(status_code=404, detail="Scenario not found")
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            progress = learning_service.create_default_progress(user_id)
+        if 'scenario_progress' not in progress:
+            progress['scenario_progress'] = {}
+        if scenario_id not in progress['scenario_progress']:
+            progress['scenario_progress'][scenario_id] = {
+                "started_at": datetime.utcnow().isoformat() + 'Z',
+                "turns": [],
+                "completed": False
+            }
+        progress['scenario_progress'][scenario_id]['turns'].append({
+            "turn_id": progress_update.turn_id,
+            "choice_id": progress_update.choice_id,
+            "timestamp": datetime.utcnow().isoformat() + 'Z'
+        })
+        turns_count = len(progress['scenario_progress'][scenario_id]['turns'])
+        if turns_count >= scenario.get('required_turns', 6):
+            progress['scenario_progress'][scenario_id]['completed'] = True
+            progress['scenario_progress'][scenario_id]['completed_at'] = datetime.utcnow().isoformat() + 'Z'
+        learning_service.save_user_progress(user_id, progress)
+        return {
+            "success": True,
+            "progress": progress['scenario_progress'][scenario_id]
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error updating scenario progress: {e}")
+        raise HTTPException(status_code=500, detail="Failed to update scenario")
+@router.get("/scenarios")
+async def list_scenarios(
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    \"\"\"Get list of all available scenarios\"\"\"
+    try:
+        scenarios = learning_service.get_all_scenarios()
+        return {
+            "success": True,
+            "scenarios": scenarios,
+            "total": len(scenarios)
+        }
+    except Exception as e:
+        logger.error(f"Error listing scenarios: {e}")
+        raise HTTPException(status_code=500, detail="Failed to list scenarios")
+"""
+# Append to learning.py
+with open('C:/repos/polyglot/backend/app/routers/learning.py', 'a', encoding='utf-8') as f:
+    f.write(endpoints_code)
+print("Successfully added all remaining Phase 1-3 endpoints!")

app/routers/learning.py ADDED Viewed

	@@ -0,0 +1,1020 @@

+"""
+Learning API Router - REST endpoints for language learning functionality
+Provides endpoints for:
+- Fetching lesson catalog and individual lessons
+- Managing user progress
+- Recording lesson completion and scores
+- Achievement tracking
+"""
+from fastapi import APIRouter, HTTPException, Depends, Request, File, UploadFile
+from fastapi.responses import Response
+from pydantic import BaseModel
+from typing import List, Dict, Optional, Any
+from datetime import datetime
+import logging
+import io
+from app.services.learning_data_service import LearningDataService
+from app.auth import optional_hf_token
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api/learning", tags=["learning"])
+# Initialize data service
+learning_service = LearningDataService()
+# ==================== Request/Response Models ====================
+class LessonProgressUpdate(BaseModel):
+    lesson_id: int
+    status: str  # 'in_progress' or 'completed'
+    score: Optional[int] = None
+    pronunciation_score: Optional[float] = None
+    listening_score: Optional[float] = None
+    comprehension_score: Optional[float] = None
+    time_spent_seconds: Optional[int] = None
+    steps_completed: Optional[int] = None
+    steps_skipped: Optional[int] = None
+class VocabularyReview(BaseModel):
+    vocabulary_id: int
+    swahili: str
+    is_correct: bool
+    mastery_level: Optional[int] = None
+class AchievementCheck(BaseModel):
+    achievement_id: str
+    progress: int
+    target: int
+# ==================== Lesson Endpoints ====================
+@router.get("/lessons")
+async def get_lessons(language: Optional[str] = 'swahili', request: Request = None, token: Optional[str] = Depends(optional_hf_token)):
+    """
+    Get catalog of all available lessons for a specific language
+    Args:
+        language: Language code (swahili, kamba, maasai)
+    Returns the lesson index with metadata for all lessons
+    """
+    try:
+        index = learning_service.get_lessons_index(language)
+        if not index:
+            raise HTTPException(status_code=404, detail=f"Lessons catalog not found for {language}")
+        return {
+            "success": True,
+            "lessons": index.get('lessons', []),
+            "learning_paths": index.get('learning_paths', {}),
+            "metadata": index.get('metadata', {})
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error fetching lessons for {language}: {e}")
+        raise HTTPException(status_code=500, detail="Failed to fetch lessons")
+@router.get("/lessons/{lesson_id}")
+async def get_lesson(lesson_id: int, language: Optional[str] = 'swahili', request: Request = None, token: Optional[str] = Depends(optional_hf_token)):
+    """
+    Get detailed lesson content including vocabulary, dialogue, and exercises
+    Args:
+        lesson_id: ID of the lesson to fetch
+        language: Language code (swahili, kamba, maasai)
+    """
+    try:
+        lesson = learning_service.get_lesson(lesson_id, language)
+        if not lesson:
+            raise HTTPException(status_code=404, detail=f"Lesson {lesson_id} not found for {language}")
+        return {
+            "success": True,
+            "lesson": lesson
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error fetching lesson {lesson_id} for {language}: {e}")
+        raise HTTPException(status_code=500, detail="Failed to fetch lesson")
+# ==================== User Progress Endpoints ====================
+@router.get("/progress")
+async def get_user_progress(request: Request, token: Optional[str] = Depends(optional_hf_token)):
+    """
+    Get user's learning progress
+    Returns overall stats, lesson progress, vocabulary progress, and achievements
+    """
+    try:
+        # Use authenticated user ID or default for anonymous users
+        user_id = token if token else 'anonymous'
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            raise HTTPException(status_code=500, detail="Failed to load user progress")
+        return {
+            "success": True,
+            "progress": progress
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error fetching user progress: {e}")
+        raise HTTPException(status_code=500, detail="Failed to fetch user progress")
+@router.post("/progress/lesson")
+async def update_lesson_progress(
+    progress_update: LessonProgressUpdate,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """
+    Update progress for a specific lesson
+    Records completion status, scores, and time spent on a lesson
+    """
+    try:
+        user_id = token if token else 'anonymous'
+        # Build progress update dict
+        update_data = {
+            'lesson_id': progress_update.lesson_id,
+            'status': progress_update.status
+        }
+        # Add optional fields if provided
+        if progress_update.score is not None:
+            update_data['latest_score'] = progress_update.score
+            # Track best score
+            user_progress = learning_service.get_user_progress(user_id)
+            if user_progress:
+                lesson_key = str(progress_update.lesson_id)
+                current_best = user_progress.get('lesson_progress', {}).get(lesson_key, {}).get('best_score', 0)
+                update_data['best_score'] = max(current_best, progress_update.score)
+        if progress_update.pronunciation_score is not None:
+            update_data['pronunciation_score'] = progress_update.pronunciation_score
+        if progress_update.listening_score is not None:
+            update_data['listening_score'] = progress_update.listening_score
+        if progress_update.comprehension_score is not None:
+            update_data['comprehension_score'] = progress_update.comprehension_score
+        if progress_update.time_spent_seconds is not None:
+            update_data['time_spent_seconds'] = progress_update.time_spent_seconds
+        if progress_update.steps_completed is not None:
+            update_data['steps_completed'] = progress_update.steps_completed
+        if progress_update.steps_skipped is not None:
+            update_data['steps_skipped'] = progress_update.steps_skipped
+        # Add completion timestamp if status is completed
+        if progress_update.status == 'completed':
+            update_data['completed_at'] = datetime.utcnow().isoformat() + 'Z'
+            # Increment attempts
+            user_progress = learning_service.get_user_progress(user_id)
+            if user_progress:
+                lesson_key = str(progress_update.lesson_id)
+                current_attempts = user_progress.get('lesson_progress', {}).get(lesson_key, {}).get('attempts', 0)
+                update_data['attempts'] = current_attempts + 1
+        # Save to file
+        success = learning_service.update_lesson_progress(
+            user_id,
+            progress_update.lesson_id,
+            update_data
+        )
+        if not success:
+            raise HTTPException(status_code=500, detail="Failed to save progress")
+        return {
+            "success": True,
+            "message": "Lesson progress updated"
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error updating lesson progress: {e}")
+        raise HTTPException(status_code=500, detail="Failed to update lesson progress")
+@router.post("/progress/vocabulary")
+async def record_vocabulary_review(
+    review: VocabularyReview,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """
+    Record a vocabulary review/practice session
+    Updates mastery level and review statistics for a vocabulary word
+    """
+    try:
+        user_id = token if token else 'anonymous'
+        # Get current vocabulary progress
+        user_progress = learning_service.get_user_progress(user_id)
+        if not user_progress:
+            raise HTTPException(status_code=500, detail="Failed to load user progress")
+        vocab_key = str(review.vocabulary_id)
+        vocab_progress = user_progress.get('vocabulary_progress', {}).get(vocab_key, {
+            'vocabulary_id': review.vocabulary_id,
+            'swahili': review.swahili,
+            'mastery_level': 0,
+            'times_reviewed': 0,
+            'times_correct': 0,
+            'ease_factor': 2.5,
+            'interval_days': 0
+        })
+        # Update review counts
+        vocab_progress['times_reviewed'] = vocab_progress.get('times_reviewed', 0) + 1
+        if review.is_correct:
+            vocab_progress['times_correct'] = vocab_progress.get('times_correct', 0) + 1
+        # Update mastery level if provided
+        if review.mastery_level is not None:
+            vocab_progress['mastery_level'] = review.mastery_level
+        # Update timestamps
+        vocab_progress['last_reviewed_at'] = datetime.utcnow().isoformat() + 'Z'
+        # Calculate next review date using simple spaced repetition
+        # (simplified version - could use SuperMemo SM-2 algorithm)
+        interval_days = vocab_progress.get('interval_days', 0)
+        if review.is_correct:
+            interval_days = max(1, interval_days * 2)  # Double the interval
+        else:
+            interval_days = 1  # Reset to 1 day if incorrect
+        vocab_progress['interval_days'] = interval_days
+        from datetime import timedelta
+        next_review = datetime.utcnow() + timedelta(days=interval_days)
+        vocab_progress['next_review_at'] = next_review.isoformat() + 'Z'
+        # Save to file
+        success = learning_service.update_vocabulary_progress(
+            user_id,
+            review.vocabulary_id,
+            vocab_progress
+        )
+        if not success:
+            raise HTTPException(status_code=500, detail="Failed to save vocabulary progress")
+        return {
+            "success": True,
+            "message": "Vocabulary review recorded",
+            "next_review_at": vocab_progress['next_review_at']
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error recording vocabulary review: {e}")
+        raise HTTPException(status_code=500, detail="Failed to record vocabulary review")
+# ==================== Achievement Endpoints ====================
+@router.get("/achievements")
+async def get_achievements(request: Request, token: Optional[str] = Depends(optional_hf_token)):
+    """
+    Get all available achievements and user's progress on them
+    """
+    try:
+        # Get achievements configuration
+        achievements_config = learning_service.get_achievements()
+        if not achievements_config:
+            raise HTTPException(status_code=404, detail="Achievements not found")
+        # Get user progress
+        user_id = token if token else 'anonymous'
+        user_progress = learning_service.get_user_progress(user_id)
+        # Merge achievement definitions with user progress
+        user_achievements = user_progress.get('achievements', {}) if user_progress else {}
+        achievements_with_progress = []
+        for achievement in achievements_config.get('achievements', []):
+            achievement_id = achievement['achievement_id']
+            achievement_data = {
+                **achievement,
+                'unlocked': False,
+                'progress': 0
+            }
+            # Add user progress if available
+            if achievement_id in user_achievements:
+                achievement_data.update(user_achievements[achievement_id])
+            achievements_with_progress.append(achievement_data)
+        return {
+            "success": True,
+            "achievements": achievements_with_progress,
+            "tiers": achievements_config.get('tiers', {})
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error fetching achievements: {e}")
+        raise HTTPException(status_code=500, detail="Failed to fetch achievements")
+@router.post("/achievements/check")
+async def check_achievement(
+    achievement: AchievementCheck,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """
+    Check and potentially unlock an achievement
+    Updates achievement progress and unlocks if target is reached
+    """
+    try:
+        user_id = token if token else 'anonymous'
+        success = learning_service.unlock_achievement(
+            user_id,
+            achievement.achievement_id,
+            achievement.progress,
+            achievement.target
+        )
+        if not success:
+            raise HTTPException(status_code=500, detail="Failed to update achievement")
+        is_unlocked = achievement.progress >= achievement.target
+        return {
+            "success": True,
+            "unlocked": is_unlocked,
+            "achievement_id": achievement.achievement_id
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error checking achievement: {e}")
+        raise HTTPException(status_code=500, detail="Failed to check achievement")
+# ==================== Statistics Endpoints ====================
+@router.get("/stats")
+async def get_user_stats(request: Request, token: Optional[str] = Depends(optional_hf_token)):
+    """
+    Get user's overall learning statistics
+    Returns aggregated stats like total XP, streak, lessons completed, etc.
+    """
+    try:
+        user_id = token if token else 'anonymous'
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            raise HTTPException(status_code=500, detail="Failed to load user progress")
+        return {
+            "success": True,
+            "stats": progress.get('overall_stats', {}),
+            "daily_stats": progress.get('daily_stats', {})
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error fetching user stats: {e}")
+        raise HTTPException(status_code=500, detail="Failed to fetch user stats")
+# ==================== TTS and ASR Endpoints ====================
+class TTSRequest(BaseModel):
+    text: str
+    language: str
+    messageId: Optional[str] = None
+@router.post("/tts/generate")
+async def generate_tts(
+    tts_request: TTSRequest,
+    request: Request
+):
+    """
+    Generate TTS audio for lesson text
+    """
+    try:
+        from app.main import tts_service
+        # Generate TTS audio
+        audio_data = await tts_service.generate_speech(
+            text=tts_request.text,
+            language_code=tts_request.language
+        )
+        if not audio_data:
+            raise HTTPException(status_code=500, detail="Failed to generate TTS audio")
+        # Return audio as WAV file
+        return Response(
+            content=audio_data,
+            media_type="audio/wav",
+            headers={
+                "Content-Disposition": f"inline; filename=tts_{tts_request.messageId or 'audio'}.wav"
+            }
+        )
+    except Exception as e:
+        logger.error(f"Error generating TTS: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to generate TTS: {str(e)}")
+@router.post("/transcribe")
+async def transcribe_audio(
+    request: Request,
+    audio: UploadFile = File(...)
+):
+    """
+    Transcribe audio for pronunciation practice
+    """
+    try:
+        from app.main import transcription_service
+        # Read audio file
+        audio_bytes = await audio.read()
+        # Get language from form data (default to Swahili)
+        form = await request.form()
+        language = form.get('language', 'swa')
+        # Transcribe
+        text = await transcription_service.transcribe_audio(
+            audio_data=audio_bytes,
+            language_code=language
+        )
+        return {
+            "success": True,
+            "text": text,
+            "language": language
+        }
+    except Exception as e:
+        logger.error(f"Error transcribing audio: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to transcribe: {str(e)}")
+# ==================== Phase 1-3 Endpoints ====================
+# Vocabulary Management
+class VocabularyAddRequest(BaseModel):
+    vocab_id: int
+    source_lesson_id: Optional[int] = None
+class VocabularyReviewRequest(BaseModel):
+    vocab_id: int
+    rating: str  # 'again', 'hard', 'good', 'easy'
+@router.get("/vocabulary/due")
+async def get_due_vocabulary(
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Get vocabulary words due for FSRS review"""
+    try:
+        user_id = token if token else 'anonymous'
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            return {"due_words": [], "total_due": 0}
+        vocab_progress = progress.get('vocabulary_progress', {})
+        now = datetime.utcnow()
+        due_words = []
+        for vocab_id, vocab_data in vocab_progress.items():
+            next_review_str = vocab_data.get('fsrs', {}).get('next_review')
+            if not next_review_str:
+                continue
+            next_review = datetime.fromisoformat(next_review_str.rstrip('Z'))
+            if next_review <= now:
+                hours_overdue = (now - next_review).total_seconds() / 3600
+                vocab_data['priority'] = 1000 - hours_overdue
+                due_words.append(vocab_data)
+        due_words.sort(key=lambda x: x.get('priority', 0), reverse=True)
+        return {
+            "due_words": due_words,
+            "total_due": len(due_words),
+            "timestamp": now.isoformat() + 'Z'
+        }
+    except Exception as e:
+        logger.error(f"Error getting due vocabulary: {e}")
+        raise HTTPException(status_code=500, detail="Failed to get due vocabulary")
+@router.post("/vocabulary/add")
+async def add_vocabulary_to_practice(
+    vocab_request: VocabularyAddRequest,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Add a vocabulary word to user's practice queue with FSRS initialization"""
+    try:
+        user_id = token if token else 'anonymous'
+        vocab = learning_service.get_vocabulary(vocab_request.vocab_id)
+        if not vocab:
+            raise HTTPException(status_code=404, detail="Vocabulary not found")
+        fsrs_data = {
+            'difficulty': 0.3,
+            'stability': 2.5,
+            'retrievability': 1.0,
+            'review_count': 0,
+            'last_review': None,
+            'next_review': datetime.utcnow().isoformat() + 'Z',
+            'lapses': 0,
+            'state': 'new'
+        }
+        user_vocab = {
+            'vocabulary_id': vocab_request.vocab_id,
+            'swahili': vocab.get('swahili', ''),
+            'english': vocab.get('english', ''),
+            'part_of_speech': vocab.get('part_of_speech', 'unknown'),
+            'added_at': datetime.utcnow().isoformat() + 'Z',
+            'added_from': vocab_request.source_lesson_id,
+            'fsrs': fsrs_data,
+            'mastery_level': 0,
+            'times_reviewed': 0,
+            'times_correct': 0,
+            'accuracy': 0.0
+        }
+        success = learning_service.update_vocabulary_progress(
+            user_id, str(vocab_request.vocab_id), user_vocab
+        )
+        if success:
+            return {"success": True, "vocabulary": user_vocab}
+        else:
+            raise HTTPException(status_code=500, detail="Failed to add vocabulary")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error adding vocabulary: {e}")
+        raise HTTPException(status_code=500, detail="Failed to add vocabulary")
+def calculate_next_review_fsrs(fsrs: Dict, grade: int) -> Dict:
+    """Implement FSRS algorithm"""
+    from datetime import timedelta
+    difficulty = fsrs['difficulty']
+    stability = fsrs['stability']
+    if grade == 0:
+        new_difficulty = min(difficulty + 0.2, 1.0)
+    elif grade == 2:
+        new_difficulty = min(difficulty + 0.1, 1.0)
+    elif grade == 4:
+        new_difficulty = max(difficulty - 0.1, 0.0)
+    else:
+        new_difficulty = difficulty
+    if grade == 0:
+        new_stability = stability * 0.5
+        state = 'relearning'
+        interval_minutes = 10
+    elif grade == 2:
+        new_stability = stability * 1.2
+        state = 'review'
+        interval_minutes = int(new_stability * 24 * 60)
+    elif grade == 3:
+        new_stability = stability * 2.5
+        state = 'review'
+        interval_minutes = int(new_stability * 24 * 60)
+    else:
+        new_stability = stability * 4.0
+        state = 'review'
+        interval_minutes = int(new_stability * 24 * 60)
+    next_review = datetime.utcnow() + timedelta(minutes=interval_minutes)
+    return {
+        'difficulty': new_difficulty,
+        'stability': new_stability,
+        'retrievability': 0.9 if grade >= 2 else 0.0,
+        'review_count': fsrs['review_count'] + 1,
+        'last_review': datetime.utcnow().isoformat() + 'Z',
+        'next_review': next_review.isoformat() + 'Z',
+        'lapses': fsrs['lapses'],
+        'state': state,
+        'interval_days': interval_minutes / (24 * 60)
+    }
+def calculate_mastery_level(vocab: Dict) -> int:
+    """Calculate mastery level (0-5)"""
+    accuracy = vocab['accuracy']
+    reviews = vocab['times_reviewed']
+    stability = vocab['fsrs']['stability']
+    if reviews == 0:
+        return 0
+    elif reviews < 5 or accuracy < 70:
+        return 1
+    elif reviews < 10 or accuracy < 85:
+        return 2
+    elif reviews < 20 or accuracy < 95:
+        return 3
+    elif reviews >= 20 and accuracy >= 95 and stability >= 30:
+        return 4
+    elif reviews >= 40 and accuracy >= 98 and stability >= 90:
+        return 5
+    else:
+        return 3
+@router.post("/vocabulary/review")
+async def record_vocabulary_review_fsrs(
+    review_request: VocabularyReviewRequest,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Record vocabulary review and update FSRS parameters"""
+    try:
+        user_id = token if token else 'anonymous'
+        progress = learning_service.get_user_progress(user_id)
+        if not progress or str(review_request.vocab_id) not in progress.get('vocabulary_progress', {}):
+            raise HTTPException(status_code=404, detail="Vocabulary not in practice queue")
+        vocab = progress['vocabulary_progress'][str(review_request.vocab_id)]
+        fsrs = vocab['fsrs']
+        grade_map = {'again': 0, 'hard': 2, 'good': 3, 'easy': 4}
+        grade = grade_map.get(review_request.rating, 3)
+        new_fsrs = calculate_next_review_fsrs(fsrs, grade)
+        vocab['fsrs'] = new_fsrs
+        vocab['times_reviewed'] += 1
+        if grade >= 2:
+            vocab['times_correct'] += 1
+        else:
+            vocab['fsrs']['lapses'] += 1
+        vocab['accuracy'] = (vocab['times_correct'] / vocab['times_reviewed']) * 100 if vocab['times_reviewed'] > 0 else 0
+        vocab['mastery_level'] = calculate_mastery_level(vocab)
+        vocab['last_reviewed_at'] = datetime.utcnow().isoformat() + 'Z'
+        if 'vocabulary_reviewed' not in progress['overall_stats']:
+            progress['overall_stats']['vocabulary_reviewed'] = 0
+        progress['overall_stats']['vocabulary_reviewed'] += 1
+        learning_service.save_user_progress(user_id, progress)
+        return {
+            "success": True,
+            "vocabulary": vocab,
+            "next_review": new_fsrs['next_review'],
+            "interval_days": new_fsrs['interval_days']
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error recording vocabulary review: {e}")
+        raise HTTPException(status_code=500, detail="Failed to record review")
+@router.get("/vocabulary/stats")
+async def get_vocabulary_stats(
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Get vocabulary mastery statistics"""
+    try:
+        user_id = token if token else 'anonymous'
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            return {
+                "total_words": 0,
+                "in_practice": 0,
+                "mastery_breakdown": {str(i): 0 for i in range(6)},
+                "average_accuracy": 0
+            }
+        vocab_progress = progress.get('vocabulary_progress', {})
+        mastery_breakdown = {str(i): 0 for i in range(6)}
+        total_accuracy = 0
+        total_with_reviews = 0
+        for vocab_data in vocab_progress.values():
+            level = vocab_data.get('mastery_level', 0)
+            mastery_breakdown[str(level)] += 1
+            if vocab_data.get('times_reviewed', 0) > 0:
+                total_accuracy += vocab_data.get('accuracy', 0)
+                total_with_reviews += 1
+        avg_accuracy = total_accuracy / total_with_reviews if total_with_reviews > 0 else 0
+        return {
+            "total_words": len(vocab_progress),
+            "in_practice": len(vocab_progress),
+            "mastery_breakdown": mastery_breakdown,
+            "average_accuracy": round(avg_accuracy, 1),
+            "total_reviews": sum(v.get('times_reviewed', 0) for v in vocab_progress.values())
+        }
+    except Exception as e:
+        logger.error(f"Error getting vocabulary stats: {e}")
+        raise HTTPException(status_code=500, detail="Failed to get stats")
+@router.get("/vocabulary/library")
+async def get_vocabulary_library(
+    lesson_id: Optional[int] = None,
+    level: Optional[str] = None,
+    search: Optional[str] = None,
+    request: Request = None,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Browse all vocabulary with filters"""
+    try:
+        user_id = token if token else 'anonymous'
+        all_vocab = learning_service.get_all_vocabulary()
+        progress = learning_service.get_user_progress(user_id)
+        user_vocab = progress.get('vocabulary_progress', {}) if progress else {}
+        filtered_vocab = all_vocab
+        if lesson_id:
+            filtered_vocab = [v for v in filtered_vocab if v.get('lesson_id') == lesson_id]
+        if level:
+            filtered_vocab = [v for v in filtered_vocab if v.get('level') == level]
+        if search:
+            search_lower = search.lower()
+            filtered_vocab = [v for v in filtered_vocab
+                            if search_lower in v.get('swahili', '').lower()
+                            or search_lower in v.get('english', '').lower()]
+        for vocab in filtered_vocab:
+            vocab_id = str(vocab.get('vocabulary_id') or vocab.get('id'))
+            if vocab_id in user_vocab:
+                vocab['status'] = 'practicing'
+                vocab['mastery_level'] = user_vocab[vocab_id].get('mastery_level', 0)
+                vocab['accuracy'] = user_vocab[vocab_id].get('accuracy', 0)
+                vocab['next_review'] = user_vocab[vocab_id].get('fsrs', {}).get('next_review')
+            else:
+                vocab['status'] = 'not_practicing'
+                vocab['mastery_level'] = 0
+        return {
+            "vocabulary": filtered_vocab,
+            "total": len(filtered_vocab),
+            "filters_applied": {
+                "lesson_id": lesson_id,
+                "level": level,
+                "search": search
+            }
+        }
+    except Exception as e:
+        logger.error(f"Error getting vocabulary library: {e}")
+        raise HTTPException(status_code=500, detail="Failed to get vocabulary")
+# Reading Comprehension
+class ComprehensionAnswer(BaseModel):
+    question_id: str
+    answer: str
+class ComprehensionSubmission(BaseModel):
+    lesson_id: int
+    passage_id: str
+    answers: List[ComprehensionAnswer]
+@router.post("/comprehension/submit")
+async def submit_comprehension_answers(
+    submission: ComprehensionSubmission,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Submit reading comprehension answers and get scoring"""
+    try:
+        user_id = token if token else 'anonymous'
+        lesson = learning_service.get_lesson(submission.lesson_id)
+        if not lesson:
+            raise HTTPException(status_code=404, detail="Lesson not found")
+        passage = None
+        for p in lesson.get('reading_passages', []):
+            if p['passage_id'] == submission.passage_id:
+                passage = p
+                break
+        if not passage:
+            raise HTTPException(status_code=404, detail="Passage not found")
+        results = []
+        correct_count = 0
+        for submitted in submission.answers:
+            question_id = submitted.question_id
+            user_answer = submitted.answer.strip().lower()
+            question = None
+            for q in passage['comprehension_questions']:
+                if q['question_id'] == question_id:
+                    question = q
+                    break
+            if not question:
+                continue
+            correct_answers = [ans.strip().lower() for ans in question.get('correct_answers', [])]
+            is_correct = user_answer in correct_answers
+            if is_correct:
+                correct_count += 1
+            results.append({
+                "question_id": question_id,
+                "correct": is_correct,
+                "user_answer": user_answer,
+                "correct_answer": question['correct_answers'][0] if correct_answers else None,
+                "explanation": question.get('explanation')
+            })
+        score = (correct_count / len(submission.answers)) * 100 if submission.answers else 0
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            progress = learning_service.create_default_progress(user_id)
+        if 'comprehension_scores' not in progress:
+            progress['comprehension_scores'] = {}
+        progress['comprehension_scores'][f"{submission.lesson_id}_{submission.passage_id}"] = {
+            "score": score,
+            "completed_at": datetime.utcnow().isoformat() + 'Z',
+            "attempts": progress['comprehension_scores'].get(f"{submission.lesson_id}_{submission.passage_id}", {}).get('attempts', 0) + 1
+        }
+        learning_service.save_user_progress(user_id, progress)
+        return {
+            "results": results,
+            "score": round(score, 1),
+            "correct": correct_count,
+            "total": len(submission.answers)
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error submitting comprehension: {e}")
+        raise HTTPException(status_code=500, detail="Failed to submit comprehension")
+# Task Scenarios
+class ScenarioProgressUpdate(BaseModel):
+    turn_id: str
+    choice_id: str
+@router.get("/scenarios/{scenario_id}")
+async def get_scenario(
+    scenario_id: str,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Get task scenario with branching dialogue"""
+    try:
+        user_id = token if token else 'anonymous'
+        scenario = learning_service.get_scenario(scenario_id)
+        if not scenario:
+            raise HTTPException(status_code=404, detail="Scenario not found")
+        progress = learning_service.get_user_progress(user_id)
+        scenario_progress = None
+        if progress and 'scenario_progress' in progress:
+            scenario_progress = progress['scenario_progress'].get(scenario_id)
+        return {
+            "scenario": scenario,
+            "user_progress": scenario_progress
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting scenario: {e}")
+        raise HTTPException(status_code=500, detail="Failed to get scenario")
+@router.post("/scenarios/{scenario_id}/progress")
+async def update_scenario_progress(
+    scenario_id: str,
+    progress_update: ScenarioProgressUpdate,
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Update scenario progress with user choice"""
+    try:
+        user_id = token if token else 'anonymous'
+        scenario = learning_service.get_scenario(scenario_id)
+        if not scenario:
+            raise HTTPException(status_code=404, detail="Scenario not found")
+        progress = learning_service.get_user_progress(user_id)
+        if not progress:
+            progress = learning_service.create_default_progress(user_id)
+        if 'scenario_progress' not in progress:
+            progress['scenario_progress'] = {}
+        if scenario_id not in progress['scenario_progress']:
+            progress['scenario_progress'][scenario_id] = {
+                "started_at": datetime.utcnow().isoformat() + 'Z',
+                "turns": [],
+                "completed": False
+            }
+        progress['scenario_progress'][scenario_id]['turns'].append({
+            "turn_id": progress_update.turn_id,
+            "choice_id": progress_update.choice_id,
+            "timestamp": datetime.utcnow().isoformat() + 'Z'
+        })
+        turns_count = len(progress['scenario_progress'][scenario_id]['turns'])
+        if turns_count >= scenario.get('required_turns', 6):
+            progress['scenario_progress'][scenario_id]['completed'] = True
+            progress['scenario_progress'][scenario_id]['completed_at'] = datetime.utcnow().isoformat() + 'Z'
+        learning_service.save_user_progress(user_id, progress)
+        return {
+            "success": True,
+            "progress": progress['scenario_progress'][scenario_id]
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error updating scenario progress: {e}")
+        raise HTTPException(status_code=500, detail="Failed to update scenario")
+@router.get("/scenarios")
+async def list_scenarios(
+    request: Request,
+    token: Optional[str] = Depends(optional_hf_token)
+):
+    """Get list of all available scenarios"""
+    try:
+        scenarios = learning_service.get_all_scenarios()
+        return {
+            "success": True,
+            "scenarios": scenarios,
+            "total": len(scenarios)
+        }
+    except Exception as e:
+        logger.error(f"Error listing scenarios: {e}")
+        raise HTTPException(status_code=500, detail="Failed to list scenarios")

app/routers/mobile.py ADDED Viewed

	@@ -0,0 +1,536 @@

+from fastapi import APIRouter, HTTPException, File, UploadFile, Form, Query, Depends
+from fastapi.responses import Response
+from typing import Optional
+from pydantic import BaseModel
+import base64
+import json
+import uuid
+import datetime
+from app.services.translation_service import TranslationService
+from app.services.tts_service import TTSService
+from app.services.transcription_service import TranscriptionService
+from app.auth import require_hf_token
+router = APIRouter()
+# Service instances - these will be injected by main app
+translation_service = None
+tts_service = None
+transcription_service = None
+# Mobile-specific data models
+class MobileSessionRequest(BaseModel):
+    user_name: str
+    default_source_lang: str = "eng"
+    default_target_lang: str = "swa"
+class MobileSessionResponse(BaseModel):
+    session_id: str
+    participant_id: str
+    user_name: str
+    source_language: str
+    target_language: str
+class MobileTranscribeRequest(BaseModel):
+    participant_id: str
+    source_language: str
+    target_language: str
+    is_final_chunk: bool = False
+class MobileLanguageUpdateRequest(BaseModel):
+    participant_id: str
+    source_language: str
+    target_language: str
+# In-memory session storage (in production, use Redis or database)
+mobile_sessions = {}
+@router.post("/mobile/session/create", response_model=MobileSessionResponse)
+async def create_mobile_session(
+    user_name: str = Form(...),
+    default_source_lang: str = Form("eng"),
+    default_target_lang: str = Form("swa"),
+    token: str = Depends(require_hf_token)
+):
+    """Create a mobile-specific single-user session"""
+    try:
+        print(f"=== MOBILE SESSION CREATE REQUEST ===")
+        print(f"User name: {user_name}")
+        print(f"Source language: {default_source_lang}")
+        print(f"Target language: {default_target_lang}")
+        # Validate inputs
+        if not user_name or user_name.strip() == "":
+            raise HTTPException(status_code=400, detail="User name is required")
+        # Validate language codes
+        valid_languages = ["eng", "swa", "kik", "kam", "mer", "luo", "som"]
+        if default_source_lang not in valid_languages:
+            print(f"Invalid source language: {default_source_lang}, defaulting to 'eng'")
+            default_source_lang = "eng"
+        if default_target_lang not in valid_languages:
+            print(f"Invalid target language: {default_target_lang}, defaulting to 'swa'")
+            default_target_lang = "swa"
+        session_id = f"mobile-{uuid.uuid4().hex[:8]}"
+        participant_id = f"user-{uuid.uuid4().hex[:8]}"
+        # Store session data
+        mobile_sessions[session_id] = {
+            "session_id": session_id,
+            "participant_id": participant_id,
+            "user_name": user_name.strip(),
+            "source_language": default_source_lang,
+            "target_language": default_target_lang,
+            "created_at": datetime.datetime.now().isoformat()
+        }
+        print(f"Created session: {session_id} for user: {user_name}")
+        print(f"Total sessions: {len(mobile_sessions)}")
+        response = MobileSessionResponse(
+            session_id=session_id,
+            participant_id=participant_id,
+            user_name=user_name.strip(),
+            source_language=default_source_lang,
+            target_language=default_target_lang
+        )
+        print(f"Returning response: {response}")
+        return response
+    except HTTPException:
+        raise
+    except Exception as e:
+        print(f"ERROR creating mobile session: {e}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"Failed to create mobile session: {str(e)}")
+@router.get("/mobile/session/{session_id}")
+async def get_mobile_session(session_id: str, token: str = Depends(require_hf_token)):
+    """Get mobile session details"""
+    if session_id not in mobile_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+    return mobile_sessions[session_id]
+@router.put("/mobile/session/{session_id}/languages")
+async def update_session_languages(
+    session_id: str,
+    participant_id: str = Form(...),
+    source_language: str = Form(...),
+    target_language: str = Form(...)
+):
+    """Update the default languages for a mobile session"""
+    try:
+        if session_id not in mobile_sessions:
+            raise HTTPException(status_code=404, detail="Session not found")
+        session = mobile_sessions[session_id]
+        if session["participant_id"] != participant_id:
+            raise HTTPException(status_code=403, detail="Invalid participant")
+        # Update session languages
+        session["source_language"] = source_language
+        session["target_language"] = target_language
+        return {
+            "success": True,
+            "session_id": session_id,
+            "source_language": source_language,
+            "target_language": target_language
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to update languages: {str(e)}")
+@router.post("/mobile/session/{session_id}/transcribe-realtime")
+async def transcribe_realtime(
+    session_id: str,
+    audio: UploadFile = File(...),
+    participant_id: str = Form(...),
+    source_language: str = Form(...),
+    target_language: str = Form(...),
+    is_final_chunk: bool = Form(False),
+    chunk_sequence: int = Form(0)
+):
+    """Real-time transcription endpoint for mobile with streaming support"""
+    try:
+        if session_id not in mobile_sessions:
+            raise HTTPException(status_code=404, detail="Session not found")
+        session = mobile_sessions[session_id]
+        if session["participant_id"] != participant_id:
+            raise HTTPException(status_code=403, detail="Invalid participant")
+        # Read audio file
+        audio_data = await audio.read()
+        # Generate unique message ID for this chunk sequence
+        message_id = f"msg-{participant_id}-{chunk_sequence}"
+        # Initialize response data
+        response_data = {
+            "success": True,
+            "message_id": message_id,
+            "chunk_sequence": chunk_sequence,
+            "original_text": "",
+            "original_language": source_language,
+            "is_final_chunk": is_final_chunk,
+            "is_interim": not is_final_chunk,
+            "session_id": session_id,
+            "translated_text": None,
+            "target_language": target_language,
+            "has_audio": False,
+            "audio_base64": None,
+            "audio_format": None
+        }
+        # Process transcription
+        if transcription_service:
+            try:
+                # Use streaming transcription if available
+                if hasattr(transcription_service, 'process_realtime_chunk'):
+                    transcription_result = await transcription_service.process_realtime_chunk(
+                        audio_data, source_language, participant_id, is_final_chunk
+                    )
+                else:
+                    # Fallback to regular transcription
+                    transcription_result = await transcription_service.transcribe_audio(
+                        audio_data, source_language
+                    )
+                response_data["original_text"] = transcription_result or ""
+                # Only process translation and TTS for final chunks with actual text
+                if is_final_chunk and transcription_result and transcription_result.strip():
+                    if translation_service:
+                        try:
+                            translated_text = await translation_service.translate_text(
+                                transcription_result, source_language, target_language
+                            )
+                            response_data["translated_text"] = translated_text
+                            # Generate TTS audio in target language
+                            if tts_service and translated_text:
+                                try:
+                                    tts_audio = await tts_service.generate_speech(
+                                        translated_text, target_language, output_format="wav"
+                                    )
+                                    if tts_audio:
+                                        response_data.update({
+                                            "has_audio": True,
+                                            "audio_base64": base64.b64encode(tts_audio).decode('utf-8'),
+                                            "audio_format": "wav"
+                                        })
+                                except Exception as tts_error:
+                                    print(f"TTS generation failed: {tts_error}")
+                                    # Continue without TTS
+                        except Exception as translation_error:
+                            print(f"Translation failed: {translation_error}")
+                            # Continue without translation
+            except Exception as transcription_error:
+                print(f"Transcription failed: {transcription_error}")
+                response_data["original_text"] = ""
+            return response_data
+        else:
+            raise HTTPException(status_code=500, detail="Transcription service not available")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Real-time transcription failed: {str(e)}")
+@router.post("/mobile/session/{session_id}/stream-audio")
+async def stream_audio_chunk(
+    session_id: str,
+    participant_id: str = Form(...),
+    audio_chunk: UploadFile = File(...),
+    source_language: str = Form(...),
+    target_language: str = Form(...),
+    chunk_index: int = Form(0),
+    is_speaking: bool = Form(True),
+    force_complete: bool = Form(False)
+):
+    """Stream audio chunks for continuous processing"""
+    try:
+        if session_id not in mobile_sessions:
+            raise HTTPException(status_code=404, detail="Session not found")
+        session = mobile_sessions[session_id]
+        if session["participant_id"] != participant_id:
+            raise HTTPException(status_code=403, detail="Invalid participant")
+        audio_data = await audio_chunk.read()
+        # Use streaming approach similar to WebSocket
+        interim_text = ""
+        if transcription_service:
+            try:
+                if hasattr(transcription_service, 'process_audio_chunk'):
+                    result = await transcription_service.process_audio_chunk(
+                        audio_data,
+                        source_language,
+                        participant_id,
+                        has_voice_activity=is_speaking,
+                        progress_callback=None,  # No callback for HTTP
+                        sentence_callback=None   # No callback for HTTP
+                    )
+                    interim_text = result or ""
+                else:
+                    # Fallback to regular transcription for interim results
+                    interim_text = await transcription_service.transcribe_audio(
+                        audio_data, source_language
+                    ) or ""
+            except Exception as e:
+                print(f"Streaming transcription error: {e}")
+                interim_text = ""
+            return {
+                "success": True,
+                "chunk_index": chunk_index,
+                "session_id": session_id,
+                "interim_text": interim_text,
+                "is_speaking": is_speaking,
+                "force_complete": force_complete
+            }
+        else:
+            raise HTTPException(status_code=500, detail="Transcription service not available")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Audio streaming failed: {str(e)}")
+@router.get("/mobile/session/{session_id}/realtime-status")
+async def get_realtime_status(session_id: str, participant_id: str = Query(...)):
+    """Get current real-time processing status"""
+    try:
+        if session_id not in mobile_sessions:
+            raise HTTPException(status_code=404, detail="Session not found")
+        session = mobile_sessions[session_id]
+        if session["participant_id"] != participant_id:
+            raise HTTPException(status_code=403, detail="Invalid participant")
+        # Check if transcription service has any pending messages
+        pending_messages = []
+        if transcription_service:
+            try:
+                if hasattr(transcription_service, 'get_participant_status'):
+                    pending_messages = transcription_service.get_participant_status(participant_id)
+                else:
+                    pending_messages = []
+            except Exception as e:
+                print(f"Error getting participant status: {e}")
+                pending_messages = []
+        return {
+            "session_id": session_id,
+            "participant_id": participant_id,
+            "is_active": True,
+            "pending_messages": pending_messages,
+            "current_languages": {
+                "source": session["source_language"],
+                "target": session["target_language"]
+            },
+            "service_status": {
+                "transcription": transcription_service is not None,
+                "translation": translation_service is not None,
+                "tts": tts_service is not None
+            }
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Status check failed: {str(e)}")
+@router.post("/mobile/session/{session_id}/transcribe-with-languages")
+async def transcribe_with_languages_legacy(
+    session_id: str,
+    audio: UploadFile = File(...),
+    participant_id: str = Form(...),
+    source_language: str = Form(...),
+    target_language: str = Form(...),
+    is_final_chunk: bool = Form(False)
+):
+    """Legacy endpoint - transcribe audio with specific source/target languages for mobile"""
+    try:
+        if session_id not in mobile_sessions:
+            raise HTTPException(status_code=404, detail="Session not found")
+        session = mobile_sessions[session_id]
+        if session["participant_id"] != participant_id:
+            raise HTTPException(status_code=403, detail="Invalid participant")
+        # Read audio file
+        audio_data = await audio.read()
+        # Generate unique message ID
+        message_id = f"msg-{uuid.uuid4().hex[:8]}"
+        # Initialize response
+        response_data = {
+            "success": True,
+            "message_id": message_id,
+            "original_text": "",
+            "original_language": source_language,
+            "translated_text": None,
+            "target_language": target_language,
+            "has_audio": False,
+            "is_final_chunk": is_final_chunk,
+            "audio_base64": None
+        }
+        # Process transcription in source language
+        if transcription_service:
+            try:
+                transcription_result = await transcription_service.transcribe_audio(
+                    audio_data, source_language
+                )
+                response_data["original_text"] = transcription_result or ""
+                # Process translation to target language
+                if translation_service and transcription_result and transcription_result.strip():
+                    try:
+                        translated_text = await translation_service.translate_text(
+                            transcription_result, source_language, target_language
+                        )
+                        response_data["translated_text"] = translated_text
+                        # Generate TTS audio in target language
+                        if tts_service and translated_text:
+                            try:
+                                tts_audio = await tts_service.generate_speech(
+                                    translated_text, target_language, output_format="wav"
+                                )
+                                if tts_audio:
+                                    response_data["has_audio"] = True
+                                    response_data["audio_base64"] = base64.b64encode(tts_audio).decode('utf-8')
+                            except Exception as tts_error:
+                                print(f"TTS generation failed: {tts_error}")
+                    except Exception as translation_error:
+                        print(f"Translation failed: {translation_error}")
+            except Exception as transcription_error:
+                print(f"Transcription failed: {transcription_error}")
+            return response_data
+        else:
+            raise HTTPException(status_code=500, detail="Transcription service not available")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
+@router.post("/mobile/translate")
+async def translate_text_mobile(
+    text: str = Form(...),
+    source_lang: str = Form(...),
+    target_lang: str = Form(...)
+):
+    """Mobile-friendly text translation endpoint"""
+    try:
+        if not translation_service:
+            raise HTTPException(status_code=500, detail="Translation service not initialized")
+        # Map common language codes to internal format
+        lang_mapping = {
+            "english": "eng", "en": "eng",
+            "swahili": "swa", "sw": "swa",
+            "kikuyu": "kik", "ki": "kik",
+            "kamba": "kam", "kam": "kam",
+            "kimeru": "mer", "mer": "mer",
+            "luo": "luo", "luo": "luo",
+            "somali": "som", "so": "som"
+        }
+        source_code = lang_mapping.get(source_lang.lower(), source_lang.lower())
+        target_code = lang_mapping.get(target_lang.lower(), target_lang.lower())
+        translated_text = await translation_service.translate_text(text, source_code, target_code)
+        return {
+            "success": True,
+            "original_text": text,
+            "translated_text": translated_text or text,
+            "source_language": source_code,
+            "target_language": target_code
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
+@router.get("/mobile/languages")
+async def get_supported_languages():
+    """Get list of supported languages for mobile app"""
+    return {
+        "supported_languages": [
+            {"code": "eng", "name": "English", "display_name": "English (eng)"},
+            {"code": "swa", "name": "Swahili", "display_name": "Swahili (swa)"},
+            {"code": "kik", "name": "Kikuyu", "display_name": "Kikuyu (kik)"},
+            {"code": "kam", "name": "Kamba", "display_name": "Kamba (kam)"},
+            {"code": "mer", "name": "Kimeru", "display_name": "Kimeru (mer)"},
+            {"code": "luo", "name": "Luo", "display_name": "Luo (luo)"},
+            {"code": "som", "name": "Somali", "display_name": "Somali (som)"}
+        ]
+    }
+@router.get("/mobile/test")
+async def test_mobile_endpoints():
+    """Test endpoint for mobile app connectivity"""
+    return {
+        "status": "Mobile API is working",
+        "endpoints": [
+            "/mobile/session/create",
+            "/mobile/session/{session_id}",
+            "/mobile/session/{session_id}/languages",
+            "/mobile/session/{session_id}/transcribe-realtime",
+            "/mobile/session/{session_id}/stream-audio",
+            "/mobile/session/{session_id}/realtime-status",
+            "/mobile/session/{session_id}/transcribe-with-languages",
+            "/mobile/translate",
+            "/mobile/languages",
+            "/mobile/test"
+        ],
+        "timestamp": datetime.datetime.now().isoformat(),
+        "services_available": {
+            "transcription": transcription_service is not None,
+            "translation": translation_service is not None,
+            "tts": tts_service is not None
+        },
+        "active_sessions": len(mobile_sessions),
+        "session_list": list(mobile_sessions.keys())
+    }
+@router.post("/mobile/test-session")
+async def test_session_creation(
+    test_user: str = Form("TestUser"),
+    test_source: str = Form("eng"),
+    test_target: str = Form("swa")
+):
+    """Test session creation with debug info"""
+    try:
+        print(f"=== TEST SESSION CREATE ===")
+        print(f"Received: user={test_user}, source={test_source}, target={test_target}")
+        session_id = f"test-{uuid.uuid4().hex[:8]}"
+        return {
+            "success": True,
+            "test_session_id": session_id,
+            "received_params": {
+                "user": test_user,
+                "source": test_source,
+                "target": test_target
+            },
+            "form_processing": "OK"
+        }
+    except Exception as e:
+        print(f"Test session error: {e}")
+        return {
+            "success": False,
+            "error": str(e)
+        }

app/routers/sessions.py ADDED Viewed

	@@ -0,0 +1,200 @@

+from fastapi import APIRouter, HTTPException, Response, Depends
+from typing import List
+from pydantic import BaseModel
+import qrcode
+import io
+from app.models import Session, SessionCreate
+from app.auth import require_hf_token, optional_hf_token
+router = APIRouter()
+# This will be set by the main app
+session_manager = None
+# Initialize services (these will be injected by main app)
+transcription_service = None
+translation_service = None
+tts_service = None
+class TextTranslationRequest(BaseModel):
+    text: str
+    source_language: str
+    target_language: str
+class TextTranslationResponse(BaseModel):
+    original_text: str
+    translated_text: str
+    source_language: str
+    target_language: str
+@router.post("/sessions", response_model=Session)
+async def create_session(session_data: SessionCreate, token: str = Depends(require_hf_token)):
+    """Create a new transcription session"""
+    try:
+        session = await session_manager.create_session(session_data)
+        return session
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/sessions", response_model=List[Session])
+async def get_all_sessions(token: str = Depends(require_hf_token)):
+    """Get all active sessions"""
+    try:
+        sessions = await session_manager.get_all_sessions()
+        return sessions
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/sessions/{session_id}", response_model=Session)
+async def get_session(session_id: str, token: str = Depends(require_hf_token)):
+    """Get specific session by ID or short code"""
+    session = await session_manager.get_session(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail="Session not found")
+    return session
+@router.get("/sessions/{session_id}/short-code")
+async def get_session_short_code(session_id: str, token: str = Depends(require_hf_token)):
+    """Get short code for a session"""
+    session = await session_manager.get_session(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail="Session not found")
+    short_code = session_manager.get_short_code(session.id)
+    return {"session_id": session.id, "short_code": short_code}
+@router.delete("/sessions/{session_id}")
+async def delete_session(session_id: str, token: str = Depends(require_hf_token)):
+    """Delete a session"""
+    success = await session_manager.delete_session(session_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Session not found")
+    return {"message": "Session deleted successfully"}
+@router.post("/sessions/{session_id}/languages/{language_code}")
+async def add_language_to_session(session_id: str, language_code: str, token: str = Depends(require_hf_token)):
+    """Add a language to a session"""
+    from app.models import LanguageCode
+    # Convert string to LanguageCode enum
+    try:
+        lang_code_enum = LanguageCode(language_code)
+    except ValueError:
+        raise HTTPException(status_code=400, detail=f"Invalid language code: {language_code}")
+    success = await session_manager.add_language_to_session(session_id, lang_code_enum)
+    if success:
+        session = await session_manager.get_session(session_id)
+        return {"message": f"Language {language_code} added to session", "session": session}
+    else:
+        # Check if session exists
+        session = await session_manager.get_session(session_id)
+        if not session:
+            raise HTTPException(status_code=404, detail="Session not found")
+        return {"message": f"Language {language_code} already exists in session", "session": session}
+@router.post("/translate", response_model=TextTranslationResponse)
+async def translate_text(request: TextTranslationRequest, token: str = Depends(require_hf_token)):
+    """Translate text from source language to target language"""
+    try:
+        # Map language codes to proper names
+        lang_map = {
+            'eng': 'English',
+            'swa': 'Swahili',
+            'kik': 'Kikuyu',
+            'kam': 'Kamba',
+            'mer': 'Kimeru',
+            'luo': 'Luo',
+            'som': 'Somali'
+        }
+        source_lang_name = lang_map.get(request.source_language.lower(), request.source_language)
+        target_lang_name = lang_map.get(request.target_language.lower(), request.target_language)
+        # Perform translation
+        translated_text = await translation_service.translate_text(
+            text=request.text,
+            source_lang=source_lang_name,
+            target_lang=target_lang_name
+        )
+        return TextTranslationResponse(
+            original_text=request.text,
+            translated_text=translated_text,
+            source_language=request.source_language,
+            target_language=request.target_language
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
+@router.get("/test")
+async def test_endpoint(token: str = Depends(optional_hf_token)):
+    """Test endpoint to verify API is working"""
+    auth_status = "authenticated" if token else "public"
+    return {
+        "status": "API is working",
+        "sessions_count": len(session_manager.sessions),
+        "auth_status": auth_status
+    }
+@router.get("/test/translation")
+async def test_translation(token: str = Depends(require_hf_token)):
+    """Test translation service directly"""
+    try:
+        # Test English to Swahili translation
+        result = await translation_service.translate_text("Hello, how are you?", "English", "Swahili")
+        return {
+            "status": "Translation test completed",
+            "original": "Hello, how are you?",
+            "translated": result,
+            "source_lang": "English",
+            "target_lang": "Swahili"
+        }
+    except Exception as e:
+        return {"status": "Translation test failed", "error": str(e)}
+@router.get("/test/tts")
+async def test_tts(token: str = Depends(require_hf_token)):
+    """Test TTS service directly"""
+    try:
+        # Test TTS generation
+        audio_data = await tts_service.generate_speech("Hello world", "eng")
+        return {
+            "status": "TTS test completed",
+            "text": "Hello world",
+            "language": "eng",
+            "audio_generated": audio_data is not None,
+            "audio_size": len(audio_data) if audio_data else 0
+        }
+    except Exception as e:
+        return {"status": "TTS test failed", "error": str(e)}
+@router.get("/sessions/{session_id}/qr-code")
+async def get_session_qr_code(session_id: str, token: str = Depends(require_hf_token)):
+    """Generate QR code for session"""
+    if session_manager is None:
+        raise HTTPException(status_code=500, detail="Session manager not initialized")
+    session = await session_manager.get_session(session_id)
+    if not session:
+        raise HTTPException(status_code=404, detail="Session not found")
+    # Generate QR code with session join URL - use your HF space URL
+    join_url = f"https://mutisya-realtime-translator-5-27-25-v2.hf.space/?join={session_id}"
+    qr = qrcode.QRCode(version=1, box_size=10, border=5)
+    qr.add_data(join_url)
+    qr.make(fit=True)
+    img = qr.make_image(fill_color="black", back_color="white")
+    # Convert to bytes
+    img_buffer = io.BytesIO()
+    img.save(img_buffer, format='PNG')
+    img_buffer.seek(0)
+    return Response(content=img_buffer.getvalue(), media_type="image/png")

app/routers/watch.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from fastapi.responses import Response
+from typing import Optional
+import io
+import base64
+from app.services.transcription_service import TranscriptionService
+from app.services.translation_service import TranslationService
+from app.services.tts_service import TTSService
+from app.models import LanguageCode
+from pydantic import BaseModel
+from app.auth import require_hf_token
+router = APIRouter()
+class WatchTranslationRequest(BaseModel):
+    source_language: str
+    target_language: str
+    audio_base64: str
+class WatchTranslationResponse(BaseModel):
+    original_text: str
+    original_language: str
+    translated_text: str
+    target_language: str
+    translated_audio_base64: str
+    success: bool
+    error: Optional[str] = None
+# Initialize services (these will be injected by main app)
+transcription_service = None
+translation_service = None
+tts_service = None
+@router.post("/watch/translate", response_model=WatchTranslationResponse)
+async def watch_translate_audio(request: WatchTranslationRequest, token: str = Depends(require_hf_token)):
+    """
+    Process audio for watch app translation
+    - Transcribe audio using source language model
+    - Translate text to target language
+    - Generate TTS audio for target language
+    - Return all data to watch app
+    """
+    try:
+        # Validate languages
+        source_lang = request.source_language.lower()
+        target_lang = request.target_language.lower()
+        if source_lang not in ['eng', 'swa', 'kik', 'kam', 'mer', 'luo', 'som']:
+            raise HTTPException(status_code=400, detail=f"Unsupported source language: {source_lang}")
+        if target_lang not in ['eng', 'swa', 'kik', 'kam', 'mer', 'luo', 'som']:
+            raise HTTPException(status_code=400, detail=f"Unsupported target language: {target_lang}")
+        # Decode base64 audio
+        try:
+            audio_data = base64.b64decode(request.audio_base64)
+            print(f"Decoded audio data: {len(audio_data)} bytes")
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Invalid base64 audio data: {str(e)}")
+        # Step 1: Transcribe audio
+        print(f"Transcribing audio with {source_lang} model...")
+        transcribed_text = await transcription_service.transcribe_audio(audio_data, source_lang)
+        if not transcribed_text or transcribed_text.strip() == "":
+            return WatchTranslationResponse(
+                original_text="",
+                original_language=source_lang,
+                translated_text="No speech detected",
+                target_language=target_lang,
+                translated_audio_base64="",
+                success=False,
+                error="No speech detected in audio"
+            )
+        print(f"Transcribed text: {transcribed_text}")
+        # Step 2: Translate text (skip if source and target are the same)
+        if source_lang == target_lang:
+            translated_text = transcribed_text
+        else:
+            print(f"Translating from {source_lang} to {target_lang}...")
+            # Convert language codes to full names for translation service
+            lang_name_map = {
+                'eng': 'English',
+                'swa': 'Swahili',
+                'kik': 'Kikuyu',
+                'kam': 'Kamba',
+                'mer': 'Kimeru',
+                'luo': 'Luo',
+                'som': 'Somali'
+            }
+            source_lang_name = lang_name_map.get(source_lang, 'English')
+            target_lang_name = lang_name_map.get(target_lang, 'Swahili')
+            translated_text = await translation_service.translate_text(
+                transcribed_text,
+                source_lang_name,
+                target_lang_name
+            )
+        print(f"Translated text: {translated_text}")
+        # Step 3: Generate TTS audio for translated text (Android-compatible WAV format)
+        print(f"Generating TTS audio for {target_lang} in WAV format for Android...")
+        tts_audio_data = await tts_service.generate_speech(translated_text, target_lang, output_format="wav")
+        # Encode TTS audio as base64
+        tts_audio_base64 = ""
+        if tts_audio_data:
+            tts_audio_base64 = base64.b64encode(tts_audio_data).decode('utf-8')
+            print(f"TTS audio generated: {len(tts_audio_data)} bytes, base64: {len(tts_audio_base64)} chars")
+        else:
+            print("TTS audio generation failed - no data returned")
+        return WatchTranslationResponse(
+            original_text=transcribed_text,
+            original_language=source_lang,
+            translated_text=translated_text,
+            target_language=target_lang,
+            translated_audio_base64=tts_audio_base64,
+            success=True
+        )
+    except Exception as e:
+        print(f"Error in watch translation: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return WatchTranslationResponse(
+            original_text="",
+            original_language=request.source_language,
+            translated_text="",
+            target_language=request.target_language,
+            translated_audio_base64="",
+            success=False,
+            error=str(e)
+        )
+@router.get("/watch/test")
+async def test_watch_endpoint(token: str = Depends(require_hf_token)):
+    """Test endpoint for watch app connectivity"""
+    return {
+        "status": "Watch API is working",
+        "services": {
+            "transcription": transcription_service is not None,
+            "translation": translation_service is not None,
+            "tts": tts_service is not None
+        }
+    }

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Services package

app/services/learning_data_service.py ADDED Viewed

	@@ -0,0 +1,415 @@

+"""
+Learning Data Service - File-based data access for language learning prototype
+This service provides access to lesson data, user progress, and achievements
+using JSON files stored in the backend/data/learning directory.
+"""
+import json
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from datetime import datetime
+import logging
+logger = logging.getLogger(__name__)
+class LearningDataService:
+    """Service for managing language learning data using JSON files"""
+    def __init__(self):
+        # Get the data directory relative to this file
+        self.data_dir = Path(__file__).parent.parent.parent / "data" / "learning"
+        self.lessons_dir = self.data_dir / "lessons"
+        self.users_dir = self.data_dir / "users"
+        # Ensure directories exist
+        self.users_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Learning data directory: {self.data_dir}")
+        logger.info(f"Lessons directory: {self.lessons_dir}")
+        logger.info(f"Users directory: {self.users_dir}")
+    # ==================== Lesson Data ====================
+    def get_lessons_index(self, language: str = 'swahili') -> Optional[Dict]:
+        """Load the lessons index/catalog for a specific language"""
+        try:
+            # Map language codes to folder names
+            language_map = {
+                'swahili': 'swahili',
+                'swa': 'swahili',
+                'kamba': 'kamba',
+                'kam': 'kamba',
+                'maasai': 'maasai',
+                'mas': 'maasai'
+            }
+            language_folder = language_map.get(language.lower(), 'swahili')
+            index_path = self.lessons_dir / language_folder / "index.json"
+            logger.info(f"Loading lessons index for language '{language}' -> folder '{language_folder}' at {index_path}")
+            if not index_path.exists():
+                logger.warning(f"Lessons index not found at {index_path}")
+                logger.info(f"Lessons dir contents: {list(self.lessons_dir.iterdir())}")
+                return None
+            with open(index_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                logger.info(f"Successfully loaded {len(data.get('lessons', []))} lessons for {language}")
+                return data
+        except Exception as e:
+            logger.error(f"Error loading lessons index for {language}: {e}")
+            return None
+    def get_lesson(self, lesson_id: int, language: str = 'swahili') -> Optional[Dict]:
+        """Load a specific lesson by ID for a specific language"""
+        try:
+            # Map language codes to folder names
+            language_map = {
+                'swahili': 'swahili',
+                'swa': 'swahili',
+                'kamba': 'kamba',
+                'kam': 'kamba',
+                'maasai': 'maasai',
+                'mas': 'maasai'
+            }
+            language_folder = language_map.get(language.lower(), 'swahili')
+            # First get the index to find the lesson file
+            index = self.get_lessons_index(language)
+            if not index:
+                return None
+            # Find the lesson in the index
+            lesson_meta = None
+            for lesson in index.get('lessons', []):
+                if lesson['lesson_id'] == lesson_id:
+                    lesson_meta = lesson
+                    break
+            if not lesson_meta:
+                logger.warning(f"Lesson {lesson_id} not found in index for {language}")
+                return None
+            # Load the lesson file
+            lesson_path = self.lessons_dir / language_folder / lesson_meta['file']
+            if not lesson_path.exists():
+                logger.warning(f"Lesson file not found: {lesson_path}")
+                return None
+            with open(lesson_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"Error loading lesson {lesson_id} for {language}: {e}")
+            return None
+    def get_available_lessons(self) -> List[Dict]:
+        """Get list of available lessons (not planned)"""
+        try:
+            index = self.get_lessons_index()
+            if not index:
+                return []
+            available = [
+                lesson for lesson in index.get('lessons', [])
+                if lesson.get('status') == 'available'
+            ]
+            return available
+        except Exception as e:
+            logger.error(f"Error getting available lessons: {e}")
+            return []
+    # ==================== Achievements ====================
+    def get_achievements(self) -> Optional[Dict]:
+        """Load achievements configuration"""
+        try:
+            achievements_path = self.data_dir / "achievements.json"
+            if not achievements_path.exists():
+                logger.warning(f"Achievements file not found at {achievements_path}")
+                return None
+            with open(achievements_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"Error loading achievements: {e}")
+            return None
+    # ==================== User Progress ====================
+    def get_user_progress(self, user_id: str) -> Optional[Dict]:
+        """Load user progress data"""
+        try:
+            user_file = self.users_dir / f"user-{user_id}.json"
+            if not user_file.exists():
+                # Return default progress structure for new users
+                return self._create_default_user_progress(user_id)
+            with open(user_file, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"Error loading user progress for {user_id}: {e}")
+            return None
+    def save_user_progress(self, user_id: str, progress_data: Dict) -> bool:
+        """Save user progress data"""
+        try:
+            user_file = self.users_dir / f"user-{user_id}.json"
+            # Update last_active timestamp
+            if 'profile' in progress_data:
+                progress_data['profile']['last_active'] = datetime.utcnow().isoformat() + 'Z'
+            with open(user_file, 'w', encoding='utf-8') as f:
+                json.dump(progress_data, f, indent=2, ensure_ascii=False)
+            logger.info(f"Saved progress for user {user_id}")
+            return True
+        except Exception as e:
+            logger.error(f"Error saving user progress for {user_id}: {e}")
+            return False
+    def update_lesson_progress(
+        self,
+        user_id: str,
+        lesson_id: int,
+        progress_update: Dict
+    ) -> bool:
+        """Update progress for a specific lesson"""
+        try:
+            user_progress = self.get_user_progress(user_id)
+            if not user_progress:
+                return False
+            # Initialize lesson_progress if it doesn't exist
+            if 'lesson_progress' not in user_progress:
+                user_progress['lesson_progress'] = {}
+            lesson_key = str(lesson_id)
+            # Update or create lesson progress
+            if lesson_key in user_progress['lesson_progress']:
+                user_progress['lesson_progress'][lesson_key].update(progress_update)
+            else:
+                user_progress['lesson_progress'][lesson_key] = progress_update
+            return self.save_user_progress(user_id, user_progress)
+        except Exception as e:
+            logger.error(f"Error updating lesson progress: {e}")
+            return False
+    def update_vocabulary_progress(
+        self,
+        user_id: str,
+        vocab_id: int,
+        vocab_update: Dict
+    ) -> bool:
+        """Update progress for a specific vocabulary word"""
+        try:
+            user_progress = self.get_user_progress(user_id)
+            if not user_progress:
+                return False
+            # Initialize vocabulary_progress if it doesn't exist
+            if 'vocabulary_progress' not in user_progress:
+                user_progress['vocabulary_progress'] = {}
+            vocab_key = str(vocab_id)
+            # Update or create vocabulary progress
+            if vocab_key in user_progress['vocabulary_progress']:
+                user_progress['vocabulary_progress'][vocab_key].update(vocab_update)
+            else:
+                user_progress['vocabulary_progress'][vocab_key] = vocab_update
+            return self.save_user_progress(user_id, user_progress)
+        except Exception as e:
+            logger.error(f"Error updating vocabulary progress: {e}")
+            return False
+    def unlock_achievement(
+        self,
+        user_id: str,
+        achievement_id: str,
+        progress: int,
+        target: int
+    ) -> bool:
+        """Unlock or update progress on an achievement"""
+        try:
+            user_progress = self.get_user_progress(user_id)
+            if not user_progress:
+                return False
+            # Initialize achievements if it doesn't exist
+            if 'achievements' not in user_progress:
+                user_progress['achievements'] = {}
+            # Update achievement
+            achievement_data = {
+                'achievement_id': achievement_id,
+                'unlocked': progress >= target,
+                'progress': progress,
+                'target': target
+            }
+            # Add unlock timestamp if newly unlocked
+            if achievement_data['unlocked'] and achievement_id not in user_progress['achievements']:
+                achievement_data['unlocked_at'] = datetime.utcnow().isoformat() + 'Z'
+            elif achievement_data['unlocked'] and achievement_id in user_progress['achievements']:
+                # Preserve original unlock time
+                if 'unlocked_at' in user_progress['achievements'][achievement_id]:
+                    achievement_data['unlocked_at'] = user_progress['achievements'][achievement_id]['unlocked_at']
+                else:
+                    achievement_data['unlocked_at'] = datetime.utcnow().isoformat() + 'Z'
+            user_progress['achievements'][achievement_id] = achievement_data
+            return self.save_user_progress(user_id, user_progress)
+        except Exception as e:
+            logger.error(f"Error unlocking achievement: {e}")
+            return False
+    # ==================== Helper Methods ====================
+    def _create_default_user_progress(self, user_id: str) -> Dict:
+        """Create default progress structure for a new user"""
+        return {
+            'user_id': user_id,
+            'profile': {
+                'user_id': user_id,
+                'learning_language': 'swa',
+                'native_language': 'eng',
+                'created_at': datetime.utcnow().isoformat() + 'Z',
+                'last_active': datetime.utcnow().isoformat() + 'Z'
+            },
+            'overall_stats': {
+                'level': 'beginner',
+                'total_xp': 0,
+                'next_level_xp': 1000,
+                'current_streak': 0,
+                'longest_streak': 0,
+                'lessons_completed': 0,
+                'vocabulary_learned': 0,
+                'vocabulary_mastered': 0,
+                'total_practice_time_seconds': 0,
+                'pronunciation_avg_score': 0.0,
+                'listening_avg_score': 0.0,
+                'reading_avg_score': 0.0
+            },
+            'daily_stats': {},
+            'lesson_progress': {},
+            'vocabulary_progress': {},
+            'achievements': {},
+            'session_history': []
+        }
+    def create_default_progress(self, user_id: str) -> Dict:
+        """Public method to create default progress structure"""
+        progress = self._create_default_user_progress(user_id)
+        # Add Phase 1-3 specific fields
+        progress['overall_stats']['vocabulary_reviewed'] = 0
+        progress['comprehension_scores'] = {}
+        progress['scenario_progress'] = {}
+        return progress
+    # ==================== Phase 1-3 Methods ====================
+    def get_vocabulary(self, vocab_id: int) -> Optional[Dict]:
+        """Get a single vocabulary word by ID from any lesson"""
+        try:
+            lessons_index = self.get_lessons_index()
+            if not lessons_index:
+                return None
+            # Search through all lessons
+            for lesson_meta in lessons_index.get('lessons', []):
+                lesson = self.get_lesson(lesson_meta['lesson_id'])
+                if lesson and 'vocabulary' in lesson:
+                    for vocab in lesson['vocabulary']:
+                        # Support both 'id' and 'vocabulary_id' fields
+                        vocab_item_id = vocab.get('vocabulary_id') or vocab.get('id')
+                        if vocab_item_id == vocab_id:
+                            # Add lesson context
+                            vocab['lesson_id'] = lesson['lesson_id']
+                            vocab['lesson_title'] = lesson.get('title', '')
+                            return vocab
+            logger.warning(f"Vocabulary {vocab_id} not found in any lesson")
+            return None
+        except Exception as e:
+            logger.error(f"Error getting vocabulary {vocab_id}: {e}")
+            return None
+    def get_all_vocabulary(self) -> List[Dict]:
+        """Get all vocabulary words from all lessons"""
+        try:
+            all_vocab = []
+            lessons_index = self.get_lessons_index()
+            if not lessons_index:
+                return all_vocab
+            for lesson_meta in lessons_index.get('lessons', []):
+                lesson = self.get_lesson(lesson_meta['lesson_id'])
+                if lesson and 'vocabulary' in lesson:
+                    for vocab in lesson['vocabulary']:
+                        # Add lesson context
+                        vocab_copy = vocab.copy()
+                        vocab_copy['lesson_id'] = lesson['lesson_id']
+                        vocab_copy['lesson_title'] = lesson.get('title', '')
+                        vocab_copy['lesson_level'] = lesson.get('difficulty_level', 1)
+                        all_vocab.append(vocab_copy)
+            return all_vocab
+        except Exception as e:
+            logger.error(f"Error getting all vocabulary: {e}")
+            return []
+    def get_scenario(self, scenario_id: str) -> Optional[Dict]:
+        """Load a task scenario by ID"""
+        try:
+            scenarios_dir = self.data_dir / "scenarios"
+            scenario_path = scenarios_dir / f"{scenario_id}.json"
+            if not scenario_path.exists():
+                logger.warning(f"Scenario file not found: {scenario_path}")
+                return None
+            with open(scenario_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"Error loading scenario {scenario_id}: {e}")
+            return None
+    def get_all_scenarios(self) -> List[Dict]:
+        """Get list of all available scenarios"""
+        try:
+            scenarios_dir = self.data_dir / "scenarios"
+            if not scenarios_dir.exists():
+                return []
+            scenarios = []
+            for scenario_file in scenarios_dir.glob("*.json"):
+                try:
+                    with open(scenario_file, 'r', encoding='utf-8') as f:
+                        scenario_data = json.load(f)
+                        # Add just metadata, not full dialogue tree
+                        scenarios.append({
+                            'scenario_id': scenario_data.get('scenario_id'),
+                            'title': scenario_data.get('title'),
+                            'title_en': scenario_data.get('title_en'),
+                            'level': scenario_data.get('level'),
+                            'estimated_duration_minutes': scenario_data.get('estimated_duration_minutes'),
+                            'learning_goals': scenario_data.get('learning_goals', [])
+                        })
+                except Exception as e:
+                    logger.error(f"Error loading scenario {scenario_file}: {e}")
+                    continue
+            return scenarios
+        except Exception as e:
+            logger.error(f"Error getting all scenarios: {e}")
+            return []

app/services/quantization_utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Dynamic INT8 Quantization utilities for ASR models.
+This module provides utilities to apply PyTorch dynamic quantization to
+Hugging Face transformer models, specifically optimized for ASR models like
+Whisper and Wav2Vec2-BERT.
+"""
+import torch
+from torch.quantization import quantize_dynamic
+from transformers import PreTrainedModel
+import time
+def apply_dynamic_int8_quantization(model: PreTrainedModel, model_type: str = "auto") -> PreTrainedModel:
+    """
+    Apply dynamic INT8 quantization to a Hugging Face model.
+    Dynamic quantization converts model weights to INT8 and activations to INT8 on-the-fly
+    during inference, reducing model size and improving inference speed with minimal
+    accuracy loss.
+    Args:
+        model: The Hugging Face model to quantize
+        model_type: Type of model ("whisper", "wav2vec2-bert", or "auto")
+    Returns:
+        Quantized model
+    References:
+        - PyTorch Quantization: https://pytorch.org/docs/stable/quantization.html
+        - Dynamic Quantization for NLP: https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html
+    """
+    print(f"\n{'='*60}")
+    print(f"Applying Dynamic INT8 Quantization to {model_type} model")
+    print(f"{'='*60}")
+    # Get model size before quantization
+    param_size = 0
+    for param in model.parameters():
+        param_size += param.nelement() * param.element_size()
+    buffer_size = 0
+    for buffer in model.buffers():
+        buffer_size += buffer.nelement() * buffer.element_size()
+    size_before_mb = (param_size + buffer_size) / 1024**2
+    print(f"Model size before quantization: {size_before_mb:.2f} MB")
+    # Start quantization timer
+    start_time = time.time()
+    try:
+        # Dynamic quantization targets:
+        # - torch.nn.Linear: Most common layer type in transformers
+        # - torch.nn.LSTM/GRU/RNN: For sequential models (if present)
+        #
+        # Note: We use qint8 (quantized int8) which converts weights to INT8
+        # and performs INT8 arithmetic for linear layers during inference
+        quantized_model = quantize_dynamic(
+            model,
+            {torch.nn.Linear},  # Quantize all Linear layers
+            dtype=torch.qint8    # Use 8-bit integer quantization
+        )
+        # Get model size after quantization
+        param_size_q = 0
+        for param in quantized_model.parameters():
+            param_size_q += param.nelement() * param.element_size()
+        buffer_size_q = 0
+        for buffer in quantized_model.buffers():
+            buffer_size_q += buffer.nelement() * buffer.element_size()
+        size_after_mb = (param_size_q + buffer_size_q) / 1024**2
+        quantization_time = time.time() - start_time
+        size_reduction = ((size_before_mb - size_after_mb) / size_before_mb) * 100
+        print(f"✓ Quantization successful!")
+        print(f"  - Model size after quantization: {size_after_mb:.2f} MB")
+        print(f"  - Size reduction: {size_reduction:.1f}%")
+        print(f"  - Quantization time: {quantization_time:.2f}s")
+        print(f"{'='*60}\n")
+        return quantized_model
+    except Exception as e:
+        print(f"✗ Quantization failed: {e}")
+        print(f"  Returning original unquantized model")
+        print(f"{'='*60}\n")
+        return model
+def get_quantization_stats(model: PreTrainedModel) -> dict:
+    """
+    Get statistics about a model's quantization status.
+    Args:
+        model: The model to analyze
+    Returns:
+        Dictionary with quantization statistics
+    """
+    stats = {
+        "is_quantized": False,
+        "quantized_layers": 0,
+        "total_layers": 0,
+        "size_mb": 0.0
+    }
+    # Count quantized vs regular layers
+    for name, module in model.named_modules():
+        if isinstance(module, (torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU)):
+            stats["total_layers"] += 1
+        # Check if layer is quantized (will have _packed_params attribute)
+        if hasattr(module, '_packed_params'):
+            stats["quantized_layers"] += 1
+            stats["is_quantized"] = True
+    # Calculate model size
+    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
+    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
+    stats["size_mb"] = (param_size + buffer_size) / 1024**2
+    return stats

app/services/session_manager.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import uuid
+import random
+import string
+from typing import Dict, List, Optional
+from app.models import Session, SessionCreate, Participant, Language, LanguageCode
+def generate_short_code(length: int = 8) -> str:
+    """Generate a random short code using uppercase letters and digits"""
+    # Use only uppercase letters and digits to avoid confusion (no lowercase to avoid O/0, I/1 confusion)
+    alphabet = string.ascii_uppercase + string.digits
+    # Remove confusing characters
+    alphabet = alphabet.replace('O', '').replace('0', '').replace('I', '').replace('1', '')
+    return ''.join(random.choice(alphabet) for _ in range(length))
+# Language mappings
+LANGUAGE_MAP = {
+    LanguageCode.ENGLISH: Language(code=LanguageCode.ENGLISH, name="English", display_name="English (eng)"),
+    LanguageCode.SWAHILI: Language(code=LanguageCode.SWAHILI, name="Swahili", display_name="Swahili (swa)"),
+    LanguageCode.KIKUYU: Language(code=LanguageCode.KIKUYU, name="Kikuyu", display_name="Kikuyu (kik)"),
+    LanguageCode.KAMBA: Language(code=LanguageCode.KAMBA, name="Kamba", display_name="Kamba (kam)"),
+    LanguageCode.KIMERU: Language(code=LanguageCode.KIMERU, name="Kimeru", display_name="Kimeru (mer)"),
+    LanguageCode.LUO: Language(code=LanguageCode.LUO, name="Luo", display_name="Luo (luo)"),
+    LanguageCode.SOMALI: Language(code=LanguageCode.SOMALI, name="Somali", display_name="Somali (som)"),
+}
+class SessionManager:
+    def __init__(self):
+        self.sessions: Dict[str, Session] = {}
+        self.participant_sessions: Dict[str, str] = {}  # participant_id -> session_id
+        self.short_code_to_id: Dict[str, str] = {}  # short_code -> session_id
+        self.id_to_short_code: Dict[str, str] = {}  # session_id -> short_code
+    async def create_session(self, session_data: SessionCreate) -> Session:
+        session_id = str(uuid.uuid4())
+        # Generate unique short code
+        short_code = generate_short_code(8)
+        while short_code in self.short_code_to_id:
+            # Extremely unlikely collision, but regenerate if needed
+            short_code = generate_short_code(8)
+        # Convert language codes to Language objects
+        languages = [LANGUAGE_MAP[lang_code] for lang_code in session_data.languages]
+        session = Session(
+            id=session_id,
+            name=session_data.name,
+            organizer_name=session_data.organizer_name,
+            languages=languages,
+            participants=[],
+            is_active=True,
+            enable_tts=session_data.enable_tts
+        )
+        self.sessions[session_id] = session
+        self.short_code_to_id[short_code] = session_id
+        self.id_to_short_code[session_id] = short_code
+        return session
+    async def get_session(self, session_id_or_code: str) -> Optional[Session]:
+        """Get session by full UUID or short code"""
+        # Try as full UUID first
+        session = self.sessions.get(session_id_or_code)
+        if session:
+            return session
+        # Try as short code
+        session_id = self.short_code_to_id.get(session_id_or_code.upper())
+        if session_id:
+            return self.sessions.get(session_id)
+        return None
+    def get_short_code(self, session_id: str) -> str:
+        """Get short code for a session ID"""
+        return self.id_to_short_code.get(session_id, session_id)
+    async def get_all_sessions(self) -> List[Session]:
+        return list(self.sessions.values())
+    async def add_participant(self, session_id: str, participant_name: str, language_code: LanguageCode) -> Optional[Participant]:
+        session = await self.get_session(session_id)
+        if not session:
+            return None
+        participant_id = str(uuid.uuid4())
+        language = LANGUAGE_MAP[language_code]
+        # Check if the participant's language is already in the session languages
+        language_exists = any(lang.code == language_code for lang in session.languages)
+        if not language_exists:
+            print(f"Adding new language {language.name} ({language_code.value}) to session {session_id}")
+            session.languages.append(language)
+        participant = Participant(
+            id=participant_id,
+            name=participant_name,
+            language=language,
+            is_organizer=len(session.participants) == 0,  # First participant is organizer
+            is_speaking=False,
+            is_connected=True
+        )
+        session.participants.append(participant)
+        self.participant_sessions[participant_id] = session_id
+        print(f"Participant {participant_name} added to session. Session now has {len(session.languages)} languages: {[lang.name for lang in session.languages]}")
+        return participant
+    async def remove_participant(self, participant_id: str) -> bool:
+        session_id = self.participant_sessions.get(participant_id)
+        if not session_id:
+            return False
+        session = await self.get_session(session_id)
+        if not session:
+            return False
+        # Remove participant from session
+        session.participants = [p for p in session.participants if p.id != participant_id]
+        del self.participant_sessions[participant_id]
+        return True
+    async def update_participant_speaking_status(self, participant_id: str, is_speaking: bool) -> bool:
+        session_id = self.participant_sessions.get(participant_id)
+        if not session_id:
+            return False
+        session = await self.get_session(session_id)
+        if not session:
+            return False
+        for participant in session.participants:
+            if participant.id == participant_id:
+                participant.is_speaking = is_speaking
+                return True
+        return False
+    async def get_participant_session_id(self, participant_id: str) -> Optional[str]:
+        return self.participant_sessions.get(participant_id)
+    async def add_language_to_session(self, session_id: str, language_code: LanguageCode) -> bool:
+        """Add a language to the session if it doesn't already exist"""
+        session = await self.get_session(session_id)
+        if not session:
+            return False
+        language = LANGUAGE_MAP[language_code]
+        # Check if the language is already in the session languages
+        language_exists = any(lang.code == language_code for lang in session.languages)
+        if not language_exists:
+            print(f"Adding new language {language.name} ({language_code.value}) to session {session_id}")
+            session.languages.append(language)
+            print(f"Session {session_id} now has {len(session.languages)} languages: {[lang.name for lang in session.languages]}")
+            return True
+        else:
+            print(f"Language {language.name} ({language_code.value}) already exists in session {session_id}")
+            return False
+    async def delete_session(self, session_id: str) -> bool:
+        if session_id in self.sessions:
+            # Remove all participants from tracking
+            session = self.sessions[session_id]
+            for participant in session.participants:
+                if participant.id in self.participant_sessions:
+                    del self.participant_sessions[participant.id]
+            # Remove short code mapping
+            short_code = self.id_to_short_code.get(session_id)
+            if short_code:
+                del self.short_code_to_id[short_code]
+                del self.id_to_short_code[session_id]
+            del self.sessions[session_id]
+            return True
+        return False

app/services/transcription_service.py ADDED Viewed

	@@ -0,0 +1,736 @@

+import asyncio
+import io
+import wave
+import numpy as np
+import time
+from typing import Dict, Optional, Callable
+from transformers import pipeline
+import torch
+from app.models import LanguageCode
+import os
+from app.services.quantization_utils import apply_dynamic_int8_quantization, get_quantization_stats
+# Silero VAD imports
+try:
+    import silero_vad
+    SILERO_VAD_AVAILABLE = True
+except ImportError:
+    SILERO_VAD_AVAILABLE = False
+    print("Warning: silero-vad not installed. Falling back to RMS-based VAD.")
+class TranscriptionService:
+    def __init__(self):
+        self.asr_pipelines: Dict[str, any] = {}
+        self.device = 0 if torch.cuda.is_available() else -1
+        # Model configurations - using original mutisya models with updated config
+        self.asr_config = {
+            "eng": {"model_repo": "openai/whisper-base.en", "model_type": "whisper"},
+            "swa": {"model_repo": "mutisya/w2v-bert-2.0-asr-swh-superv-v25-37-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "kik": {"model_repo": "mutisya/w2v-bert-2.0-asr-kik-superv-v25-37-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "kam": {"model_repo": "mutisya/w2v-bert-2.0-asr-kam-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "mer": {"model_repo": "mutisya/w2v-bert-2.0-asr-mer-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "luo": {"model_repo": "mutisya/w2v-bert-2.0-asr-luo-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "som": {"model_repo": "mutisya/w2v-bert-2.0-asr-som-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True}
+        }
+        self.preload_languages = ["eng"]
+        self.background_loading_task = None
+        self.models_loading_status = {}
+        # Enhanced audio buffering for VAD-based sentence detection
+        self.candidate_audio_buffers: Dict[str, bytes] = {}  # participant_id -> candidate audio buffer
+        self.candidate_text_cache: Dict[str, str] = {}  # participant_id -> current candidate text
+        self.silence_counters: Dict[str, int] = {}  # participant_id -> consecutive silence chunks
+        self.sentence_finalized: Dict[str, bool] = {}  # participant_id -> whether current sentence is already finalized
+        # VAD parameters - made more lenient for better detection
+        self.silence_threshold = 1  # Number of consecutive silent chunks before sentence break (1 second for natural pauses)
+        self.min_sentence_length = 0.03  # Minimum sentence length in seconds (very short)
+        # Silero VAD initialization
+        self.vad_model = None
+        self.vad_sample_rate = 16000
+        self.vad_available = SILERO_VAD_AVAILABLE
+        # Quantization configuration
+        # Set ENABLE_INT8_QUANTIZATION=true in environment to enable quantization
+        self.enable_quantization = os.getenv('ENABLE_INT8_QUANTIZATION', 'true').lower() == 'true'
+        print(f"INT8 Quantization: {'ENABLED' if self.enable_quantization else 'DISABLED'}")
+    async def initialize(self):
+        """Initialize ASR models for preloaded languages and Silero VAD"""
+        # Initialize Silero VAD model
+        if self.vad_available:
+            try:
+                print("Loading Silero VAD model...")
+                self.vad_model = silero_vad.load_silero_vad(onnx=False)
+                print("✓ Silero VAD model loaded successfully")
+            except Exception as e:
+                print(f"Failed to load Silero VAD model: {e}")
+                print("Falling back to RMS-based VAD")
+                self.vad_available = False
+        # Initialize ASR models
+        for lang_code in self.preload_languages:
+            if lang_code in self.asr_config:
+                try:
+                    model_config = self.asr_config[lang_code]
+                    pipeline_obj = self._load_and_quantize_pipeline(lang_code, model_config)
+                    self.asr_pipelines[lang_code] = pipeline_obj
+                except Exception as e:
+                    print(f"Failed to load ASR model for {lang_code}: {e}")
+    def _load_and_quantize_pipeline(self, lang_code: str, model_config: dict):
+        """Load ASR pipeline and optionally apply INT8 quantization"""
+        # Build pipeline parameters
+        pipeline_params = {
+            "task": "automatic-speech-recognition",
+            "model": model_config["model_repo"],
+            "device": self.device,
+            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32
+        }
+        # Add trust_remote_code if specified
+        if model_config.get("trust_remote_code", False):
+            pipeline_params["trust_remote_code"] = True
+        print(f"Loading ASR model for {lang_code}: {model_config['model_repo']}")
+        pipeline_obj = pipeline(**pipeline_params)
+        # Apply quantization if enabled
+        if self.enable_quantization:
+            try:
+                # Get the underlying model from the pipeline
+                model = pipeline_obj.model
+                model_type = model_config.get("model_type", "auto")
+                # Apply dynamic INT8 quantization
+                quantized_model = apply_dynamic_int8_quantization(model, model_type)
+                # Replace the model in the pipeline
+                pipeline_obj.model = quantized_model
+                # Print quantization stats
+                stats = get_quantization_stats(quantized_model)
+                print(f"✓ {lang_code} model quantized: {stats['quantized_layers']}/{stats['total_layers']} layers, {stats['size_mb']:.2f} MB")
+            except Exception as e:
+                print(f"Warning: Could not quantize {lang_code} model: {e}")
+                print(f"Continuing with unquantized model")
+        return pipeline_obj
+    async def ensure_model_loaded(self, language_code: str):
+        """Load ASR model for language if not already loaded"""
+        if language_code not in self.asr_pipelines and language_code in self.asr_config:
+            try:
+                model_config = self.asr_config[language_code]
+                pipeline_obj = self._load_and_quantize_pipeline(language_code, model_config)
+                self.asr_pipelines[language_code] = pipeline_obj
+            except Exception as e:
+                print(f"Failed to load ASR model for {language_code}: {e}")
+                raise
+    async def process_audio_chunk(self, audio_data: bytes, language_code: str, participant_id: str,
+                                 has_voice_activity: bool = True,
+                                 progress_callback: Optional[Callable] = None,
+                                 sentence_callback: Optional[Callable] = None,
+                                 debug_callback: Optional[Callable] = None) -> str:
+        """Process audio chunk with VAD-based sentence detection"""
+        try:
+            # Initialize buffers if needed
+            if participant_id not in self.candidate_audio_buffers:
+                # Store as numpy array, not bytes, to avoid multiple WAV header issues
+                self.candidate_audio_buffers[participant_id] = np.array([], dtype=np.float32)
+                self.candidate_text_cache[participant_id] = ""
+                self.silence_counters[participant_id] = 0
+                self.sentence_finalized[participant_id] = False
+            # Convert current chunk to numpy array for processing
+            current_chunk_array = self._bytes_to_audio_array(audio_data)
+            if len(current_chunk_array) == 0:
+                print(f"WARNING: Received empty audio chunk for participant {participant_id}")
+                return self.candidate_text_cache.get(participant_id, "")
+            print(f"DEBUG: Received audio chunk - bytes: {len(audio_data)}, samples: {len(current_chunk_array)}, "
+                  f"duration: {len(current_chunk_array)/16000:.3f}s, "
+                  f"first 4 bytes: {audio_data[:4]}")
+            # DO NOT normalize individual chunks - this causes audio distortion
+            # We'll normalize the entire accumulated audio buffer before transcription
+            current_chunk_array = current_chunk_array.astype(np.float32)
+            # Get existing accumulated audio array (now stored as numpy array)
+            existing_array = self.candidate_audio_buffers[participant_id]
+            if len(existing_array) > 0:
+                # Concatenate with existing audio (like stream = np.concatenate([stream, y]))
+                combined_array = np.concatenate([existing_array, current_chunk_array])
+            else:
+                combined_array = current_chunk_array
+            # Store as numpy array to avoid WAV header accumulation issues
+            self.candidate_audio_buffers[participant_id] = combined_array
+            # For debug callback, convert to bytes (this adds ONE WAV header)
+            combined_bytes = self._audio_array_to_bytes(combined_array)
+            # Update silence counter based on voice activity
+            if not has_voice_activity:
+                self.silence_counters[participant_id] += 1
+            else:
+                self.silence_counters[participant_id] = 0
+            # Check if we should finalize sentence due to prolonged silence
+            should_finalize = (self.silence_counters[participant_id] >= self.silence_threshold and
+                              len(combined_array) > 0 and
+                              not self.sentence_finalized[participant_id])
+            if should_finalize:
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Always run transcription on the accumulated audio
+            audio_duration_sec = len(combined_array) / 16000.0  # 16kHz sample rate
+            # Minimum duration check - ignore very short audio bursts
+            MIN_CHUNK_DURATION = 0.3  # 300ms minimum
+            if audio_duration_sec < MIN_CHUNK_DURATION:
+                print(f"Audio chunk too short: {audio_duration_sec:.3f}s < {MIN_CHUNK_DURATION}s, skipping transcription")
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            # Force finalization if buffer gets too long (prevent infinite accumulation)
+            if audio_duration_sec > 15.0 and not self.sentence_finalized[participant_id]:  # Force completion after 15 seconds
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Run voice activity detection on the accumulated audio before transcription
+            has_voice_in_buffer = self.has_meaningful_voice_activity(combined_bytes)
+            if not has_voice_in_buffer:
+                # Still send progress update with cached text to maintain UI state
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            # Run transcription
+            await self.ensure_model_loaded(language_code)
+            # Double-check voice activity before running expensive ASR
+            has_voice_for_asr = self.has_voice_activity(combined_bytes)
+            if not has_voice_for_asr:
+                print(f"ASR: No voice activity detected in audio buffer for participant {participant_id}, skipping ASR execution")
+                # Return cached text and send progress update
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            if language_code not in self.asr_pipelines:
+                raise ValueError(f"ASR model not available for language: {language_code}")
+            print(f"ASR: Running transcription for participant {participant_id} with {len(combined_array)/16000:.2f}s of audio")
+            pipeline_obj = self.asr_pipelines[language_code]
+            # Normalize the ENTIRE accumulated audio buffer before transcription
+            # This prevents audio distortion from per-chunk normalization
+            normalized_array = combined_array.astype(np.float32)
+            max_val = np.max(np.abs(normalized_array))
+            if max_val > 0:
+                normalized_array = normalized_array / max_val
+            # Track transcription latency
+            transcription_start_time = time.time()
+            # For wav2vec2 models, request word timestamps
+            model_type = self.asr_config[language_code].get("model_type", "whisper")
+            if model_type in ["wav2vec2-bert", "wav2vec2"]:
+                result = pipeline_obj(
+                    {"sampling_rate": 16000, "raw": normalized_array},
+                    return_timestamps="word"
+                )
+            else:
+                # Whisper model - add anti-hallucination parameters
+                # Note: HuggingFace pipeline uses different parameter names than OpenAI Whisper
+                result = pipeline_obj(
+                    {"sampling_rate": 16000, "raw": normalized_array},
+                    return_timestamps=True,
+                    chunk_length_s=30,  # Process in 30s chunks
+                    stride_length_s=5   # 5s stride for context
+                )
+            transcription_latency_ms = (time.time() - transcription_start_time) * 1000
+            candidate_text = result.get("text", "").strip()
+            word_timestamps = result.get("chunks", []) if model_type in ["wav2vec2-bert", "wav2vec2"] else None
+            # Send debug information if callback provided (for wav2vec2 models only)
+            if debug_callback and word_timestamps is not None:
+                debug_info = {
+                    "text": candidate_text,
+                    "timestamps": word_timestamps,
+                    "audio_data": combined_bytes,
+                    "audio_duration": audio_duration_sec,
+                    "model_type": model_type,
+                    "transcription_latency_ms": transcription_latency_ms
+                }
+                await debug_callback(debug_info)
+            # Filter out common ASR artifacts and very short responses
+            artifacts = [
+                "thank you", "thanks", "bye", ".", ",", "?", "!",
+                "um", "uh", "ah", "hmm", "mm", "mhm",
+                "you", "the", "a", "an", "and", "but", "or",
+                "music", "laughter", "applause", "[music]", "[laughter]",
+                # Common Whisper hallucinations:
+                "subscribe", "subtitles", "amara", "www", "http",
+                "please subscribe", "like and subscribe",
+                "thank you for watching", "don't forget to subscribe",
+                "[blank_audio]", "[noise]", "[silence]",
+            ]
+            # Check if the result is likely an artifact
+            is_artifact = (
+                len(candidate_text) < 3 or  # Very short
+                candidate_text.lower() in artifacts or  # Common artifacts
+                len(candidate_text.split()) == 1 and len(candidate_text) < 6  # Single very short word
+            )
+            if is_artifact:
+                # Keep the previous cached text instead of updating with artifact
+                candidate_text = self.candidate_text_cache.get(participant_id, "")
+            # Cache the current candidate text
+            self.candidate_text_cache[participant_id] = candidate_text
+            # Force completion if we have a reasonable amount of text and some silence
+            word_count = len(candidate_text.split()) if candidate_text else 0
+            if (word_count >= 3 and self.silence_counters[participant_id] >= 2 and
+                not self.sentence_finalized[participant_id]):  # At least 3 words and 2 silent chunks
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Always send progress update
+            if progress_callback:
+                await progress_callback(candidate_text, False)
+            return candidate_text
+        except Exception as e:
+            print(f"TranscriptionService: Error processing audio chunk: {e}")
+            import traceback
+            traceback.print_exc()
+            # Even on error, try to send cached text
+            if progress_callback:
+                cached_text = self.candidate_text_cache.get(participant_id, "")
+                await progress_callback(cached_text, False)
+            return self.candidate_text_cache.get(participant_id, "")
+    async def _finalize_candidate_sentence(self, language_code: str, participant_id: str,
+                                         sentence_callback: Optional[Callable] = None) -> str:
+        """Finalize the current candidate sentence and clear buffers"""
+        try:
+            # Check if sentence was already finalized
+            if self.sentence_finalized.get(participant_id, False):
+                print(f"Sentence for participant {participant_id} already finalized, skipping duplicate")
+                return self.candidate_text_cache.get(participant_id, "")
+            final_text = self.candidate_text_cache.get(participant_id, "")
+            final_audio_array = self.candidate_audio_buffers.get(participant_id, np.array([], dtype=np.float32))
+            # Convert audio array to bytes for VAD check and callback
+            final_audio_bytes = self._audio_array_to_bytes(final_audio_array) if len(final_audio_array) > 0 else b''
+            if final_text and len(final_text.strip()) > 0:
+                # Run VAD check on the final accumulated buffer before sending for translation
+                if len(final_audio_bytes) > 0:
+                    has_voice_in_final = self.has_meaningful_voice_activity(final_audio_bytes)
+                    if not has_voice_in_final:
+                        print(f"Finalize: No voice activity in final buffer for participant {participant_id}, discarding sentence: '{final_text}'")
+                        # Clear buffers without sending to translation
+                        self.candidate_audio_buffers[participant_id] = np.array([], dtype=np.float32)
+                        self.candidate_text_cache[participant_id] = ""
+                        self.silence_counters[participant_id] = 0
+                        self.sentence_finalized[participant_id] = False
+                        return ""
+                # Mark as finalized BEFORE calling the callback to prevent race conditions
+                self.sentence_finalized[participant_id] = True
+                # Send to sentence callback for translation
+                if sentence_callback and len(final_audio_bytes) > 0:
+                    print(f"Finalizing sentence for participant {participant_id}: '{final_text}'")
+                    await sentence_callback(final_text, final_audio_bytes)
+            # Clear buffers for next sentence
+            self.candidate_audio_buffers[participant_id] = np.array([], dtype=np.float32)
+            self.candidate_text_cache[participant_id] = ""
+            self.silence_counters[participant_id] = 0
+            self.sentence_finalized[participant_id] = False  # Reset for next sentence
+            return final_text
+        except Exception as e:
+            print(f"Error finalizing sentence: {e}")
+            import traceback
+            traceback.print_exc()
+            # Reset finalized flag on error
+            self.sentence_finalized[participant_id] = False
+            return ""
+    def has_voice_activity(self, audio_data: bytes, threshold: float = 0.5) -> bool:
+        """Voice Activity Detection using Silero VAD (with RMS fallback)"""
+        try:
+            audio_array = self._bytes_to_audio_array(audio_data)
+            if len(audio_array) == 0:
+                print("VAD: No audio array, returning False")
+                return False
+            # Normalize audio to float32 range [-1, 1]
+            audio_array = audio_array.astype(np.float32)
+            if np.max(np.abs(audio_array)) > 0:
+                audio_array /= np.max(np.abs(audio_array))
+            # Use Silero VAD if available
+            if self.vad_available and self.vad_model is not None:
+                try:
+                    # Silero VAD expects 512 samples (32ms) or 1536 samples (96ms) for 16kHz
+                    # Process audio in chunks and average the probabilities
+                    frame_size = 512  # 32ms at 16kHz
+                    num_samples = len(audio_array)
+                    # If audio is too short, pad it
+                    if num_samples < frame_size:
+                        audio_array = np.pad(audio_array, (0, frame_size - num_samples), mode='constant')
+                        num_samples = frame_size
+                    # Process in frames and collect probabilities
+                    speech_probs = []
+                    for i in range(0, num_samples, frame_size):
+                        frame = audio_array[i:i + frame_size]
+                        if len(frame) < frame_size:
+                            # Pad last frame if needed
+                            frame = np.pad(frame, (0, frame_size - len(frame)), mode='constant')
+                        # Convert to torch tensor
+                        frame_tensor = torch.from_numpy(frame).float()
+                        # Get speech probability from Silero VAD
+                        with torch.no_grad():
+                            prob = self.vad_model(frame_tensor, self.vad_sample_rate).item()
+                            speech_probs.append(prob)
+                    # Average probability across all frames
+                    speech_prob = np.mean(speech_probs)
+                    has_voice = speech_prob > threshold
+                    print(f"VAD: Silero speech_prob={speech_prob:.4f} (avg of {len(speech_probs)} frames), threshold={threshold}, RESULT={has_voice}")
+                    return has_voice
+                except Exception as e:
+                    print(f"Silero VAD error: {e}, falling back to RMS-based VAD")
+                    # Fall through to RMS-based VAD below
+            # Fallback: RMS-based VAD (original implementation)
+            rms_threshold = 0.002
+            rms = np.sqrt(np.mean(audio_array ** 2))
+            peak = np.max(np.abs(audio_array))
+            audio_std = np.std(audio_array)
+            zero_crossing_rate = np.sum(np.diff(np.sign(audio_array)) != 0) / len(audio_array)
+            has_voice_rms = rms > rms_threshold
+            has_voice_peak = peak > rms_threshold * 3
+            has_voice_variation = audio_std > rms_threshold * 0.8
+            has_voice_zcr = zero_crossing_rate > 0.008
+            has_voice = has_voice_rms or (has_voice_peak and has_voice_variation) or has_voice_zcr
+            print(f"VAD: RMS-based - RMS={rms:.6f}({has_voice_rms}), peak={peak:.6f}({has_voice_peak}), std={audio_std:.6f}({has_voice_variation}), zcr={zero_crossing_rate:.6f}({has_voice_zcr}), RESULT={has_voice}")
+            return has_voice
+        except Exception as e:
+            print(f"Error in VAD: {e}")
+            return True  # Default to assuming voice activity on error
+    def has_meaningful_voice_activity(self, audio_data: bytes, threshold: float = 0.005) -> bool:
+        """Stricter VAD check specifically for pre-transcription filtering"""
+        try:
+            audio_array = self._bytes_to_audio_array(audio_data)
+            if len(audio_array) == 0:
+                return False
+            # Normalize audio
+            audio_array = audio_array.astype(np.float32)
+            if np.max(np.abs(audio_array)) > 0:
+                audio_array /= np.max(np.abs(audio_array))
+            # Calculate features with higher thresholds for meaningful speech
+            rms = np.sqrt(np.mean(audio_array ** 2))
+            peak = np.max(np.abs(audio_array))
+            audio_std = np.std(audio_array)
+            zero_crossing_rate = np.sum(np.diff(np.sign(audio_array)) != 0) / len(audio_array)
+            # Higher thresholds for meaningful speech detection
+            has_meaningful_voice = (
+                rms > threshold and
+                peak > threshold * 2 and
+                audio_std > threshold * 0.5 and
+                zero_crossing_rate > 0.015  # Higher ZCR threshold for meaningful speech
+            )
+            return has_meaningful_voice
+        except Exception as e:
+            print(f"Error in meaningful VAD: {e}")
+            return False  # Default to no meaningful voice on error
+    async def force_complete_sentence(self, participant_id: str, language_code: str, sentence_callback: Optional[Callable] = None) -> str:
+        """Force complete any pending sentence for a participant"""
+        try:
+            # Check if sentence was already finalized
+            if self.sentence_finalized.get(participant_id, False):
+                print(f"Force completion: Sentence for participant {participant_id} already finalized, skipping")
+                return ""
+            if participant_id in self.candidate_text_cache:
+                cached_text = self.candidate_text_cache[participant_id]
+                if cached_text and len(cached_text.strip()) > 0:
+                    result = await self._finalize_candidate_sentence(language_code, participant_id, sentence_callback)
+                    return result
+            return ""
+        except Exception as e:
+            print(f"Error in force_complete_sentence: {e}")
+            import traceback
+            traceback.print_exc()
+            return ""
+    async def transcribe_audio(self, audio_data: bytes, language_code: str, callback: Optional[Callable] = None) -> str:
+        """Transcribe audio data to text"""
+        try:
+            # Check for voice activity before running ASR
+            has_voice = self.has_voice_activity(audio_data)
+            if not has_voice:
+                print(f"ASR: No voice activity detected in audio data, skipping transcription")
+                return ""
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.asr_pipelines:
+                raise ValueError(f"ASR model not available for language: {language_code}")
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(audio_data)
+            print(f"ASR: Running transcription with {len(audio_array)/16000:.2f}s of audio")
+            # Transcribe
+            pipeline_obj = self.asr_pipelines[language_code]
+            result = pipeline_obj({"sampling_rate": 16000, "raw": audio_array})
+            text = result.get("text", "")
+            if callback:
+                await callback(text)
+            return text
+        except Exception as e:
+            print(f"TranscriptionService: Transcription error: {e}")
+            import traceback
+            traceback.print_exc()
+            return ""
+    def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray:
+        """Convert audio bytes to numpy array (supports WAV, WebM/Opus)"""
+        try:
+            # Detect format by checking magic bytes
+            is_webm = audio_data[:4] == b'\x1a\x45\xdf\xa3'  # WebM/Matroska magic bytes
+            is_wav = audio_data[:4] == b'RIFF'
+            import sys
+            print(f"_bytes_to_audio_array: length={len(audio_data)}, first 4 bytes={audio_data[:4]}, is_wav={is_wav}", flush=True)
+            sys.stdout.flush()
+            # Handle raw PCM (16-bit, 48kHz from extendable-media-recorder)
+            # This is the most common case for microphone input
+            if not is_wav and not is_webm and len(audio_data) > 0:
+                try:
+                    # Assume 16-bit PCM at 48kHz (browser's native rate)
+                    audio_array = np.frombuffer(audio_data, dtype=np.int16)
+                    # Check if this looks like valid audio data (not NaN, reasonable range)
+                    if len(audio_array) > 0 and not np.isnan(audio_array).any():
+                        print(f"Raw PCM: {len(audio_array)} samples, assuming 48kHz 16-bit", flush=True)
+                        # Convert to float32 and normalize
+                        audio_float = audio_array.astype(np.float32) / 32768.0
+                        # Resample from 48kHz to 16kHz
+                        import librosa
+                        audio_array = librosa.resample(audio_float, orig_sr=48000, target_sr=16000)
+                        print(f"Resampled to 16kHz: {len(audio_array)} samples", flush=True)
+                        return audio_array
+                except Exception as pcm_error:
+                    print(f"TranscriptionService: Raw PCM decoding error: {pcm_error}", flush=True)
+                    # Fall through to other methods
+            if is_webm:
+                # Decode WebM/Opus using pydub (requires ffmpeg)
+                try:
+                    from pydub import AudioSegment
+                    audio_io = io.BytesIO(audio_data)
+                    audio_segment = AudioSegment.from_file(audio_io, format="webm")
+                    # Convert to mono 16kHz
+                    audio_segment = audio_segment.set_channels(1)
+                    audio_segment = audio_segment.set_frame_rate(16000)
+                    # Convert to numpy array
+                    samples = np.array(audio_segment.get_array_of_samples(), dtype=np.int16)
+                    # Normalize to float32 [-1, 1]
+                    audio_array = samples.astype(np.float32) / 32768.0
+                    return audio_array
+                except Exception as webm_error:
+                    print(f"TranscriptionService: WebM decoding error: {webm_error}")
+                    # Fall through to other methods
+            if is_wav:
+                # Decode WAV format (first chunk from frontend includes WAV header with sample rate)
+                try:
+                    audio_io = io.BytesIO(audio_data)
+                    with wave.open(audio_io, 'rb') as wav_file:
+                        sample_rate = wav_file.getframerate()
+                        channels = wav_file.getnchannels()
+                        sample_width = wav_file.getsampwidth()
+                        print(f"WAV format: {sample_rate}Hz, {channels} channel(s), {sample_width*8}-bit", flush=True)
+                        frames = wav_file.readframes(-1)
+                        audio_array = np.frombuffer(frames, dtype=np.int16)
+                        # Resample if needed
+                        if sample_rate != 16000:
+                            print(f"WARNING: Resampling from {sample_rate}Hz to 16000Hz", flush=True)
+                            import librosa
+                            # Convert to float first
+                            audio_float = audio_array.astype(np.float32) / 32768.0
+                            # Resample
+                            audio_array = librosa.resample(audio_float, orig_sr=sample_rate, target_sr=16000)
+                            print(f"Resampled: {len(audio_array)} samples at 16kHz", flush=True)
+                        else:
+                            # Convert to float32 and normalize
+                            audio_array = audio_array.astype(np.float32) / 32768.0
+                        print(f"Returning audio array: {len(audio_array)} samples", flush=True)
+                        return audio_array
+                except Exception as wav_error:
+                    print(f"TranscriptionService: WAV decoding error: {wav_error}")
+                    import traceback
+                    traceback.print_exc()
+            # Fallback: assume raw float32 audio data
+            try:
+                audio_array = np.frombuffer(audio_data, dtype=np.float32)
+                return audio_array
+            except Exception:
+                pass
+            # Last resort: return empty array
+            return np.array([], dtype=np.float32)
+        except Exception as e:
+            print(f"TranscriptionService: Audio conversion error: {e}")
+            return np.array([], dtype=np.float32)
+    def _audio_array_to_bytes(self, audio_array: np.ndarray) -> bytes:
+        """Convert numpy audio array back to WAV bytes for storage"""
+        try:
+            # Ensure float32 format
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32)
+            # Convert to 16-bit PCM for WAV storage
+            audio_int16 = (audio_array * 32767).astype(np.int16)
+            # Create WAV bytes
+            wav_buffer = io.BytesIO()
+            with wave.open(wav_buffer, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Mono
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(16000)  # 16kHz
+                wav_file.writeframes(audio_int16.tobytes())
+            return wav_buffer.getvalue()
+        except Exception as e:
+            print(f"Error converting audio array to bytes: {e}")
+            return b''
+    def clear_participant_buffers(self, participant_id: str):
+        """Clear all buffers for a participant (e.g., when they stop speaking or disconnect)"""
+        if participant_id in self.candidate_audio_buffers:
+            del self.candidate_audio_buffers[participant_id]
+        if participant_id in self.candidate_text_cache:
+            del self.candidate_text_cache[participant_id]
+        if participant_id in self.silence_counters:
+            del self.silence_counters[participant_id]
+        if participant_id in self.sentence_finalized:
+            del self.sentence_finalized[participant_id]
+    async def load_remaining_models_in_background(self):
+        """Load all remaining ASR models in the background after startup"""
+        try:
+            print("ASR: Starting background loading of additional language models...")
+            for lang_code in self.asr_config.keys():
+                if lang_code not in self.preload_languages and lang_code not in self.asr_pipelines:
+                    try:
+                        print(f"ASR: Background loading model for {lang_code}...")
+                        self.models_loading_status[lang_code] = "loading"
+                        model_config = self.asr_config[lang_code]
+                        # Use quantization helper for background loading too
+                        pipeline_obj = self._load_and_quantize_pipeline(lang_code, model_config)
+                        self.asr_pipelines[lang_code] = pipeline_obj
+                        self.models_loading_status[lang_code] = "loaded"
+                        print(f"ASR: Successfully loaded model for {lang_code} in background")
+                        # Add a small delay between loading models to prevent overwhelming the system
+                        await asyncio.sleep(2)
+                    except Exception as e:
+                        print(f"ASR: Failed to load model for {lang_code} in background: {e}")
+                        self.models_loading_status[lang_code] = "failed"
+            print("ASR: Background loading of all language models complete")
+            print(f"ASR: Loaded models: {list(self.asr_pipelines.keys())}")
+        except Exception as e:
+            print(f"ASR: Error in background model loading: {e}")
+    def start_background_loading(self):
+        """Start background loading of models as a non-blocking task"""
+        if self.background_loading_task is None:
+            self.background_loading_task = asyncio.create_task(self.load_remaining_models_in_background())
+            print("ASR: Background model loading task started")
+    async def cleanup(self):
+        """Cleanup resources"""
+        # Cancel background loading if still running
+        if self.background_loading_task and not self.background_loading_task.done():
+            self.background_loading_task.cancel()
+            try:
+                await self.background_loading_task
+            except asyncio.CancelledError:
+                pass
+        self.asr_pipelines.clear()

app/services/transcription_service.py.bak ADDED Viewed

	@@ -0,0 +1,726 @@

+import asyncio
+import io
+import wave
+import numpy as np
+import time
+from typing import Dict, Optional, Callable
+from transformers import pipeline
+import torch
+from app.models import LanguageCode
+from app.services.performance_mixin import track_performance
+# Silero VAD imports
+try:
+    import silero_vad
+    SILERO_VAD_AVAILABLE = True
+except ImportError:
+    SILERO_VAD_AVAILABLE = False
+    print("Warning: silero-vad not installed. Falling back to RMS-based VAD.")
+class TranscriptionService:
+    def __init__(self):
+        self.asr_pipelines: Dict[str, any] = {}
+        self.device = 0 if torch.cuda.is_available() else -1
+        # Model configurations - using original mutisya models with updated config
+        self.asr_config = {
+            "eng": {"model_repo": "openai/whisper-base.en", "model_type": "whisper"},
+            "swa": {"model_repo": "mutisya/w2v-bert-2.0-asr-swh-superv-v25-37-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "kik": {"model_repo": "mutisya/w2v-bert-2.0-asr-kik-superv-v25-37-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "kam": {"model_repo": "mutisya/w2v-bert-2.0-asr-kam-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "mer": {"model_repo": "mutisya/w2v-bert-2.0-asr-mer-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "luo": {"model_repo": "mutisya/w2v-bert-2.0-asr-luo-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True},
+            "som": {"model_repo": "mutisya/w2v-bert-2.0-asr-som-superv-v25-36-1", "model_type": "wav2vec2-bert", "trust_remote_code": True}
+        }
+        self.preload_languages = ["eng"]
+        self.background_loading_task = None
+        self.models_loading_status = {}
+        # Enhanced audio buffering for VAD-based sentence detection
+        self.candidate_audio_buffers: Dict[str, bytes] = {}  # participant_id -> candidate audio buffer
+        self.candidate_text_cache: Dict[str, str] = {}  # participant_id -> current candidate text
+        self.silence_counters: Dict[str, int] = {}  # participant_id -> consecutive silence chunks
+        self.sentence_finalized: Dict[str, bool] = {}  # participant_id -> whether current sentence is already finalized
+        # VAD parameters - made more lenient for better detection
+        self.silence_threshold = 1  # Number of consecutive silent chunks before sentence break (1 second for natural pauses)
+        self.min_sentence_length = 0.03  # Minimum sentence length in seconds (very short)
+        # Silero VAD initialization
+        self.vad_model = None
+        self.vad_sample_rate = 16000
+        self.vad_available = SILERO_VAD_AVAILABLE
+    async def initialize(self):
+        """Initialize ASR models for preloaded languages and Silero VAD"""
+        # Initialize Silero VAD model
+        if self.vad_available:
+            try:
+                print("Loading Silero VAD model...")
+                self.vad_model = silero_vad.load_silero_vad(onnx=False)
+                print("✓ Silero VAD model loaded successfully")
+            except Exception as e:
+                print(f"Failed to load Silero VAD model: {e}")
+                print("Falling back to RMS-based VAD")
+                self.vad_available = False
+        # Initialize ASR models
+        for lang_code in self.preload_languages:
+            if lang_code in self.asr_config:
+                try:
+                    model_config = self.asr_config[lang_code]
+                    # Build pipeline parameters
+                    pipeline_params = {
+                        "task": "automatic-speech-recognition",
+                        "model": model_config["model_repo"],
+                        "device": self.device,
+                        "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32
+                    }
+                    # Add trust_remote_code if specified
+                    if model_config.get("trust_remote_code", False):
+                        pipeline_params["trust_remote_code"] = True
+                    pipeline_obj = pipeline(**pipeline_params)
+                    self.asr_pipelines[lang_code] = pipeline_obj
+                except Exception as e:
+                    print(f"Failed to load ASR model for {lang_code}: {e}")
+    async def ensure_model_loaded(self, language_code: str):
+        """Load ASR model for language if not already loaded"""
+        if language_code not in self.asr_pipelines and language_code in self.asr_config:
+            try:
+                model_config = self.asr_config[language_code]
+                # Build pipeline parameters
+                pipeline_params = {
+                    "task": "automatic-speech-recognition",
+                    "model": model_config["model_repo"],
+                    "device": self.device,
+                    "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32
+                }
+                # Add trust_remote_code if specified
+                if model_config.get("trust_remote_code", False):
+                    pipeline_params["trust_remote_code"] = True
+                pipeline_obj = pipeline(**pipeline_params)
+                self.asr_pipelines[language_code] = pipeline_obj
+            except Exception as e:
+                print(f"Failed to load ASR model for {language_code}: {e}")
+                raise
+    async def process_audio_chunk(self, audio_data: bytes, language_code: str, participant_id: str,
+                                 has_voice_activity: bool = True,
+                                 progress_callback: Optional[Callable] = None,
+                                 sentence_callback: Optional[Callable] = None,
+                                 debug_callback: Optional[Callable] = None) -> str:
+        """Process audio chunk with VAD-based sentence detection"""
+        try:
+            # Initialize buffers if needed
+            if participant_id not in self.candidate_audio_buffers:
+                # Store as numpy array, not bytes, to avoid multiple WAV header issues
+                self.candidate_audio_buffers[participant_id] = np.array([], dtype=np.float32)
+                self.candidate_text_cache[participant_id] = ""
+                self.silence_counters[participant_id] = 0
+                self.sentence_finalized[participant_id] = False
+            # Convert current chunk to numpy array for processing
+            current_chunk_array = self._bytes_to_audio_array(audio_data)
+            if len(current_chunk_array) == 0:
+                print(f"WARNING: Received empty audio chunk for participant {participant_id}")
+                return self.candidate_text_cache.get(participant_id, "")
+            print(f"DEBUG: Received audio chunk - bytes: {len(audio_data)}, samples: {len(current_chunk_array)}, "
+                  f"duration: {len(current_chunk_array)/16000:.3f}s, "
+                  f"first 4 bytes: {audio_data[:4]}")
+            # DO NOT normalize individual chunks - this causes audio distortion
+            # We'll normalize the entire accumulated audio buffer before transcription
+            current_chunk_array = current_chunk_array.astype(np.float32)
+            # Get existing accumulated audio array (now stored as numpy array)
+            existing_array = self.candidate_audio_buffers[participant_id]
+            if len(existing_array) > 0:
+                # Concatenate with existing audio (like stream = np.concatenate([stream, y]))
+                combined_array = np.concatenate([existing_array, current_chunk_array])
+            else:
+                combined_array = current_chunk_array
+            # Store as numpy array to avoid WAV header accumulation issues
+            self.candidate_audio_buffers[participant_id] = combined_array
+            # For debug callback, convert to bytes (this adds ONE WAV header)
+            combined_bytes = self._audio_array_to_bytes(combined_array)
+            # Update silence counter based on voice activity
+            if not has_voice_activity:
+                self.silence_counters[participant_id] += 1
+            else:
+                self.silence_counters[participant_id] = 0
+            # Check if we should finalize sentence due to prolonged silence
+            should_finalize = (self.silence_counters[participant_id] >= self.silence_threshold and
+                              len(combined_array) > 0 and
+                              not self.sentence_finalized[participant_id])
+            if should_finalize:
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Always run transcription on the accumulated audio
+            audio_duration_sec = len(combined_array) / 16000.0  # 16kHz sample rate
+            # Minimum duration check - ignore very short audio bursts
+            MIN_CHUNK_DURATION = 0.3  # 300ms minimum
+            if audio_duration_sec < MIN_CHUNK_DURATION:
+                print(f"Audio chunk too short: {audio_duration_sec:.3f}s < {MIN_CHUNK_DURATION}s, skipping transcription")
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            # Force finalization if buffer gets too long (prevent infinite accumulation)
+            if audio_duration_sec > 15.0 and not self.sentence_finalized[participant_id]:  # Force completion after 15 seconds
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Run voice activity detection on the accumulated audio before transcription
+            has_voice_in_buffer = self.has_meaningful_voice_activity(combined_bytes)
+            if not has_voice_in_buffer:
+                # Still send progress update with cached text to maintain UI state
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            # Run transcription
+            await self.ensure_model_loaded(language_code)
+            # Double-check voice activity before running expensive ASR
+            has_voice_for_asr = self.has_voice_activity(combined_bytes)
+            if not has_voice_for_asr:
+                print(f"ASR: No voice activity detected in audio buffer for participant {participant_id}, skipping ASR execution")
+                # Return cached text and send progress update
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            if language_code not in self.asr_pipelines:
+                raise ValueError(f"ASR model not available for language: {language_code}")
+            print(f"ASR: Running transcription for participant {participant_id} with {len(combined_array)/16000:.2f}s of audio")
+            pipeline_obj = self.asr_pipelines[language_code]
+            # Normalize the ENTIRE accumulated audio buffer before transcription
+            # This prevents audio distortion from per-chunk normalization
+            normalized_array = combined_array.astype(np.float32)
+            max_val = np.max(np.abs(normalized_array))
+            if max_val > 0:
+                normalized_array = normalized_array / max_val
+            # Track transcription latency
+            transcription_start_time = time.time()
+            # For wav2vec2 models, request word timestamps
+            model_type = self.asr_config[language_code].get("model_type", "whisper")
+            if model_type in ["wav2vec2-bert", "wav2vec2"]:
+                result = pipeline_obj(
+                    {"sampling_rate": 16000, "raw": normalized_array},
+                    return_timestamps="word"
+                )
+            else:
+                # Whisper model - add anti-hallucination parameters
+                # Note: HuggingFace pipeline uses different parameter names than OpenAI Whisper
+                result = pipeline_obj(
+                    {"sampling_rate": 16000, "raw": normalized_array},
+                    return_timestamps=True,
+                    chunk_length_s=30,  # Process in 30s chunks
+                    stride_length_s=5   # 5s stride for context
+                )
+            transcription_latency_ms = (time.time() - transcription_start_time) * 1000
+            candidate_text = result.get("text", "").strip()
+            word_timestamps = result.get("chunks", []) if model_type in ["wav2vec2-bert", "wav2vec2"] else None
+            # Send debug information if callback provided (for wav2vec2 models only)
+            if debug_callback and word_timestamps is not None:
+                debug_info = {
+                    "text": candidate_text,
+                    "timestamps": word_timestamps,
+                    "audio_data": combined_bytes,
+                    "audio_duration": audio_duration_sec,
+                    "model_type": model_type,
+                    "transcription_latency_ms": transcription_latency_ms
+                }
+                await debug_callback(debug_info)
+            # Filter out common ASR artifacts and very short responses
+            artifacts = [
+                "thank you", "thanks", "bye", ".", ",", "?", "!",
+                "um", "uh", "ah", "hmm", "mm", "mhm",
+                "you", "the", "a", "an", "and", "but", "or",
+                "music", "laughter", "applause", "[music]", "[laughter]",
+                # Common Whisper hallucinations:
+                "subscribe", "subtitles", "amara", "www", "http",
+                "please subscribe", "like and subscribe",
+                "thank you for watching", "don't forget to subscribe",
+                "[blank_audio]", "[noise]", "[silence]",
+            ]
+            # Check if the result is likely an artifact
+            is_artifact = (
+                len(candidate_text) < 3 or  # Very short
+                candidate_text.lower() in artifacts or  # Common artifacts
+                len(candidate_text.split()) == 1 and len(candidate_text) < 6  # Single very short word
+            )
+            if is_artifact:
+                # Keep the previous cached text instead of updating with artifact
+                candidate_text = self.candidate_text_cache.get(participant_id, "")
+            # Cache the current candidate text
+            self.candidate_text_cache[participant_id] = candidate_text
+            # Force completion if we have a reasonable amount of text and some silence
+            word_count = len(candidate_text.split()) if candidate_text else 0
+            if (word_count >= 3 and self.silence_counters[participant_id] >= 2 and
+                not self.sentence_finalized[participant_id]):  # At least 3 words and 2 silent chunks
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Always send progress update
+            if progress_callback:
+                await progress_callback(candidate_text, False)
+            return candidate_text
+        except Exception as e:
+            print(f"TranscriptionService: Error processing audio chunk: {e}")
+            import traceback
+            traceback.print_exc()
+            # Even on error, try to send cached text
+            if progress_callback:
+                cached_text = self.candidate_text_cache.get(participant_id, "")
+                await progress_callback(cached_text, False)
+            return self.candidate_text_cache.get(participant_id, "")
+    async def _finalize_candidate_sentence(self, language_code: str, participant_id: str,
+                                         sentence_callback: Optional[Callable] = None) -> str:
+        """Finalize the current candidate sentence and clear buffers"""
+        try:
+            # Check if sentence was already finalized
+            if self.sentence_finalized.get(participant_id, False):
+                print(f"Sentence for participant {participant_id} already finalized, skipping duplicate")
+                return self.candidate_text_cache.get(participant_id, "")
+            final_text = self.candidate_text_cache.get(participant_id, "")
+            final_audio_array = self.candidate_audio_buffers.get(participant_id, np.array([], dtype=np.float32))
+            # Convert audio array to bytes for VAD check and callback
+            final_audio_bytes = self._audio_array_to_bytes(final_audio_array) if len(final_audio_array) > 0 else b''
+            if final_text and len(final_text.strip()) > 0:
+                # Run VAD check on the final accumulated buffer before sending for translation
+                if len(final_audio_bytes) > 0:
+                    has_voice_in_final = self.has_meaningful_voice_activity(final_audio_bytes)
+                    if not has_voice_in_final:
+                        print(f"Finalize: No voice activity in final buffer for participant {participant_id}, discarding sentence: '{final_text}'")
+                        # Clear buffers without sending to translation
+                        self.candidate_audio_buffers[participant_id] = np.array([], dtype=np.float32)
+                        self.candidate_text_cache[participant_id] = ""
+                        self.silence_counters[participant_id] = 0
+                        self.sentence_finalized[participant_id] = False
+                        return ""
+                # Mark as finalized BEFORE calling the callback to prevent race conditions
+                self.sentence_finalized[participant_id] = True
+                # Send to sentence callback for translation
+                if sentence_callback and len(final_audio_bytes) > 0:
+                    print(f"Finalizing sentence for participant {participant_id}: '{final_text}'")
+                    await sentence_callback(final_text, final_audio_bytes)
+            # Clear buffers for next sentence
+            self.candidate_audio_buffers[participant_id] = np.array([], dtype=np.float32)
+            self.candidate_text_cache[participant_id] = ""
+            self.silence_counters[participant_id] = 0
+            self.sentence_finalized[participant_id] = False  # Reset for next sentence
+            return final_text
+        except Exception as e:
+            print(f"Error finalizing sentence: {e}")
+            import traceback
+            traceback.print_exc()
+            # Reset finalized flag on error
+            self.sentence_finalized[participant_id] = False
+            return ""
+    def has_voice_activity(self, audio_data: bytes, threshold: float = 0.5) -> bool:
+        """Voice Activity Detection using Silero VAD (with RMS fallback)"""
+        try:
+            audio_array = self._bytes_to_audio_array(audio_data)
+            if len(audio_array) == 0:
+                print("VAD: No audio array, returning False")
+                return False
+            # Normalize audio to float32 range [-1, 1]
+            audio_array = audio_array.astype(np.float32)
+            if np.max(np.abs(audio_array)) > 0:
+                audio_array /= np.max(np.abs(audio_array))
+            # Use Silero VAD if available
+            if self.vad_available and self.vad_model is not None:
+                try:
+                    # Silero VAD expects 512 samples (32ms) or 1536 samples (96ms) for 16kHz
+                    # Process audio in chunks and average the probabilities
+                    frame_size = 512  # 32ms at 16kHz
+                    num_samples = len(audio_array)
+                    # If audio is too short, pad it
+                    if num_samples < frame_size:
+                        audio_array = np.pad(audio_array, (0, frame_size - num_samples), mode='constant')
+                        num_samples = frame_size
+                    # Process in frames and collect probabilities
+                    speech_probs = []
+                    for i in range(0, num_samples, frame_size):
+                        frame = audio_array[i:i + frame_size]
+                        if len(frame) < frame_size:
+                            # Pad last frame if needed
+                            frame = np.pad(frame, (0, frame_size - len(frame)), mode='constant')
+                        # Convert to torch tensor
+                        frame_tensor = torch.from_numpy(frame).float()
+                        # Get speech probability from Silero VAD
+                        with torch.no_grad():
+                            prob = self.vad_model(frame_tensor, self.vad_sample_rate).item()
+                            speech_probs.append(prob)
+                    # Average probability across all frames
+                    speech_prob = np.mean(speech_probs)
+                    has_voice = speech_prob > threshold
+                    print(f"VAD: Silero speech_prob={speech_prob:.4f} (avg of {len(speech_probs)} frames), threshold={threshold}, RESULT={has_voice}")
+                    return has_voice
+                except Exception as e:
+                    print(f"Silero VAD error: {e}, falling back to RMS-based VAD")
+                    # Fall through to RMS-based VAD below
+            # Fallback: RMS-based VAD (original implementation)
+            rms_threshold = 0.002
+            rms = np.sqrt(np.mean(audio_array ** 2))
+            peak = np.max(np.abs(audio_array))
+            audio_std = np.std(audio_array)
+            zero_crossing_rate = np.sum(np.diff(np.sign(audio_array)) != 0) / len(audio_array)
+            has_voice_rms = rms > rms_threshold
+            has_voice_peak = peak > rms_threshold * 3
+            has_voice_variation = audio_std > rms_threshold * 0.8
+            has_voice_zcr = zero_crossing_rate > 0.008
+            has_voice = has_voice_rms or (has_voice_peak and has_voice_variation) or has_voice_zcr
+            print(f"VAD: RMS-based - RMS={rms:.6f}({has_voice_rms}), peak={peak:.6f}({has_voice_peak}), std={audio_std:.6f}({has_voice_variation}), zcr={zero_crossing_rate:.6f}({has_voice_zcr}), RESULT={has_voice}")
+            return has_voice
+        except Exception as e:
+            print(f"Error in VAD: {e}")
+            return True  # Default to assuming voice activity on error
+    def has_meaningful_voice_activity(self, audio_data: bytes, threshold: float = 0.005) -> bool:
+        """Stricter VAD check specifically for pre-transcription filtering"""
+        try:
+            audio_array = self._bytes_to_audio_array(audio_data)
+            if len(audio_array) == 0:
+                return False
+            # Normalize audio
+            audio_array = audio_array.astype(np.float32)
+            if np.max(np.abs(audio_array)) > 0:
+                audio_array /= np.max(np.abs(audio_array))
+            # Calculate features with higher thresholds for meaningful speech
+            rms = np.sqrt(np.mean(audio_array ** 2))
+            peak = np.max(np.abs(audio_array))
+            audio_std = np.std(audio_array)
+            zero_crossing_rate = np.sum(np.diff(np.sign(audio_array)) != 0) / len(audio_array)
+            # Higher thresholds for meaningful speech detection
+            has_meaningful_voice = (
+                rms > threshold and
+                peak > threshold * 2 and
+                audio_std > threshold * 0.5 and
+                zero_crossing_rate > 0.015  # Higher ZCR threshold for meaningful speech
+            )
+            return has_meaningful_voice
+        except Exception as e:
+            print(f"Error in meaningful VAD: {e}")
+            return False  # Default to no meaningful voice on error
+    async def force_complete_sentence(self, participant_id: str, language_code: str, sentence_callback: Optional[Callable] = None) -> str:
+        """Force complete any pending sentence for a participant"""
+        try:
+            # Check if sentence was already finalized
+            if self.sentence_finalized.get(participant_id, False):
+                print(f"Force completion: Sentence for participant {participant_id} already finalized, skipping")
+                return ""
+            if participant_id in self.candidate_text_cache:
+                cached_text = self.candidate_text_cache[participant_id]
+                if cached_text and len(cached_text.strip()) > 0:
+                    result = await self._finalize_candidate_sentence(language_code, participant_id, sentence_callback)
+                    return result
+            return ""
+        except Exception as e:
+            print(f"Error in force_complete_sentence: {e}")
+            import traceback
+            traceback.print_exc()
+            return ""
+    @track_performance("transcription", "transcribe_audio")
+    async def transcribe_audio(self, audio_data: bytes, language_code: str, callback: Optional[Callable] = None) -> str:
+        """Transcribe audio data to text"""
+        try:
+            # Check for voice activity before running ASR
+            has_voice = self.has_voice_activity(audio_data)
+            if not has_voice:
+                print(f"ASR: No voice activity detected in audio data, skipping transcription")
+                return ""
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.asr_pipelines:
+                raise ValueError(f"ASR model not available for language: {language_code}")
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(audio_data)
+            print(f"ASR: Running transcription with {len(audio_array)/16000:.2f}s of audio")
+            # Transcribe
+            pipeline_obj = self.asr_pipelines[language_code]
+            result = pipeline_obj({"sampling_rate": 16000, "raw": audio_array})
+            text = result.get("text", "")
+            if callback:
+                await callback(text)
+            return text
+        except Exception as e:
+            print(f"TranscriptionService: Transcription error: {e}")
+            import traceback
+            traceback.print_exc()
+            return ""
+    def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray:
+        """Convert audio bytes to numpy array (supports WAV, WebM/Opus)"""
+        try:
+            # Detect format by checking magic bytes
+            is_webm = audio_data[:4] == b'\x1a\x45\xdf\xa3'  # WebM/Matroska magic bytes
+            is_wav = audio_data[:4] == b'RIFF'
+            import sys
+            print(f"_bytes_to_audio_array: length={len(audio_data)}, first 4 bytes={audio_data[:4]}, is_wav={is_wav}", flush=True)
+            sys.stdout.flush()
+            # Handle raw PCM (16-bit, 48kHz from extendable-media-recorder)
+            # This is the most common case now that we strip WAV headers in frontend
+            if not is_wav and not is_webm and len(audio_data) > 0:
+                try:
+                    # Assume 16-bit PCM at 48kHz (browser's native rate)
+                    audio_array = np.frombuffer(audio_data, dtype=np.int16)
+                    # Check if this looks like valid audio data (not NaN, reasonable range)
+                    if len(audio_array) > 0 and not np.isnan(audio_array).any():
+                        print(f"Raw PCM: {len(audio_array)} samples, assuming 48kHz 16-bit", flush=True)
+                        # Convert to float32 and normalize
+                        audio_float = audio_array.astype(np.float32) / 32768.0
+                        # Resample from 48kHz to 16kHz
+                        import librosa
+                        audio_array = librosa.resample(audio_float, orig_sr=48000, target_sr=16000)
+                        print(f"Resampled to 16kHz: {len(audio_array)} samples", flush=True)
+                        return audio_array
+                except Exception as pcm_error:
+                    print(f"TranscriptionService: Raw PCM decoding error: {pcm_error}", flush=True)
+                    # Fall through to other methods
+            if is_webm:
+                # Decode WebM/Opus using pydub (requires ffmpeg)
+                try:
+                    from pydub import AudioSegment
+                    audio_io = io.BytesIO(audio_data)
+                    audio_segment = AudioSegment.from_file(audio_io, format="webm")
+                    # Convert to mono 16kHz
+                    audio_segment = audio_segment.set_channels(1)
+                    audio_segment = audio_segment.set_frame_rate(16000)
+                    # Convert to numpy array
+                    samples = np.array(audio_segment.get_array_of_samples(), dtype=np.int16)
+                    # Normalize to float32 [-1, 1]
+                    audio_array = samples.astype(np.float32) / 32768.0
+                    return audio_array
+                except Exception as webm_error:
+                    print(f"TranscriptionService: WebM decoding error: {webm_error}")
+                    # Fall through to other methods
+            if is_wav:
+                # Decode WAV format
+                try:
+                    audio_io = io.BytesIO(audio_data)
+                    with wave.open(audio_io, 'rb') as wav_file:
+                        sample_rate = wav_file.getframerate()
+                        channels = wav_file.getnchannels()
+                        sample_width = wav_file.getsampwidth()
+                        print(f"WAV format: {sample_rate}Hz, {channels} channel(s), {sample_width*8}-bit", flush=True)
+                        frames = wav_file.readframes(-1)
+                        audio_array = np.frombuffer(frames, dtype=np.int16)
+                        # Resample if needed
+                        if sample_rate != 16000:
+                            print(f"WARNING: Resampling from {sample_rate}Hz to 16000Hz", flush=True)
+                            import librosa
+                            # Convert to float first
+                            audio_float = audio_array.astype(np.float32) / 32768.0
+                            # Resample
+                            audio_array = librosa.resample(audio_float, orig_sr=sample_rate, target_sr=16000)
+                            print(f"Resampled: {len(audio_array)} samples at 16kHz", flush=True)
+                        else:
+                            # Convert to float32 and normalize
+                            audio_array = audio_array.astype(np.float32) / 32768.0
+                        print(f"Returning audio array: {len(audio_array)} samples", flush=True)
+                        return audio_array
+                except Exception as wav_error:
+                    print(f"TranscriptionService: WAV decoding error: {wav_error}")
+                    import traceback
+                    traceback.print_exc()
+            # Fallback: assume raw float32 audio data
+            try:
+                audio_array = np.frombuffer(audio_data, dtype=np.float32)
+                return audio_array
+            except Exception:
+                pass
+            # Last resort: return empty array
+            return np.array([], dtype=np.float32)
+        except Exception as e:
+            print(f"TranscriptionService: Audio conversion error: {e}")
+            return np.array([], dtype=np.float32)
+    def _audio_array_to_bytes(self, audio_array: np.ndarray) -> bytes:
+        """Convert numpy audio array back to WAV bytes for storage"""
+        try:
+            # Ensure float32 format
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32)
+            # Convert to 16-bit PCM for WAV storage
+            audio_int16 = (audio_array * 32767).astype(np.int16)
+            # Create WAV bytes
+            wav_buffer = io.BytesIO()
+            with wave.open(wav_buffer, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Mono
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(16000)  # 16kHz
+                wav_file.writeframes(audio_int16.tobytes())
+            return wav_buffer.getvalue()
+        except Exception as e:
+            print(f"Error converting audio array to bytes: {e}")
+            return b''
+    def clear_participant_buffers(self, participant_id: str):
+        """Clear all buffers for a participant (e.g., when they stop speaking or disconnect)"""
+        if participant_id in self.candidate_audio_buffers:
+            del self.candidate_audio_buffers[participant_id]
+        if participant_id in self.candidate_text_cache:
+            del self.candidate_text_cache[participant_id]
+        if participant_id in self.silence_counters:
+            del self.silence_counters[participant_id]
+        if participant_id in self.sentence_finalized:
+            del self.sentence_finalized[participant_id]
+    async def load_remaining_models_in_background(self):
+        """Load all remaining ASR models in the background after startup"""
+        try:
+            print("ASR: Starting background loading of additional language models...")
+            for lang_code in self.asr_config.keys():
+                if lang_code not in self.preload_languages and lang_code not in self.asr_pipelines:
+                    try:
+                        print(f"ASR: Background loading model for {lang_code}...")
+                        self.models_loading_status[lang_code] = "loading"
+                        model_config = self.asr_config[lang_code]
+                        # Build pipeline parameters
+                        pipeline_params = {
+                            "task": "automatic-speech-recognition",
+                            "model": model_config["model_repo"],
+                            "device": self.device,
+                            "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32
+                        }
+                        # Add trust_remote_code if specified
+                        if model_config.get("trust_remote_code", False):
+                            pipeline_params["trust_remote_code"] = True
+                        pipeline_obj = pipeline(**pipeline_params)
+                        self.asr_pipelines[lang_code] = pipeline_obj
+                        self.models_loading_status[lang_code] = "loaded"
+                        print(f"ASR: Successfully loaded model for {lang_code} in background")
+                        # Add a small delay between loading models to prevent overwhelming the system
+                        await asyncio.sleep(2)
+                    except Exception as e:
+                        print(f"ASR: Failed to load model for {lang_code} in background: {e}")
+                        self.models_loading_status[lang_code] = "failed"
+            print("ASR: Background loading of all language models complete")
+            print(f"ASR: Loaded models: {list(self.asr_pipelines.keys())}")
+        except Exception as e:
+            print(f"ASR: Error in background model loading: {e}")
+    def start_background_loading(self):
+        """Start background loading of models as a non-blocking task"""
+        if self.background_loading_task is None:
+            self.background_loading_task = asyncio.create_task(self.load_remaining_models_in_background())
+            print("ASR: Background model loading task started")
+    async def cleanup(self):
+        """Cleanup resources"""
+        # Cancel background loading if still running
+        if self.background_loading_task and not self.background_loading_task.done():
+            self.background_loading_task.cancel()
+            try:
+                await self.background_loading_task
+            except asyncio.CancelledError:
+                pass
+        self.asr_pipelines.clear()

app/services/transcription_service_onnx.py ADDED Viewed

	@@ -0,0 +1,682 @@

+import asyncio
+import io
+import wave
+import numpy as np
+from typing import Dict, Optional, Callable
+from collections import OrderedDict
+import onnxruntime as ort
+from transformers import AutoProcessor, WhisperProcessor
+from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
+import os
+from app.models import LanguageCode
+class ONNXTranscriptionService:
+    def __init__(self):
+        self.asr_models: Dict[str, any] = {}
+        self.processors: Dict[str, any] = {}
+        self.max_asr_models = 2  # Memory management - keep max 2 models loaded
+        self.model_cache = OrderedDict()  # LRU cache for models
+        # GPU optimization - detect and configure providers
+        available_providers = ort.get_available_providers()
+        print(f"ONNX ASR: Available providers: {available_providers}")
+        if 'CUDAExecutionProvider' in available_providers:
+            # Configure CUDA provider with optimizations
+            cuda_provider_options = {
+                'device_id': 0,
+                'arena_extend_strategy': 'kNextPowerOfTwo',
+                'gpu_mem_limit': int(0.8 * 1024 * 1024 * 1024),  # 80% of GPU memory
+                'cudnn_conv_algo_search': 'EXHAUSTIVE',
+                'do_copy_in_default_stream': True,
+                'enable_tracing': True,  # Enable tracing for better diagnostics
+            }
+            # Include TensorRT if available, then CUDA, then CPU
+            provider_list = []
+            if 'TensorrtExecutionProvider' in available_providers:
+                provider_list.append('TensorrtExecutionProvider')
+            provider_list.append(('CUDAExecutionProvider', cuda_provider_options))
+            provider_list.append('CPUExecutionProvider')
+            self.providers = provider_list
+            print(f"ONNX ASR: Using GPU acceleration with providers: {[p[0] if isinstance(p, tuple) else p for p in provider_list]}")
+            print(f"ONNX ASR: GPU memory limit: {cuda_provider_options['gpu_mem_limit'] // (1024**3)}GB")
+        else:
+            self.providers = ['CPUExecutionProvider']
+            print("ONNX ASR: CUDA not available, using CPU execution")
+        print(f"ONNX ASR: Configured providers: {[p[0] if isinstance(p, tuple) else p for p in self.providers]}")
+        # ONNX Model configurations - using pre-converted ONNX models from HuggingFace
+        self.asr_config = {
+            "eng": {"model_repo": "mutisya/whisper-medium-en-onnx", "model_type": "whisper", "use_onnx": True},  # Pre-converted ONNX model
+            "swa": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-swh-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "kik": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-kik-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "kam": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-kam-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "mer": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-mer-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "luo": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-luo-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "som": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-som-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True}
+        }
+        # Alternative model configurations for different performance tiers
+        self.alternative_models = {
+            "eng_small": {"model_repo": "mutisya/whisper-small-en-onnx", "model_type": "whisper", "use_onnx": True},
+            "eng_base": {"model_repo": "mutisya/whisper-base-en-onnx", "model_type": "whisper", "use_onnx": True},
+            "eng_medium": {"model_repo": "mutisya/whisper-medium-en-onnx", "model_type": "whisper", "use_onnx": True}
+        }
+        self.preload_languages = ["eng"]
+        # Current model performance mode (small, base, medium)
+        # Can be configured via environment variable WHISPER_MODEL_SIZE
+        self.performance_mode = os.getenv("WHISPER_MODEL_SIZE", "medium").lower()
+        # Enhanced audio buffering for VAD-based sentence detection
+        self.candidate_audio_buffers: Dict[str, bytes] = {}
+        self.candidate_text_cache: Dict[str, str] = {}
+        self.silence_counters: Dict[str, int] = {}
+        self.sentence_finalized: Dict[str, bool] = {}
+        # VAD parameters
+        self.silence_threshold = 2
+        self.min_sentence_length = 0.03
+    def set_performance_mode(self, mode: str):
+        """Set the performance mode for English models (small, base, medium)"""
+        if mode in ["small", "base", "medium"]:
+            self.performance_mode = mode
+            # Update the English model configuration based on performance mode
+            if f"eng_{mode}" in self.alternative_models:
+                self.asr_config["eng"] = self.alternative_models[f"eng_{mode}"]
+                # Clear cached English model to force reload with new configuration
+                if "eng" in self.model_cache:
+                    del self.model_cache["eng"]
+                if "eng" in self.asr_models:
+                    del self.asr_models["eng"]
+                if "eng" in self.processors:
+                    del self.processors["eng"]
+                print(f"Performance mode set to {mode}. English model will be reloaded on next use.")
+            else:
+                print(f"Warning: No model configuration found for performance mode {mode}")
+        else:
+            print(f"Invalid performance mode: {mode}. Must be one of: small, base, medium")
+    async def initialize(self):
+        """Initialize ASR models for preloaded languages"""
+        print(f"ONNX ASR: Initializing with providers: {self.providers}")
+        # Apply performance mode to English model configuration
+        if self.performance_mode in ["small", "base", "medium"]:
+            if f"eng_{self.performance_mode}" in self.alternative_models:
+                self.asr_config["eng"] = self.alternative_models[f"eng_{self.performance_mode}"]
+                print(f"Using Whisper {self.performance_mode} model for English")
+            else:
+                print(f"Warning: Performance mode {self.performance_mode} not available, using default medium")
+        for lang_code in self.preload_languages:
+            if lang_code in self.asr_config:
+                try:
+                    await self.ensure_model_loaded(lang_code)
+                except Exception as e:
+                    print(f"Failed to load ASR model for {lang_code}: {e}")
+    async def ensure_model_loaded(self, language_code: str):
+        """Load ASR model for language if not already loaded with LRU cache"""
+        if language_code in self.model_cache:
+            # Move to end (most recently used)
+            self.model_cache.move_to_end(language_code)
+            return
+        if language_code not in self.asr_config:
+            raise ValueError(f"Language {language_code} not supported")
+        model_config = self.asr_config[language_code]
+        # Check if we need to evict old models
+        while len(self.model_cache) >= self.max_asr_models:
+            # Remove least recently used model
+            old_lang, _ = self.model_cache.popitem(last=False)
+            if old_lang in self.asr_models:
+                del self.asr_models[old_lang]
+            if old_lang in self.processors:
+                del self.processors[old_lang]
+            print(f"ONNX ASR: Evicted model for {old_lang} (LRU cache)")
+        try:
+            if model_config.get("use_onnx", False):
+                # Load ONNX model
+                print(f"ONNX ASR: Loading ONNX model for {language_code}")
+                # Special handling for Whisper models
+                if model_config.get("model_type") == "whisper":
+                    print(f"ONNX ASR: Loading Whisper ONNX model from {model_config['model_repo']}")
+                    # Get authentication token for private repos
+                    import os
+                    auth_token = os.getenv('HUGGING_FACE_HUB_TOKEN') or os.getenv('HF_TOKEN')
+                    # Load pre-converted Whisper ONNX model using Optimum
+                    load_kwargs = {
+                        # export=False because we're using pre-converted models
+                        "export": False,
+                        # use_cache=True because our models now include past key value variants for optimization
+                        "use_cache": True,
+                        # Add authentication token for private repos
+                        "token": auth_token
+                    }
+                    # Configure providers - pass all available providers to Optimum
+                    provider_names = [p[0] if isinstance(p, tuple) else p for p in self.providers]
+                    load_kwargs["providers"] = provider_names
+                    print(f"ONNX ASR: Whisper using providers: {provider_names}")
+                    # Add subfolder if specified (for models that store ONNX in subfolders)
+                    if "subfolder" in model_config:
+                        load_kwargs["subfolder"] = model_config["subfolder"]
+                    model = ORTModelForSpeechSeq2Seq.from_pretrained(
+                        model_config["model_repo"],
+                        **load_kwargs
+                    )
+                    # Load Whisper processor with authentication token
+                    processor = WhisperProcessor.from_pretrained(
+                        model_config["model_repo"],
+                        token=auth_token
+                    )
+                    # Configure for English transcription
+                    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
+                        language="en",
+                        task="transcribe"
+                    )
+                    self.asr_models[language_code] = model
+                    self.processors[language_code] = processor
+                    print(f"ONNX ASR: Successfully loaded Whisper ONNX model for {language_code}")
+                else:
+                    # Original wav2vec2-bert model loading logic
+                    # Create ONNX session with optimizations and verbose logging
+                    session_options = ort.SessionOptions()
+                    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+                    # Enable verbose logging to diagnose operator assignments
+                    session_options.log_severity_level = 1  # WARNING level for detailed logs
+                    session_options.logid = "ONNX_ASR"      # Prefix for log identification
+                    # Use configured providers with optimizations
+                    providers = self.providers
+                    print(f"ONNX ASR: wav2vec2-bert using providers: {[p[0] if isinstance(p, tuple) else p for p in providers]}")
+                    # Get authentication token for private repos
+                    import os
+                    auth_token = os.getenv('HUGGING_FACE_HUB_TOKEN') or os.getenv('HF_TOKEN')
+                    # Download model files from HuggingFace Hub with authentication
+                    from huggingface_hub import hf_hub_download
+                    onnx_path = hf_hub_download(
+                        repo_id=model_config["model_repo"],
+                        filename="model.onnx",
+                        token=auth_token
+                    )
+                    session = ort.InferenceSession(onnx_path, providers=providers, sess_options=session_options)
+                    # Load processor for preprocessing with authentication
+                    processor = AutoProcessor.from_pretrained(
+                        model_config["model_repo"],
+                        token=auth_token
+                    )
+                    self.asr_models[language_code] = session
+                    self.processors[language_code] = processor
+                    print(f"ONNX ASR: Successfully loaded ONNX model for {language_code}")
+            else:
+                # This service is ONNX-only - no PyTorch fallback
+                raise ValueError(f"Language {language_code} is not configured for ONNX models. Set 'use_onnx': True in config.")
+            # Add to cache
+            self.model_cache[language_code] = True
+        except Exception as e:
+            print(f"Failed to load ASR model for {language_code}: {e}")
+            raise
+    async def process_audio_chunk(self, audio_data: bytes, language_code: str, participant_id: str,
+                                 has_voice_activity: bool = True,
+                                 progress_callback: Optional[Callable] = None,
+                                 sentence_callback: Optional[Callable] = None) -> str:
+        """Process audio chunk with VAD-based sentence detection using ONNX models"""
+        try:
+            # Initialize buffers if needed
+            if participant_id not in self.candidate_audio_buffers:
+                self.candidate_audio_buffers[participant_id] = b''
+                self.candidate_text_cache[participant_id] = ""
+                self.silence_counters[participant_id] = 0
+                self.sentence_finalized[participant_id] = False
+            # Convert current chunk to numpy array for processing
+            current_chunk_array = self._bytes_to_audio_array(audio_data)
+            if len(current_chunk_array) == 0:
+                return self.candidate_text_cache.get(participant_id, "")
+            # Normalize the audio chunk
+            current_chunk_array = current_chunk_array.astype(np.float32)
+            if np.max(np.abs(current_chunk_array)) > 0:
+                current_chunk_array /= np.max(np.abs(current_chunk_array))
+            # Get existing accumulated audio array
+            existing_buffer = self.candidate_audio_buffers[participant_id]
+            if len(existing_buffer) > 0:
+                existing_array = self._bytes_to_audio_array(existing_buffer)
+                if len(existing_array) > 0:
+                    combined_array = np.concatenate([existing_array, current_chunk_array])
+                else:
+                    combined_array = current_chunk_array
+            else:
+                combined_array = current_chunk_array
+            # Convert back to bytes for storage
+            combined_bytes = self._audio_array_to_bytes(combined_array)
+            self.candidate_audio_buffers[participant_id] = combined_bytes
+            # Update silence counter based on voice activity
+            if not has_voice_activity:
+                self.silence_counters[participant_id] += 1
+            else:
+                self.silence_counters[participant_id] = 0
+            # Check if we should finalize sentence due to prolonged silence
+            should_finalize = (self.silence_counters[participant_id] >= self.silence_threshold and
+                              len(combined_array) > 0 and
+                              not self.sentence_finalized[participant_id])
+            if should_finalize:
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Always run transcription on the accumulated audio
+            audio_duration_sec = len(combined_array) / 16000.0  # 16kHz sample rate
+            if audio_duration_sec < 0.1:  # Very short minimum
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            # Force finalization if buffer gets too long
+            if audio_duration_sec > 15.0 and not self.sentence_finalized[participant_id]:
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Run voice activity detection on the accumulated audio before transcription
+            has_voice_in_buffer = self.has_meaningful_voice_activity(combined_bytes)
+            if not has_voice_in_buffer:
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            # Run transcription
+            await self.ensure_model_loaded(language_code)
+            # Double-check voice activity before running expensive ASR
+            has_voice_for_asr = self.has_voice_activity(combined_bytes)
+            if not has_voice_for_asr:
+                print(f"ONNX ASR: No voice activity detected, skipping ASR execution for {participant_id}")
+                if progress_callback:
+                    cached_text = self.candidate_text_cache.get(participant_id, "")
+                    await progress_callback(cached_text, False)
+                return self.candidate_text_cache.get(participant_id, "")
+            if language_code not in self.asr_models:
+                raise ValueError(f"ASR model not available for language: {language_code}")
+            print(f"ONNX ASR: Running transcription for {participant_id} with {audio_duration_sec:.2f}s of audio")
+            # Run ONNX inference (this service is ONNX-only)
+            model_config = self.asr_config[language_code]
+            if not model_config.get("use_onnx", False):
+                raise ValueError(f"Language {language_code} is not configured for ONNX. This service only supports ONNX models.")
+            # ONNX inference
+            text = await self._run_onnx_inference(combined_array, language_code)
+            # Filter out common ASR artifacts
+            artifacts = [
+                "thank you", "thanks", "bye", ".", ",", "?", "!",
+                "um", "uh", "ah", "hmm", "mm", "mhm",
+                "you", "the", "a", "an", "and", "but", "or",
+                "music", "laughter", "applause", "[music]", "[laughter]",
+            ]
+            # Check if the result is likely an artifact
+            is_artifact = (
+                len(text) < 3 or
+                text.lower() in artifacts or
+                len(text.split()) == 1 and len(text) < 6
+            )
+            if is_artifact:
+                text = self.candidate_text_cache.get(participant_id, "")
+            # Cache the current candidate text
+            self.candidate_text_cache[participant_id] = text
+            # Force completion if we have reasonable text and some silence
+            word_count = len(text.split()) if text else 0
+            if (word_count >= 3 and self.silence_counters[participant_id] >= 2 and
+                not self.sentence_finalized[participant_id]):
+                return await self._finalize_candidate_sentence(
+                    language_code, participant_id, sentence_callback
+                )
+            # Always send progress update
+            if progress_callback:
+                await progress_callback(text, False)
+            return text
+        except Exception as e:
+            print(f"ONNX TranscriptionService: Error processing audio chunk: {e}")
+            import traceback
+            traceback.print_exc()
+            if progress_callback:
+                cached_text = self.candidate_text_cache.get(participant_id, "")
+                await progress_callback(cached_text, False)
+            return self.candidate_text_cache.get(participant_id, "")
+    async def _run_onnx_inference(self, audio_array: np.ndarray, language_code: str) -> str:
+        """Run ONNX inference for speech recognition"""
+        try:
+            model = self.asr_models[language_code]
+            processor = self.processors[language_code]
+            model_config = self.asr_config[language_code]
+            # Check if this is a Whisper model
+            if model_config.get("model_type") == "whisper":
+                # Whisper-specific processing using Optimum
+                import torch
+                # Process audio input for Whisper
+                inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")
+                # Generate transcription using the ORTModelForSpeechSeq2Seq
+                predicted_ids = model.generate(inputs.input_features, max_length=448)
+                # Decode the generated IDs
+                transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+                return transcription[0].strip() if transcription else ""
+            else:
+                # Original wav2vec2-bert processing
+                session = model
+                # Preprocess audio
+                inputs = processor(audio_array, sampling_rate=16000, return_tensors="np")
+                # Get input names for ONNX session
+                input_names = [inp.name for inp in session.get_inputs()]
+                # Prepare inputs for ONNX
+                onnx_inputs = {}
+                for name in input_names:
+                    if name in inputs:
+                        onnx_inputs[name] = inputs[name]
+                    elif name == "input_values" and "input_features" in inputs:
+                        onnx_inputs[name] = inputs["input_features"]
+                    elif name == "attention_mask" and "attention_mask" in inputs:
+                        onnx_inputs[name] = inputs["attention_mask"]
+                # Run ONNX inference
+                outputs = session.run(None, onnx_inputs)
+                # Post-process outputs (assuming CTC decoding)
+                logits = outputs[0]  # First output should be logits
+                # Simple greedy CTC decoding
+                predicted_ids = np.argmax(logits, axis=-1)
+                # Decode using processor
+                text = processor.batch_decode(predicted_ids)[0]
+                return text.strip()
+        except Exception as e:
+            print(f"ONNX ASR: Inference error: {e}")
+            import traceback
+            traceback.print_exc()
+            return ""
+    async def _finalize_candidate_sentence(self, language_code: str, participant_id: str,
+                                         sentence_callback: Optional[Callable] = None) -> str:
+        """Finalize the current candidate sentence and clear buffers"""
+        try:
+            if self.sentence_finalized.get(participant_id, False):
+                print(f"Sentence for participant {participant_id} already finalized, skipping duplicate")
+                return self.candidate_text_cache.get(participant_id, "")
+            final_text = self.candidate_text_cache.get(participant_id, "")
+            final_audio_bytes = self.candidate_audio_buffers.get(participant_id, b'')
+            if final_text and len(final_text.strip()) > 0:
+                self.sentence_finalized[participant_id] = True
+                if sentence_callback and len(final_audio_bytes) > 0:
+                    print(f"Finalizing sentence for participant {participant_id}: '{final_text}'")
+                    await sentence_callback(final_text, final_audio_bytes)
+            # Clear buffers for next sentence
+            self.candidate_audio_buffers[participant_id] = b''
+            self.candidate_text_cache[participant_id] = ""
+            self.silence_counters[participant_id] = 0
+            self.sentence_finalized[participant_id] = False
+            return final_text
+        except Exception as e:
+            print(f"Error finalizing sentence: {e}")
+            import traceback
+            traceback.print_exc()
+            self.sentence_finalized[participant_id] = False
+            return ""
+    def has_voice_activity(self, audio_data: bytes, threshold: float = 0.0005) -> bool:
+        """Enhanced VAD based on audio analysis"""
+        try:
+            audio_array = self._bytes_to_audio_array(audio_data)
+            if len(audio_array) == 0:
+                return False
+            # Normalize audio
+            audio_array = audio_array.astype(np.float32)
+            if np.max(np.abs(audio_array)) > 0:
+                audio_array /= np.max(np.abs(audio_array))
+            # Calculate multiple features for better VAD
+            rms = np.sqrt(np.mean(audio_array ** 2))
+            peak = np.max(np.abs(audio_array))
+            audio_std = np.std(audio_array)
+            zero_crossing_rate = np.sum(np.diff(np.sign(audio_array)) != 0) / len(audio_array)
+            # Voice activity detection
+            has_voice_rms = rms > threshold
+            has_voice_peak = peak > threshold * 3
+            has_voice_variation = audio_std > threshold * 0.8
+            has_voice_zcr = zero_crossing_rate > 0.008
+            has_voice = has_voice_rms or (has_voice_peak and has_voice_variation) or has_voice_zcr
+            return has_voice
+        except Exception as e:
+            print(f"Error in VAD: {e}")
+            return True
+    def has_meaningful_voice_activity(self, audio_data: bytes, threshold: float = 0.002) -> bool:
+        """Stricter VAD check for pre-transcription filtering"""
+        try:
+            audio_array = self._bytes_to_audio_array(audio_data)
+            if len(audio_array) == 0:
+                return False
+            # Normalize audio
+            audio_array = audio_array.astype(np.float32)
+            if np.max(np.abs(audio_array)) > 0:
+                audio_array /= np.max(np.abs(audio_array))
+            # Calculate features with higher thresholds
+            rms = np.sqrt(np.mean(audio_array ** 2))
+            peak = np.max(np.abs(audio_array))
+            audio_std = np.std(audio_array)
+            zero_crossing_rate = np.sum(np.diff(np.sign(audio_array)) != 0) / len(audio_array)
+            # Higher thresholds for meaningful speech detection
+            has_meaningful_voice = (
+                rms > threshold and
+                peak > threshold * 2 and
+                audio_std > threshold * 0.5 and
+                zero_crossing_rate > 0.015
+            )
+            return has_meaningful_voice
+        except Exception as e:
+            print(f"Error in meaningful VAD: {e}")
+            return False
+    async def force_complete_sentence(self, participant_id: str, language_code: str, sentence_callback: Optional[Callable] = None) -> str:
+        """Force complete any pending sentence for a participant"""
+        try:
+            if self.sentence_finalized.get(participant_id, False):
+                print(f"Force completion: Sentence for participant {participant_id} already finalized")
+                return ""
+            if participant_id in self.candidate_text_cache:
+                cached_text = self.candidate_text_cache[participant_id]
+                if cached_text and len(cached_text.strip()) > 0:
+                    result = await self._finalize_candidate_sentence(language_code, participant_id, sentence_callback)
+                    return result
+            return ""
+        except Exception as e:
+            print(f"Error in force_complete_sentence: {e}")
+            import traceback
+            traceback.print_exc()
+            return ""
+    async def transcribe_audio(self, audio_data: bytes, language_code: str, callback: Optional[Callable] = None) -> str:
+        """Transcribe audio data to text using ONNX models"""
+        try:
+            # Check for voice activity before running ASR
+            has_voice = self.has_voice_activity(audio_data)
+            if not has_voice:
+                print(f"ONNX ASR: No voice activity detected, skipping transcription")
+                return ""
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.asr_models:
+                raise ValueError(f"ASR model not available for language: {language_code}")
+            # Convert audio bytes to numpy array
+            audio_array = self._bytes_to_audio_array(audio_data)
+            print(f"ONNX ASR: Running transcription with {len(audio_array)/16000:.2f}s of audio")
+            # Run ONNX inference (this service is ONNX-only)
+            model_config = self.asr_config[language_code]
+            if not model_config.get("use_onnx", False):
+                raise ValueError(f"Language {language_code} is not configured for ONNX. This service only supports ONNX models.")
+            # ONNX inference
+            text = await self._run_onnx_inference(audio_array, language_code)
+            if callback:
+                await callback(text)
+            return text
+        except Exception as e:
+            print(f"ONNX TranscriptionService: Transcription error: {e}")
+            import traceback
+            traceback.print_exc()
+            return ""
+    def _bytes_to_audio_array(self, audio_data: bytes) -> np.ndarray:
+        """Convert audio bytes to numpy array"""
+        try:
+            # Try to decode as WAV
+            try:
+                audio_io = io.BytesIO(audio_data)
+                with wave.open(audio_io, 'rb') as wav_file:
+                    frames = wav_file.readframes(-1)
+                    audio_array = np.frombuffer(frames, dtype=np.int16)
+                    # Convert to float32 and normalize
+                    audio_array = audio_array.astype(np.float32) / 32768.0
+                    return audio_array
+            except Exception:
+                pass
+            # Fallback: assume raw float32 audio data
+            try:
+                audio_array = np.frombuffer(audio_data, dtype=np.float32)
+                return audio_array
+            except Exception:
+                pass
+            return np.array([], dtype=np.float32)
+        except Exception as e:
+            print(f"ONNX TranscriptionService: Audio conversion error: {e}")
+            return np.array([], dtype=np.float32)
+    def _audio_array_to_bytes(self, audio_array: np.ndarray) -> bytes:
+        """Convert numpy audio array back to WAV bytes for storage"""
+        try:
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32)
+            # Convert to 16-bit PCM for WAV storage
+            audio_int16 = (audio_array * 32767).astype(np.int16)
+            # Create WAV bytes
+            wav_buffer = io.BytesIO()
+            with wave.open(wav_buffer, 'wb') as wav_file:
+                wav_file.setnchannels(1)  # Mono
+                wav_file.setsampwidth(2)  # 16-bit
+                wav_file.setframerate(16000)  # 16kHz
+                wav_file.writeframes(audio_int16.tobytes())
+            return wav_buffer.getvalue()
+        except Exception as e:
+            print(f"Error converting audio array to bytes: {e}")
+            return b''
+    def clear_participant_buffers(self, participant_id: str):
+        """Clear all buffers for a participant"""
+        if participant_id in self.candidate_audio_buffers:
+            del self.candidate_audio_buffers[participant_id]
+        if participant_id in self.candidate_text_cache:
+            del self.candidate_text_cache[participant_id]
+        if participant_id in self.silence_counters:
+            del self.silence_counters[participant_id]
+        if participant_id in self.sentence_finalized:
+            del self.sentence_finalized[participant_id]
+    async def cleanup(self):
+        """Cleanup resources"""
+        self.asr_models.clear()
+        self.processors.clear()
+        self.model_cache.clear()

app/services/transcription_service_onnx_optimized.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import asyncio
+import io
+import wave
+import numpy as np
+from typing import Dict, Optional, Callable
+from collections import OrderedDict
+import onnxruntime as ort
+from transformers import AutoProcessor, WhisperProcessor
+from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
+import os
+from app.models import LanguageCode
+class OptimizedONNXTranscriptionService:
+    """
+    Optimized ONNX Transcription Service that uses pre-converted ONNX models
+    instead of performing runtime conversion from PyTorch models.
+    Benefits:
+    - Faster container startup (no conversion time)
+    - Reduced memory usage during initialization
+    - More predictable deployment times
+    - Better resource utilization in production
+    """
+    def __init__(self):
+        self.asr_models: Dict[str, any] = {}
+        self.processors: Dict[str, any] = {}
+        self.max_asr_models = 2  # Memory management - keep max 2 models loaded
+        self.model_cache = OrderedDict()  # LRU cache for models
+        # GPU optimization
+        self.providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if ort.get_available_providers().__contains__('CUDAExecutionProvider') else ['CPUExecutionProvider']
+        # OPTIMIZED ONNX Model configurations - using pre-converted models
+        self.asr_config = {
+            # English: Use pre-converted ONNX model (no runtime conversion!)
+            "eng": {
+                "model_repo": "mutisya/whisper-medium-en-onnx",  # Pre-converted ONNX model
+                "model_type": "whisper",
+                "use_onnx": True,
+                "export": False  # ⭐ KEY CHANGE: No runtime export needed!
+            },
+            # African languages: Already using ONNX models
+            "swa": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-swh-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "kik": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-kik-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "kam": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-kam-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "mer": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-mer-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "luo": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-luo-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True},
+            "som": {"model_repo": "mutisya/w2v-bert-2.0-asr-onnx-som-v25-37-1", "model_type": "wav2vec2-bert", "use_onnx": True}
+        }
+        self.preload_languages = ["eng"]
+        # Enhanced audio buffering for VAD-based sentence detection
+        self.candidate_audio_buffers: Dict[str, bytes] = {}
+        self.candidate_text_cache: Dict[str, str] = {}
+        self.silence_counters: Dict[str, int] = {}
+        self.sentence_finalized: Dict[str, bool] = {}
+        # VAD parameters
+        self.silence_threshold = 2
+        self.min_sentence_length = 0.03
+    async def initialize(self):
+        """Initialize ASR models for preloaded languages"""
+        print(f"🚀 Optimized ONNX ASR: Initializing with providers: {self.providers}")
+        print(f"📈 Performance Improvement: Using pre-converted ONNX models (no runtime conversion)")
+        for lang_code in self.preload_languages:
+            if lang_code in self.asr_config:
+                try:
+                    start_time = asyncio.get_event_loop().time()
+                    await self.ensure_model_loaded(lang_code)
+                    end_time = asyncio.get_event_loop().time()
+                    print(f"⚡ Model loading time for {lang_code}: {end_time - start_time:.2f}s")
+                except Exception as e:
+                    print(f"❌ Failed to load ASR model for {lang_code}: {e}")
+    async def ensure_model_loaded(self, language_code: str):
+        """Load ASR model for language if not already loaded with LRU cache"""
+        if language_code in self.model_cache:
+            # Move to end (most recently used)
+            self.model_cache.move_to_end(language_code)
+            return
+        if language_code not in self.asr_config:
+            raise ValueError(f"Language {language_code} not supported")
+        model_config = self.asr_config[language_code]
+        # Check if we need to evict old models
+        while len(self.model_cache) >= self.max_asr_models:
+            # Remove least recently used model
+            old_lang, _ = self.model_cache.popitem(last=False)
+            if old_lang in self.asr_models:
+                del self.asr_models[old_lang]
+            if old_lang in self.processors:
+                del self.processors[old_lang]
+            print(f"🗑️ ONNX ASR: Evicted model for {old_lang} (LRU cache)")
+        try:
+            if model_config.get("use_onnx", False):
+                # Load ONNX model
+                print(f"📥 ONNX ASR: Loading ONNX model for {language_code}")
+                # Special handling for Whisper models
+                if model_config.get("model_type") == "whisper":
+                    print(f"🎙️ ONNX ASR: Loading pre-converted Whisper ONNX model from {model_config['model_repo']}")
+                    # Load pre-converted Whisper ONNX model using Optimum
+                    load_kwargs = {
+                        # Note: No 'export' parameter needed since model is already in ONNX format
+                        # This is the key optimization - no runtime conversion!
+                    }
+                    # Add subfolder if specified (for models that store ONNX in subfolders)
+                    if "subfolder" in model_config:
+                        load_kwargs["subfolder"] = model_config["subfolder"]
+                    # ⭐ KEY OPTIMIZATION: No export flag needed for pre-converted models
+                    # The old code had: if model_config.get("export", False): load_kwargs["export"] = True
+                    # Now we skip this entirely since the model is already in ONNX format
+                    model = ORTModelForSpeechSeq2Seq.from_pretrained(
+                        model_config["model_repo"],
+                        **load_kwargs
+                    )
+                    # Load Whisper processor
+                    processor = WhisperProcessor.from_pretrained(model_config["model_repo"])
+                    # Configure for English transcription
+                    model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
+                        language="en",
+                        task="transcribe"
+                    )
+                    self.asr_models[language_code] = model
+                    self.processors[language_code] = processor
+                    print(f"✅ ONNX ASR: Successfully loaded pre-converted Whisper ONNX model for {language_code}")
+                else:
+                    # Original wav2vec2-bert model loading logic (unchanged)
+                    # Create ONNX session with optimizations
+                    session_options = ort.SessionOptions()
+                    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+                    # Enable parallel execution
+                    session_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
+                    model_path = model_config["model_repo"]
+                    try:
+                        # Try to load from HuggingFace directly
+                        from huggingface_hub import hf_hub_download
+                        model_file = hf_hub_download(repo_id=model_path, filename="model.onnx")
+                        # Create ONNX Runtime session
+                        session = ort.InferenceSession(
+                            model_file,
+                            session_options,
+                            providers=self.providers
+                        )
+                        # Load processor/tokenizer
+                        processor = AutoProcessor.from_pretrained(model_path)
+                        self.asr_models[language_code] = session
+                        self.processors[language_code] = processor
+                        print(f"✅ ONNX ASR: Successfully loaded {model_config['model_type']} ONNX model for {language_code}")
+                    except Exception as e:
+                        print(f"❌ Error loading ONNX model {model_path}: {e}")
+                        raise
+            else:
+                raise ValueError(f"Non-ONNX models not supported in optimized service")
+            # Add to cache
+            self.model_cache[language_code] = True
+        except Exception as e:
+            print(f"❌ Error loading model for {language_code}: {e}")
+            raise
+    # Rest of the methods remain the same as the original transcription service
+    # (transcribe_audio, process_audio_chunk, etc.)
+    # ... [Include all other methods from the original service]
+    async def transcribe_audio(self, participant_id: str, audio_data: bytes, language_code: str = "eng") -> Optional[str]:
+        """Transcribe audio using ONNX models"""
+        try:
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.asr_models or language_code not in self.processors:
+                raise ValueError(f"Model not loaded for language: {language_code}")
+            model = self.asr_models[language_code]
+            processor = self.processors[language_code]
+            # Convert audio bytes to numpy array
+            audio_io = io.BytesIO(audio_data)
+            with wave.open(audio_io, 'rb') as wav_file:
+                frames = wav_file.readframes(-1)
+                sample_rate = wav_file.getframerate()
+                audio_np = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32768.0
+            # Get model configuration
+            model_config = self.asr_config[language_code]
+            if model_config.get("model_type") == "whisper":
+                # Process with Whisper ONNX model
+                inputs = processor(audio_np, sampling_rate=sample_rate, return_tensors="pt")
+                with torch.no_grad():
+                    predicted_ids = model.generate(**inputs)
+                    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+                return transcription.strip()
+            else:
+                # Process with wav2vec2-bert ONNX model
+                inputs = processor(audio_np, sampling_rate=sample_rate, return_tensors="np")
+                # Run ONNX inference
+                ort_inputs = {model.get_inputs()[0].name: inputs.input_values}
+                ort_outputs = model.run(None, ort_inputs)
+                # Decode results
+                predicted_ids = np.argmax(ort_outputs[0], axis=-1)
+                transcription = processor.decode(predicted_ids[0])
+                return transcription.strip()
+        except Exception as e:
+            print(f"❌ Transcription error for {participant_id}: {e}")
+            return None
+    def get_performance_stats(self) -> Dict[str, any]:
+        """Get performance statistics for monitoring"""
+        return {
+            "loaded_models": list(self.model_cache.keys()),
+            "cache_size": len(self.model_cache),
+            "max_cache_size": self.max_asr_models,
+            "providers": self.providers,
+            "optimization_enabled": True,
+            "runtime_conversion": False  # Key metric: no runtime conversion
+        }

app/services/translation_service.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import asyncio
+from typing import Dict, Optional
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+import nltk
+from app.models import LanguageCode
+from app.services.quantization_utils import apply_dynamic_int8_quantization, get_quantization_stats
+# FLORES-200 language codes mapping
+FLORES_CODES = {
+    "English": "eng_Latn",
+    "eng": "eng_Latn",
+    "Swahili": "swh_Latn",
+    "swa": "swh_Latn",
+    "Kikuyu": "kik_Latn",
+    "kik": "kik_Latn",
+    "Kamba": "kam_Latn",
+    "kam": "kam_Latn",
+    "Kimeru": "mer_Latn",
+    "mer": "mer_Latn",
+    "Luo": "luo_Latn",
+    "luo": "luo_Latn",
+    "Somali": "som_Latn",
+    "som": "som_Latn",
+}
+class TranslationService:
+    def __init__(self, enable_quantization: bool = True):
+        self.translation_pipeline = None
+        self.device = 0 if torch.cuda.is_available() else -1
+        self.model_path = "mutisya/nllb_600m-en-kik-kam-luo-mer-som-swh-drL-24_5-filtered-v24_28_4"
+        self.enable_quantization = enable_quantization
+    async def initialize(self):
+        """Initialize translation model"""
+        try:
+            # Download NLTK data with better error handling
+            try:
+                nltk.download("punkt", quiet=True)
+                nltk.download('punkt_tab', quiet=True)
+            except Exception as nltk_error:
+                print(f"Warning: NLTK data download failed: {nltk_error}")
+                # Continue anyway, sentence tokenization might still work
+            # Load translation model with explicit model kwargs for newer transformers
+            print(f"Loading translation model: {self.model_path}")
+            model = AutoModelForSeq2SeqLM.from_pretrained(
+                self.model_path,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+            )
+            tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            # Apply quantization if enabled
+            if self.enable_quantization:
+                try:
+                    print("Applying INT8 quantization to translation model...")
+                    model = apply_dynamic_int8_quantization(model, "translation")
+                    stats = get_quantization_stats(model)
+                    print(f"✓ Translation model quantized: {stats['quantized_layers']}/{stats['total_layers']} layers, {stats['size_mb']:.2f} MB")
+                except Exception as e:
+                    print(f"Warning: Could not quantize translation model: {e}")
+                    print(f"Continuing with unquantized model")
+            self.translation_pipeline = pipeline(
+                'translation',
+                model=model,
+                tokenizer=tokenizer,
+                device=self.device,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+            )
+        except Exception as e:
+            print(f"Failed to initialize translation service: {e}")
+            raise
+    async def translate_text(self, text: str, source_lang: str, target_lang: str) -> str:
+        """Translate text from source language to target language"""
+        print(f"=== TRANSLATION REQUEST ===")
+        print(f"Text: '{text}'")
+        print(f"Source: {source_lang}")
+        print(f"Target: {target_lang}")
+        if not self.translation_pipeline:
+            print("TRANSLATION ERROR: Translation service not initialized")
+            raise RuntimeError("Translation service not initialized")
+        if not text or not text.strip():
+            print("TRANSLATION ERROR: Empty text provided")
+            return ""
+        try:
+            # Get FLORES codes
+            src_code = FLORES_CODES.get(source_lang, "eng_Latn")
+            tgt_code = FLORES_CODES.get(target_lang, "eng_Latn")
+            print(f"FLORES codes: {source_lang} -> {src_code}, {target_lang} -> {tgt_code}")
+            # Skip translation if same language
+            if src_code == tgt_code:
+                print("TRANSLATION SKIPPED: Same source and target language")
+                return text
+            # Tokenize into sentences for better translation
+            sentences = nltk.sent_tokenize(text)
+            translated_sentences = []
+            print(f"Translating {len(sentences)} sentences...")
+            for i, sentence in enumerate(sentences):
+                if sentence.strip():
+                    print(f"Translating sentence {i+1}: '{sentence}'")
+                    result = self.translation_pipeline(
+                        sentence,
+                        src_lang=src_code,
+                        tgt_lang=tgt_code
+                    )
+                    translated = result[0]['translation_text']
+                    print(f"Translation result: '{translated}'")
+                    # Preserve punctuation and capitalization
+                    if sentence.strip().endswith(".") and not translated.strip().endswith("."):
+                        translated += "."
+                    if sentence.strip()[0].isupper() and translated.strip():
+                        translated = translated[0].upper() + translated[1:]
+                    translated_sentences.append(translated)
+            final_translation = " ".join(translated_sentences)
+            # Preserve paragraph breaks
+            if text.endswith(".\n\n"):
+                final_translation += ".\n\n"
+            print(f"FINAL TRANSLATION: '{final_translation}'")
+            print(f"=== TRANSLATION COMPLETE ===")
+            return final_translation
+        except Exception as e:
+            print(f"TRANSLATION ERROR: {e}")
+            import traceback
+            traceback.print_exc()
+            return text  # Return original text if translation fails
+    async def cleanup(self):
+        """Cleanup resources"""
+        self.translation_pipeline = None

app/services/translation_service_onnx.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import asyncio
+from typing import Dict, Optional
+from transformers import AutoTokenizer, pipeline
+from optimum.onnxruntime import ORTModelForSeq2SeqLM
+import nltk
+from app.models import LanguageCode
+# FLORES-200 language codes mapping
+FLORES_CODES = {
+    "English": "eng_Latn",
+    "eng": "eng_Latn",
+    "Swahili": "swh_Latn",
+    "swa": "swh_Latn",
+    "Kikuyu": "kik_Latn",
+    "kik": "kik_Latn",
+    "Kamba": "kam_Latn",
+    "kam": "kam_Latn",
+    "Kimeru": "mer_Latn",
+    "mer": "mer_Latn",
+    "Luo": "luo_Latn",
+    "luo": "luo_Latn",
+    "Somali": "som_Latn",
+    "som": "som_Latn",
+}
+class ONNXTranslationService:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.translation_pipeline = None
+        # Use ONNX optimized NLLB model (FP32 format with separate encoder/decoder)
+        self.model_repo = "mutisya/nllb-translation-onnx-v25-37-1"
+    async def initialize(self):
+        """Initialize ONNX translation model using optimum.onnxruntime"""
+        try:
+            print("ONNX Translation: Initializing translation service with ONNX Runtime...")
+            print(f"ONNX Translation: Loading model from {self.model_repo}")
+            # Check available providers for GPU detection
+            import onnxruntime as ort
+            available_providers = ort.get_available_providers()
+            print(f"ONNX Translation: Available providers: {available_providers}")
+            # Download NLTK data with better error handling
+            try:
+                nltk.download("punkt", quiet=True)
+                nltk.download('punkt_tab', quiet=True)
+            except Exception as nltk_error:
+                print(f"Warning: NLTK data download failed: {nltk_error}")
+            # Get authentication token for private repo
+            import os
+            auth_token = os.getenv('HUGGING_FACE_HUB_TOKEN') or os.getenv('HF_TOKEN')
+            # Configure providers list for optimal performance
+            print("ONNX Translation: Configuring execution providers...")
+            if 'CUDAExecutionProvider' in available_providers:
+                # Use both CUDA and CPU providers to eliminate assignment warnings
+                providers_list = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+                primary_provider = 'CUDAExecutionProvider'
+                print(f"ONNX Translation: Using providers: {providers_list} (primary: {primary_provider})")
+            else:
+                providers_list = ['CPUExecutionProvider']
+                primary_provider = 'CPUExecutionProvider'
+                print(f"ONNX Translation: Using CPU-only providers: {providers_list}")
+            # Load ONNX model using optimum (handles separate encoder/decoder files)
+            # Configure session options for optimal CUDA performance
+            import onnxruntime as ort
+            session_options = ort.SessionOptions()
+            session_options.log_severity_level = 1  # WARNING level for detailed logs
+            session_options.logid = "ONNX_Translation"
+            # Enable all graph optimizations to reduce memcpy operations
+            session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+            # Optimize threading for better GPU utilization
+            session_options.inter_op_num_threads = 1  # Reduce CPU thread contention
+            session_options.intra_op_num_threads = 1  # Focus on GPU execution
+            # Note: enable_cuda_graph not available in this ONNX Runtime version
+            # Configure provider options with performance optimizations for CUDA
+            provider_options = []
+            if primary_provider == 'CUDAExecutionProvider':
+                cuda_options = {
+                    'device_id': 0,
+                    'arena_extend_strategy': 'kNextPowerOfTwo',
+                    'gpu_mem_limit': int(0.6 * 1024 * 1024 * 1024),  # 60% of GPU memory for translation
+                    'cudnn_conv_algo_search': 'EXHAUSTIVE',
+                    'cudnn_conv_use_max_workspace': '1',  # Enable max workspace for fp16 tensor cores
+                    'do_copy_in_default_stream': True,
+                    'enable_skip_layer_norm_strict_mode': False,  # Better performance for transformers
+                    'prefer_nhwc': True,  # Optimize data layout for GPU
+                }
+                # Configure providers with options
+                provider_options = [
+                    ('CUDAExecutionProvider', cuda_options),
+                    ('CPUExecutionProvider', {})
+                ]
+            # Try with optimized provider configuration and session options
+            try:
+                print("ONNX Translation: Attempting optimized provider configuration...")
+                self.model = ORTModelForSeq2SeqLM.from_pretrained(
+                    self.model_repo,
+                    token=auth_token,
+                    providers=provider_options if provider_options else providers_list,  # Use provider options or list
+                    session_options=session_options,               # Add session options
+                )
+                print(f"ONNX Translation: Model loaded successfully with providers: {providers_list}")
+                # Check what providers the model is actually using
+                if hasattr(self.model, 'providers'):
+                    print(f"ONNX Translation: Model is using providers: {self.model.providers}")
+                if hasattr(self.model, 'device'):
+                    print(f"ONNX Translation: Model device: {self.model.device}")
+            except Exception as e1:
+                print(f"ONNX Translation: Optimized provider approach failed: {e1}")
+                print("ONNX Translation: Falling back to simple provider list...")
+                # Fallback: Try with simple provider list (no options)
+                try:
+                    self.model = ORTModelForSeq2SeqLM.from_pretrained(
+                        self.model_repo,
+                        token=auth_token,
+                        providers=providers_list,  # Simple provider list
+                        session_options=session_options,
+                    )
+                    print(f"ONNX Translation: Model loaded successfully with simple providers: {providers_list}")
+                    # Check what the model is actually using
+                    if hasattr(self.model, 'providers'):
+                        print(f"ONNX Translation: Model is using providers: {self.model.providers}")
+                    if hasattr(self.model, 'device'):
+                        print(f"ONNX Translation: Model device: {self.model.device}")
+                except Exception as e2:
+                    print(f"ONNX Translation: Simple provider approach failed: {e2}")
+                    print("ONNX Translation: Falling back to auto-detect...")
+                    # Final fallback: Let model auto-detect
+                    self.model = ORTModelForSeq2SeqLM.from_pretrained(
+                        self.model_repo,
+                        token=auth_token
+                        # Not passing provider, letting it auto-detect based on device
+                    )
+                    print(f"ONNX Translation: Model loaded successfully with auto-detection")
+                    # Check what the model is actually using
+                    if hasattr(self.model, 'providers'):
+                        print(f"ONNX Translation: Model auto-selected providers: {self.model.providers}")
+                    if hasattr(self.model, 'device'):
+                        print(f"ONNX Translation: Model device: {self.model.device}")
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_repo,
+                token=auth_token
+            )
+            # Create translation pipeline
+            # For ONNX models, we should specify device to ensure pipeline uses GPU
+            # Use the same provider detection as the model to ensure consistency
+            device = 0 if primary_provider == 'CUDAExecutionProvider' else -1
+            print(f"ONNX Translation: Setting pipeline device to: {device} ({'GPU' if device >= 0 else 'CPU'})")
+            print(f"ONNX Translation: Pipeline will use device based on primary provider: {primary_provider}")
+            self.translation_pipeline = pipeline(
+                "translation",
+                model=self.model,
+                tokenizer=self.tokenizer,
+                device=device
+            )
+            print("ONNX Translation: Successfully initialized ONNX translation model")
+        except Exception as e:
+            print(f"Failed to initialize ONNX translation service: {e}")
+            print("ONNX translation model is not available. Please ensure the model repository exists and contains the required ONNX files.")
+            import traceback
+            traceback.print_exc()
+            raise RuntimeError(f"ONNX translation model unavailable at {self.model_repo}: {e}")
+    async def translate_text(self, text: str, source_lang: str, target_lang: str) -> str:
+        """Translate text from source language to target language using ONNX"""
+        print(f"=== ONNX TRANSLATION REQUEST ===")
+        print(f"Text: '{text}'")
+        print(f"Source: {source_lang}")
+        print(f"Target: {target_lang}")
+        if not self.translation_pipeline:
+            print("ONNX TRANSLATION ERROR: Translation service not initialized")
+            raise RuntimeError("ONNX Translation service not initialized")
+        if not text or not text.strip():
+            print("ONNX TRANSLATION ERROR: Empty text provided")
+            return ""
+        try:
+            # Get FLORES codes
+            src_code = FLORES_CODES.get(source_lang, "eng_Latn")
+            tgt_code = FLORES_CODES.get(target_lang, "eng_Latn")
+            print(f"FLORES codes: {source_lang} -> {src_code}, {target_lang} -> {tgt_code}")
+            # Skip translation if same language
+            if src_code == tgt_code:
+                print("ONNX TRANSLATION SKIPPED: Same source and target language")
+                return text
+            # Tokenize into sentences for better translation
+            sentences = nltk.sent_tokenize(text)
+            translated_sentences = []
+            print(f"Translating {len(sentences)} sentences with ONNX...")
+            for i, sentence in enumerate(sentences):
+                if sentence.strip():
+                    print(f"Translating sentence {i+1}: '{sentence}'")
+                    # Use the pipeline for translation
+                    result = self.translation_pipeline(
+                        sentence.strip(),
+                        src_lang=src_code,
+                        tgt_lang=tgt_code,
+                        max_length=512
+                    )
+                    translated = result[0]['translation_text']
+                    print(f"ONNX Translation result: '{translated}'")
+                    # Preserve punctuation and capitalization
+                    if sentence.strip().endswith(".") and not translated.strip().endswith("."):
+                        translated += "."
+                    if sentence.strip() and sentence.strip()[0].isupper() and translated.strip():
+                        translated = translated[0].upper() + translated[1:]
+                    translated_sentences.append(translated)
+            final_translation = " ".join(translated_sentences)
+            # Preserve paragraph breaks
+            if text.endswith(".\n\n"):
+                final_translation += ".\n\n"
+            print(f"ONNX FINAL TRANSLATION: '{final_translation}'")
+            print(f"=== ONNX TRANSLATION COMPLETE ===")
+            return final_translation
+        except Exception as e:
+            print(f"ONNX TRANSLATION ERROR: {e}")
+            import traceback
+            traceback.print_exc()
+            raise RuntimeError(f"Translation failed: {e}")
+    async def cleanup(self):
+        """Cleanup resources"""
+        self.model = None
+        self.tokenizer = None
+        self.translation_pipeline = None
+        print("ONNX Translation: Translation service cleaned up")

app/services/tts_service.py ADDED Viewed

	@@ -0,0 +1,541 @@

+import asyncio
+import io
+import wave
+import numpy as np
+import subprocess
+from typing import Dict, Optional
+from transformers import pipeline
+import torch
+import os
+from app.services.quantization_utils import apply_dynamic_int8_quantization, get_quantization_stats
+class TTSService:
+    def __init__(self, enable_quantization: bool = True):
+        self.tts_pipelines: Dict[str, any] = {}
+        self.device = 0 if torch.cuda.is_available() else -1
+        self.enable_quantization = enable_quantization
+        # Check if espeak is available
+        self.espeak_available = self._check_espeak_availability()
+        # TTS model configurations from your original code
+        self.tts_config = {
+            "kik": {"model_repo": "mutisya/vits_kik_drL_24_5-v24_27_1_f", "model_type": "vits"},
+            "luo": {"model_repo": "mutisya/vits_luo_drL_24_5-v24_27_1_f", "model_type": "vits"},
+            "kam": {"model_repo": "mutisya/vits_kam_drL_24_5-v24_27_1_f", "model_type": "vits"},
+            "mer": {"model_repo": "mutisya/vits_mer_drL_24_5-v24_27_1_f", "model_type": "vits"},
+            "som": {"model_repo": "mutisya/vits_som_drL_24_5-v24_27_1_m", "model_type": "vits"},
+            "swa": {"model_repo": "mutisya/vits_swh_biblica-v24_27_1_m", "model_type": "vits"},
+            "eng": {"model_repo": "kakao-enterprise/vits-ljs", "model_type": "vits"},
+        }
+        # Alternative TTS models that don't require espeak (fallback)
+        self.fallback_tts_config = {
+            "eng": {"model_repo": "microsoft/speecht5_tts", "model_type": "speecht5"},
+            "swa": {"model_repo": "facebook/mms-tts-swh", "model_type": "mms"},
+            "som": {"model_repo": "facebook/mms-tts-som", "model_type": "mms"},
+        }
+        self.preload_languages = ["kik", "swa"]
+        self.background_loading_task = None
+        self.models_loading_status = {}
+    def _check_espeak_availability(self) -> bool:
+        """Check if espeak is available on the system"""
+        try:
+            result = subprocess.run(['espeak', '--version'],
+                                  capture_output=True, text=True, timeout=5)
+            if result.returncode == 0:
+                print("TTS: espeak is available")
+                return True
+            else:
+                print("TTS: espeak command failed")
+                return False
+        except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
+            print(f"TTS: espeak not available: {e}")
+            return False
+    async def initialize(self):
+        """Initialize TTS models for preloaded languages"""
+        print("TTS: Initializing TTS service...")
+        print(f"TTS: espeak available: {self.espeak_available}")
+        for lang_code in self.preload_languages:
+            await self.ensure_model_loaded(lang_code)
+    def _load_and_quantize_tts_pipeline(self, lang_code: str, model_repo: str, model_type: str = "vits"):
+        """Load TTS pipeline and optionally apply INT8 quantization"""
+        print(f"TTS: Loading model for {lang_code}: {model_repo}")
+        pipeline_obj = pipeline(
+            "text-to-speech",
+            model=model_repo,
+            device=self.device
+        )
+        # Apply quantization if enabled
+        if self.enable_quantization:
+            try:
+                # Get the underlying model from the pipeline
+                model = pipeline_obj.model
+                print(f"TTS: Applying INT8 quantization to {lang_code} model...")
+                quantized_model = apply_dynamic_int8_quantization(model, model_type)
+                # Replace the model in the pipeline
+                pipeline_obj.model = quantized_model
+                # Print quantization stats
+                stats = get_quantization_stats(quantized_model)
+                print(f"✓ TTS {lang_code} model quantized: {stats['quantized_layers']}/{stats['total_layers']} layers, {stats['size_mb']:.2f} MB")
+            except Exception as e:
+                print(f"TTS: Warning - Could not quantize {lang_code} model: {e}")
+                print(f"TTS: Continuing with unquantized model")
+        return pipeline_obj
+    async def ensure_model_loaded(self, language_code: str):
+        """Load TTS model for language if not already loaded"""
+        if language_code in self.tts_pipelines:
+            return
+        # First try to load primary model if espeak is available
+        if self.espeak_available and language_code in self.tts_config:
+            try:
+                model_config = self.tts_config[language_code]
+                pipeline_obj = self._load_and_quantize_tts_pipeline(
+                    language_code,
+                    model_config["model_repo"],
+                    model_config.get("model_type", "vits")
+                )
+                self.tts_pipelines[language_code] = pipeline_obj
+                print(f"TTS: Loaded primary TTS model for {language_code}")
+                return
+            except Exception as e:
+                print(f"TTS: Failed to load primary TTS model for {language_code}: {e}")
+                # Continue to try fallback models
+        # Try fallback models if primary failed or espeak not available
+        if language_code in self.fallback_tts_config:
+            try:
+                model_config = self.fallback_tts_config[language_code]
+                if model_config["model_type"] == "speecht5":
+                    # Special handling for SpeechT5
+                    from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+                    import torch
+                    processor = SpeechT5Processor.from_pretrained(model_config["model_repo"])
+                    model = SpeechT5ForTextToSpeech.from_pretrained(model_config["model_repo"])
+                    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+                    # Create a custom pipeline-like object
+                    class SpeechT5Pipeline:
+                        def __init__(self, processor, model, vocoder):
+                            self.processor = processor
+                            self.model = model
+                            self.vocoder = vocoder
+                        def __call__(self, text):
+                            inputs = self.processor(text=text, return_tensors="pt")
+                            # Use default speaker embeddings
+                            import datasets
+                            embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+                            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+                            speech = self.model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=self.vocoder)
+                            return {
+                                "audio": speech.numpy(),
+                                "sampling_rate": 16000
+                            }
+                    pipeline_obj = SpeechT5Pipeline(processor, model, vocoder)
+                else:
+                    # Standard pipeline for MMS models
+                    pipeline_obj = pipeline(
+                        "text-to-speech",
+                        model=model_config["model_repo"],
+                        device=self.device
+                    )
+                self.tts_pipelines[language_code] = pipeline_obj
+                print(f"TTS: Loaded fallback TTS model for {language_code}")
+                return
+            except Exception as e:
+                print(f"TTS: Failed to load fallback TTS model for {language_code}: {e}")
+        print(f"TTS: No TTS model available for language: {language_code}")
+    async def generate_speech(self, text: str, language_code: str, output_format: str = "webm") -> Optional[bytes]:
+        """Generate speech audio from text
+        Args:
+            text: Text to convert to speech
+            language_code: Language code for TTS model
+            output_format: Output format - "webm" (default, web-compatible) or "wav" (Android-compatible)
+        Returns:
+            Audio bytes in the requested format, or None if generation fails
+        """
+        try:
+            print(f"=== TTS GENERATION REQUEST ===")
+            print(f"Text: '{text}'")
+            print(f"Language: {language_code}")
+            print(f"Output format: {output_format}")
+            # Input validation: Check for invalid or problematic text
+            if not text or not text.strip():
+                print("TTS: Empty or whitespace-only text, skipping TTS generation")
+                return None
+            # Check for very short text that might cause issues
+            clean_text = text.strip()
+            if len(clean_text) <= 2 and clean_text in [".", ",", "!", "?", ":", ";", "-"]:
+                print(f"TTS: Text '{clean_text}' is too short or punctuation-only, skipping TTS generation")
+                return None
+            # Check for minimum meaningful length
+            if len(clean_text.replace(" ", "").replace(".", "").replace(",", "")) < 2:
+                print(f"TTS: Text '{clean_text}' has insufficient content for TTS, skipping")
+                return None
+            print(f"TTS pipelines available: {list(self.tts_pipelines.keys())}")
+            print(f"TTS config available: {list(self.tts_config.keys())}")
+            print(f"Fallback config available: {list(self.fallback_tts_config.keys())}")
+            # Check if the language is supported
+            if language_code not in self.tts_config and language_code not in self.fallback_tts_config:
+                print(f"TTS: Language {language_code} not configured for TTS")
+                return None
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.tts_pipelines:
+                print(f"TTS: TTS model not available for language: {language_code}")
+                return None
+            if not text or not text.strip():
+                print("TTS: Empty text provided")
+                return None
+            print(f"TTS: Generating speech for '{text}' in {language_code}")
+            # Generate speech
+            pipeline_obj = self.tts_pipelines[language_code]
+            result = pipeline_obj(text)
+            audio_array = result["audio"]
+            sample_rate = result.get("sampling_rate", 22050)
+            print(f"TTS: Generated audio array of length {len(audio_array)} at {sample_rate}Hz")
+            # Validate audio array
+            if len(audio_array) == 0:
+                print("TTS: Warning - Generated audio array is empty")
+                return None
+            # Check for potential issues with audio data
+            audio_min = np.min(audio_array)
+            audio_max = np.max(audio_array)
+            audio_rms = np.sqrt(np.mean(audio_array**2))
+            print(f"TTS: Audio statistics - Min: {audio_min:.4f}, Max: {audio_max:.4f}, RMS: {audio_rms:.4f}")
+            # Check if audio might be silent or corrupted
+            if audio_rms < 0.001:
+                print("TTS: Warning - Audio appears to be very quiet or silent")
+            if audio_max > 1.0 or audio_min < -1.0:
+                print("TTS: Warning - Audio values outside expected range [-1, 1]")
+                # Clip to valid range
+                audio_array = np.clip(audio_array, -1.0, 1.0)
+                print("TTS: Clipped audio to valid range")
+            # Convert to WAV bytes with appropriate sample rate
+            if output_format == "wav":
+                # For Android: use 16kHz sample rate
+                target_sample_rate = 16000
+                wav_bytes = self._convert_to_wav_bytes(audio_array, sample_rate)
+                print(f"TTS: Converted to WAV: {len(wav_bytes)} bytes")
+                # Convert sample rate to 16kHz if needed for Android compatibility
+                if sample_rate != target_sample_rate:
+                    print(f"TTS: Converting sample rate from {sample_rate}Hz to {target_sample_rate}Hz for Android compatibility")
+                    wav_bytes = await self._resample_wav_to_16khz(wav_bytes, sample_rate)
+                    print(f"TTS: Resampled WAV: {len(wav_bytes)} bytes")
+                print(f"TTS: Generated {len(wav_bytes)} bytes of WAV audio for '{text}'")
+                print(f"=== TTS GENERATION COMPLETE ===")
+                return wav_bytes
+            else:
+                # For web: use original sample rate and convert to WebM
+                wav_bytes = self._convert_to_wav_bytes(audio_array, sample_rate)
+                print(f"TTS: Converted to WAV: {len(wav_bytes)} bytes")
+                # Convert to WebM format for web compatibility
+                webm_bytes = await self._convert_to_webm(wav_bytes)
+                print(f"TTS: Generated {len(webm_bytes)} bytes of WebM audio for '{text}'")
+                print(f"=== TTS GENERATION COMPLETE ===")
+                return webm_bytes
+        except Exception as e:
+            print(f"TTS: TTS generation error: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+    async def generate_speech_dual_format(self, text: str, language_code: str) -> tuple[Optional[bytes], Optional[bytes]]:
+        """Generate speech audio in both WebM and WAV formats
+        Args:
+            text: Text to convert to speech
+            language_code: Language code for TTS model
+        Returns:
+            Tuple of (webm_bytes, wav_bytes), either can be None if generation fails
+        """
+        try:
+            print(f"=== TTS DUAL FORMAT GENERATION REQUEST ===")
+            print(f"Text: '{text}'")
+            print(f"Language: {language_code}")
+            # Input validation: Check for invalid or problematic text
+            if not text or not text.strip():
+                print("TTS: Empty or whitespace-only text, skipping TTS generation")
+                return None, None
+            # Check for very short text that might cause issues
+            clean_text = text.strip()
+            if len(clean_text) <= 2 and clean_text in [".", ",", "!", "?", ":", ";", "-"]:
+                print(f"TTS: Text '{clean_text}' is too short or punctuation-only, skipping TTS generation")
+                return None, None
+            # Check for minimum meaningful length
+            if len(clean_text.replace(" ", "").replace(".", "").replace(",", "")) < 2:
+                print(f"TTS: Text '{clean_text}' has insufficient content for TTS, skipping")
+                return None, None
+            # Check if the language is supported
+            if language_code not in self.tts_config and language_code not in self.fallback_tts_config:
+                print(f"TTS: Language {language_code} not configured for TTS")
+                return None, None
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.tts_pipelines:
+                print(f"TTS: TTS model not available for language: {language_code}")
+                return None, None
+            print(f"TTS: Generating speech for '{text}' in {language_code}")
+            # Generate speech once
+            pipeline_obj = self.tts_pipelines[language_code]
+            result = pipeline_obj(text)
+            audio_array = result["audio"]
+            sample_rate = result.get("sampling_rate", 22050)
+            print(f"TTS: Generated audio array of length {len(audio_array)} at {sample_rate}Hz")
+            # Validate audio array
+            if len(audio_array) == 0:
+                print("TTS: Warning - Generated audio array is empty")
+                return None, None
+            # Check for potential issues with audio data
+            audio_min = np.min(audio_array)
+            audio_max = np.max(audio_array)
+            audio_rms = np.sqrt(np.mean(audio_array**2))
+            print(f"TTS: Audio statistics - Min: {audio_min:.4f}, Max: {audio_max:.4f}, RMS: {audio_rms:.4f}")
+            # Check if audio might be silent or corrupted
+            if audio_rms < 0.001:
+                print("TTS: Warning - Audio appears to be very quiet or silent")
+            if audio_max > 1.0 or audio_min < -1.0:
+                print("TTS: Warning - Audio values outside expected range [-1, 1]")
+                # Clip to valid range
+                audio_array = np.clip(audio_array, -1.0, 1.0)
+                print("TTS: Clipped audio to valid range")
+            # Generate WAV at original sample rate first
+            wav_bytes_original = self._convert_to_wav_bytes(audio_array, sample_rate)
+            print(f"TTS: Converted to WAV: {len(wav_bytes_original)} bytes")
+            # Generate WebM from original WAV
+            webm_bytes = await self._convert_to_webm(wav_bytes_original)
+            print(f"TTS: Converted to WebM: {len(webm_bytes)} bytes")
+            # Generate 16kHz WAV for Android
+            wav_bytes_16k = await self._resample_wav_to_16khz(wav_bytes_original, sample_rate)
+            print(f"TTS: Resampled to 16kHz WAV: {len(wav_bytes_16k)} bytes")
+            print(f"TTS: Generated dual format audio for '{text}'")
+            print(f"=== TTS DUAL FORMAT GENERATION COMPLETE ===")
+            return webm_bytes, wav_bytes_16k
+        except Exception as e:
+            print(f"TTS: Dual format TTS generation error: {e}")
+            import traceback
+            traceback.print_exc()
+            return None, None
+    def _convert_to_wav_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
+        """Convert numpy audio array to WAV bytes"""
+        buffer = io.BytesIO()
+        with wave.open(buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(sample_rate)
+            # Ensure audio is in valid range [-1, 1]
+            audio_array = np.clip(audio_array, -1.0, 1.0)
+            # Convert to int16 with proper scaling
+            int16_audio = (audio_array * 32767).astype(np.int16)
+            # Validate the converted audio
+            print(f"TTS: Converting {len(audio_array)} samples to WAV at {sample_rate}Hz")
+            print(f"TTS: Int16 audio range: {np.min(int16_audio)} to {np.max(int16_audio)}")
+            wav_file.writeframes(int16_audio.tobytes())
+        wav_data = buffer.getvalue()
+        print(f"TTS: WAV file created: {len(wav_data)} bytes (expected header: 44 bytes + {len(int16_audio) * 2} data bytes)")
+        return wav_data
+    async def _resample_wav_to_16khz(self, wav_bytes: bytes, original_sample_rate: int) -> bytes:
+        """Resample WAV audio to 16kHz using FFmpeg"""
+        try:
+            process = subprocess.Popen([
+                "ffmpeg", "-f", "wav", "-i", "pipe:0",
+                "-ar", "16000",  # Set output sample rate to 16kHz
+                "-ac", "1",      # Ensure mono output
+                "-f", "wav", "pipe:1"
+            ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            resampled_data, stderr = process.communicate(input=wav_bytes)
+            if process.returncode != 0:
+                print(f"TTS: FFmpeg resampling error: {stderr.decode()}")
+                return wav_bytes  # Return original if resampling fails
+            return resampled_data
+        except Exception as e:
+            print(f"TTS: Resampling error: {e}")
+            return wav_bytes  # Return original if resampling fails
+    async def _convert_to_webm(self, wav_bytes: bytes) -> bytes:
+        """Convert WAV bytes to WebM format using FFmpeg"""
+        try:
+            process = subprocess.Popen([
+                "ffmpeg", "-f", "wav", "-i", "pipe:0",
+                "-c:a", "libopus", "-b:a", "64k",
+                "-f", "webm", "pipe:1"
+            ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            webm_data, stderr = process.communicate(input=wav_bytes)
+            if process.returncode != 0:
+                print(f"TTS: FFmpeg error: {stderr.decode()}")
+                return wav_bytes  # Return original WAV if conversion fails
+            return webm_data
+        except Exception as e:
+            print(f"TTS: WebM conversion error: {e}")
+            return wav_bytes  # Return original WAV if conversion fails
+    async def load_remaining_models_in_background(self):
+        """Load all remaining TTS models in the background after startup"""
+        try:
+            print("TTS: Starting background loading of additional voice models...")
+            # Load primary models first
+            for lang_code in self.tts_config.keys():
+                if lang_code not in self.preload_languages and lang_code not in self.tts_pipelines:
+                    if self.espeak_available:
+                        try:
+                            print(f"TTS: Background loading primary model for {lang_code}...")
+                            self.models_loading_status[lang_code] = "loading"
+                            model_config = self.tts_config[lang_code]
+                            pipeline_obj = pipeline(
+                                "text-to-speech",
+                                model=model_config["model_repo"],
+                                device=self.device
+                            )
+                            self.tts_pipelines[lang_code] = pipeline_obj
+                            self.models_loading_status[lang_code] = "loaded"
+                            print(f"TTS: Successfully loaded primary model for {lang_code} in background")
+                            # Add a small delay between loading models
+                            await asyncio.sleep(2)
+                        except Exception as e:
+                            print(f"TTS: Failed to load primary model for {lang_code} in background: {e}")
+                            self.models_loading_status[lang_code] = "failed"
+            # Load fallback models for languages not yet loaded
+            for lang_code in self.fallback_tts_config.keys():
+                if lang_code not in self.tts_pipelines:
+                    try:
+                        print(f"TTS: Background loading fallback model for {lang_code}...")
+                        model_config = self.fallback_tts_config[lang_code]
+                        if model_config["model_type"] == "speecht5":
+                            from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
+                            processor = SpeechT5Processor.from_pretrained(model_config["model_repo"])
+                            model = SpeechT5ForTextToSpeech.from_pretrained(model_config["model_repo"])
+                            vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+                            if self.device >= 0:
+                                model = model.to(f"cuda:{self.device}")
+                                vocoder = vocoder.to(f"cuda:{self.device}")
+                            self.tts_pipelines[lang_code] = {
+                                "type": "speecht5",
+                                "processor": processor,
+                                "model": model,
+                                "vocoder": vocoder
+                            }
+                        else:
+                            pipeline_obj = pipeline(
+                                "text-to-speech",
+                                model=model_config["model_repo"],
+                                device=self.device
+                            )
+                            self.tts_pipelines[lang_code] = pipeline_obj
+                        print(f"TTS: Successfully loaded fallback model for {lang_code} in background")
+                        await asyncio.sleep(2)
+                    except Exception as e:
+                        print(f"TTS: Failed to load fallback model for {lang_code}: {e}")
+            print("TTS: Background loading of all voice models complete")
+            print(f"TTS: Loaded models: {list(self.tts_pipelines.keys())}")
+        except Exception as e:
+            print(f"TTS: Error in background model loading: {e}")
+    def start_background_loading(self):
+        """Start background loading of models as a non-blocking task"""
+        if self.background_loading_task is None:
+            self.background_loading_task = asyncio.create_task(self.load_remaining_models_in_background())
+            print("TTS: Background model loading task started")
+    async def cleanup(self):
+        """Cleanup resources"""
+        # Cancel background loading if still running
+        if self.background_loading_task and not self.background_loading_task.done():
+            self.background_loading_task.cancel()
+            try:
+                await self.background_loading_task
+            except asyncio.CancelledError:
+                pass
+        self.tts_pipelines.clear()
+        print("TTS: TTS service cleaned up")

app/services/tts_service_onnx.py ADDED Viewed

	@@ -0,0 +1,587 @@

+import asyncio
+import io
+import wave
+import numpy as np
+import subprocess
+from typing import Dict, Optional
+import onnxruntime as ort
+from transformers import AutoProcessor
+from collections import OrderedDict
+import os
+class ONNXTTSService:
+    def __init__(self):
+        self.tts_models: Dict[str, any] = {}
+        self.processors: Dict[str, any] = {}
+        self.max_tts_models = 3  # Keep up to 3 TTS models in memory
+        self.model_cache = OrderedDict()  # LRU cache
+        # GPU optimization - detect and configure providers
+        available_providers = ort.get_available_providers()
+        print(f"ONNX TTS: Available providers: {available_providers}")
+        if 'CUDAExecutionProvider' in available_providers:
+            # Configure CUDA provider with optimizations
+            cuda_provider_options = {
+                'device_id': 0,
+                'arena_extend_strategy': 'kNextPowerOfTwo',
+                'gpu_mem_limit': int(0.7 * 1024 * 1024 * 1024),  # 70% of GPU memory (TTS uses less than ASR)
+                'cudnn_conv_algo_search': 'EXHAUSTIVE',
+                'do_copy_in_default_stream': True,
+            }
+            self.providers = [('CUDAExecutionProvider', cuda_provider_options), 'CPUExecutionProvider']
+            print(f"ONNX TTS: Using CUDA acceleration with GPU memory limit: {cuda_provider_options['gpu_mem_limit'] // (1024**3)}GB")
+        else:
+            self.providers = ['CPUExecutionProvider']
+            print("ONNX TTS: CUDA not available, using CPU execution")
+        print(f"ONNX TTS: Configured providers: {[p[0] if isinstance(p, tuple) else p for p in self.providers]}")
+        # Check if espeak is available
+        self.espeak_available = self._check_espeak_availability()
+        # ONNX TTS model configurations - using FP32 optimized models (16kHz corrected)
+        self.tts_config = {
+            "kik": {"model_repo": "mutisya/vits-tts-onnx-fp32-kikuyu-v25-37-1", "model_type": "vits", "use_onnx": True},
+            "luo": {"model_repo": "mutisya/vits-tts-onnx-fp32-luo-v25-37-1", "model_type": "vits", "use_onnx": True},
+            "kam": {"model_repo": "mutisya/vits-tts-onnx-fp32-kamba-v25-37-1", "model_type": "vits", "use_onnx": True},
+            "mer": {"model_repo": "mutisya/vits-tts-onnx-fp32-kimeru-v25-37-1", "model_type": "vits", "use_onnx": True},
+            "som": {"model_repo": "mutisya/vits-tts-onnx-fp32-somali-v25-37-1", "model_type": "vits", "use_onnx": True},
+            "swa": {"model_repo": "mutisya/vits-tts-onnx-fp32-swahili-v25-37-1", "model_type": "vits", "use_onnx": True},
+            "eng": {"model_repo": "kakao-enterprise/vits-ljs", "model_type": "vits", "use_onnx": False},  # Fallback to PyTorch
+        }
+        # Alternative TTS models that don't require espeak (fallback)
+        self.fallback_tts_config = {
+            "eng": {"model_repo": "microsoft/speecht5_tts", "model_type": "speecht5"},
+            "swa": {"model_repo": "facebook/mms-tts-swh", "model_type": "mms"},
+            "som": {"model_repo": "facebook/mms-tts-som", "model_type": "mms"},
+        }
+        self.preload_languages = ["kik", "swa"]
+    def _check_espeak_availability(self) -> bool:
+        """Check if espeak is available on the system"""
+        try:
+            result = subprocess.run(['espeak', '--version'],
+                                  capture_output=True, text=True, timeout=5)
+            if result.returncode == 0:
+                print("ONNX TTS: espeak is available")
+                return True
+            else:
+                print("ONNX TTS: espeak command failed")
+                return False
+        except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
+            print(f"ONNX TTS: espeak not available: {e}")
+            return False
+    async def initialize(self):
+        """Initialize TTS models for preloaded languages"""
+        print("ONNX TTS: Initializing TTS service with ONNX Runtime...")
+        print(f"ONNX TTS: espeak available: {self.espeak_available}")
+        print(f"ONNX TTS: Using providers: {self.providers}")
+        for lang_code in self.preload_languages:
+            await self.ensure_model_loaded(lang_code)
+    async def ensure_model_loaded(self, language_code: str):
+        """Load TTS model for language if not already loaded with LRU cache"""
+        if language_code in self.model_cache:
+            # Move to end (most recently used)
+            self.model_cache.move_to_end(language_code)
+            return
+        # Check if we need to evict old models
+        while len(self.model_cache) >= self.max_tts_models:
+            # Remove least recently used model
+            old_lang, _ = self.model_cache.popitem(last=False)
+            if old_lang in self.tts_models:
+                del self.tts_models[old_lang]
+            if old_lang in self.processors:
+                del self.processors[old_lang]
+            print(f"ONNX TTS: Evicted model for {old_lang} (LRU cache)")
+        # First try to load ONNX model
+        if language_code in self.tts_config:
+            model_config = self.tts_config[language_code]
+            if model_config.get("use_onnx", False):
+                try:
+                    print(f"ONNX TTS: Loading ONNX model for {language_code}")
+                    # Create ONNX session with optimizations and verbose logging
+                    session_options = ort.SessionOptions()
+                    session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+                    # Enable verbose logging to diagnose operator assignments
+                    session_options.log_severity_level = 1  # WARNING level for detailed logs
+                    session_options.logid = "ONNX_TTS"      # Prefix for log identification
+                    # GPU memory optimization for T4 with diagnostic tracing
+                    if 'CUDAExecutionProvider' in self.providers:
+                        provider_options = [{
+                            'device_id': 0,
+                            'arena_extend_strategy': 'kSameAsRequested',
+                            'gpu_mem_limit': int(0.3 * 1024 * 1024 * 1024),  # 30% of GPU memory for TTS
+                            'cudnn_conv_algo_search': 'EXHAUSTIVE',
+                            'do_copy_in_default_stream': True,
+                            'enable_tracing': True,  # Enable tracing for better diagnostics
+                        }]
+                        providers = [('CUDAExecutionProvider', provider_options[0]), 'CPUExecutionProvider']
+                    else:
+                        providers = self.providers
+                    # Get authentication token for private repos
+                    import os
+                    auth_token = os.getenv('HUGGING_FACE_HUB_TOKEN') or os.getenv('HF_TOKEN')
+                    # Download ONNX model from HuggingFace Hub with authentication
+                    from huggingface_hub import hf_hub_download
+                    onnx_path = hf_hub_download(
+                        repo_id=model_config["model_repo"],
+                        filename="model.onnx",
+                        token=auth_token
+                    )
+                    session = ort.InferenceSession(onnx_path, providers=providers, sess_options=session_options)
+                    # Load processor for preprocessing with authentication
+                    processor = AutoProcessor.from_pretrained(
+                        model_config["model_repo"],
+                        token=auth_token
+                    )
+                    self.tts_models[language_code] = session
+                    self.processors[language_code] = processor
+                    self.model_cache[language_code] = True
+                    print(f"ONNX TTS: Successfully loaded ONNX model for {language_code}")
+                    return
+                except Exception as e:
+                    print(f"ONNX TTS: Failed to load ONNX model for {language_code}: {e}")
+                    # Continue to try fallback models
+            else:
+                # Try PyTorch model if ONNX not available
+                try:
+                    print(f"ONNX TTS: Loading PyTorch model for {language_code} (fallback)")
+                    from transformers import pipeline
+                    pipeline_obj = pipeline(
+                        "text-to-speech",
+                        model=model_config["model_repo"],
+                        device=0 if self.providers[0] == 'CUDAExecutionProvider' else -1
+                    )
+                    self.tts_models[language_code] = pipeline_obj
+                    self.processors[language_code] = None  # Not needed for pipeline
+                    self.model_cache[language_code] = True
+                    print(f"ONNX TTS: Successfully loaded PyTorch model for {language_code}")
+                    return
+                except Exception as e:
+                    print(f"ONNX TTS: Failed to load PyTorch model for {language_code}: {e}")
+        # Try fallback models if primary failed
+        if language_code in self.fallback_tts_config:
+            try:
+                model_config = self.fallback_tts_config[language_code]
+                if model_config["model_type"] == "speecht5":
+                    # Special handling for SpeechT5
+                    from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+                    import torch
+                    # Get authentication token for private repos
+                    import os
+                    auth_token = os.getenv('HUGGING_FACE_HUB_TOKEN') or os.getenv('HF_TOKEN')
+                    processor = SpeechT5Processor.from_pretrained(
+                        model_config["model_repo"],
+                        token=auth_token
+                    )
+                    model = SpeechT5ForTextToSpeech.from_pretrained(
+                        model_config["model_repo"],
+                        token=auth_token
+                    )
+                    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+                    # Create a custom pipeline-like object
+                    class SpeechT5Pipeline:
+                        def __init__(self, processor, model, vocoder):
+                            self.processor = processor
+                            self.model = model
+                            self.vocoder = vocoder
+                        def __call__(self, text):
+                            inputs = self.processor(text=text, return_tensors="pt")
+                            # Use default speaker embeddings
+                            import datasets
+                            embeddings_dataset = datasets.load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+                            speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+                            speech = self.model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=self.vocoder)
+                            return {
+                                "audio": speech.numpy(),
+                                "sampling_rate": 16000
+                            }
+                    pipeline_obj = SpeechT5Pipeline(processor, model, vocoder)
+                else:
+                    # Standard pipeline for MMS models
+                    from transformers import pipeline
+                    pipeline_obj = pipeline(
+                        "text-to-speech",
+                        model=model_config["model_repo"],
+                        device=0 if self.providers[0] == 'CUDAExecutionProvider' else -1
+                    )
+                self.tts_models[language_code] = pipeline_obj
+                self.processors[language_code] = None
+                self.model_cache[language_code] = True
+                print(f"ONNX TTS: Successfully loaded fallback model for {language_code}")
+                return
+            except Exception as e:
+                print(f"ONNX TTS: Failed to load fallback TTS model for {language_code}: {e}")
+        print(f"ONNX TTS: No TTS model available for language: {language_code}")
+    async def generate_speech(self, text: str, language_code: str, output_format: str = "webm") -> Optional[bytes]:
+        """Generate speech audio from text using ONNX models
+        Args:
+            text: Text to convert to speech
+            language_code: Language code for TTS model
+            output_format: Output format - "webm" (default, web-compatible) or "wav" (Android-compatible)
+        Returns:
+            Audio bytes in the requested format, or None if generation fails
+        """
+        try:
+            print(f"=== ONNX TTS GENERATION REQUEST ===")
+            print(f"Text: '{text}'")
+            print(f"Language: {language_code}")
+            print(f"Output format: {output_format}")
+            # Input validation
+            if not text or not text.strip():
+                print("ONNX TTS: Empty or whitespace-only text, skipping TTS generation")
+                return None
+            # Check for very short text that might cause issues
+            clean_text = text.strip()
+            if len(clean_text) <= 2 and clean_text in [".", ",", "!", "?", ":", ";", "-"]:
+                print(f"ONNX TTS: Text '{clean_text}' is too short or punctuation-only, skipping TTS generation")
+                return None
+            # Check for minimum meaningful length
+            if len(clean_text.replace(" ", "").replace(".", "").replace(",", "")) < 2:
+                print(f"ONNX TTS: Text '{clean_text}' has insufficient content for TTS, skipping")
+                return None
+            # Check if the language is supported
+            if language_code not in self.tts_config and language_code not in self.fallback_tts_config:
+                print(f"ONNX TTS: Language {language_code} not configured for TTS")
+                return None
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.tts_models:
+                print(f"ONNX TTS: TTS model not available for language: {language_code}")
+                return None
+            print(f"ONNX TTS: Generating speech for '{text}' in {language_code}")
+            # Generate speech based on model type
+            model_config = self.tts_config.get(language_code, {})
+            if model_config.get("use_onnx", False):
+                # ONNX inference
+                audio_array, sample_rate = await self._run_onnx_tts_inference(text, language_code)
+            else:
+                # PyTorch pipeline inference
+                pipeline_obj = self.tts_models[language_code]
+                result = pipeline_obj(text)
+                audio_array = result["audio"]
+                sample_rate = result.get("sampling_rate", 16000)  # Default to 16kHz (corrected)
+            print(f"ONNX TTS: Generated audio array of length {len(audio_array)} at {sample_rate}Hz")
+            # Validate audio array
+            if len(audio_array) == 0:
+                print("ONNX TTS: Warning - Generated audio array is empty")
+                return None
+            # Check audio statistics
+            audio_min = np.min(audio_array)
+            audio_max = np.max(audio_array)
+            audio_rms = np.sqrt(np.mean(audio_array**2))
+            print(f"ONNX TTS: Audio statistics - Min: {audio_min:.4f}, Max: {audio_max:.4f}, RMS: {audio_rms:.4f}")
+            # Check if audio might be silent or corrupted
+            if audio_rms < 0.001:
+                print("ONNX TTS: Warning - Audio appears to be very quiet or silent")
+            if audio_max > 1.0 or audio_min < -1.0:
+                print("ONNX TTS: Warning - Audio values outside expected range [-1, 1]")
+                # Clip to valid range
+                audio_array = np.clip(audio_array, -1.0, 1.0)
+                print("ONNX TTS: Clipped audio to valid range")
+            # Convert to requested format
+            if output_format == "wav":
+                # For Android: use 16kHz sample rate
+                target_sample_rate = 16000
+                wav_bytes = self._convert_to_wav_bytes(audio_array, sample_rate)
+                print(f"ONNX TTS: Converted to WAV: {len(wav_bytes)} bytes")
+                # Convert sample rate to 16kHz if needed for Android compatibility
+                if sample_rate != target_sample_rate:
+                    print(f"ONNX TTS: Converting sample rate from {sample_rate}Hz to {target_sample_rate}Hz")
+                    wav_bytes = await self._resample_wav_to_16khz(wav_bytes, sample_rate)
+                    print(f"ONNX TTS: Resampled WAV: {len(wav_bytes)} bytes")
+                print(f"ONNX TTS: Generated {len(wav_bytes)} bytes of WAV audio for '{text}'")
+                print(f"=== ONNX TTS GENERATION COMPLETE ===")
+                return wav_bytes
+            else:
+                # For web: use original sample rate and convert to WebM
+                wav_bytes = self._convert_to_wav_bytes(audio_array, sample_rate)
+                print(f"ONNX TTS: Converted to WAV: {len(wav_bytes)} bytes")
+                # Convert to WebM format for web compatibility
+                webm_bytes = await self._convert_to_webm(wav_bytes)
+                print(f"ONNX TTS: Generated {len(webm_bytes)} bytes of WebM audio for '{text}'")
+                print(f"=== ONNX TTS GENERATION COMPLETE ===")
+                return webm_bytes
+        except Exception as e:
+            print(f"ONNX TTS: TTS generation error: {e}")
+            import traceback
+            traceback.print_exc()
+            return None
+    async def _run_onnx_tts_inference(self, text: str, language_code: str) -> tuple[np.ndarray, int]:
+        """Run ONNX inference for text-to-speech"""
+        try:
+            session = self.tts_models[language_code]
+            processor = self.processors[language_code]
+            # Preprocess text
+            inputs = processor(text=text, return_tensors="np")
+            # Get input names for ONNX session
+            input_names = [inp.name for inp in session.get_inputs()]
+            # Prepare inputs for ONNX
+            onnx_inputs = {}
+            for name in input_names:
+                if name in inputs:
+                    onnx_inputs[name] = inputs[name]
+                elif name == "input_ids" and "input_ids" in inputs:
+                    onnx_inputs[name] = inputs["input_ids"].astype(np.int64)
+                elif name == "attention_mask" and "attention_mask" in inputs:
+                    onnx_inputs[name] = inputs["attention_mask"].astype(np.int64)
+            # Run ONNX inference
+            outputs = session.run(None, onnx_inputs)
+            # Extract audio from outputs (assuming first output is audio)
+            audio_array = outputs[0]
+            # Ensure audio is 1D
+            if audio_array.ndim > 1:
+                audio_array = audio_array.flatten()
+            # Convert to float32 if needed
+            if audio_array.dtype != np.float32:
+                audio_array = audio_array.astype(np.float32)
+            # Sample rate is 16kHz for our corrected models
+            sample_rate = 16000
+            return audio_array, sample_rate
+        except Exception as e:
+            print(f"ONNX TTS: Inference error: {e}")
+            import traceback
+            traceback.print_exc()
+            return np.array([], dtype=np.float32), 16000
+    async def generate_speech_dual_format(self, text: str, language_code: str) -> tuple[Optional[bytes], Optional[bytes]]:
+        """Generate speech audio in both WebM and WAV formats using ONNX
+        Args:
+            text: Text to convert to speech
+            language_code: Language code for TTS model
+        Returns:
+            Tuple of (webm_bytes, wav_bytes), either can be None if generation fails
+        """
+        try:
+            print(f"=== ONNX TTS DUAL FORMAT GENERATION REQUEST ===")
+            print(f"Text: '{text}'")
+            print(f"Language: {language_code}")
+            # Input validation
+            if not text or not text.strip():
+                print("ONNX TTS: Empty or whitespace-only text, skipping TTS generation")
+                return None, None
+            clean_text = text.strip()
+            if len(clean_text) <= 2 and clean_text in [".", ",", "!", "?", ":", ";", "-"]:
+                print(f"ONNX TTS: Text '{clean_text}' is too short or punctuation-only, skipping TTS generation")
+                return None, None
+            if len(clean_text.replace(" ", "").replace(".", "").replace(",", "")) < 2:
+                print(f"ONNX TTS: Text '{clean_text}' has insufficient content for TTS, skipping")
+                return None, None
+            # Check if the language is supported
+            if language_code not in self.tts_config and language_code not in self.fallback_tts_config:
+                print(f"ONNX TTS: Language {language_code} not configured for TTS")
+                return None, None
+            await self.ensure_model_loaded(language_code)
+            if language_code not in self.tts_models:
+                print(f"ONNX TTS: TTS model not available for language: {language_code}")
+                return None, None
+            print(f"ONNX TTS: Generating speech for '{text}' in {language_code}")
+            # Generate speech once
+            model_config = self.tts_config.get(language_code, {})
+            if model_config.get("use_onnx", False):
+                # ONNX inference
+                audio_array, sample_rate = await self._run_onnx_tts_inference(text, language_code)
+            else:
+                # PyTorch pipeline inference
+                pipeline_obj = self.tts_models[language_code]
+                result = pipeline_obj(text)
+                audio_array = result["audio"]
+                sample_rate = result.get("sampling_rate", 16000)
+            print(f"ONNX TTS: Generated audio array of length {len(audio_array)} at {sample_rate}Hz")
+            # Validate audio array
+            if len(audio_array) == 0:
+                print("ONNX TTS: Warning - Generated audio array is empty")
+                return None, None
+            # Check for potential issues with audio data
+            audio_min = np.min(audio_array)
+            audio_max = np.max(audio_array)
+            audio_rms = np.sqrt(np.mean(audio_array**2))
+            print(f"ONNX TTS: Audio statistics - Min: {audio_min:.4f}, Max: {audio_max:.4f}, RMS: {audio_rms:.4f}")
+            if audio_rms < 0.001:
+                print("ONNX TTS: Warning - Audio appears to be very quiet or silent")
+            if audio_max > 1.0 or audio_min < -1.0:
+                print("ONNX TTS: Warning - Audio values outside expected range [-1, 1]")
+                audio_array = np.clip(audio_array, -1.0, 1.0)
+                print("ONNX TTS: Clipped audio to valid range")
+            # Generate WAV at original sample rate first
+            wav_bytes_original = self._convert_to_wav_bytes(audio_array, sample_rate)
+            print(f"ONNX TTS: Converted to WAV: {len(wav_bytes_original)} bytes")
+            # Generate WebM from original WAV
+            webm_bytes = await self._convert_to_webm(wav_bytes_original)
+            print(f"ONNX TTS: Converted to WebM: {len(webm_bytes)} bytes")
+            # Generate 16kHz WAV for Android
+            wav_bytes_16k = await self._resample_wav_to_16khz(wav_bytes_original, sample_rate)
+            print(f"ONNX TTS: Resampled to 16kHz WAV: {len(wav_bytes_16k)} bytes")
+            print(f"ONNX TTS: Generated dual format audio for '{text}'")
+            print(f"=== ONNX TTS DUAL FORMAT GENERATION COMPLETE ===")
+            return webm_bytes, wav_bytes_16k
+        except Exception as e:
+            print(f"ONNX TTS: Dual format TTS generation error: {e}")
+            import traceback
+            traceback.print_exc()
+            return None, None
+    def _convert_to_wav_bytes(self, audio_array: np.ndarray, sample_rate: int) -> bytes:
+        """Convert numpy audio array to WAV bytes"""
+        buffer = io.BytesIO()
+        with wave.open(buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(sample_rate)
+            # Ensure audio is in valid range [-1, 1]
+            audio_array = np.clip(audio_array, -1.0, 1.0)
+            # Convert to int16 with proper scaling
+            int16_audio = (audio_array * 32767).astype(np.int16)
+            # Validate the converted audio
+            print(f"ONNX TTS: Converting {len(audio_array)} samples to WAV at {sample_rate}Hz")
+            print(f"ONNX TTS: Int16 audio range: {np.min(int16_audio)} to {np.max(int16_audio)}")
+            wav_file.writeframes(int16_audio.tobytes())
+        wav_data = buffer.getvalue()
+        print(f"ONNX TTS: WAV file created: {len(wav_data)} bytes")
+        return wav_data
+    async def _resample_wav_to_16khz(self, wav_bytes: bytes, original_sample_rate: int) -> bytes:
+        """Resample WAV audio to 16kHz using FFmpeg"""
+        try:
+            process = subprocess.Popen([
+                "ffmpeg", "-f", "wav", "-i", "pipe:0",
+                "-ar", "16000",  # Set output sample rate to 16kHz
+                "-ac", "1",      # Ensure mono output
+                "-f", "wav", "pipe:1"
+            ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            resampled_data, stderr = process.communicate(input=wav_bytes)
+            if process.returncode != 0:
+                print(f"ONNX TTS: FFmpeg resampling error: {stderr.decode()}")
+                return wav_bytes  # Return original if resampling fails
+            return resampled_data
+        except Exception as e:
+            print(f"ONNX TTS: Resampling error: {e}")
+            return wav_bytes  # Return original if resampling fails
+    async def _convert_to_webm(self, wav_bytes: bytes) -> bytes:
+        """Convert WAV bytes to WebM format using FFmpeg"""
+        try:
+            process = subprocess.Popen([
+                "ffmpeg", "-f", "wav", "-i", "pipe:0",
+                "-c:a", "libopus", "-b:a", "64k",
+                "-f", "webm", "pipe:1"
+            ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            webm_data, stderr = process.communicate(input=wav_bytes)
+            if process.returncode != 0:
+                print(f"ONNX TTS: FFmpeg error: {stderr.decode()}")
+                return wav_bytes  # Return original WAV if conversion fails
+            return webm_data
+        except Exception as e:
+            print(f"ONNX TTS: WebM conversion error: {e}")
+            return wav_bytes  # Return original WAV if conversion fails
+    async def cleanup(self):
+        """Cleanup resources"""
+        self.tts_models.clear()
+        self.processors.clear()
+        self.model_cache.clear()
+        print("ONNX TTS: TTS service cleaned up")

app/services/websocket_manager.py ADDED Viewed

	@@ -0,0 +1,909 @@

+import asyncio
+import uuid
+from typing import Dict, Set, Optional
+import socketio
+import numpy as np
+from app.models import Message, LanguageCode
+from app.services.session_manager import SessionManager, LANGUAGE_MAP
+from app.services.transcription_service import TranscriptionService
+from app.services.translation_service import TranslationService
+from app.services.tts_service import TTSService
+def truncate_array_for_log(arr, max_items=10):
+    """Helper function to truncate arrays in log messages for readability"""
+    if not arr or len(arr) <= max_items:
+        return arr
+    return arr[:max_items] + [f"... {len(arr) - max_items} more items"]
+class WebSocketManager:
+    def __init__(self, session_manager: SessionManager, transcription_service: TranscriptionService,
+                 translation_service: TranslationService, tts_service: TTSService):
+        self.session_manager = session_manager
+        self.transcription_service = transcription_service
+        self.translation_service = translation_service
+        self.tts_service = tts_service
+        self.sio = None  # Will be set by main.py
+        self.client_sessions: Dict[str, str] = {}  # sid -> session_id
+        self.client_participants: Dict[str, str] = {}  # sid -> participant_id
+        self.session_clients: Dict[str, Set[str]] = {}  # session_id -> set of sids
+        self.messages: Dict[str, Message] = {}  # message_id -> message
+        self.participant_current_message: Dict[str, str] = {}  # participant_id -> current_message_id
+        self.processed_messages: Set[str] = set()  # Track processed message IDs to prevent duplicates
+    def set_socketio(self, sio):
+        """Set the Socket.IO server instance"""
+        self.sio = sio
+    async def handle_join_session(self, sid: str, data: dict):
+        """Handle participant joining a session"""
+        try:
+            session_id = data.get('sessionId')
+            participant_name = data.get('participantName')
+            language_code = data.get('language')
+            print(f"=== JOIN SESSION REQUEST ===")
+            print(f"Session ID: {session_id}")
+            print(f"Participant: {participant_name}")
+            print(f"Language: {language_code}")
+            if not all([session_id, participant_name, language_code]):
+                await self._emit_error(sid, "Missing required fields")
+                return
+            # Validate language code
+            try:
+                lang_enum = LanguageCode(language_code)
+                print(f"Language code validated: {lang_enum}")
+            except ValueError:
+                await self._emit_error(sid, f"Invalid language code: {language_code}")
+                return
+            # Resolve session ID (in case it's a short code)
+            session = await self.session_manager.get_session(session_id)
+            if not session:
+                await self._emit_error(sid, "Session not found")
+                return
+            # Use the full UUID for all subsequent operations
+            session_id = session.id
+            print(f"Resolved session ID: {session_id}")
+            # Add participant to session
+            participant = await self.session_manager.add_participant(
+                session_id, participant_name, lang_enum
+            )
+            print(f"Participant created: {participant}")
+            if not participant:
+                await self._emit_error(sid, "Session not found or unable to join")
+                return
+            # Get updated session info
+            session = await self.session_manager.get_session(session_id)
+            if session:
+                print(f"Session {session_id} now has {len(session.languages)} languages: {[f'{lang.name}({lang.code.value})' for lang in session.languages]}")
+                print(f"Session participants: {[f'{p.name}({p.language.name})' for p in session.participants]}")
+            # Track client connections
+            self.client_sessions[sid] = session_id
+            self.client_participants[sid] = participant.id
+            if session_id not in self.session_clients:
+                self.session_clients[session_id] = set()
+            self.session_clients[session_id].add(sid)
+            # Send success response
+            await self.sio.emit('participant_joined', participant.dict(), room=sid)
+            # Notify other participants
+            await self._broadcast_to_session(session_id, 'participant_update', participant.dict(), exclude_sid=sid)
+            print(f"=== JOIN SESSION COMPLETE ===")
+        except Exception as e:
+            print(f"Error in handle_join_session: {e}")
+            import traceback
+            traceback.print_exc()
+            await self._emit_error(sid, "Failed to join session")
+    async def handle_join_hub(self, sid: str, data: dict):
+        """Handle hub joining a session for observation"""
+        try:
+            session_id = data.get('sessionId')
+            if not session_id:
+                await self._emit_error(sid, "Missing sessionId for hub")
+                return
+            # Verify session exists
+            session = await self.session_manager.get_session(session_id)
+            if not session:
+                await self._emit_error(sid, "Session not found")
+                return
+            # Track hub connection
+            self.client_sessions[sid] = session_id
+            if session_id not in self.session_clients:
+                self.session_clients[session_id] = set()
+            self.session_clients[session_id].add(sid)
+            # Send success response
+            await self.sio.emit('hub_joined', {'sessionId': session_id}, room=sid)
+            print(f"Hub joined session {session_id} with sid {sid}")
+        except Exception as e:
+            print(f"Error in handle_join_hub: {e}")
+            await self._emit_error(sid, "Failed to join as hub")
+    async def handle_audio_chunk(self, sid: str, data: dict):
+        """Handle incoming audio chunk from participant"""
+        try:
+            participant_id = self.client_participants.get(sid)
+            if not participant_id:
+                return
+            audio_data = data.get('audioData', [])
+            is_pause_boundary = data.get('isPauseBoundary', False)
+            if not audio_data:
+                return
+            # Convert array to bytes
+            audio_bytes = bytes(audio_data)
+            # Process audio chunk using VAD-based approach
+            if audio_bytes:
+                # Check for voice activity in this chunk
+                has_voice = self.transcription_service.has_voice_activity(audio_bytes)
+                # Process the chunk (even if no voice to handle silence detection)
+                # If isPauseBoundary is True, force finalization by treating as silence
+                await self._process_audio_chunk_vad(participant_id, audio_bytes, has_voice and not is_pause_boundary, is_pause_boundary)
+        except Exception as e:
+            print(f"Error in handle_audio_chunk: {e}")
+            import traceback
+            traceback.print_exc()
+    async def handle_speaking_status(self, sid: str, data: dict):
+        """Handle speaking status updates"""
+        try:
+            participant_id = self.client_participants.get(sid)
+            if not participant_id:
+                return
+            is_speaking = data.get('isSpeaking', False)
+            await self.session_manager.update_participant_speaking_status(participant_id, is_speaking)
+            # If participant stopped speaking, force complete any pending sentence
+            if not is_speaking:
+                # Get session and participant info for force completion
+                session_id = await self.session_manager.get_participant_session_id(participant_id)
+                if session_id:
+                    session = await self.session_manager.get_session(session_id)
+                    participant = next((p for p in session.participants if p.id == participant_id), None)
+                    if participant:
+                        # Define the sentence callback for force completion
+                        async def force_sentence_callback(final_text: str, final_audio: bytes):
+                            # Create or get existing message
+                            current_message_id = self.participant_current_message.get(participant_id)
+                            if not current_message_id:
+                                current_message_id = str(uuid.uuid4())
+                            # Check if this message was already processed
+                            if current_message_id in self.processed_messages:
+                                print(f"Force completion: Message {current_message_id} already processed, skipping duplicate")
+                                return
+                            # Mark as processed to prevent duplicates
+                            self.processed_messages.add(current_message_id)
+                            from app.models import Message
+                            message = Message(
+                                id=current_message_id,
+                                session_id=session_id,
+                                speaker_id=participant_id,
+                                speaker_name=participant.name,
+                                original_text=final_text,
+                                original_language=participant.language,
+                                translations={},
+                                is_transcribing=False
+                            )
+                            self.messages[current_message_id] = message
+                            # Broadcast the completed message
+                            print(f"Force completion: Broadcasting message_complete for {current_message_id}: '{final_text}'")
+                            await self._broadcast_to_session(session_id, 'message_complete', {
+                                'messageId': current_message_id,
+                                'sessionId': session_id,
+                                'text': final_text,
+                                'speakerId': participant_id,
+                                'speakerName': participant.name,
+                                'language': participant.language.code.value
+                            })
+                            # Clear current message tracking
+                            if participant_id in self.participant_current_message:
+                                del self.participant_current_message[participant_id]
+                            # Start translation processing (non-blocking to allow continued audio processing)
+                            print("Starting TRANSLATION and TTS (background task)")
+                            asyncio.create_task(self._process_translations_and_tts(message, session))
+                        # Force complete any pending sentence
+                        await self.transcription_service.force_complete_sentence(
+                            participant_id,
+                            participant.language.code.value,
+                            force_sentence_callback
+                        )
+                # Clear transcription service buffers after force completion
+                self.transcription_service.clear_participant_buffers(participant_id)
+            # Broadcast speaking status to session
+            session_id = self.client_sessions.get(sid)
+            if session_id:
+                await self._broadcast_to_session(session_id, 'speaking_status', {
+                    'participantId': participant_id,
+                    'isSpeaking': is_speaking
+                })
+        except Exception as e:
+            print(f"Error in handle_speaking_status: {e}")
+            import traceback
+            traceback.print_exc()
+    async def handle_leave_session(self, sid: str, data: dict):
+        """Handle participant leaving a session"""
+        await self._cleanup_client(sid)
+    async def handle_disconnect(self, sid: str):
+        """Handle client disconnection"""
+        await self._cleanup_client(sid)
+    async def _process_audio_chunk_vad(self, participant_id: str, audio_data: bytes, has_voice_activity: bool, is_pause_boundary: bool = False):
+        """Process audio chunk using VAD-based sentence detection
+        Args:
+            participant_id: ID of the participant
+            audio_data: Raw audio data bytes
+            has_voice_activity: Whether voice activity was detected in this chunk
+            is_pause_boundary: If True, forces sentence finalization (from stop button or explicit pause)
+        """
+        try:
+            session_id = await self.session_manager.get_participant_session_id(participant_id)
+            if not session_id:
+                return
+            session = await self.session_manager.get_session(session_id)
+            if not session:
+                return
+            participant = next((p for p in session.participants if p.id == participant_id), None)
+            if not participant:
+                return
+            # Get or create current message for this participant
+            current_message_id = self.participant_current_message.get(participant_id)
+            if not current_message_id:
+                current_message_id = str(uuid.uuid4())
+                message = Message(
+                    id=current_message_id,
+                    session_id=session_id,
+                    speaker_id=participant_id,
+                    speaker_name=participant.name,
+                    original_text="",
+                    original_language=participant.language,
+                    translations={},
+                    is_transcribing=True
+                )
+                self.messages[current_message_id] = message
+                self.participant_current_message[participant_id] = current_message_id
+                # Start typing indicator
+                await self._broadcast_to_session(session_id, 'typing_start', {
+                    'speakerId': participant_id,
+                    'speakerName': participant.name,
+                    'languageCode': participant.language.code.value
+                })
+            message = self.messages[current_message_id]
+            # Define callbacks
+            async def on_progress(text: str, is_complete: bool):
+                """Called with in-progress transcription updates"""
+                # Update the message text even for progress updates
+                message.original_text = text
+                await self._broadcast_to_session(session_id, 'transcription_progress', {
+                    'messageId': current_message_id,
+                    'text': text,
+                    'isTranscribing': not is_complete,
+                    'speakerId': participant_id,
+                    'speakerName': participant.name
+                })
+            async def on_debug(debug_info: dict):
+                """Called with debug information from ASR (wav2vec2 models only)"""
+                # Prepare debug data for transmission
+                debug_data = {
+                    'messageId': current_message_id,
+                    'text': debug_info['text'],
+                    'timestamps': debug_info['timestamps'],
+                    'audioData': list(debug_info['audio_data']),
+                    'audioDuration': debug_info['audio_duration'],
+                    'modelType': debug_info['model_type'],
+                    'language': participant.language.code.value
+                }
+                await self._broadcast_to_session(session_id, 'transcription_debug', debug_data)
+            async def on_sentence_complete(final_text: str, final_audio: bytes):
+                """Called when a complete sentence is detected"""
+                # Check if this message was already processed
+                if current_message_id in self.processed_messages:
+                    print(f"Message {current_message_id} already processed, skipping duplicate")
+                    return
+                # Mark as processed to prevent duplicates
+                self.processed_messages.add(current_message_id)
+                message.original_text = final_text
+                message.is_transcribing = False
+                # Broadcast complete sentence with session ID
+                message_data = {
+                    'messageId': current_message_id,
+                    'sessionId': session_id,
+                    'text': final_text,
+                    'speakerId': participant_id,
+                    'speakerName': participant.name,
+                    'language': participant.language.code.value,
+                    'audioData': list(final_audio)
+                }
+                print(f"Broadcasting message_complete for {current_message_id}: '{final_text}'")
+                await self._broadcast_to_session(session_id, 'message_complete', message_data)
+                # Stop typing indicator
+                await self._broadcast_to_session(session_id, 'typing_stop', {
+                    'speakerId': participant_id
+                })
+                # Clear current message tracking
+                if participant_id in self.participant_current_message:
+                    del self.participant_current_message[participant_id]
+                # Start translation and TTS processing (non-blocking to allow continued audio processing)
+                print("Starting TRANSLATION and TTS (background task)")
+                asyncio.create_task(self._process_translations_and_tts(message, session))
+            # Process the audio chunk
+            result_text = await self.transcription_service.process_audio_chunk(
+                audio_data,
+                participant.language.code.value,
+                participant_id,
+                has_voice_activity,
+                progress_callback=on_progress,
+                sentence_callback=on_sentence_complete,
+                debug_callback=on_debug
+            )
+            # If this is a pause boundary (stop button clicked), force immediate finalization
+            if is_pause_boundary and participant_id in self.participant_current_message:
+                print(f"Pause boundary detected - forcing sentence finalization for participant {participant_id}")
+                # Get the current accumulated text from transcription service
+                if hasattr(self.transcription_service, 'candidate_text_cache') and participant_id in self.transcription_service.candidate_text_cache:
+                    final_text = self.transcription_service.candidate_text_cache.get(participant_id, "").strip()
+                    if final_text:  # Only finalize if there's actual text
+                        # Get accumulated audio
+                        final_audio = b""
+                        if hasattr(self.transcription_service, 'candidate_audio_buffers') and participant_id in self.transcription_service.candidate_audio_buffers:
+                            audio_array = self.transcription_service.candidate_audio_buffers.get(participant_id, np.array([]))
+                            if len(audio_array) > 0:
+                                # Convert float array to int16 bytes
+                                audio_int16 = (audio_array * 32767).astype(np.int16)
+                                final_audio = audio_int16.tobytes()
+                        # Trigger sentence completion
+                        await on_sentence_complete(final_text, final_audio)
+                        # Clear the buffers manually since we're forcing finalization
+                        if participant_id in self.transcription_service.candidate_text_cache:
+                            self.transcription_service.candidate_text_cache[participant_id] = ""
+                        if participant_id in self.transcription_service.candidate_audio_buffers:
+                            self.transcription_service.candidate_audio_buffers[participant_id] = np.array([], dtype=np.float32)
+                        if participant_id in self.transcription_service.silence_counters:
+                            self.transcription_service.silence_counters[participant_id] = 0
+                        if participant_id in self.transcription_service.sentence_finalized:
+                            self.transcription_service.sentence_finalized[participant_id] = False
+        except Exception as e:
+            print(f"Error in _process_audio_chunk_vad: {e}")
+            import traceback
+            traceback.print_exc()
+    async def _process_translations_and_tts(self, message: Message, session):
+        """Process translations and TTS for all session languages"""
+        try:
+            source_lang = message.original_language.name
+            print(f"=== TRANSLATION/TTS PROCESSING START ===")
+            print(f"Message ID: {message.id}")
+            print(f"Original message: '{message.original_text}'")
+            print(f"Original language: {message.original_language.name} ({message.original_language.code.value})")
+            print(f"Session languages: {[f'{lang.name} ({lang.code.value})' for lang in session.languages]}")
+            print(f"Session ID for verification: {session.id}")
+            # Create a mapping to track which audio belongs to which message and language
+            audio_tasks = []
+            # Check if TTS is enabled for this session
+            if session.enable_tts:
+                # First, generate TTS for the original message
+                print(f"TTS: Generating TTS for original message in {message.original_language.code.value}: '{message.original_text}'")
+                print(f"TTS Model: VITS ONNX (mutisya/vits-tts-onnx-fp32-{message.original_language.name.lower()}) - File: tts_service_onnx.py")
+                original_audio_task = asyncio.create_task(
+                    self.tts_service.generate_speech_dual_format(message.original_text, message.original_language.code.value)
+                )
+                audio_tasks.append((
+                    message.original_language.code.value,
+                    message.original_text,
+                    original_audio_task,
+                    True  # is_original
+                ))
+            else:
+                print(f"TTS: Skipping TTS generation (disabled for this session)")
+            # Process translations for each language in the session
+            print(f"Processing translations for {len(session.languages)} session languages...")
+            print(f"Session languages: {[f'{lang.name}({lang.code.value})' for lang in session.languages]}")
+            translation_tasks = []
+            for language in session.languages:
+                print(f"Checking language: {language.name} ({language.code.value})")
+                if language.code != message.original_language.code:
+                    print(f"TRANSLATING: '{message.original_text}' from {source_lang} to {language.name}")
+                    print(f"Translation Model: mutisya/nllb_600m (NLLB-600M) - File: translation_service.py")
+                    # Create translation task
+                    translation_task = asyncio.create_task(
+                        self.translation_service.translate_text(
+                            message.original_text, source_lang, language.name
+                        )
+                    )
+                    translation_tasks.append((language, translation_task))
+                else:
+                    print(f"SKIPPING translation for {language.name} (same as original language)")
+            print(f"Created {len(translation_tasks)} translation tasks for non-original languages")
+            # Wait for all translations to complete
+            for language, translation_task in translation_tasks:
+                try:
+                    translated_text = await translation_task
+                    if translated_text:
+                        print(f"TRANSLATION SUCCESS: '{translated_text}' for {language.name}")
+                        message.translations[language.code.value] = translated_text
+                        # Broadcast translation update to all clients
+                        await self._broadcast_to_session(message.session_id, 'translation_update', {
+                            'messageId': message.id,
+                            'targetLanguage': language.code.value,
+                            'translatedText': translated_text
+                        })
+                        # Check if TTS is enabled for this session
+                        if session.enable_tts:
+                            # Generate TTS for the translated text
+                            print(f"TTS: Generating TTS for translation in {language.code.value}: '{translated_text}'")
+                            print(f"TTS Model: VITS ONNX (mutisya/vits-tts-onnx-fp32-{language.name.lower()}) - File: tts_service_onnx.py")
+                            tts_task = asyncio.create_task(
+                                self.tts_service.generate_speech_dual_format(translated_text, language.code.value)
+                            )
+                            audio_tasks.append((
+                                language.code.value,
+                                translated_text,
+                                tts_task,
+                                False  # is_original
+                            ))
+                        else:
+                            print(f"TTS: Skipping TTS generation for translation (disabled for this session)")
+                    else:
+                        print(f"TRANSLATION FAILED: No translated text returned for {language.name}")
+                except Exception as e:
+                    print(f"Translation error for {language.name}: {e}")
+            # Wait for all TTS generation to complete and broadcast with proper alignment
+            for language_code, text, audio_task, is_original in audio_tasks:
+                try:
+                    audio_result = await audio_task
+                    if audio_result and (audio_result[0] or audio_result[1]):
+                        webm_data, wav_data = audio_result
+                        print(f"TTS: Audio generated successfully for {language_code}")
+                        if webm_data:
+                            print(f"TTS: WebM audio: {len(webm_data)} bytes")
+                        if wav_data:
+                            print(f"TTS: WAV audio: {len(wav_data)} bytes")
+                        print(f"TTS: Text for {language_code}: '{text}'")
+                        # Broadcast TTS audio with explicit message-text-audio alignment (dual format)
+                        await self._broadcast_tts_audio_aligned_dual_format(
+                            message.session_id,
+                            message.id,
+                            language_code,
+                            text,
+                            webm_data,
+                            wav_data,
+                            is_original
+                        )
+                    else:
+                        print(f"TTS: Failed to generate audio for {language_code}")
+                except Exception as e:
+                    print(f"TTS generation error for {language_code}: {e}")
+            print(f"=== TRANSLATION/TTS PROCESSING END ===")
+        except Exception as e:
+            print(f"Error in _process_translations_and_tts: {e}")
+            import traceback
+            traceback.print_exc()
+    async def _broadcast_to_session(self, session_id: str, event: str, data: dict, exclude_sid: str = None):
+        """Broadcast message to all clients in a session"""
+        if session_id not in self.session_clients:
+            return
+        # Create a copy of the set to avoid concurrent modification
+        client_sids = list(self.session_clients[session_id])
+        for sid in client_sids:
+            if sid != exclude_sid:
+                try:
+                    await self.sio.emit(event, data, room=sid)
+                except Exception as e:
+                    print(f"Error broadcasting {event} to client {sid}: {e}")
+    async def _broadcast_tts_audio_aligned(self, session_id: str, message_id: str,
+                                           language_code: str, text: str, audio_data: bytes,
+                                           is_original: bool = False):
+        """Broadcast TTS audio with explicit message-text-audio alignment"""
+        try:
+            if session_id not in self.session_clients:
+                return
+            print(f"TTS ALIGNED: Broadcasting audio for message {message_id}")
+            print(f"TTS ALIGNED: Language: {language_code}")
+            print(f"TTS ALIGNED: Text: '{text}'")
+            print(f"TTS ALIGNED: Audio size: {len(audio_data)} bytes")
+            print(f"TTS ALIGNED: Is original: {is_original}")
+            # Create a copy of the set to avoid concurrent modification
+            client_sids = list(self.session_clients[session_id])
+            # Send audio data in chunks to all participants with explicit alignment data
+            for sid in client_sids:
+                try:
+                    chunk_size = 4096
+                    for i in range(0, len(audio_data), chunk_size):
+                        chunk = audio_data[i:i + chunk_size]
+                        is_last_chunk = i + chunk_size >= len(audio_data)
+                        chunk_data = {
+                            'messageId': message_id,           # Explicit message ID
+                            'languageCode': language_code,     # Language of THIS audio
+                            'text': text,                      # Text that THIS audio represents
+                            'isOriginal': is_original,         # Whether this is original or translation
+                            'chunk': list(chunk),
+                            'isLast': is_last_chunk,
+                            'chunkIndex': i // chunk_size,     # Chunk ordering
+                            'totalChunks': (len(audio_data) + chunk_size - 1) // chunk_size
+                        }
+                        await self.sio.emit('tts_audio_chunk', chunk_data, room=sid)
+                        # Small delay to prevent overwhelming
+                        await asyncio.sleep(0.01)
+                    print(f"TTS ALIGNED: Successfully sent aligned audio to participant {sid}")
+                except Exception as e:
+                    print(f"TTS ALIGNED: Error sending audio to participant {sid}: {e}")
+        except Exception as e:
+            print(f"TTS ALIGNED: Error broadcasting aligned audio: {e}")
+    async def _broadcast_tts_audio_aligned_dual_format(self, session_id: str, message_id: str,
+                                                       language_code: str, text: str, webm_data: bytes,
+                                                       wav_data: bytes, is_original: bool = False):
+        """Broadcast TTS audio with both WebM and WAV formats for cross-platform compatibility"""
+        try:
+            if session_id not in self.session_clients:
+                return
+            print(f"TTS DUAL FORMAT: Broadcasting audio for message {message_id}")
+            print(f"TTS DUAL FORMAT: Language: {language_code}")
+            print(f"TTS DUAL FORMAT: Text: '{text}'")
+            if webm_data:
+                print(f"TTS DUAL FORMAT: WebM size: {len(webm_data)} bytes")
+            if wav_data:
+                print(f"TTS DUAL FORMAT: WAV size: {len(wav_data)} bytes")
+            print(f"TTS DUAL FORMAT: Is original: {is_original}")
+            # Create a copy of the set to avoid concurrent modification
+            client_sids = list(self.session_clients[session_id])
+            # Use WebM data for chunking (primary format for web clients)
+            primary_audio_data = webm_data if webm_data else wav_data
+            if not primary_audio_data:
+                print("TTS DUAL FORMAT: No audio data available")
+                return
+            # Send audio data in chunks to all participants with dual format support
+            chunk_size = 4096
+            for sid in client_sids:
+                try:
+                    for i in range(0, len(primary_audio_data), chunk_size):
+                        chunk = primary_audio_data[i:i + chunk_size]
+                        is_last_chunk = i + chunk_size >= len(primary_audio_data)
+                        # Prepare WAV chunk if available
+                        wav_chunk = None
+                        if wav_data and i < len(wav_data):
+                            wav_end = min(i + chunk_size, len(wav_data))
+                            wav_chunk = wav_data[i:wav_end]
+                        chunk_data = {
+                            'messageId': message_id,           # Explicit message ID
+                            'languageCode': language_code,     # Language of THIS audio
+                            'text': text,                      # Text that THIS audio represents
+                            'isOriginal': is_original,         # Whether this is original or translation
+                            'chunk': list(chunk),              # WebM audio chunk (for web clients)
+                            'wavChunk': list(wav_chunk) if wav_chunk else None,  # WAV audio chunk (for Android clients)
+                            'isLast': is_last_chunk,
+                            'chunkIndex': i // chunk_size,     # Chunk ordering
+                            'totalChunks': (len(primary_audio_data) + chunk_size - 1) // chunk_size,
+                            'format': 'webm',                 # Primary format
+                            'wavFormat': 'wav' if wav_chunk else None  # Secondary format available
+                        }
+                        await self.sio.emit('tts_audio_chunk', chunk_data, room=sid)
+                        # Small delay to prevent overwhelming
+                        await asyncio.sleep(0.01)
+                    print(f"TTS DUAL FORMAT: Successfully sent dual format audio to participant {sid}")
+                except Exception as e:
+                    print(f"TTS DUAL FORMAT: Error sending audio to participant {sid}: {e}")
+        except Exception as e:
+            print(f"TTS DUAL FORMAT: Error broadcasting dual format audio: {e}")
+    async def _broadcast_tts_audio_to_all_participants(self, session_id: str, language_code: str,
+                                                       audio_data: bytes, message_id: str, text: str):
+        """Legacy method - now calls the aligned version"""
+        await self._broadcast_tts_audio_aligned(
+            session_id, message_id, language_code, text, audio_data, False
+        )
+    async def _broadcast_audio_to_language_participants(self, session_id: str, language_code: str,
+                                                        audio_data: bytes, message_id: str):
+        """Broadcast audio to participants listening in specific language (legacy method)"""
+        try:
+            session = await self.session_manager.get_session(session_id)
+            if not session:
+                return
+            # Find participants with matching language
+            target_participants = [p for p in session.participants if p.language.code.value == language_code]
+            for participant in target_participants:
+                # Find client SID for this participant
+                participant_sid = None
+                for sid, pid in self.client_participants.items():
+                    if pid == participant.id:
+                        participant_sid = sid
+                        break
+                if participant_sid:
+                    print(f"TTS: Broadcasting audio to participant {participant.name} in {language_code}")
+                    # Send audio data in chunks
+                    chunk_size = 4096
+                    for i in range(0, len(audio_data), chunk_size):
+                        chunk = audio_data[i:i + chunk_size]
+                        await self.sio.emit('tts_audio_chunk', {
+                            'messageId': message_id,
+                            'chunk': list(chunk),
+                            'isLast': i + chunk_size >= len(audio_data)
+                        }, room=participant_sid)
+                        # Small delay to prevent overwhelming
+                        await asyncio.sleep(0.01)
+        except Exception as e:
+            print(f"TTS: Error broadcasting audio: {e}")
+    async def _cleanup_client(self, sid: str):
+        """Clean up client data on disconnect"""
+        try:
+            participant_id = self.client_participants.get(sid)
+            session_id = self.client_sessions.get(sid)
+            if participant_id:
+                # Remove participant from session
+                await self.session_manager.remove_participant(participant_id)
+                # Clear participant buffers
+                self.transcription_service.clear_participant_buffers(participant_id)
+                # Clear current message tracking
+                if participant_id in self.participant_current_message:
+                    del self.participant_current_message[participant_id]
+                del self.client_participants[sid]
+            if session_id:
+                # Remove from session clients
+                if session_id in self.session_clients:
+                    self.session_clients[session_id].discard(sid)
+                    if not self.session_clients[session_id]:
+                        del self.session_clients[session_id]
+                        # If session is empty, clear processed messages for this session
+                        self._cleanup_session_processed_messages(session_id)
+                del self.client_sessions[sid]
+        except Exception as e:
+            print(f"Error cleaning up client {sid}: {e}")
+    def _cleanup_session_processed_messages(self, session_id: str):
+        """Clean up processed messages for empty sessions to prevent memory leaks"""
+        try:
+            # Remove processed messages that belong to this session
+            messages_to_remove = []
+            for message_id in list(self.processed_messages):
+                if message_id in self.messages and self.messages[message_id].session_id == session_id:
+                    messages_to_remove.append(message_id)
+            for message_id in messages_to_remove:
+                self.processed_messages.discard(message_id)
+                if message_id in self.messages:
+                    del self.messages[message_id]
+            print(f"Cleaned up {len(messages_to_remove)} processed messages for session {session_id}")
+        except Exception as e:
+            print(f"Error cleaning up session processed messages: {e}")
+    async def _emit_error(self, sid: str, message: str):
+        """Emit error message to specific client"""
+        try:
+            await self.sio.emit('join_error', message, room=sid)
+        except Exception as e:
+            print(f"Error emitting error to {sid}: {e}")
+    async def handle_update_participant_language(self, sid: str, data: dict):
+        """Handle participant language update (affects speech recognition)"""
+        try:
+            session_id = data.get('sessionId')
+            participant_id = data.get('participantId')
+            language_code = data.get('language')
+            print(f"=== UPDATE PARTICIPANT LANGUAGE ===")
+            print(f"Session ID: {session_id}")
+            print(f"Participant ID: {participant_id}")
+            print(f"New Language: {language_code}")
+            if not all([session_id, participant_id, language_code]):
+                await self._emit_error(sid, "Missing required fields")
+                return
+            # Validate language code
+            try:
+                from app.models import LanguageCode
+                lang_enum = LanguageCode(language_code)
+                print(f"Language code validated: {lang_enum}")
+            except ValueError:
+                await self._emit_error(sid, f"Invalid language code: {language_code}")
+                return
+            # Update participant's language in session
+            session = await self.session_manager.get_session(session_id)
+            if session:
+                for participant in session.participants:
+                    if participant.id == participant_id:
+                        # Update participant's language using LANGUAGE_MAP for complete Language object
+                        if lang_enum in LANGUAGE_MAP:
+                            participant.language = LANGUAGE_MAP[lang_enum]
+                            print(f"Updated participant {participant.name} language to {lang_enum.value} ({participant.language.display_name})")
+                        else:
+                            print(f"Warning: Language {lang_enum.value} not found in LANGUAGE_MAP, using fallback")
+                            from app.models import Language
+                            participant.language = Language(code=lang_enum, name=lang_enum.value, display_name=lang_enum.value)
+                        # Notify all clients in session
+                        await self._broadcast_to_session(session_id, 'participant_language_updated', {
+                            'participantId': participant_id,
+                            'language': language_code
+                        })
+                        break
+            print(f"=== UPDATE PARTICIPANT LANGUAGE COMPLETE ===")
+        except Exception as e:
+            print(f"Error in handle_update_participant_language: {e}")
+            import traceback
+            traceback.print_exc()
+            await self._emit_error(sid, "Failed to update participant language")
+    async def handle_update_session_languages(self, sid: str, data: dict):
+        """Handle session languages update (affects translation targets)"""
+        try:
+            session_id = data.get('sessionId')
+            languages = data.get('languages', [])
+            print(f"=== UPDATE SESSION LANGUAGES (REPLACE MODE) ===")
+            print(f"Session ID: {session_id}")
+            print(f"New Languages: {languages}")
+            if not session_id or not languages:
+                await self._emit_error(sid, "Missing required fields")
+                return
+            # Get current session for comparison
+            session = await self.session_manager.get_session(session_id)
+            if not session:
+                await self._emit_error(sid, "Session not found")
+                return
+            current_languages = [lang.code.value for lang in session.languages]
+            print(f"Before update - Session languages: {current_languages}")
+            # Validate all language codes and create Language objects
+            validated_languages = []
+            try:
+                from app.models import Language, LanguageCode
+                from app.services.session_manager import LANGUAGE_MAP
+                for lang_code in languages:
+                    lang_enum = LanguageCode(lang_code)
+                    language = LANGUAGE_MAP[lang_enum]
+                    validated_languages.append(language)
+                    print(f"Validated language: {lang_code} -> {language.name}")
+            except ValueError as e:
+                await self._emit_error(sid, f"Invalid language code: {e}")
+                return
+            # REPLACE session languages (not add to them)
+            session.languages = validated_languages
+            new_languages = [lang.code.value for lang in session.languages]
+            print(f"After update - Session languages: {new_languages}")
+            # Verify the session manager has the updated languages
+            verification_session = await self.session_manager.get_session(session_id)
+            if verification_session:
+                verification_languages = [lang.code.value for lang in verification_session.languages]
+                print(f"Verification - Session manager languages: {verification_languages}")
+            # Notify all clients in session about the update
+            await self._broadcast_to_session(session_id, 'session_languages_updated', {
+                'sessionId': session_id,
+                'languages': new_languages,
+                'previous': current_languages
+            })
+            print(f"=== UPDATE SESSION LANGUAGES COMPLETE ===")
+        except Exception as e:
+            print(f"Error in handle_update_session_languages: {e}")
+            import traceback
+            traceback.print_exc()
+            await self._emit_error(sid, "Failed to update session languages")

preload_models.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import io
+import nltk
+import json
+import torch
+import os
+import sys
+import gc
+if len(sys.argv) > 1:
+    os.environ["HUGGING_FACE_HUB_TOKEN"] = sys.argv[1]
+nltk.download("punkt")
+nltk.download('punkt_tab')
+device = 0 if torch.cuda.is_available() else -1
+def cleanup_model_resource(model):
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

requirements.txt ADDED Viewed

	@@ -0,0 +1,58 @@

+# Web framework and server
+fastapi==0.115.5
+uvicorn[standard]==0.32.1
+websockets==13.1
+python-socketio==5.11.4
+python-multipart==0.0.17
+pydantic==2.10.3
+# PyTorch ecosystem - latest stable versions
+torch==2.5.1
+torchaudio==2.5.1
+transformers==4.45.2
+datasets==3.1.0
+tokenizers==0.20.4
+accelerate==1.2.1
+# ONNX Runtime for optimized inference - GPU enabled
+onnxruntime-gpu==1.19.2
+onnx==1.17.0
+optimum[onnxruntime-gpu]==1.23.0
+huggingface-hub==0.26.2
+# Audio processing
+soundfile==0.12.1
+librosa==0.10.2
+phonemizer==3.3.0
+pydub==0.25.1
+# Scientific computing
+scipy==1.14.1
+numpy==2.1.3
+# Natural language processing
+nltk==3.9.1
+sentencepiece==0.2.0
+# Computer vision and image processing
+pillow==11.0.0
+qrcode[pil]==8.0
+# Authentication and security
+python-jose[cryptography]==3.3.0
+passlib[bcrypt]==1.7.4
+# File handling
+aiofiles==24.1.0
+# Model optimization
+bitsandbytes==0.45.0
+# Protocol buffers - compatible version
+protobuf==5.28.3
+# Speech processing
+speechbrain==1.0.2
+# Voice Activity Detection
+silero-vad>=5.1