Spaces:

lordofgaming
/

voiceforge

Sleeping

App Files Files Community

lordofgaming commited on Feb 1

Commit

673435a

1 Parent(s): fbdfd83

Initial VoiceForge deployment (clean)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +39 -0
README.md +20 -6
backend/.env +16 -0
backend/.flake8 +4 -0
backend/Dockerfile +50 -0
backend/app/__init__.py +3 -0
backend/app/api/__init__.py +3 -0
backend/app/api/routes/__init__.py +35 -0
backend/app/api/routes/analysis.py +60 -0
backend/app/api/routes/audio.py +100 -0
backend/app/api/routes/auth.py +116 -0
backend/app/api/routes/batch.py +204 -0
backend/app/api/routes/cloning.py +81 -0
backend/app/api/routes/health.py +93 -0
backend/app/api/routes/s2s.py +45 -0
backend/app/api/routes/sign.py +164 -0
backend/app/api/routes/sign_bridge.py +63 -0
backend/app/api/routes/stt.py +489 -0
backend/app/api/routes/transcripts.py +200 -0
backend/app/api/routes/translation.py +261 -0
backend/app/api/routes/tts.py +245 -0
backend/app/api/routes/ws.py +208 -0
backend/app/core/__init__.py +7 -0
backend/app/core/config.py +108 -0
backend/app/core/limiter.py +27 -0
backend/app/core/middleware.py +70 -0
backend/app/core/request_size_middleware.py +91 -0
backend/app/core/security.py +113 -0
backend/app/core/security_encryption.py +107 -0
backend/app/core/security_headers.py +37 -0
backend/app/core/ws_security.py +181 -0
backend/app/main.py +273 -0
backend/app/models/__init__.py +19 -0
backend/app/models/audio_file.py +47 -0
backend/app/models/auth.py +36 -0
backend/app/models/base.py +44 -0
backend/app/models/sign_lstm.py +63 -0
backend/app/models/transcript.py +67 -0
backend/app/schemas/__init__.py +39 -0
backend/app/schemas/stt.py +98 -0
backend/app/schemas/transcript.py +69 -0
backend/app/schemas/tts.py +67 -0
backend/app/services/__init__.py +13 -0
backend/app/services/audio_service.py +101 -0
backend/app/services/batch_service.py +348 -0
backend/app/services/cache_service.py +71 -0
backend/app/services/clone_service.py +104 -0
backend/app/services/diarization_service.py +338 -0
backend/app/services/edge_tts_service.py +357 -0
backend/app/services/emotion_service.py +132 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+# CPU-only base for free-tier HuggingFace Spaces
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    python3-dev \
+    ffmpeg \
+    libsndfile1 \
+    git \
+    supervisor \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements and install
+COPY deploy/huggingface/requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Create directory for weights and logs
+RUN mkdir -p backend/app/models/weights /var/log/supervisor
+# Copy supervisor config
+COPY deploy/huggingface/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+# Expose ports (HF expects the app on 7860)
+EXPOSE 7860
+# Run supervisor to start both backend and frontend
+CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]

README.md CHANGED Viewed

@@ -1,10 +1,24 @@
 ---
-title: Voiceforge
-emoji: 🏢
-colorFrom: red
-colorTo: indigo
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: VoiceForge Universal
+emoji: 🎙️
+colorFrom: indigo
+colorTo: pink
 sdk: docker
+pinned: true
+license: mit
 ---
+# VoiceForge: Universal Communication Platform
+**Instant Speech-to-Speech | Signed Communication | Meeting Intelligence**
+Powered by:
+- **Whisper** (STT)
+- **SeamlessM4T** (S2S)
+- **MediaPipe** (Sign Recognition)
+- **Edge TTS** (Synthesis)
+## 🚀 Usage
+1. Click the "Speech-to-Speech" tab to translate voice instantly.
+2. Use "Signed Communication" to visualize ASL.
+3. Upload meetings for instant minutes.

backend/.env ADDED Viewed

	@@ -0,0 +1,16 @@

+# VoiceForge Backend Environment Variables
+# Database
+DATABASE_URL=sqlite:///./voiceforge.db
+# Hugging Face Token (for Speaker Diarization)
+# Get your token at: https://huggingface.co/settings/tokens
+# See docs/HF_TOKEN_ROTATION.md for setup instructions
+HF_TOKEN=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
+# Encryption Key (auto-generated in dev, REQUIRED in production)
+# Generate with: python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"
+# ENCRYPTION_KEY=
+# Environment (development | production)
+ENVIRONMENT=development

backend/.flake8 ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+max-line-length = 120
+extend-ignore = E203
+exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,venv

backend/Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Build Stage
+FROM python:3.10-slim as builder
+WORKDIR /app
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+# Install system dependencies required for building python packages
+# ffmpeg is needed for audio processing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Install python dependencies
+COPY requirements.txt .
+RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
+# Final Stage
+FROM python:3.10-slim
+WORKDIR /app
+# Install runtime dependencies (ffmpeg)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Copy wheels from builder
+COPY --from=builder /app/wheels /wheels
+COPY --from=builder /app/requirements.txt .
+# Install dependencies from wheels
+RUN pip install --no-cache /wheels/*
+# Copy application code
+COPY . .
+# Create a non-root user
+RUN addgroup --system app && adduser --system --group app
+USER app
+# Expose port
+EXPOSE 8000
+# Run commands
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

backend/app/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+VoiceForge Backend Package
+"""

backend/app/api/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+VoiceForge API Package
+"""

backend/app/api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+VoiceForge API Routes Package
+"""
+from .stt import router as stt_router
+from .tts import router as tts_router
+from .health import router as health_router
+from .transcripts import router as transcripts_router
+from .ws import router as ws_router
+from .translation import router as translation_router
+from .batch import router as batch_router
+from .analysis import router as analysis_router
+from .audio import router as audio_router
+from .cloning import router as cloning_router
+from .sign import router as sign_router
+from .auth import router as auth_router
+from .s2s import router as s2s_router
+from .sign_bridge import router as sign_bridge_router
+__all__ = [
+    "stt_router",
+    "tts_router",
+    "health_router",
+    "transcripts_router",
+    "ws_router",
+    "translation_router",
+    "batch_router",
+    "analysis_router",
+    "audio_router",
+    "cloning_router",
+    "sign_router",
+    "auth_router",
+    "s2s_router",
+    "sign_bridge_router",
+]

backend/app/api/routes/analysis.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Analysis API Routes
+Endpoints for Emotion and Sentiment Analysis
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from typing import Dict, Any
+import logging
+import os
+import shutil
+import tempfile
+from app.services.emotion_service import get_emotion_service
+from app.services.nlp_service import get_nlp_service
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/analysis", tags=["Analysis"])
+@router.post("/emotion/audio")
+async def analyze_audio_emotion(
+    file: UploadFile = File(..., description="Audio file to analyze"),
+):
+    """
+    Analyze emotions in an audio file using Wav2Vec2.
+    Returns dominant emotion and probability distribution.
+    """
+    service = get_emotion_service()
+    # Save to temp file
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        result = service.analyze_audio(tmp_path)
+        return result
+    except Exception as e:
+        logger.error(f"Emotion analysis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
+@router.post("/sentiment/text")
+async def analyze_text_sentiment(
+    text: str = Form(..., description="Text to analyze"),
+):
+    """
+    Analyze text sentiment (polarity and subjectivity).
+    """
+    nlp_service = get_nlp_service()
+    try:
+        return nlp_service.analyze_sentiment(text)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/audio.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+Audio Editing API Routes
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from fastapi.responses import FileResponse
+from typing import List, Optional
+import os
+import shutil
+import tempfile
+import uuid
+from app.services.audio_service import get_audio_service, AudioService
+router = APIRouter(prefix="/audio", tags=["Audio Studio"])
+@router.post("/trim")
+async def trim_audio(
+    file: UploadFile = File(..., description="Audio file"),
+    start_sec: float = Form(..., description="Start time in seconds"),
+    end_sec: float = Form(..., description="End time in seconds"),
+    service: AudioService = Depends(get_audio_service)
+):
+    """Trim an audio file"""
+    suffix = os.path.splitext(file.filename)[1] or ".mp3"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        output_path = tmp_path.replace(suffix, f"_trimmed{suffix}")
+        service.trim_audio(tmp_path, int(start_sec * 1000), int(end_sec * 1000), output_path)
+        return FileResponse(
+            output_path,
+            filename=f"trimmed_{file.filename}",
+            background=None # Let FastAPI handle cleanup? No, we need custom cleanup or use background task
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    # Note: Temp files might persist. In prod, use a cleanup task.
+@router.post("/merge")
+async def merge_audio(
+    files: List[UploadFile] = File(..., description="Files to merge"),
+    format: str = Form("mp3", description="Output format"),
+    service: AudioService = Depends(get_audio_service)
+):
+    """Merge multiple audio files"""
+    temp_files = []
+    try:
+        for file in files:
+            suffix = os.path.splitext(file.filename)[1] or ".mp3"
+            tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+            content = await file.read()
+            tmp.write(content)
+            tmp.close()
+            temp_files.append(tmp.name)
+        output_filename = f"merged_{uuid.uuid4()}.{format}"
+        output_path = os.path.join(tempfile.gettempdir(), output_filename)
+        service.merge_audio(temp_files, output_path)
+        return FileResponse(output_path, filename=output_filename)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        for p in temp_files:
+            try:
+                os.unlink(p)
+            except:
+                pass
+@router.post("/convert")
+async def convert_audio(
+    file: UploadFile = File(..., description="Audio file"),
+    target_format: str = Form(..., description="Target format (mp3, wav, flac, ogg)"),
+    service: AudioService = Depends(get_audio_service)
+):
+    """Convert audio format"""
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        output_path = service.convert_format(tmp_path, target_format)
+        return FileResponse(
+            output_path,
+            filename=f"{os.path.splitext(file.filename)[0]}.{target_format}"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass

backend/app/api/routes/auth.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from datetime import datetime, timedelta
+from typing import List
+from pydantic import BaseModel
+import secrets
+from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi.security import OAuth2PasswordRequestForm
+from sqlalchemy.orm import Session
+from ...core.security import (
+    create_access_token,
+    get_password_hash,
+    verify_password,
+    get_current_active_user,
+    ACCESS_TOKEN_EXPIRE_MINUTES
+)
+from ...models import get_db, User, ApiKey
+from ...core.limiter import limiter
+from fastapi import APIRouter, Depends, HTTPException, status, Request
+router = APIRouter(prefix="/auth", tags=["Authentication"])
+# --- Schemas ---
+class Token(BaseModel):
+    access_token: str
+    token_type: str
+class UserCreate(BaseModel):
+    email: str
+    password: str
+    full_name: str = None
+class UserOut(BaseModel):
+    id: int
+    email: str
+    full_name: str = None
+    is_active: bool
+    class Config:
+        orm_mode = True
+class ApiKeyCreate(BaseModel):
+    name: str
+class ApiKeyOut(BaseModel):
+    key: str
+    name: str
+    created_at: datetime
+    class Config:
+        orm_mode = True
+# --- Endpoints ---
+@router.post("/register", response_model=UserOut)
+@limiter.limit("5/minute")
+async def register(request: Request, user_in: UserCreate, db: Session = Depends(get_db)):
+    """Register a new user"""
+    existing_user = db.query(User).filter(User.email == user_in.email).first()
+    if existing_user:
+        raise HTTPException(status_code=400, detail="Email already registered")
+    hashed_password = get_password_hash(user_in.password)
+    new_user = User(
+        email=user_in.email,
+        hashed_password=hashed_password,
+        full_name=user_in.full_name
+    )
+    db.add(new_user)
+    db.commit()
+    db.refresh(new_user)
+    return new_user
+@router.post("/login", response_model=Token)
+@limiter.limit("5/minute")
+async def login(request: Request, form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
+    """Login to get access token"""
+    user = db.query(User).filter(User.email == form_data.username).first()
+    if not user or not verify_password(form_data.password, user.hashed_password):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Incorrect email or password",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    access_token = create_access_token(
+        subject=user.id, expires_delta=access_token_expires
+    )
+    return {"access_token": access_token, "token_type": "bearer"}
+@router.post("/api-keys", response_model=ApiKeyOut)
+async def create_api_key(
+    key_in: ApiKeyCreate,
+    current_user: User = Depends(get_current_active_user),
+    db: Session = Depends(get_db)
+):
+    """Generate a new API key for the current user"""
+    # Generate secure 32-char key
+    raw_key = secrets.token_urlsafe(32)
+    api_key_str = f"vf_{raw_key}"  # Prefix for identification
+    new_key = ApiKey(
+        key=api_key_str,
+        name=key_in.name,
+        user_id=current_user.id
+    )
+    db.add(new_key)
+    db.commit()
+    db.refresh(new_key)
+    return new_key
+@router.get("/me", response_model=UserOut)
+async def read_users_me(current_user: User = Depends(get_current_active_user)):
+    """Get current user details"""
+    return current_user

backend/app/api/routes/batch.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Batch Processing API Routes
+Endpoints for submitting and managing batch transcription jobs
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends, BackgroundTasks
+from fastapi.responses import FileResponse
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+import logging
+import shutil
+import os
+import tempfile
+from pathlib import Path
+from app.services.batch_service import get_batch_service
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/batch", tags=["batch"])
+# Request/Response Models
+class BatchJobResponse(BaseModel):
+    """Batch job response model."""
+    job_id: str
+    status: str
+    progress: float
+    created_at: str
+    total_files: int
+    completed_files: int
+    failed_files: int
+    has_zip: bool
+    files: Optional[Dict[str, Any]] = None
+# Endpoints
+@router.post("/transcribe", response_model=BatchJobResponse)
+async def create_batch_job(
+    background_tasks: BackgroundTasks,
+    files: List[UploadFile] = File(..., description="Audio files to transcribe"),
+    language: Optional[str] = Form(None, description="Language code (e.g., 'en', 'hi')"),
+    output_format: str = Form("txt", description="Output format (txt, srt)"),
+):
+    """
+    Submit a batch of audio files for transcription.
+    1. Uploads multiple files
+    2. Creates a batch job
+    3. Starts processing in background
+    Args:
+        files: List of audio files
+        language: Optional language code
+        output_format: Output format (txt or srt)
+    Returns:
+        Created job details
+    """
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided")
+    if len(files) > 50:
+        raise HTTPException(status_code=400, detail="Maximum 50 files per batch")
+    try:
+        service = get_batch_service()
+        # Create temp files for processing
+        file_paths = {}
+        original_names = []
+        for file in files:
+            suffix = Path(file.filename).suffix or ".wav"
+            # Create a named temp file that persists until manually deleted
+            tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+            content = await file.read()
+            tmp.write(content)
+            tmp.close()
+            file_paths[file.filename] = tmp.name
+            original_names.append(file.filename)
+        # Create job
+        job = service.create_job(
+            filenames=original_names,
+            options={
+                "language": language,
+                "output_format": output_format,
+            }
+        )
+        # Connect to Celery worker for processing
+        from app.workers.tasks import process_audio_file
+        # NOTE: For MVP batch service, we are currently keeping the simplified background_tasks approach
+        # because the 'process_audio_file' task defined in tasks.py is for individual files,
+        # whereas 'process_job' handles the whole batch logic (zipping etc).
+        # To fully migrate, we would need to refactor batch_service to span multiple tasks.
+        #
+        # For now, let's keep the background_task for the orchestrator, and have the orchestrator
+        # call the celery tasks for individual files?
+        # Actually, `service.process_job` currently runs synchronously in a background thread.
+        # We will leave as is for 3.1 step 1, but we CAN use Celery for the individual transcriptions.
+        # Start processing in background (Orchestrator runs in thread, calls expensive operations)
+        background_tasks.add_task(
+             service.process_job,
+             job_id=job.job_id,
+             file_paths=file_paths,
+        )
+        return job.to_dict()
+    except Exception as e:
+        # Cleanup any created temp files on error
+        for path in file_paths.values():
+            try:
+                os.unlink(path)
+            except:
+                pass
+        logger.error(f"Batch job creation failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/jobs", response_model=List[BatchJobResponse])
+async def list_jobs(limit: int = 10):
+    """
+    List recent batch jobs.
+    Args:
+        limit: Max number of jobs to return
+    Returns:
+        List of jobs
+    """
+    service = get_batch_service()
+    jobs = service.list_jobs(limit)
+    return [job.to_dict() for job in jobs]
+@router.get("/{job_id}", response_model=BatchJobResponse)
+async def get_job_status(job_id: str):
+    """
+    Get status of a specific batch job.
+    Args:
+        job_id: Job ID
+    Returns:
+        Job details and progress
+    """
+    service = get_batch_service()
+    job = service.get_job(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+    return job.to_dict()
+@router.get("/{job_id}/download")
+async def download_results(job_id: str):
+    """
+    Download batch job results as ZIP.
+    Args:
+        job_id: Job ID
+    Returns:
+        ZIP file download
+    """
+    service = get_batch_service()
+    zip_path = service.get_zip_path(job_id)
+    if not zip_path:
+        raise HTTPException(status_code=404, detail="Results not available (job may be processing or failed)")
+    return FileResponse(
+        path=zip_path,
+        filename=f"batch_{job_id}_results.zip",
+        media_type="application/zip",
+    )
+@router.delete("/{job_id}")
+async def delete_job(job_id: str):
+    """
+    Delete a batch job and cleanup files.
+    Args:
+        job_id: Job ID
+    """
+    service = get_batch_service()
+    # Try to cancel first if running
+    service.cancel_job(job_id)
+    # Delete data
+    success = service.delete_job(job_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Job not found")
+    return {"status": "deleted", "job_id": job_id}

backend/app/api/routes/cloning.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Voice Cloning API Routes
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from fastapi.responses import FileResponse
+from typing import List, Optional
+import os
+import shutil
+import tempfile
+import uuid
+from app.services.clone_service import get_clone_service, CloneService
+router = APIRouter(prefix="/clone", tags=["Voice Cloning"])
+@router.post("/synthesize")
+async def clone_synthesize(
+    text: str = Form(..., description="Text to speak"),
+    language: str = Form("en", description="Language code (en, es, fr, de, etc.)"),
+    files: List[UploadFile] = File(..., description="Reference audio samples (1-3 files, 3-10s each recommended)"),
+    service: CloneService = Depends(get_clone_service)
+):
+    """
+    Clone a voice from reference audio samples.
+    Uses Coqui XTTS v2.
+    WARNING: Heavy operation. May take 5-20 seconds depending on GPU.
+    """
+    # Validation
+    if not files:
+        raise HTTPException(status_code=400, detail="At least one reference audio file is required")
+    temp_files = []
+    try:
+        # Save reference files
+        for file in files:
+            suffix = os.path.splitext(file.filename)[1] or ".wav"
+            tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+            content = await file.read()
+            tmp.write(content)
+            tmp.close()
+            temp_files.append(tmp.name)
+        # Generate output path
+        output_filename = f"cloned_{uuid.uuid4()}.wav"
+        output_path = os.path.join(tempfile.gettempdir(), output_filename)
+        # Synthesize
+        service.clone_voice(
+            text=text,
+            speaker_wav_paths=temp_files,
+            language=language,
+            output_path=output_path
+        )
+        return FileResponse(
+            output_path,
+            filename="cloned_speech.wav",
+            media_type="audio/wav"
+        )
+    except ImportError:
+        raise HTTPException(status_code=503, detail="Voice Cloning service not available (TTS library missing)")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Cleanup input files
+        for p in temp_files:
+            try:
+                os.unlink(p)
+            except:
+                pass
+        # Note: Output file cleanup needs management in prod (background task or stream)
+@router.get("/languages")
+def get_languages(service: CloneService = Depends(get_clone_service)):
+    return {"languages": service.get_supported_languages()}

backend/app/api/routes/health.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Health Check Router
+"""
+from fastapi import APIRouter
+router = APIRouter(prefix="/health", tags=["Health"])
+@router.get("")
+@router.get("/")
+async def health_check():
+    """Basic health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": "voiceforge-api",
+        "version": "1.0.0",
+    }
+@router.get("/ready")
+async def readiness_check():
+    """Readiness check - verifies all dependencies are available"""
+    # TODO: Check database, Redis, Google Cloud connectivity
+    return {
+        "status": "ready",
+        "checks": {
+            "database": "ok",
+            "redis": "ok",
+            "google_cloud": "ok",
+        }
+    }
+@router.get("/memory")
+async def memory_status():
+    """Get current memory usage and loaded models"""
+    from ...services.whisper_stt_service import (
+        _whisper_models,
+        _model_last_used,
+        get_memory_usage_mb
+    )
+    import time
+    current_time = time.time()
+    models_info = {}
+    for name in _whisper_models.keys():
+        last_used = _model_last_used.get(name, 0)
+        idle_seconds = current_time - last_used if last_used else 0
+        models_info[name] = {
+            "loaded": True,
+            "idle_seconds": round(idle_seconds, 1)
+        }
+    return {
+        "memory_mb": round(get_memory_usage_mb(), 1),
+        "loaded_models": list(_whisper_models.keys()),
+        "models_detail": models_info
+    }
+@router.post("/memory/cleanup")
+async def cleanup_memory():
+    """Unload idle models to free memory"""
+    from ...services.whisper_stt_service import cleanup_idle_models, get_memory_usage_mb
+    before = get_memory_usage_mb()
+    cleanup_idle_models()
+    after = get_memory_usage_mb()
+    return {
+        "memory_before_mb": round(before, 1),
+        "memory_after_mb": round(after, 1),
+        "freed_mb": round(before - after, 1)
+    }
+@router.post("/memory/unload-all")
+async def unload_all():
+    """Unload ALL models to free maximum memory"""
+    from ...services.whisper_stt_service import unload_all_models, get_memory_usage_mb
+    before = get_memory_usage_mb()
+    unloaded = unload_all_models()
+    after = get_memory_usage_mb()
+    return {
+        "unloaded_models": unloaded,
+        "memory_before_mb": round(before, 1),
+        "memory_after_mb": round(after, 1),
+        "freed_mb": round(before - after, 1)
+    }

backend/app/api/routes/s2s.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Speech-to-Speech API Router
+"""
+import logging
+from typing import Optional
+from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException
+from app.services.speech_bridge_service import get_bridge_service, SpeechBridgeService
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/s2s", tags=["Speech-to-Speech"])
+settings = get_settings()
+@router.post("/process")
+async def process_speech_to_speech(
+    file: UploadFile = File(..., description="Audio file to process"),
+    source_lang: str = Form("en", description="Source language code (e.g. 'en', 'hi')"),
+    target_lang: str = Form("es", description="Target language code (e.g. 'es', 'fr')"),
+    voice_id: Optional[str] = Form(None, description="Target TTS Voice ID"),
+    bridge_service: SpeechBridgeService = Depends(get_bridge_service)
+):
+    """
+    Process audio: Speech -> Text -> Translation -> Speech
+    """
+    try:
+        # Read audio file
+        audio_bytes = await file.read()
+        result = await bridge_service.process_speech_to_speech(
+            audio_bytes=audio_bytes,
+            source_lang=source_lang,
+            target_lang=target_lang,
+            voice_id=voice_id
+        )
+        if "error" in result:
+             raise HTTPException(status_code=400, detail=result["error"])
+        return result
+    except Exception as e:
+        logger.error(f"S2S API Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/sign.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Sign Language API Routes
+Provides WebSocket and REST endpoints for ASL recognition.
+"""
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+import numpy as np
+import base64
+import cv2
+import logging
+from typing import List
+from ...services.sign_recognition_service import get_sign_service, SignPrediction
+from ...services.sign_avatar_service import get_avatar_service
+from pydantic import BaseModel
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/sign", tags=["Sign Language"])
+class TextToSignRequest(BaseModel):
+    text: str
+@router.get("/health")
+async def sign_health():
+    """Check if sign recognition service is available"""
+    try:
+        service = get_sign_service()
+        return {"status": "ready", "service": "SignRecognitionService"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+@router.post("/recognize")
+async def recognize_sign(file: UploadFile = File(..., description="Image of hand sign")):
+    """
+    Recognize ASL letter from a single image.
+    Upload an image containing a hand sign to get the predicted letter.
+    """
+    try:
+        # Read image
+        contents = await file.read()
+        nparr = np.frombuffer(contents, np.uint8)
+        image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        if image is None:
+            raise HTTPException(status_code=400, detail="Invalid image file")
+        # Get predictions
+        service = get_sign_service()
+        predictions = service.process_frame(image)
+        if not predictions:
+            return JSONResponse({
+                "success": True,
+                "predictions": [],
+                "message": "No hands detected in image"
+            })
+        return JSONResponse({
+            "success": True,
+            "predictions": [
+                {
+                    "letter": p.letter,
+                    "confidence": p.confidence
+                }
+                for p in predictions
+            ]
+        })
+    except Exception as e:
+        logger.error(f"Sign recognition error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.websocket("/live")
+async def sign_websocket(websocket: WebSocket):
+    """
+    WebSocket endpoint for real-time sign language recognition.
+    Client sends base64-encoded JPEG frames, server responds with predictions.
+    Protocol:
+    - Client sends: {"frame": "<base64 jpeg>"}
+    - Server sends: {"predictions": [{"letter": "A", "confidence": 0.8}]}
+    """
+    await websocket.accept()
+    service = get_sign_service()
+    logger.info("Sign language WebSocket connected")
+    try:
+        while True:
+            # Receive frame from client
+            data = await websocket.receive_json()
+            if "frame" not in data:
+                await websocket.send_json({"error": "Missing 'frame' field"})
+                continue
+            # Decode base64 image
+            try:
+                frame_data = base64.b64decode(data["frame"])
+                nparr = np.frombuffer(frame_data, np.uint8)
+                frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+                if frame is None:
+                    await websocket.send_json({"error": "Invalid frame data"})
+                    continue
+            except Exception as e:
+                await websocket.send_json({"error": f"Frame decode error: {e}"})
+                continue
+            # Process frame
+            predictions = service.process_frame(frame)
+            # Send results
+            await websocket.send_json({
+                "predictions": [
+                    {
+                        "letter": p.letter,
+                        "confidence": round(p.confidence, 2)
+                    }
+                    for p in predictions
+                ]
+            })
+    except WebSocketDisconnect:
+        logger.info("Sign language WebSocket disconnected")
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+        await websocket.close(code=1011, reason=str(e))
+@router.get("/alphabet")
+async def get_alphabet():
+    """Get list of supported ASL letters"""
+    return {
+        "supported_letters": list("ABCDILUVWY5"),  # Currently implemented
+        "note": "J and Z require motion tracking (coming soon)"
+    }
+@router.post("/animate")
+async def animate_text(request: TextToSignRequest):
+    """
+    Convert text to sign language animation sequence (Finger Spelling).
+    """
+    try:
+        service = get_avatar_service()
+        sequence = service.text_to_glosses(request.text)
+        return {
+            "success": True,
+            "sequence": sequence,
+            "count": len(sequence)
+        }
+    except Exception as e:
+        logger.error(f"Animation error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/sign_bridge.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Sign-to-Speech API Router
+"""
+import logging
+from typing import Optional, Dict, Any
+from fastapi import APIRouter, UploadFile, File, Form, Depends, HTTPException, Body
+from pydantic import BaseModel
+from app.services.sign_bridge_service import get_sign_bridge_service, SignBridgeService
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/sign-bridge", tags=["Sign-to-Speech"])
+settings = get_settings()
+class SignTextRequest(BaseModel):
+    text: str
+    voice_id: Optional[str] = "en-US-AriaNeural"
+@router.post("/speak")
+async def speak_sign_text(
+    request: SignTextRequest,
+    bridge_service: SignBridgeService = Depends(get_sign_bridge_service)
+):
+    """
+    Speak text derived from Sign Language.
+    """
+    try:
+        result = await bridge_service.speak_text(
+            text=request.text,
+            voice_id=request.voice_id
+        )
+        if "error" in result:
+             raise HTTPException(status_code=400, detail=result["error"])
+        return result
+    except Exception as e:
+        logger.error(f"Sign-Bridge Speak Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/process-frame")
+async def process_frame(
+    file: UploadFile = File(...),
+    bridge_service: SignBridgeService = Depends(get_sign_bridge_service)
+):
+    """
+    Process a video frame for sign recognition (Backend-side).
+    Note: For real-time, client-side MediaPipe is preferred.
+    """
+    try:
+        # Read image
+        import numpy as np
+        import cv2
+        contents = await file.read()
+        nparr = np.frombuffer(contents, np.uint8)
+        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        result = await bridge_service.process_sign_frame(img)
+        return result
+    except Exception as e:
+        logger.error(f"Sign-Bridge Frame Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/stt.py ADDED Viewed

	@@ -0,0 +1,489 @@

+"""
+Speech-to-Text API Router
+"""
+import logging
+from datetime import datetime
+from typing import Optional, List
+from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends, Request
+from fastapi.responses import JSONResponse
+from ...core.limiter import limiter
+from ...services.stt_service import get_stt_service, STTService
+from ...services.file_service import get_file_service, FileService
+from ...schemas.stt import (
+    TranscriptionResponse,
+    TranscriptionRequest,
+    LanguageInfo,
+    LanguageListResponse,
+)
+from ...core.config import get_settings
+from sqlalchemy.orm import Session
+from app.models import get_db, AudioFile, Transcript
+from ...workers.tasks import process_audio_file
+from celery.result import AsyncResult
+from ...schemas.stt import (
+    TranscriptionResponse,
+    TranscriptionRequest,
+    LanguageInfo,
+    LanguageListResponse,
+    AsyncTranscriptionResponse,
+    TaskStatusResponse,
+)
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/stt", tags=["Speech-to-Text"])
+settings = get_settings()
+@router.get("/languages", response_model=LanguageListResponse)
+async def get_supported_languages(
+    stt_service: STTService = Depends(get_stt_service),
+):
+    """
+    Get list of supported languages for speech-to-text
+    """
+    languages = stt_service.get_supported_languages()
+    return LanguageListResponse(
+        languages=languages,
+        total=len(languages),
+    )
+@router.post("/upload", response_model=TranscriptionResponse)
+@limiter.limit("10/minute")
+async def transcribe_upload(
+    request: Request,
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    enable_punctuation: bool = Form(default=True, description="Enable automatic punctuation"),
+    enable_word_timestamps: bool = Form(default=True, description="Include word-level timestamps"),
+    enable_diarization: bool = Form(default=False, description="Enable speaker diarization"),
+    speaker_count: Optional[int] = Form(default=None, description="Expected number of speakers"),
+    prompt: Optional[str] = Form(None, description="Custom vocabulary/keywords (e.g. 'VoiceForge, PyTorch')"),
+    stt_service: STTService = Depends(get_stt_service),
+    file_service: FileService = Depends(get_file_service),
+    db: Session = Depends(get_db),
+):
+    """
+    Transcribe an uploaded audio file
+    Supports: WAV, MP3, M4A, FLAC, OGG, WebM
+    For files longer than 1 minute, consider using the async endpoint.
+    """
+    # Validate file type
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    ext = file.filename.split(".")[-1].lower()
+    if ext not in settings.supported_audio_formats_list:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format: {ext}. Supported: {', '.join(settings.supported_audio_formats_list)}"
+        )
+    # Validate language
+    if language not in settings.supported_languages_list:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported language: {language}. Supported: {', '.join(settings.supported_languages_list)}"
+        )
+    try:
+        # Read file content
+        content = await file.read()
+        # Save to storage
+        storage_path, metadata = file_service.save_upload(
+            file_content=content,
+            original_filename=file.filename,
+        )
+        logger.info(f"Processing transcription for {file.filename} ({len(content)} bytes)")
+        # Perform transcription
+        result = stt_service.transcribe_file(
+            audio_path=storage_path,
+            language=language,
+            enable_automatic_punctuation=enable_punctuation,
+            enable_word_time_offsets=enable_word_timestamps,
+            enable_speaker_diarization=enable_diarization,
+            diarization_speaker_count=speaker_count,
+            sample_rate=metadata.get("sample_rate"),
+            prompt=prompt, # Custom vocabulary
+        )
+        # Clean up temp file (optional - could keep for history)
+        # file_service.delete_file(storage_path)
+        # Save to database
+        try:
+            # 1. Create AudioFile record
+            audio_file = AudioFile(
+                storage_path=str(storage_path),
+                original_filename=file.filename,
+                duration=result.duration,
+                format=ext,
+                sample_rate=metadata.get("sample_rate"),
+                language=language,
+                detected_language=result.language,
+                status="done"
+            )
+            db.add(audio_file)
+            db.flush() # get ID
+            # 2. Create Transcript record
+            transcript = Transcript(
+                audio_file_id=audio_file.id,
+                raw_text=result.text,
+                processed_text=result.text, # initially same
+                segments=[s.model_dump() for s in result.segments] if result.segments else [],
+                language=result.language,
+                created_at=datetime.utcnow(),
+            )
+            db.add(transcript)
+            db.commit()
+            db.refresh(transcript)
+            # Return result with ID
+            response_data = result.model_dump()
+            response_data["id"] = transcript.id
+            # Explicitly validate to catch errors early
+            try:
+                return TranscriptionResponse(**response_data)
+            except Exception as e:
+                logger.error(f"Validation error for response: {e}")
+                logger.error(f"Response data: {response_data}")
+                raise HTTPException(status_code=500, detail=f"Response validation failed: {str(e)}")
+            # return response - removed undefined variable
+        except Exception as e:
+            logger.error(f"Failed to save to DB: {e}")
+            # Don't fail the request if DB save fails, just return result
+            # But in production we might want to ensure persistence
+            return result
+    except FileNotFoundError as e:
+        logger.error(f"File error: {e}")
+        raise HTTPException(status_code=404, detail=str(e))
+    except ValueError as e:
+        logger.error(f"Validation error: {e}")
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.exception(f"Transcription failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
+@router.post("/upload/quality")
+async def transcribe_quality(
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    preprocess: bool = Form(default=False, description="Apply noise reduction (5-15% WER improvement)"),
+    prompt: Optional[str] = Form(None, description="Custom vocabulary/keywords"),
+):
+    """
+    High-quality transcription mode (optimized for accuracy).
+    Features:
+    - beam_size=5 for more accurate decoding (~40% fewer errors)
+    - condition_on_previous_text=False to reduce hallucinations
+    - Optional audio preprocessing for noisy environments
+    Trade-off: ~2x slower than standard mode
+    Best for: Important recordings, noisy audio, reduced error tolerance
+    """
+    from app.services.whisper_stt_service import get_whisper_stt_service
+    import tempfile
+    import os
+    # Validate file
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    try:
+        content = await file.read()
+        # Save to temp file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            f.write(content)
+            temp_path = f.name
+        try:
+            stt_service = get_whisper_stt_service()
+            result = stt_service.transcribe_quality(
+                temp_path,
+                language=language,
+                preprocess=preprocess,
+                prompt=prompt,
+            )
+            return result
+        finally:
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+    except Exception as e:
+        logger.exception(f"Quality transcription failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
+@router.post("/upload/batch")
+async def transcribe_batch(
+    files: List[UploadFile] = File(..., description="Multiple audio files to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    batch_size: int = Form(default=8, description="Batch size (8 optimal for CPU)"),
+):
+    """
+    Batch transcription for high throughput.
+    Uses BatchedInferencePipeline for 2-3x speedup on concurrent files.
+    Best for: Processing multiple files, API with high concurrency
+    """
+    from app.services.whisper_stt_service import get_whisper_stt_service
+    import tempfile
+    import os
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided")
+    results = []
+    stt_service = get_whisper_stt_service()
+    for file in files:
+        if not file.filename:
+            continue
+        try:
+            content = await file.read()
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                f.write(content)
+                temp_path = f.name
+            try:
+                result = stt_service.transcribe_batched(
+                    temp_path,
+                    language=language,
+                    batch_size=batch_size,
+                )
+                result["filename"] = file.filename
+                results.append(result)
+            finally:
+                try:
+                    os.unlink(temp_path)
+                except:
+                    pass
+        except Exception as e:
+            logger.error(f"Failed to transcribe {file.filename}: {e}")
+            results.append({
+                "filename": file.filename,
+                "error": str(e),
+            })
+    return {
+        "count": len(results),
+        "results": results,
+        "mode": "batched",
+        "batch_size": batch_size,
+    }
+@router.post("/async-upload", response_model=AsyncTranscriptionResponse)
+async def transcribe_async_upload(
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    file_service: FileService = Depends(get_file_service),
+    db: Session = Depends(get_db),
+):
+    """
+    Asynchronously transcribe an uploaded audio file (Celery)
+    """
+    # Validate file type
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    ext = file.filename.split(".")[-1].lower()
+    if ext not in settings.supported_audio_formats_list:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format: {ext}"
+        )
+    try:
+        content = await file.read()
+        storage_path, metadata = file_service.save_upload(
+            file_content=content,
+            original_filename=file.filename,
+        )
+        # Create AudioFile record with 'queued' status
+        audio_file = AudioFile(
+            storage_path=str(storage_path),
+            original_filename=file.filename,
+            duration=0.0, # Will be updated by worker
+            format=ext,
+            sample_rate=metadata.get("sample_rate"),
+            language=language,
+            status="queued"
+        )
+        db.add(audio_file)
+        db.commit()
+        db.refresh(audio_file)
+        # Trigger Celery Task
+        task = process_audio_file.delay(audio_file.id)
+        return AsyncTranscriptionResponse(
+            task_id=task.id,
+            audio_file_id=audio_file.id,
+            status="queued",
+            message="File uploaded and queued for processing"
+        )
+    except Exception as e:
+        logger.exception(f"Async upload failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/tasks/{task_id}", response_model=TaskStatusResponse)
+async def get_task_status(task_id: str, db: Session = Depends(get_db)):
+    """
+    Check status of an async transcription task
+    """
+    task_result = AsyncResult(task_id)
+    response = TaskStatusResponse(
+        task_id=task_id,
+        status=task_result.status.lower(),
+        created_at=datetime.utcnow(), # Approximate or fetch from DB tracked tasks
+        updated_at=datetime.utcnow()
+    )
+    if task_result.successful():
+        # If successful, the result of the task function isn't returned directly
+        # because process_audio_file returns None (it saves to DB).
+        # We need to find the Transcript associated with this task if possible.
+        # Ideally, we should store task_id in AudioFile or Transcript to link them.
+        # For now, we just report completion.
+        response.status = "completed"
+        response.progress = 100.0
+    elif task_result.failed():
+        response.status = "failed"
+        response.error = str(task_result.result)
+    elif task_result.state == 'PROGRESS':
+        response.status = "processing"
+        # If we had progress updating in the task, we could read it here
+    return response
+@router.post("/transcribe-bytes", response_model=TranscriptionResponse)
+async def transcribe_bytes(
+    audio_content: bytes,
+    language: str = "en-US",
+    encoding: str = "LINEAR16",
+    sample_rate: int = 16000,
+    stt_service: STTService = Depends(get_stt_service),
+):
+    """
+    Transcribe raw audio bytes (for streaming/real-time use)
+    This endpoint is primarily for internal use or advanced clients
+    that send pre-processed audio data.
+    """
+    try:
+        result = stt_service.transcribe_bytes(
+            audio_content=audio_content,
+            language=language,
+            encoding=encoding,
+            sample_rate=sample_rate,
+        )
+        return result
+    except Exception as e:
+        logger.exception(f"Transcription failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# TODO: WebSocket endpoint for real-time streaming
+# @router.websocket("/stream")
+# async def stream_transcription(websocket: WebSocket):
+#     """Real-time streaming transcription via WebSocket"""
+#     pass
+@router.post("/upload/diarize")
+async def diarize_audio(
+    file: UploadFile = File(..., description="Audio file to diarize"),
+    num_speakers: Optional[int] = Form(None, description="Exact number of speakers (optional)"),
+    min_speakers: Optional[int] = Form(None, description="Minimum number of speakers (optional)"),
+    max_speakers: Optional[int] = Form(None, description="Maximum number of speakers (optional)"),
+    language: Optional[str] = Form(None, description="Language code (e.g., 'en'). Auto-detected if not provided."),
+    preprocess: bool = Form(False, description="Apply noise reduction before processing (improves accuracy for noisy audio)"),
+):
+    """
+    Perform Speaker Diarization ("Who said what").
+    Uses faster-whisper for transcription + pyannote.audio for speaker identification.
+    Requires:
+    - HF_TOKEN in .env for Pyannote model access
+    Returns:
+    - segments: List of segments with timestamps, text, and speaker labels
+    - speaker_stats: Speaking time per speaker
+    - language: Detected/specified language
+    """
+    from app.services.diarization_service import get_diarization_service
+    import tempfile
+    import os
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    try:
+        # Save temp file
+        content = await file.read()
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            f.write(content)
+            temp_path = f.name
+        try:
+            service = get_diarization_service()
+            result = service.process_audio(
+                temp_path,
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers,
+                language=language,
+                preprocess=preprocess,
+            )
+            return result
+        except ValueError as e:
+            # Token missing
+            raise HTTPException(status_code=400, detail=str(e))
+        except ImportError as e:
+            # Not installed
+            raise HTTPException(status_code=503, detail=str(e))
+        except Exception as e:
+            logger.exception("Diarization error")
+            raise HTTPException(status_code=500, detail=f"Diarization failed: {str(e)}")
+        finally:
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+    except Exception as e:
+        logger.error(f"Diarization request failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/transcripts.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Transcript Management Routes
+CRUD operations and Export
+"""
+from typing import List, Optional
+from fastapi import APIRouter, Depends, HTTPException, Response, Query, UploadFile, File, Form
+from sqlalchemy.orm import Session
+from datetime import datetime
+from ...models import get_db, Transcript, AudioFile
+from ...schemas.transcript import TranscriptResponse, TranscriptUpdate
+from ...services.nlp_service import get_nlp_service, NLPService
+from ...services.export_service import ExportService
+router = APIRouter(prefix="/transcripts", tags=["Transcripts"])
+@router.get("", response_model=List[TranscriptResponse])
+async def list_transcripts(
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db),
+):
+    """List all transcripts"""
+    transcripts = db.query(Transcript).order_by(Transcript.created_at.desc()).offset(skip).limit(limit).all()
+    return transcripts
+@router.get("/{transcript_id}", response_model=TranscriptResponse)
+async def get_transcript(
+    transcript_id: int,
+    db: Session = Depends(get_db),
+):
+    """Get specific transcript details"""
+    transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
+    if not transcript:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    return transcript
+@router.post("/{transcript_id}/analyze")
+async def analyze_transcript(
+    transcript_id: int,
+    db: Session = Depends(get_db),
+    nlp_service: NLPService = Depends(get_nlp_service),
+):
+    """Run NLP analysis on a transcript"""
+    transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
+    if not transcript:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    if not transcript.processed_text:
+        raise HTTPException(status_code=400, detail="Transcript has no text content")
+    # Run analysis
+    analysis = nlp_service.process_transcript(transcript.processed_text)
+    # Update DB
+    transcript.sentiment = analysis["sentiment"]
+    transcript.topics = {"keywords": analysis["keywords"]}
+    transcript.summary = analysis["summary"]
+    transcript.updated_at = datetime.utcnow()
+    db.commit()
+    db.refresh(transcript)
+    return {
+        "status": "success",
+        "analysis": analysis
+    }
+@router.get("/{transcript_id}/export")
+async def export_transcript(
+    transcript_id: int,
+    format: str = Query(..., regex="^(txt|srt|vtt|pdf)$"),
+    db: Session = Depends(get_db),
+):
+    """
+    Export transcript to specific format
+    """
+    transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
+    if not transcript:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    # Convert model to dict for service
+    data = {
+        "id": transcript.id,
+        "text": transcript.processed_text,
+        "created_at": str(transcript.created_at),
+        "duration": 0,
+        "segments": transcript.segments,
+        "words": [],
+        "sentiment": transcript.sentiment,
+    }
+    if format == "txt":
+        content = ExportService.to_txt(data)
+        media_type = "text/plain"
+    elif format == "srt":
+        content = ExportService.to_srt(data)
+        media_type = "text/plain"
+    elif format == "vtt":
+        content = ExportService.to_vtt(data)
+        media_type = "text/vtt"
+    elif format == "pdf":
+        content = ExportService.to_pdf(data)
+        media_type = "application/pdf"
+    else:
+        raise HTTPException(status_code=400, detail="Unsupported format")
+    return Response(
+        content=content,
+        media_type=media_type,
+        headers={
+            "Content-Disposition": f'attachment; filename="transcript_{transcript_id}.{format}"'
+        }
+    )
+@router.post("/meeting")
+async def process_meeting(
+    file: UploadFile = File(..., description="Audio recording of meeting"),
+    num_speakers: Optional[int] = Form(None, description="Number of speakers (hint)"),
+    language: Optional[str] = Form(None, description="Language code"),
+    db: Session = Depends(get_db),
+):
+    """
+    Process a meeting recording:
+    1. Diarization (Who spoke when)
+    2. Transcription (What was said)
+    3. NLP Analysis (Summary, Action Items, Sentiment)
+    4. Save to DB
+    """
+    import shutil
+    import os
+    import tempfile
+    from ...services.meeting_service import get_meeting_service
+    # Save upload to temp file
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        meeting_service = get_meeting_service()
+        # Run full pipeline
+        # This can be slow (minutes) so strictly speaking should be a background task
+        # But for this MVP level we'll do it synchronously with a long timeout
+        result = meeting_service.process_meeting(
+            audio_path=tmp_path,
+            num_speakers=num_speakers,
+            language=language
+        )
+        # Save to DB
+        # Create AudioFile record first
+        audio_file = AudioFile(
+            filename=file.filename,
+            filepath="processed_in_memory", # We delete temp file, so no perm path
+            duration=result["metadata"]["duration_seconds"],
+            file_size=0,
+            format=suffix.replace(".", "")
+        )
+        db.add(audio_file)
+        db.commit()
+        db.refresh(audio_file)
+        # Create Transcript record
+        transcript = Transcript(
+            audio_file_id=audio_file.id,
+            raw_text=result["raw_text"],
+            processed_text=result["raw_text"],
+            segments=result["transcript_segments"],
+            sentiment=result["sentiment"],
+            topics={"keywords": result["topics"]},
+            action_items=result["action_items"],
+            attendees=result["metadata"]["attendees"],
+            summary=result["summary"],
+            language=result["metadata"]["language"],
+            confidence=0.95, # Estimated
+            duration=result["metadata"]["duration_seconds"],
+            created_at=datetime.utcnow()
+        )
+        db.add(transcript)
+        db.commit()
+        db.refresh(transcript)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Cleanup
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass

backend/app/api/routes/translation.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Translation API Routes
+Endpoints for text and audio translation services
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form
+from pydantic import BaseModel, Field
+from typing import Optional, List
+import logging
+from app.services.translation_service import get_translation_service
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/translation", tags=["translation"])
+# Request/Response Models
+class TranslateTextRequest(BaseModel):
+    """Request model for text translation."""
+    text: str = Field(..., min_length=1, max_length=5000, description="Text to translate")
+    source_lang: str = Field(..., description="Source language code (e.g., 'hi', 'en-US')")
+    target_lang: str = Field(..., description="Target language code (e.g., 'en', 'es')")
+    use_pivot: bool = Field(default=True, description="Use English as pivot for unsupported pairs")
+class TranslateTextResponse(BaseModel):
+    """Response model for text translation."""
+    translated_text: str
+    source_lang: str
+    target_lang: str
+    source_text: str
+    processing_time: float
+    word_count: int
+    pivot_used: Optional[bool] = False
+    intermediate_text: Optional[str] = None
+    model_used: Optional[str] = None
+class LanguageInfo(BaseModel):
+    """Language information model."""
+    code: str
+    name: str
+    flag: str
+    native: str
+class TranslationPair(BaseModel):
+    """Translation pair model."""
+    code: str
+    source: LanguageInfo
+    target: LanguageInfo
+class DetectLanguageResponse(BaseModel):
+    """Response model for language detection."""
+    detected_language: str
+    confidence: float
+    language_info: Optional[dict] = None
+    all_probabilities: Optional[List[dict]] = None
+# Endpoints
+@router.get("/languages", response_model=List[LanguageInfo])
+async def get_supported_languages():
+    """
+    Get list of all supported languages.
+    Returns:
+        List of supported languages with metadata
+    """
+    service = get_translation_service()
+    return service.get_supported_languages()
+@router.get("/pairs")
+async def get_supported_pairs():
+    """
+    Get list of all supported translation pairs.
+    Returns:
+        List of supported source->target language pairs
+    """
+    service = get_translation_service()
+    return {
+        "pairs": service.get_supported_pairs(),
+        "total": len(service.get_supported_pairs()),
+    }
+@router.post("/text", response_model=TranslateTextResponse)
+async def translate_text(request: TranslateTextRequest):
+    """
+    Translate text from source to target language.
+    - Uses Helsinki-NLP MarianMT models (~300MB per language pair)
+    - Supports pivot translation through English for unsupported pairs
+    - First request for a language pair may take longer (model loading)
+    Args:
+        request: Translation request with text and language codes
+    Returns:
+        Translated text with metadata
+    """
+    service = get_translation_service()
+    try:
+        if request.use_pivot:
+            result = service.translate_with_pivot(
+                text=request.text,
+                source_lang=request.source_lang,
+                target_lang=request.target_lang,
+            )
+        else:
+            result = service.translate_text(
+                text=request.text,
+                source_lang=request.source_lang,
+                target_lang=request.target_lang,
+            )
+        return TranslateTextResponse(**result)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"Translation error: {e}")
+        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
+@router.post("/detect", response_model=DetectLanguageResponse)
+async def detect_language(text: str = Form(..., min_length=10, description="Text to analyze")):
+    """
+    Detect the language of input text.
+    Args:
+        text: Text to analyze (minimum 10 characters for accuracy)
+    Returns:
+        Detected language with confidence score
+    """
+    service = get_translation_service()
+    result = service.detect_language(text)
+    if result.get("error"):
+        raise HTTPException(status_code=400, detail=result["error"])
+    return DetectLanguageResponse(**result)
+@router.get("/model-info")
+async def get_model_info():
+    """
+    Get information about loaded translation models.
+    Returns:
+        Model loading status and supported pairs
+    """
+    service = get_translation_service()
+    return service.get_model_info()
+@router.post("/audio")
+async def translate_audio(
+    file: UploadFile = File(..., description="Audio file to translate"),
+    source_lang: str = Form(..., description="Source language code"),
+    target_lang: str = Form(..., description="Target language code"),
+    generate_audio: bool = Form(default=True, description="Generate TTS output"),
+):
+    """
+    Full audio translation pipeline: STT → Translate → TTS
+    1. Transcribe audio using Whisper
+    2. Translate text using MarianMT
+    3. Optionally generate speech in target language
+    Args:
+        file: Audio file (WAV, MP3, etc.)
+        source_lang: Source language code
+        target_lang: Target language code
+        generate_audio: Whether to generate TTS output
+    Returns:
+        Transcription, translation, and optional audio response
+    """
+    import tempfile
+    import os
+    from app.services.whisper_stt_service import get_whisper_stt_service
+    from app.services.edge_tts_service import get_edge_tts_service
+    translation_service = get_translation_service()
+    stt_service = get_whisper_stt_service()
+    tts_service = get_edge_tts_service()
+    # Save uploaded file
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        content = await file.read()
+        tmp.write(content)
+        tmp_path = tmp.name
+    try:
+        # Step 1: Transcribe
+        transcription = stt_service.transcribe_file(tmp_path, language=source_lang)
+        source_text = transcription["text"]
+        if not source_text.strip():
+            raise HTTPException(status_code=400, detail="No speech detected in audio")
+        # Step 2: Translate
+        translation = translation_service.translate_with_pivot(
+            text=source_text,
+            source_lang=source_lang,
+            target_lang=target_lang,
+        )
+        translated_text = translation["translated_text"]
+        # Step 3: Generate TTS (optional)
+        audio_base64 = None
+        if generate_audio:
+            # Map language code to voice
+            voice_map = {
+                "en": "en-US-AriaNeural",
+                "hi": "hi-IN-SwaraNeural",
+                "es": "es-ES-ElviraNeural",
+                "fr": "fr-FR-DeniseNeural",
+                "de": "de-DE-KatjaNeural",
+                "zh": "zh-CN-XiaoxiaoNeural",
+                "ja": "ja-JP-NanamiNeural",
+                "ko": "ko-KR-SunHiNeural",
+                "ar": "ar-SA-ZariyahNeural",
+                "ru": "ru-RU-SvetlanaNeural",
+            }
+            target_code = target_lang.split("-")[0].lower()
+            voice = voice_map.get(target_code, "en-US-AriaNeural")
+            audio_bytes = tts_service.synthesize_sync(translated_text, voice=voice)
+            import base64
+            audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+        return {
+            "source_text": source_text,
+            "translated_text": translated_text,
+            "source_lang": source_lang,
+            "target_lang": target_lang,
+            "transcription_time": transcription["processing_time"],
+            "translation_time": translation["processing_time"],
+            "audio_base64": audio_base64,
+            "audio_format": "mp3" if audio_base64 else None,
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Audio translation failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass

backend/app/api/routes/tts.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+Text-to-Speech API Router
+"""
+import base64
+import logging
+from typing import Optional
+from fastapi import APIRouter, HTTPException, Depends, Response, Request
+from fastapi.responses import StreamingResponse
+from io import BytesIO
+from ...core.limiter import limiter
+from ...services.tts_service import get_tts_service, TTSService
+from ...schemas.tts import (
+    SynthesisRequest,
+    SynthesisResponse,
+    VoiceInfo,
+    VoiceListResponse,
+    VoicePreviewRequest,
+)
+from ...core.config import get_settings
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/tts", tags=["Text-to-Speech"])
+settings = get_settings()
+@router.get("/voices", response_model=VoiceListResponse)
+async def get_voices(
+    language: Optional[str] = None,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Get list of available TTS voices
+    Optionally filter by language code (e.g., "en-US", "es", "fr")
+    """
+    return await tts_service.get_voices(language_code=language)
+@router.get("/voices/{language}", response_model=VoiceListResponse)
+async def get_voices_by_language(
+    language: str,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Get voices for a specific language
+    """
+    if language not in settings.supported_languages_list:
+        # Try partial match (e.g., "en" matches "en-US", "en-GB")
+        partial_matches = [l for l in settings.supported_languages_list if l.startswith(language)]
+        if not partial_matches:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported language: {language}"
+            )
+    return await tts_service.get_voices(language_code=language)
+@router.post("/synthesize", response_model=SynthesisResponse)
+@limiter.limit("10/minute")
+async def synthesize_speech(
+    request: Request,
+    request_body: SynthesisRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Synthesize text to speech
+    Returns base64-encoded audio content along with metadata.
+    Decode the audio_content field to get the audio bytes.
+    """
+    # Validate text length
+    if len(request_body.text) > 5000:
+        raise HTTPException(
+            status_code=400,
+            detail="Text too long. Maximum 5000 characters."
+        )
+    # Validate language
+    lang_base = request_body.language.split("-")[0] if "-" in request_body.language else request_body.language
+    supported_bases = [l.split("-")[0] for l in settings.supported_languages_list]
+    if lang_base not in supported_bases:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported language: {request_body.language}"
+        )
+    try:
+        result = await tts_service.synthesize(request_body)
+        return result
+    except ValueError as e:
+        logger.error(f"Synthesis validation error: {e}")
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.exception(f"Synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
+@router.post("/stream")
+async def stream_speech(
+    request: SynthesisRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Stream text-to-speech audio
+    Returns a chunked audio stream (audio/mpeg) for immediate playback.
+    Best for long text to reduce latency (TTFB).
+    """
+    try:
+        return StreamingResponse(
+            tts_service.synthesize_stream(request),
+            media_type="audio/mpeg"
+        )
+    except Exception as e:
+        logger.exception(f"Streaming synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/ssml")
+async def synthesize_ssml(
+    text: str,
+    voice: str = "en-US-AriaNeural",
+    rate: str = "medium",
+    pitch: str = "medium",
+    emphasis: Optional[str] = None,
+    auto_breaks: bool = True,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Synthesize speech with SSML prosody control
+    Supports advanced speech customization:
+    - rate: 'x-slow', 'slow', 'medium', 'fast', 'x-fast'
+    - pitch: 'x-low', 'low', 'medium', 'high', 'x-high'
+    - emphasis: 'reduced', 'moderate', 'strong'
+    - auto_breaks: Add natural pauses at punctuation
+    Returns audio/mpeg stream.
+    """
+    try:
+        from ...services.edge_tts_service import get_edge_tts_service
+        edge_service = get_edge_tts_service()
+        # Build SSML
+        ssml = edge_service.build_ssml(
+            text=text,
+            voice=voice,
+            rate=rate,
+            pitch=pitch,
+            emphasis=emphasis,
+            breaks=auto_breaks
+        )
+        # Synthesize
+        audio_bytes = await edge_service.synthesize_ssml(ssml, voice)
+        return Response(
+            content=audio_bytes,
+            media_type="audio/mpeg",
+            headers={"Content-Disposition": "inline; filename=speech.mp3"}
+        )
+    except Exception as e:
+        logger.exception(f"SSML synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/synthesize/audio")
+async def synthesize_audio_file(
+    request: SynthesisRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Synthesize text and return audio file directly
+    Returns the audio file as a downloadable stream.
+    """
+    try:
+        result = await tts_service.synthesize(request)
+        # Decode base64 audio
+        audio_bytes = base64.b64decode(result.audio_content)
+        # Determine content type
+        content_types = {
+            "MP3": "audio/mpeg",
+            "LINEAR16": "audio/wav",
+            "OGG_OPUS": "audio/ogg",
+        }
+        content_type = content_types.get(result.encoding, "audio/mpeg")
+        # Return as streaming response
+        return StreamingResponse(
+            BytesIO(audio_bytes),
+            media_type=content_type,
+            headers={
+                "Content-Disposition": f'attachment; filename="speech.{result.encoding.lower()}"',
+                "Content-Length": str(result.audio_size),
+            }
+        )
+    except Exception as e:
+        logger.exception(f"Audio synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/preview")
+async def preview_voice(
+    request: VoicePreviewRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Generate a short preview of a voice
+    Returns a small audio sample for voice selection UI.
+    """
+    # Find the voice to get its language
+    voices = tts_service.get_voices().voices
+    voice_info = next((v for v in voices if v.name == request.voice), None)
+    if not voice_info:
+        raise HTTPException(status_code=404, detail=f"Voice not found: {request.voice}")
+    # Create synthesis request with preview text
+    synth_request = SynthesisRequest(
+        text=request.text or "Hello! This is a preview of my voice.",
+        language=voice_info.language_code,
+        voice=request.voice,
+        audio_encoding="MP3",
+    )
+    try:
+        result = tts_service.synthesize(synth_request)
+        # Return audio directly
+        audio_bytes = base64.b64decode(result.audio_content)
+        return StreamingResponse(
+            BytesIO(audio_bytes),
+            media_type="audio/mpeg",
+        )
+    except Exception as e:
+        logger.exception(f"Preview failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/ws.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+WebSocket Router for Real-Time Transcription
+"""
+import logging
+import json
+from typing import Dict, Optional
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Query
+from app.core.ws_security import (
+    validate_ws_origin,
+    authenticate_websocket,
+    ws_rate_limiter
+)
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/ws", tags=["WebSocket"])
+class ConnectionManager:
+    """Manages active WebSocket connections"""
+    def __init__(self):
+        self.active_connections: Dict[str, WebSocket] = {}
+        self.user_ids: Dict[str, Optional[int]] = {}  # Track authenticated users
+    async def connect(
+        self,
+        client_id: str,
+        websocket: WebSocket,
+        user_id: Optional[int] = None
+    ) -> bool:
+        """
+        Connect a client after validation.
+        Returns True if connection accepted, False if rejected.
+        """
+        # Validate origin
+        if not validate_ws_origin(websocket):
+            logger.warning(f"WebSocket rejected for {client_id}: invalid origin")
+            await websocket.close(code=1008)  # Policy Violation
+            return False
+        await websocket.accept()
+        self.active_connections[client_id] = websocket
+        self.user_ids[client_id] = user_id
+        logger.info(f"Client {client_id} connected (user_id={user_id})")
+        return True
+    def disconnect(self, client_id: str):
+        if client_id in self.active_connections:
+            del self.active_connections[client_id]
+            self.user_ids.pop(client_id, None)
+            ws_rate_limiter.cleanup(client_id)
+            logger.info(f"Client {client_id} disconnected")
+    async def send_json(self, client_id: str, data: dict):
+        if client_id in self.active_connections:
+            await self.active_connections[client_id].send_json(data)
+manager = ConnectionManager()
+@router.websocket("/transcription/{client_id}")
+async def websocket_transcription(
+    websocket: WebSocket,
+    client_id: str,
+    token: Optional[str] = Query(None)
+):
+    """
+    Real-time streaming transcription via WebSocket with VAD.
+    Optional auth via query param: ws://host/ws/transcription/{id}?token=jwt_token
+    """
+    # Authenticate (optional for demo, but logged)
+    user_id = await authenticate_websocket(websocket, token)
+    if not await manager.connect(client_id, websocket, user_id):
+        return  # Connection rejected
+    from app.services.ws_stt_service import StreamManager, transcribe_buffer
+    stream_manager = StreamManager(websocket)
+    async def handle_transcription(audio_bytes: bytes):
+        """Callback for processing speech segments."""
+        # Check rate limit
+        if not ws_rate_limiter.check_rate(client_id):
+            await manager.send_json(client_id, {"error": "Rate limit exceeded"})
+            return
+        # Check message size
+        if not ws_rate_limiter.check_size(audio_bytes):
+            await manager.send_json(client_id, {"error": "Message too large"})
+            return
+        try:
+            # Send processing status
+            await manager.send_json(client_id, {"status": "processing"})
+            # Transcribe
+            result = await transcribe_buffer(audio_bytes)
+            text = result.get("text", "").strip()
+            if text:
+                # Send result
+                await manager.send_json(client_id, {
+                    "text": text,
+                    "is_final": True,
+                    "status": "complete"
+                })
+                logger.info(f"Transcribed: {text}")
+        except Exception as e:
+            logger.error(f"Transcription callback error: {e}")
+            await manager.send_json(client_id, {"error": str(e)})
+    try:
+        # Start processing loop
+        await stream_manager.process_stream(handle_transcription)
+    except WebSocketDisconnect:
+        manager.disconnect(client_id)
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+        try:
+            await manager.send_json(client_id, {"error": str(e)})
+        except:
+            pass
+        manager.disconnect(client_id)
+@router.websocket("/tts/{client_id}")
+async def websocket_tts(
+    websocket: WebSocket,
+    client_id: str,
+    token: Optional[str] = Query(None)
+):
+    """
+    Real-time Text-to-Speech via WebSocket
+    Protocol:
+    - Client sends: JSON {"text": "...", "voice": "...", "rate": "...", "pitch": "..."}
+    - Server sends: Binary audio chunks (MP3) followed by JSON {"status": "complete"}
+    Optional auth via query param: ws://host/ws/tts/{id}?token=jwt_token
+    This achieves <500ms TTFB by streaming as chunks are generated.
+    """
+    # Authenticate (optional for demo, but logged)
+    user_id = await authenticate_websocket(websocket, token)
+    if not await manager.connect(client_id, websocket, user_id):
+        return  # Connection rejected
+    try:
+        import edge_tts
+        while True:
+            # Receive synthesis request
+            data = await websocket.receive_json()
+            text = data.get("text", "")
+            voice = data.get("voice", "en-US-AriaNeural")
+            rate = data.get("rate", "+0%")
+            pitch = data.get("pitch", "+0Hz")
+            if not text:
+                await websocket.send_json({"error": "No text provided"})
+                continue
+            logger.info(f"WebSocket TTS: Synthesizing '{text[:50]}...' with {voice}")
+            # Stream audio chunks directly
+            import time
+            start_time = time.time()
+            first_chunk_sent = False
+            total_bytes = 0
+            communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    await websocket.send_bytes(chunk["data"])
+                    total_bytes += len(chunk["data"])
+                    if not first_chunk_sent:
+                        ttfb = (time.time() - start_time) * 1000
+                        logger.info(f"WebSocket TTS TTFB: {ttfb:.0f}ms")
+                        first_chunk_sent = True
+            # Send completion marker
+            total_time = time.time() - start_time
+            await websocket.send_json({
+                "status": "complete",
+                "total_bytes": total_bytes,
+                "total_time_ms": round(total_time * 1000),
+                "ttfb_ms": round(ttfb) if first_chunk_sent else None
+            })
+    except WebSocketDisconnect:
+        manager.disconnect(client_id)
+    except Exception as e:
+        logger.error(f"WebSocket TTS error: {e}")
+        try:
+            await websocket.send_json({"error": str(e)})
+        except:
+            pass
+        manager.disconnect(client_id)

backend/app/core/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+VoiceForge Core Package
+"""
+from .config import get_settings, Settings, LANGUAGE_METADATA
+__all__ = ["get_settings", "Settings", "LANGUAGE_METADATA"]

backend/app/core/config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+VoiceForge Configuration
+Pydantic Settings for application configuration
+"""
+from functools import lru_cache
+from typing import List
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables"""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="allow",  # Allow extra env vars without error
+    )
+    # Application
+    app_name: str = "VoiceForge"
+    app_version: str = "1.0.0"
+    debug: bool = False
+    # API Server
+    api_host: str = "0.0.0.0"
+    api_port: int = 8000
+    # Database
+    database_url: str = Field(
+        default="sqlite:///./voiceforge.db",
+        description="Database connection URL (SQLite for dev, PostgreSQL for prod)"
+    )
+    # Redis
+    redis_url: str = Field(
+        default="redis://localhost:6379/0",
+        description="Redis connection URL for caching and Celery"
+    )
+    # Google Cloud
+    google_application_credentials: str = Field(
+        default="./credentials/google-cloud-key.json",
+        description="Path to Google Cloud service account JSON key"
+    )
+    # AI Services Configuration
+    use_local_services: bool = Field(
+        default=True,
+        description="Use local free services (Whisper + EdgeTTS) instead of Google Cloud"
+    )
+    whisper_model: str = Field(
+        default="small",
+        description="Whisper model size (tiny, base, small, medium, large-v3)"
+    )
+    # Security
+    secret_key: str = Field(
+        default="your-super-secret-key-change-in-production",
+        description="Secret key for JWT encoding"
+    )
+    access_token_expire_minutes: int = 30
+    algorithm: str = "HS256"
+    hf_token: str | None = Field(default=None, description="Hugging Face Token for Diarization")
+    # File Storage
+    upload_dir: str = "./uploads"
+    max_audio_duration_seconds: int = 600  # 10 minutes
+    max_upload_size_mb: int = 50
+    # Supported Languages
+    supported_languages: str = "en-US,en-GB,es-ES,es-MX,fr-FR,de-DE,ja-JP,ko-KR,zh-CN,hi-IN"
+    # Audio Formats
+    supported_audio_formats: str = "wav,mp3,m4a,flac,ogg,webm"
+    @property
+    def supported_languages_list(self) -> List[str]:
+        """Get supported languages as a list"""
+        return [lang.strip() for lang in self.supported_languages.split(",")]
+    @property
+    def supported_audio_formats_list(self) -> List[str]:
+        """Get supported audio formats as a list"""
+        return [fmt.strip() for fmt in self.supported_audio_formats.split(",")]
+# Language metadata for UI display
+LANGUAGE_METADATA = {
+    "en-US": {"name": "English (US)", "flag": "🇺🇸", "native": "English"},
+    "en-GB": {"name": "English (UK)", "flag": "🇬🇧", "native": "English"},
+    "es-ES": {"name": "Spanish (Spain)", "flag": "🇪🇸", "native": "Español"},
+    "es-MX": {"name": "Spanish (Mexico)", "flag": "🇲🇽", "native": "Español"},
+    "fr-FR": {"name": "French", "flag": "🇫🇷", "native": "Français"},
+    "de-DE": {"name": "German", "flag": "🇩🇪", "native": "Deutsch"},
+    "ja-JP": {"name": "Japanese", "flag": "🇯🇵", "native": "日本語"},
+    "ko-KR": {"name": "Korean", "flag": "🇰🇷", "native": "한국어"},
+    "zh-CN": {"name": "Chinese (Mandarin)", "flag": "🇨🇳", "native": "中文"},
+    "hi-IN": {"name": "Hindi", "flag": "🇮🇳", "native": "हिन्दी"},
+}
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance"""
+    return Settings()

backend/app/core/limiter.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+# Initialize Limiter
+# Use in-memory storage for local dev (Redis for production)
+redis_url = os.getenv("REDIS_URL")
+# For local testing without Redis, use memory storage
+if redis_url and redis_url.strip():
+    try:
+        import redis
+        r = redis.from_url(redis_url)
+        r.ping()  # Test connection
+        storage_uri = redis_url
+    except Exception:
+        # Redis not available, fall back to memory
+        storage_uri = "memory://"
+else:
+    storage_uri = "memory://"
+limiter = Limiter(
+    key_func=get_remote_address,
+    storage_uri=storage_uri,
+    default_limits=["60/minute"]  # Global limit: 60 req/min per IP
+)

backend/app/core/middleware.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+Rate Limiting Middleware
+Uses Redis to track and limit request rates per IP address.
+Pure ASGI implementation to avoid BaseHTTPMiddleware issues.
+"""
+import time
+import redis
+from starlette.responses import JSONResponse
+from starlette.types import ASGIApp, Scope, Receive, Send
+from ..core.config import get_settings
+settings = get_settings()
+class RateLimitMiddleware:
+    def __init__(self, app: ASGIApp):
+        self.app = app
+        # Hardcoded or from settings (bypassing constructor arg issue)
+        self.requests_per_minute = 60
+        self.window_size = 60  # seconds
+        # Connect to Redis
+        try:
+            self.redis_client = redis.from_url(settings.redis_url)
+        except Exception as e:
+            print(f"⚠️ Rate limiter disabled: Could not connect to Redis ({e})")
+            self.redis_client = None
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        # Skip if not HTTP
+        if scope["type"] != "http":
+            await self.app(scope, receive, send)
+            return
+        # Skip rate limiting for non-API routes or if Redis is down
+        path = scope.get("path", "")
+        if not path.startswith("/api/") or self.redis_client is None:
+            await self.app(scope, receive, send)
+            return
+        # Get client IP
+        client = scope.get("client")
+        client_ip = client[0] if client else "unknown"
+        key = f"rate_limit:{client_ip}"
+        try:
+            # Simple fixed window counter
+            current_count = self.redis_client.incr(key)
+            # Set expiry on first request
+            if current_count == 1:
+                self.redis_client.expire(key, self.window_size)
+            if current_count > self.requests_per_minute:
+                response = JSONResponse(
+                    status_code=429,
+                    content={
+                        "detail": "Too many requests",
+                        "retry_after": self.window_size
+                    },
+                    headers={"Retry-After": str(self.window_size)}
+                )
+                await response(scope, receive, send)
+                return
+        except redis.RedisError:
+            # Fail open if Redis has issues during request
+            pass
+        await self.app(scope, receive, send)

backend/app/core/request_size_middleware.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Request Size Limiting Middleware
+Prevents large request bodies from consuming excessive memory.
+"""
+import os
+import logging
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import Response
+logger = logging.getLogger(__name__)
+# Default max body size: 50MB (configurable via env)
+DEFAULT_MAX_BODY_SIZE = 50 * 1024 * 1024  # 50 MB
+class RequestSizeLimitMiddleware(BaseHTTPMiddleware):
+    """
+    ASGI Middleware to limit request body size.
+    Checks Content-Length header and rejects requests that exceed the limit
+    with a 413 Payload Too Large response.
+    Note: This checks Content-Length before reading the body.
+    For chunked transfer encoding, consider additional safeguards.
+    """
+    def __init__(self, app, max_body_size: int = None):
+        super().__init__(app)
+        self.max_body_size = max_body_size or int(
+            os.getenv("MAX_REQUEST_BODY_SIZE", DEFAULT_MAX_BODY_SIZE)
+        )
+        logger.info(f"Request size limit: {self.max_body_size / 1024 / 1024:.1f} MB")
+    async def dispatch(self, request: Request, call_next):
+        # Skip for WebSocket upgrades
+        if request.headers.get("upgrade", "").lower() == "websocket":
+            return await call_next(request)
+        # Check Content-Length header
+        content_length = request.headers.get("content-length")
+        if content_length:
+            try:
+                size = int(content_length)
+                if size > self.max_body_size:
+                    logger.warning(
+                        f"Request too large: {size / 1024 / 1024:.1f} MB "
+                        f"(limit: {self.max_body_size / 1024 / 1024:.1f} MB) "
+                        f"from {request.client.host if request.client else 'unknown'}"
+                    )
+                    return Response(
+                        content="Request body too large",
+                        status_code=413,
+                        media_type="text/plain"
+                    )
+            except ValueError:
+                pass  # Invalid Content-Length, let the server handle it
+        return await call_next(request)
+class StreamingSizeValidator:
+    """
+    Utility for validating file upload sizes during streaming read.
+    Use with SpooledTemporaryFile for memory-efficient large file handling:
+        validator = StreamingSizeValidator(max_size=100 * 1024 * 1024)
+        with SpooledTemporaryFile(max_size=5*1024*1024) as tmp:
+            async for chunk in file.stream():
+                validator.add(len(chunk))  # Raises if exceeded
+                tmp.write(chunk)
+    """
+    def __init__(self, max_size: int):
+        self.max_size = max_size
+        self.current_size = 0
+    def add(self, chunk_size: int):
+        """Add chunk size and check limit. Raises ValueError if exceeded."""
+        self.current_size += chunk_size
+        if self.current_size > self.max_size:
+            raise ValueError(
+                f"Upload exceeds size limit: {self.current_size} > {self.max_size}"
+            )
+    @property
+    def size(self) -> int:
+        return self.current_size

backend/app/core/security.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+Security Utilities
+Handles password hashing, JWT generation, and API key verification.
+"""
+from datetime import datetime, timedelta
+from typing import Optional, Union, Any
+from jose import jwt
+from passlib.context import CryptContext
+from fastapi.security import OAuth2PasswordBearer, APIKeyHeader
+from fastapi import Depends, HTTPException, status
+from sqlalchemy.orm import Session
+from ..core.config import get_settings
+from ..models import get_db, User, ApiKey
+settings = get_settings()
+# Password hashing (PBKDF2 is safer/easier on Windows than bcrypt sometimes)
+pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
+# JWT configuration
+SECRET_KEY = settings.secret_key
+ALGORITHM = settings.algorithm
+ACCESS_TOKEN_EXPIRE_MINUTES = settings.access_token_expire_minutes
+# OAuth2 scheme - auto_error=False allows API key fallback
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="api/v1/auth/login", auto_error=False)
+api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
+def verify_password(plain_password: str, hashed_password: str) -> bool:
+    return pwd_context.verify(plain_password, hashed_password)
+def get_password_hash(password: str) -> str:
+    return pwd_context.hash(password)
+def create_access_token(subject: Union[str, Any], expires_delta: timedelta = None) -> str:
+    if expires_delta:
+        expire = datetime.utcnow() + expires_delta
+    else:
+        expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    to_encode = {"exp": expire, "sub": str(subject)}
+    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+    return encoded_jwt
+async def get_current_user(
+    token: str = Depends(oauth2_scheme),
+    db: Session = Depends(get_db)
+) -> Optional[User]:
+    """Validate JWT and return user. Returns None if token missing/invalid."""
+    if not token:
+        return None  # Allow API key fallback
+    try:
+        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+        user_id: str = payload.get("sub")
+        if user_id is None:
+            return None
+    except Exception:
+        return None
+    user = db.query(User).filter(User.id == int(user_id)).first()
+    return user
+async def get_current_active_user(current_user: Optional[User] = Depends(get_current_user)) -> User:
+    """Get current active user. Raises 401 if not authenticated."""
+    if not current_user:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Not authenticated",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    if not current_user.is_active:
+        raise HTTPException(status_code=400, detail="Inactive user")
+    return current_user
+async def verify_api_key(
+    api_key: str = Depends(api_key_header),
+    db: Session = Depends(get_db)
+) -> Optional[User]:
+    """
+    Validate API key from X-API-Key header.
+    Returns the associated user if valid, else None (or raises if enforcing).
+    """
+    if not api_key:
+        return None  # Or raise if strict
+    key_record = db.query(ApiKey).filter(ApiKey.key == api_key, ApiKey.is_active == True).first()
+    if key_record:
+        # Update usage stats
+        key_record.last_used_at = datetime.utcnow()
+        db.commit()
+        return key_record.user
+    return None  # Invalid key
+def get_api_user_or_jwt_user(
+    api_key_user: Optional[User] = Depends(verify_api_key),
+    jwt_user: Optional[User] = Depends(get_current_user)
+) -> User:
+    """Allow access via either API Key or JWT"""
+    if api_key_user:
+        return api_key_user
+    if jwt_user:
+        return jwt_user
+    raise HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Not authenticated"
+    )

backend/app/core/security_encryption.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Field-level Encryption for SQLAlchemy Models.
+Uses Fernet symmetric encryption from the `cryptography` library.
+The ENCRYPTION_KEY should be a 32-byte base64-encoded key.
+Generate one with: from cryptography.fernet import Fernet; print(Fernet.generate_key())
+"""
+import os
+import base64
+import logging
+from typing import Optional
+from cryptography.fernet import Fernet, InvalidToken
+from sqlalchemy import TypeDecorator, String
+logger = logging.getLogger(__name__)
+# --- Configuration ---
+# IMPORTANT: Store this securely! In production, use secrets manager or env vars.
+# Default key is for development ONLY - regenerate for production!
+_DEFAULT_DEV_KEY = "VOICEFORGE_DEV_KEY_REPLACE_ME_NOW="  # Placeholder - NOT a valid key
+def _get_encryption_key() -> bytes:
+    """Get the encryption key from environment. Fail-closed in production."""
+    key_str = os.getenv("ENCRYPTION_KEY")
+    if key_str:
+        return key_str.encode()
+    # Check if running in production
+    is_production = os.getenv("ENVIRONMENT", "development").lower() == "production"
+    if is_production:
+        raise RuntimeError(
+            "ENCRYPTION_KEY environment variable must be set in production! "
+            "Generate one with: python -c \"from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())\""
+        )
+    # Development fallback - deterministic but INSECURE
+    logger.warning("⚠️  ENCRYPTION_KEY not set! Using DEV-ONLY fixed key. DO NOT USE IN PRODUCTION.")
+    # Fixed base64 key for dev consistency (32 bytes -> valid Fernet key)
+    return base64.urlsafe_b64encode(b"voiceforge_dev_key_32bytes_ok!")
+# Cache the Fernet instance
+_fernet: Optional[Fernet] = None
+def get_fernet() -> Fernet:
+    """Get or create the Fernet encryption instance."""
+    global _fernet
+    if _fernet is None:
+        key = _get_encryption_key()
+        _fernet = Fernet(key)
+    return _fernet
+# --- SQLAlchemy TypeDecorator ---
+class EncryptedString(TypeDecorator):
+    """
+    SQLAlchemy type that encrypts/decrypts string values transparently.
+    Usage:
+        class User(Base):
+            full_name = Column(EncryptedString(255), nullable=True)
+    The encrypted data is stored as a base64-encoded string in the database.
+    """
+    impl = String
+    cache_ok = True
+    def __init__(self, length: int = 512, *args, **kwargs):
+        # Encrypted strings are longer than plaintext, so pad the length
+        super().__init__(length * 2, *args, **kwargs)
+    def process_bind_param(self, value, dialect):
+        """Encrypt the value before storing in DB."""
+        if value is None:
+            return None
+        try:
+            fernet = get_fernet()
+            # Encode string to bytes, encrypt, then decode to string for storage
+            encrypted = fernet.encrypt(value.encode('utf-8'))
+            return encrypted.decode('utf-8')
+        except Exception as e:
+            logger.error(f"Encryption failed: {e}")
+            # In case of encryption failure, store plaintext (fail-open for dev)
+            # In production, you might want to raise instead
+            return value
+    def process_result_value(self, value, dialect):
+        """Decrypt the value when reading from DB."""
+        if value is None:
+            return None
+        try:
+            fernet = get_fernet()
+            # Decode from storage string, decrypt, then decode to string
+            decrypted = fernet.decrypt(value.encode('utf-8'))
+            return decrypted.decode('utf-8')
+        except InvalidToken:
+            # Value might be plaintext (legacy data or encryption disabled)
+            logger.warning("Decryption failed - returning raw value (possible legacy data)")
+            return value
+        except Exception as e:
+            logger.error(f"Decryption failed: {e}")
+            return value

backend/app/core/security_headers.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.types import ASGIApp, Receive, Scope, Send
+class SecurityHeadersMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app: ASGIApp):
+        super().__init__(app)
+    async def dispatch(self, request, call_next):
+        response = await call_next(request)
+        # Prevent Clickjacking
+        response.headers["X-Frame-Options"] = "DENY"
+        # Prevent MIME type sniffing
+        response.headers["X-Content-Type-Options"] = "nosniff"
+        # Enable XSS filtering in browser (legacy but good for depth)
+        response.headers["X-XSS-Protection"] = "1; mode=block"
+        # Strict Transport Security (HSTS)
+        # Enforce HTTPS. max-age=31536000 is 1 year.
+        # includeSubDomains applies to all subdomains.
+        # preload allows domain to be included in browser preload lists.
+        # NOTE: Only effective if served over HTTPS.
+        response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
+        # Content Security Policy (CSP)
+        # Very strict default: only allow content from self.
+        # This might need adjustment for Swagger UI (CDN assets) or other resources.
+        # For now, we allow 'unsafe-inline' and 'unsafe-eval' for Swagger UI compatibility if needed,
+        # but primarily 'self'.
+        response.headers["Content-Security-Policy"] = "default-src 'self'; img-src 'self' data: https:; style-src 'self' 'unsafe-inline'; script-src 'self' 'unsafe-inline';"
+        # Referrer Policy
+        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+        return response

backend/app/core/ws_security.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+WebSocket Security Utilities
+Authentication and validation for WebSocket connections.
+"""
+import os
+import logging
+from typing import Optional, Set
+from urllib.parse import urlparse
+from fastapi import WebSocket, WebSocketException, status
+from jose import jwt, JWTError
+from .config import get_settings
+logger = logging.getLogger(__name__)
+settings = get_settings()
+# Allowed origins for WebSocket connections
+_allowed_ws_origins: Optional[Set[str]] = None
+def get_allowed_origins() -> Set[str]:
+    """Get set of allowed WebSocket origins."""
+    global _allowed_ws_origins
+    if _allowed_ws_origins is None:
+        origins_str = os.getenv(
+            "CORS_ORIGINS",
+            "http://localhost:8501,http://localhost:3000,http://localhost:8000"
+        )
+        _allowed_ws_origins = {o.strip().rstrip('/') for o in origins_str.split(",")}
+        # Add HuggingFace origins if deploying there
+        if os.getenv("SPACE_ID"):
+            _allowed_ws_origins.add("https://huggingface.co")
+            _allowed_ws_origins.add(f"https://{os.getenv('SPACE_ID')}.hf.space")
+        logger.info(f"WebSocket allowed origins: {_allowed_ws_origins}")
+    return _allowed_ws_origins
+def validate_ws_origin(websocket: WebSocket) -> bool:
+    """
+    Validate WebSocket Origin header against allowed origins.
+    Returns True if valid, False otherwise.
+    """
+    origin = websocket.headers.get("origin")
+    if not origin:
+        # No origin header - could be same-origin or non-browser client
+        # In production, you might want to reject these
+        logger.warning("WebSocket connection without Origin header")
+        return True  # Allow for dev/non-browser clients
+    # Normalize origin
+    origin = origin.rstrip('/')
+    allowed = get_allowed_origins()
+    if origin in allowed:
+        return True
+    # Check for wildcard subdomain match (e.g., *.hf.space)
+    parsed = urlparse(origin)
+    if parsed.netloc.endswith('.hf.space'):
+        return True
+    logger.warning(f"WebSocket rejected: origin '{origin}' not in allowed list")
+    return False
+async def authenticate_websocket(
+    websocket: WebSocket,
+    token: Optional[str] = None
+) -> Optional[int]:
+    """
+    Authenticate a WebSocket connection using JWT token.
+    Token can be provided via:
+    - Query parameter: ws://host/ws/endpoint?token=xxx
+    - First message after connection
+    Returns user_id if authenticated, None otherwise.
+    """
+    # Try query param first
+    if not token:
+        token = websocket.query_params.get("token")
+    if not token:
+        return None
+    try:
+        payload = jwt.decode(
+            token,
+            settings.secret_key,
+            algorithms=[settings.algorithm]
+        )
+        user_id = payload.get("sub")
+        if user_id:
+            return int(user_id)
+    except JWTError as e:
+        logger.warning(f"WebSocket auth failed: {e}")
+    except Exception as e:
+        logger.error(f"WebSocket auth error: {e}")
+    return None
+async def require_ws_auth(websocket: WebSocket) -> int:
+    """
+    Require authentication for WebSocket. Closes connection if not authenticated.
+    Usage:
+        @router.websocket("/secure/{client_id}")
+        async def secure_ws(websocket: WebSocket, client_id: str):
+            user_id = await require_ws_auth(websocket)
+            await websocket.accept()
+            # ... handle connection
+    """
+    # Validate origin first
+    if not validate_ws_origin(websocket):
+        await websocket.close(code=status.WS_1008_POLICY_VIOLATION)
+        raise WebSocketException(code=status.WS_1008_POLICY_VIOLATION)
+    user_id = await authenticate_websocket(websocket)
+    if not user_id:
+        await websocket.close(code=status.WS_1008_POLICY_VIOLATION)
+        raise WebSocketException(code=status.WS_1008_POLICY_VIOLATION)
+    return user_id
+class WebSocketRateLimiter:
+    """
+    Simple rate limiter for WebSocket messages.
+    Tracks message count per connection and rejects if exceeded.
+    """
+    def __init__(self, max_messages_per_second: int = 10, max_message_size: int = 1024 * 1024):
+        self.max_rate = max_messages_per_second
+        self.max_size = max_message_size
+        self._counts: dict = {}  # client_id -> (count, last_reset_time)
+    def check_rate(self, client_id: str) -> bool:
+        """Check if client is within rate limits. Returns True if allowed."""
+        import time
+        now = time.time()
+        if client_id not in self._counts:
+            self._counts[client_id] = (1, now)
+            return True
+        count, last_reset = self._counts[client_id]
+        # Reset counter every second
+        if now - last_reset >= 1.0:
+            self._counts[client_id] = (1, now)
+            return True
+        # Check limit
+        if count >= self.max_rate:
+            logger.warning(f"WebSocket rate limit exceeded for {client_id}")
+            return False
+        self._counts[client_id] = (count + 1, last_reset)
+        return True
+    def check_size(self, data: bytes) -> bool:
+        """Check if message size is within limits."""
+        if len(data) > self.max_size:
+            logger.warning(f"WebSocket message too large: {len(data)} > {self.max_size}")
+            return False
+        return True
+    def cleanup(self, client_id: str):
+        """Remove client from tracking."""
+        self._counts.pop(client_id, None)
+# Global rate limiter instance
+ws_rate_limiter = WebSocketRateLimiter(max_messages_per_second=20, max_message_size=5 * 1024 * 1024)

backend/app/main.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+VoiceForge - FastAPI Main Application
+Production-grade Speech-to-Text & Text-to-Speech API
+"""
+import logging
+# WARN: PyTorch 2.6+ security workaround for Pyannote
+# Must be before any other torch imports
+import os
+os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
+import torch.serialization
+try:
+    torch.serialization.add_safe_globals([dict])
+except:
+    pass
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.openapi.utils import get_openapi
+from prometheus_fastapi_instrumentator import Instrumentator
+from .core.config import get_settings
+from .api.routes import (
+    stt_router,
+    tts_router,
+    health_router,
+    transcripts_router,
+    ws_router,
+    translation_router,
+    batch_router,
+    analysis_router,
+    audio_router,
+    cloning_router,
+    sign_router,
+    auth_router,
+    s2s_router,
+    sign_bridge  # Import the module
+)
+from .models import Base, engine
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+settings = get_settings()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Application lifespan handler
+    Runs on startup and shutdown
+    """
+    # Startup
+    logger.info(f"Starting {settings.app_name} v{settings.app_version}")
+    # Create database tables
+    logger.info("Creating database tables...")
+    Base.metadata.create_all(bind=engine)
+    # Pre-warm Whisper models for faster first request
+    logger.info("Pre-warming AI models...")
+    try:
+        from .services.whisper_stt_service import get_whisper_model
+        # Pre-load English Distil model (most common)
+        get_whisper_model("distil-small.en")
+        logger.info("✅ Distil-Whisper model loaded")
+        # Pre-load multilingual model
+        get_whisper_model("small")
+        logger.info("✅ Whisper-small model loaded")
+    except Exception as e:
+        logger.warning(f"Model pre-warming failed: {e}")
+    # Pre-cache TTS voice list
+    try:
+        from .services.tts_service import get_tts_service
+        tts_service = get_tts_service()
+        await tts_service.get_voices()
+        logger.info("✅ TTS voice list cached")
+    except Exception as e:
+        logger.warning(f"Voice list caching failed: {e}")
+    logger.info("🚀 Startup complete - All models warmed up!")
+    yield
+    # Shutdown
+    logger.info("Shutting down...")
+    # TODO: Close database connections
+    # TODO: Close Redis connections
+    logger.info("Shutdown complete")
+# Create FastAPI application
+app = FastAPI(
+    title=settings.app_name,
+    description="""
+## VoiceForge API
+Production-grade Speech-to-Text and Text-to-Speech API.
+### Features
+- 🎤 **Speech-to-Text**: Transcribe audio files with word-level timestamps
+- 🔊 **Text-to-Speech**: Synthesize speech with 300+ neural voices
+- 🌍 **Multi-language**: Support for 10+ languages
+- 🧠 **AI Analysis**: Sentiment, keywords, and summarization
+- 🌐 **Translation**: Translate text/audio between 20+ languages
+- ⚡ **Free & Fast**: Local Whisper + Edge TTS - no API costs
+    """,
+    version=settings.app_version,
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan,
+)
+from slowapi import _rate_limit_exceeded_handler
+from slowapi.errors import RateLimitExceeded
+from slowapi.middleware import SlowAPIMiddleware
+from .core.limiter import limiter
+from .core.security_headers import SecurityHeadersMiddleware
+from .core.request_size_middleware import RequestSizeLimitMiddleware
+# Request body size limit (must be first to reject large requests early)
+app.add_middleware(RequestSizeLimitMiddleware)
+# Add Rate Limiting (default: 60 requests/min per IP)
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+app.add_middleware(SlowAPIMiddleware)
+# Security Headers (Must be before CORS to ensure headers are present even on errors/CORS blocks)
+app.add_middleware(SecurityHeadersMiddleware)
+# CORS middleware - Use CORS_ORIGINS env var (comma-separated) or defaults
+_cors_origins = os.getenv(
+    "CORS_ORIGINS",
+    "http://localhost:8501,http://localhost:3000,http://localhost:8000"
+).split(",")
+# Add HuggingFace origins if deploying there
+if os.getenv("SPACE_ID"):  # HF Spaces sets this
+    _cors_origins.extend(["https://huggingface.co", f"https://{os.getenv('SPACE_ID')}.hf.space"])
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[o.strip() for o in _cors_origins],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Prometheus Metrics
+Instrumentator().instrument(app).expose(app)
+# Include routers
+app.include_router(health_router)
+app.include_router(auth_router, prefix="/api/v1")
+app.include_router(stt_router, prefix="/api/v1")
+app.include_router(tts_router, prefix="/api/v1")
+app.include_router(transcripts_router, prefix="/api/v1")
+app.include_router(ws_router, prefix="/api/v1")
+app.include_router(translation_router, prefix="/api/v1")
+app.include_router(batch_router, prefix="/api/v1")
+app.include_router(analysis_router, prefix="/api/v1")
+app.include_router(audio_router, prefix="/api/v1")
+app.include_router(cloning_router, prefix="/api/v1")
+app.include_router(sign_router, prefix="/api/v1")
+app.include_router(s2s_router, prefix="/api/v1") # Added s2s_router
+app.include_router(sign_bridge.router, prefix="/api/v1") # Added sign_bridge_router
+# Exception handlers
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    """Global exception handler for unhandled errors"""
+    logger.exception(f"Unhandled error: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content={
+            "error": "internal_server_error",
+            "message": "An unexpected error occurred",
+            "detail": str(exc) if settings.debug else None,
+        },
+    )
+@app.exception_handler(ValueError)
+async def value_error_handler(request: Request, exc: ValueError):
+    """Handler for validation errors"""
+    return JSONResponse(
+        status_code=400,
+        content={
+            "error": "validation_error",
+            "message": str(exc),
+        },
+    )
+# Root endpoint
+@app.get("/", tags=["Root"])
+async def root():
+    """API root - returns basic info"""
+    return {
+        "name": settings.app_name,
+        "version": settings.app_version,
+        "status": "running",
+        "docs": "/docs",
+        "health": "/health",
+    }
+# Custom OpenAPI schema
+def custom_openapi():
+    """Generate custom OpenAPI schema with enhanced documentation"""
+    if app.openapi_schema:
+        return app.openapi_schema
+    openapi_schema = get_openapi(
+        title=settings.app_name,
+        version=settings.app_version,
+        description=app.description,
+        routes=app.routes,
+    )
+    # Add custom logo
+    openapi_schema["info"]["x-logo"] = {
+        "url": "https://example.com/logo.png"
+    }
+    # Add tags with descriptions
+    openapi_schema["tags"] = [
+        {
+            "name": "Health",
+            "description": "Health check endpoints for monitoring",
+        },
+        {
+            "name": "Speech-to-Text",
+            "description": "Convert audio to text with timestamps and speaker detection",
+        },
+        {
+            "name": "Text-to-Speech",
+            "description": "Convert text to natural-sounding speech",
+        },
+    ]
+    app.openapi_schema = openapi_schema
+    return app.openapi_schema
+app.openapi = custom_openapi
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app.main:app",
+        host=settings.api_host,
+        port=settings.api_port,
+        reload=settings.debug,
+    )

backend/app/models/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+VoiceForge Database Models Package
+"""
+from .base import Base, engine, SessionLocal, get_db
+from .audio_file import AudioFile
+from .transcript import Transcript
+from .auth import User, ApiKey
+__all__ = [
+    "Base",
+    "engine",
+    "SessionLocal",
+    "get_db",
+    "AudioFile",
+    "Transcript",
+    "User",
+    "ApiKey",
+]

backend/app/models/audio_file.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""
+Audio File Model
+"""
+from datetime import datetime
+from sqlalchemy import Column, Integer, String, Float, DateTime, ForeignKey, Enum
+from sqlalchemy.orm import relationship
+import enum
+from .base import Base
+class AudioFileStatus(str, enum.Enum):
+    """Audio file processing status"""
+    UPLOADED = "uploaded"
+    PROCESSING = "processing"
+    DONE = "done"
+    FAILED = "failed"
+class AudioFile(Base):
+    """Audio file database model"""
+    __tablename__ = "audio_files"
+    id = Column(Integer, primary_key=True, index=True)
+    # user_id removed
+    storage_path = Column(String(500), nullable=False)
+    original_filename = Column(String(255), nullable=True)
+    duration = Column(Float, nullable=True)  # Duration in seconds
+    format = Column(String(20), nullable=True)  # wav, mp3, etc.
+    sample_rate = Column(Integer, nullable=True)
+    channels = Column(Integer, nullable=True)
+    file_size = Column(Integer, nullable=True)  # Size in bytes
+    language = Column(String(10), nullable=True)  # User-specified language
+    detected_language = Column(String(10), nullable=True)  # Auto-detected language
+    status = Column(String(20), default=AudioFileStatus.UPLOADED.value, index=True)
+    error_message = Column(String(500), nullable=True)
+    created_at = Column(DateTime, default=datetime.utcnow, index=True)
+    processed_at = Column(DateTime, nullable=True)
+    # Relationships
+    # user relationship removed
+    transcripts = relationship("Transcript", back_populates="audio_file")
+    def __repr__(self):
+        return f"<AudioFile(id={self.id}, filename={self.original_filename}, status={self.status})>"

backend/app/models/auth.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+User and API Key Models
+"""
+from sqlalchemy import Column, Integer, String, Boolean, ForeignKey, DateTime
+from sqlalchemy.orm import relationship
+from datetime import datetime
+from .base import Base
+from ..core.security_encryption import EncryptedString
+class User(Base):
+    __tablename__ = "users"
+    id = Column(Integer, primary_key=True, index=True)
+    email = Column(String, unique=True, index=True, nullable=False)  # Not encrypted (needed for lookup)
+    hashed_password = Column(String, nullable=False)
+    full_name = Column(EncryptedString(255), nullable=True)  # ENCRYPTED
+    is_active = Column(Boolean, default=True)
+    is_superuser = Column(Boolean, default=False)
+    # Relationships
+    api_keys = relationship("ApiKey", back_populates="user", cascade="all, delete-orphan")
+class ApiKey(Base):
+    __tablename__ = "api_keys"
+    id = Column(Integer, primary_key=True, index=True)
+    key = Column(String, unique=True, index=True, nullable=False)
+    name = Column(String, nullable=True)  # e.g. "Production App"
+    is_active = Column(Boolean, default=True)
+    created_at = Column(DateTime, default=datetime.utcnow)
+    last_used_at = Column(DateTime, nullable=True)
+    user_id = Column(Integer, ForeignKey("users.id"))
+    user = relationship("User", back_populates="api_keys")

backend/app/models/base.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+SQLAlchemy Base and Database Session
+"""
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+from ..core.config import get_settings
+settings = get_settings()
+# Create SQLAlchemy engine
+# Create SQLAlchemy engine
+if "sqlite" in settings.database_url:
+    engine = create_engine(
+        settings.database_url,
+        connect_args={"check_same_thread": False},
+    )
+else:
+    engine = create_engine(
+        settings.database_url,
+        pool_pre_ping=True,
+        pool_size=10,
+        max_overflow=20,
+    )
+# Create session factory
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+# Create declarative base
+Base = declarative_base()
+def get_db():
+    """
+    Database session dependency for FastAPI
+    Yields a database session and ensures cleanup
+    """
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()

backend/app/models/sign_lstm.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import torch
+import torch.nn as nn
+class SignLSTM(nn.Module):
+    """
+    LSTM Model for Sign Language Recognition.
+    Architecture:
+    - Input: Sequence of MediaPipe Landmarks (21 points * 3 coords = 63 features)
+    - Hidden Layers: 2 or 3 LSTM layers to capture temporal dynamics
+    - Output: Fully Connected layer -> Class probabilities (ASL Alphabet or Vocabulary)
+    Why LSTM?
+    It captures the 'motion' and 'context' of signs, not just static hand shapes,
+    allowing for much higher accuracy (99% reported in research) and dynamic gesture recognition.
+    """
+    def __init__(self, input_size=63, hidden_size=128, num_layers=2, num_classes=26):
+        super(SignLSTM, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        # LSTM Layer
+        # batch_first=True expects input shape: (batch, seq_len, features)
+        self.lstm = nn.LSTM(
+            input_size,
+            hidden_size,
+            num_layers,
+            batch_first=True,
+            dropout=0.2
+        )
+        # Fully Connected Layer for classification
+        self.fc = nn.Linear(hidden_size, num_classes)
+        # Validation/Inference activation
+        self.softmax = nn.Softmax(dim=1)
+    def forward(self, x):
+        # x shape: (batch_size, sequence_length, input_size)
+        # Initialize hidden state and cell state (optional, defaults to zeros if not provided)
+        # h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
+        # c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
+        # Forward propagate LSTM
+        # out shape: (batch_size, seq_len, hidden_size)
+        out, _ = self.lstm(x)
+        # Decode the hidden state of the last time step
+        # out[:, -1, :] gets the last output of the sequence
+        out = self.fc(out[:, -1, :])
+        return out
+    def predict(self, x):
+        """Helper for inference"""
+        with torch.no_grad():
+            logits = self.forward(x)
+            probabilities = self.softmax(logits)
+            confidence, predicted_class = torch.max(probabilities, 1)
+            return predicted_class.item(), confidence.item()

backend/app/models/transcript.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Transcript Model
+"""
+from datetime import datetime
+from sqlalchemy import Column, Integer, String, Text, DateTime, ForeignKey, JSON, Float
+from sqlalchemy.orm import relationship
+from .base import Base
+from ..core.security_encryption import EncryptedString
+class Transcript(Base):
+    """Transcript database model"""
+    __tablename__ = "transcripts"
+    id = Column(Integer, primary_key=True, index=True)
+    audio_file_id = Column(Integer, ForeignKey("audio_files.id"), nullable=True, index=True)
+    audio_file_id = Column(Integer, ForeignKey("audio_files.id"), nullable=True, index=True)
+    # user_id removed (Auth disabled for portfolio)
+    # Transcript content - ENCRYPTED
+    raw_text = Column(EncryptedString(10000), nullable=True)  # Original transcription
+    processed_text = Column(EncryptedString(10000), nullable=True)  # After NLP processing
+    # Segments with timestamps and speaker info (JSON array)
+    # Format: [{"start": 0.0, "end": 1.5, "text": "Hello", "speaker": "SPEAKER_1", "confidence": 0.95}]
+    segments = Column(JSON, nullable=True)
+    # Word-level timestamps (JSON array)
+    # Format: [{"word": "hello", "start": 0.0, "end": 0.5, "confidence": 0.98}]
+    words = Column(JSON, nullable=True)
+    # Language info
+    language = Column(String(10), nullable=True)  # Transcription language
+    translation_language = Column(String(10), nullable=True)  # If translated
+    translated_text = Column(Text, nullable=True)
+    # NLP Analysis (Phase 2)
+    sentiment = Column(JSON, nullable=True)  # {"overall": "positive", "score": 0.8, "segments": [...]}
+    topics = Column(JSON, nullable=True)  # ["technology", "business"]
+    keywords = Column(JSON, nullable=True)  # [{"word": "AI", "score": 0.9}]
+    action_items = Column(JSON, nullable=True)  # [{"text": "Email John", "assignee": "Speaker 1"}]
+    attendees = Column(JSON, nullable=True)  # ["Speaker 1", "Speaker 2"]
+    summary = Column(EncryptedString(5000), nullable=True)  # ENCRYPTED
+    # Metadata
+    confidence = Column(Float, nullable=True)  # Overall confidence score
+    duration = Column(Float, nullable=True)  # Audio duration in seconds
+    word_count = Column(Integer, nullable=True)
+    # Timestamps
+    created_at = Column(DateTime, default=datetime.utcnow, index=True)
+    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+    # Relationships
+    audio_file = relationship("AudioFile", back_populates="transcripts")
+    audio_file = relationship("AudioFile", back_populates="transcripts")
+    # user relationship removed
+    def __repr__(self):
+        preview = self.raw_text[:50] + "..." if self.raw_text and len(self.raw_text) > 50 else self.raw_text
+        return f"<Transcript(id={self.id}, preview='{preview}')>"
+# Import Float for confidence field

backend/app/schemas/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+VoiceForge Schemas Package
+"""
+from .stt import (
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionSegment,
+    TranscriptionWord,
+    LanguageInfo,
+)
+from .tts import (
+    SynthesisRequest,
+    SynthesisResponse,
+    VoiceInfo,
+    VoiceListResponse,
+)
+from .transcript import (
+    TranscriptCreate,
+    TranscriptUpdate,
+    TranscriptResponse,
+    TranscriptListResponse,
+)
+__all__ = [
+    "TranscriptionRequest",
+    "TranscriptionResponse",
+    "TranscriptionSegment",
+    "TranscriptionWord",
+    "LanguageInfo",
+    "SynthesisRequest",
+    "SynthesisResponse",
+    "VoiceInfo",
+    "VoiceListResponse",
+    "TranscriptCreate",
+    "TranscriptUpdate",
+    "TranscriptResponse",
+    "TranscriptListResponse",
+]

backend/app/schemas/stt.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Speech-to-Text Schemas
+"""
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+class TranscriptionWord(BaseModel):
+    """Individual word with timing information"""
+    word: str
+    start_time: float = Field(..., description="Start time in seconds")
+    end_time: float = Field(..., description="End time in seconds")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score")
+class TranscriptionSegment(BaseModel):
+    """Transcript segment with speaker and timing"""
+    text: str
+    start_time: float = Field(..., description="Start time in seconds")
+    end_time: float = Field(..., description="End time in seconds")
+    speaker: Optional[str] = Field(None, description="Speaker label (e.g., SPEAKER_1)")
+    confidence: float = Field(..., ge=0.0, le=1.0)
+    words: Optional[List[TranscriptionWord]] = None
+class TranscriptionRequest(BaseModel):
+    """Request parameters for transcription"""
+    language: str = Field(default="en-US", description="Language code (e.g., en-US)")
+    enable_automatic_punctuation: bool = True
+    enable_word_time_offsets: bool = True
+    enable_speaker_diarization: bool = False
+    diarization_speaker_count: Optional[int] = Field(None, ge=2, le=10)
+    model: str = Field(default="default", description="STT model to use")
+class TranscriptionResponse(BaseModel):
+    """Response from transcription"""
+    id: Optional[int] = None
+    audio_file_id: Optional[int] = None
+    text: str = Field(..., description="Full transcription text")
+    segments: List[TranscriptionSegment] = Field(default_factory=list)
+    words: Optional[List[TranscriptionWord]] = None
+    language: str
+    detected_language: Optional[str] = None
+    confidence: float = Field(..., ge=0.0, le=1.0)
+    duration: float = Field(..., description="Audio duration in seconds")
+    word_count: int
+    processing_time: float = Field(..., description="Processing time in seconds")
+    model_config = {
+        "from_attributes": True
+    }
+class StreamingTranscriptionResponse(BaseModel):
+    """Response for streaming transcription updates"""
+    is_final: bool = False
+    text: str
+    confidence: float = Field(default=0.0, ge=0.0, le=1.0)
+    stability: float = Field(default=0.0, ge=0.0, le=1.0)
+class LanguageInfo(BaseModel):
+    """Language information for UI display"""
+    code: str = Field(..., description="Language code (e.g., en-US)")
+    name: str = Field(..., description="Display name (e.g., English (US))")
+    native_name: str = Field(..., description="Native name (e.g., English)")
+    flag: str = Field(..., description="Flag emoji")
+    stt_supported: bool = True
+    tts_supported: bool = True
+class LanguageListResponse(BaseModel):
+    """Response with list of supported languages"""
+    languages: List[LanguageInfo]
+    total: int
+class TaskStatusResponse(BaseModel):
+    """Status of an async transcription task"""
+    task_id: str
+    status: str = Field(..., description="pending, processing, completed, failed")
+    progress: float = Field(default=0.0, ge=0.0, le=100.0, description="Progress percentage")
+    result: Optional[TranscriptionResponse] = None
+    error: Optional[str] = None
+    created_at: datetime
+    updated_at: datetime
+class AsyncTranscriptionResponse(BaseModel):
+    """Response for async transcription submission"""
+    task_id: str
+    audio_file_id: int
+    status: str = "queued"
+    message: str = "File uploaded and queued for processing"

backend/app/schemas/transcript.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Transcript Schemas
+"""
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+from .stt import TranscriptionSegment, TranscriptionWord
+class TranscriptCreate(BaseModel):
+    """Schema for creating a transcript"""
+    raw_text: str
+    processed_text: Optional[str] = None
+    segments: Optional[List[Dict[str, Any]]] = None
+    words: Optional[List[Dict[str, Any]]] = None
+    language: str = "en-US"
+    confidence: Optional[float] = None
+    duration: Optional[float] = None
+class TranscriptUpdate(BaseModel):
+    """Schema for updating a transcript"""
+    processed_text: Optional[str] = None
+    language: Optional[str] = None
+class TranscriptResponse(BaseModel):
+    """Schema for transcript response"""
+    id: int
+    audio_file_id: Optional[int] = None
+    user_id: Optional[int] = None
+    raw_text: Optional[str] = None
+    processed_text: Optional[str] = None
+    segments: Optional[List[Dict[str, Any]]] = None
+    words: Optional[List[Dict[str, Any]]] = None
+    language: Optional[str] = None
+    translation_language: Optional[str] = None
+    translated_text: Optional[str] = None
+    sentiment: Optional[Dict[str, Any]] = None
+    topics: Optional[List[str]] = None
+    keywords: Optional[List[Dict[str, Any]]] = None
+    summary: Optional[str] = None
+    confidence: Optional[float] = None
+    duration: Optional[float] = None
+    word_count: Optional[int] = None
+    created_at: datetime
+    updated_at: Optional[datetime] = None
+    model_config = {
+        "from_attributes": True
+    }
+class TranscriptListResponse(BaseModel):
+    """Schema for paginated transcript list"""
+    transcripts: List[TranscriptResponse]
+    total: int
+    page: int
+    page_size: int
+    has_more: bool
+class ExportRequest(BaseModel):
+    """Schema for transcript export request"""
+    format: str = Field(..., pattern="^(txt|srt|vtt|pdf|json)$")
+    include_timestamps: bool = True
+    include_speakers: bool = True

backend/app/schemas/tts.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Text-to-Speech Schemas
+"""
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class SynthesisRequest(BaseModel):
+    """Request for text-to-speech synthesis"""
+    text: str = Field(..., min_length=1, max_length=5000, description="Text to synthesize")
+    language: str = Field(default="en-US", description="Language code")
+    voice: Optional[str] = Field(None, description="Voice name (e.g., en-US-Wavenet-D)")
+    # Audio configuration
+    audio_encoding: str = Field(default="MP3", description="Output format: MP3, LINEAR16, OGG_OPUS")
+    sample_rate: int = Field(default=24000, description="Sample rate in Hz")
+    # Voice tuning
+    speaking_rate: float = Field(default=1.0, ge=0.25, le=4.0, description="Speaking rate")
+    pitch: float = Field(default=0.0, ge=-20.0, le=20.0, description="Voice pitch in semitones")
+    volume_gain_db: float = Field(default=0.0, ge=-96.0, le=16.0, description="Volume gain in dB")
+    # SSML support
+    use_ssml: bool = Field(default=False, description="Treat text as SSML")
+class SynthesisResponse(BaseModel):
+    """Response from text-to-speech synthesis"""
+    audio_content: str = Field(..., description="Base64 encoded audio")
+    audio_size: int = Field(..., description="Audio size in bytes")
+    duration_estimate: float = Field(..., description="Estimated duration in seconds")
+    voice_used: str
+    language: str
+    encoding: str
+    sample_rate: int
+    processing_time: float = Field(..., description="Processing time in seconds")
+class VoiceInfo(BaseModel):
+    """Information about a TTS voice"""
+    name: str = Field(..., description="Voice name (e.g., en-US-Wavenet-D)")
+    language_code: str = Field(..., description="Language code")
+    language_name: str = Field(..., description="Language display name")
+    ssml_gender: str = Field(..., description="MALE, FEMALE, or NEUTRAL")
+    natural_sample_rate: int = Field(..., description="Native sample rate in Hz")
+    voice_type: str = Field(..., description="Standard, WaveNet, or Neural2")
+    # Display helpers
+    display_name: Optional[str] = None
+    flag: Optional[str] = None
+class VoiceListResponse(BaseModel):
+    """Response with list of available voices"""
+    voices: List[VoiceInfo]
+    total: int
+    language_filter: Optional[str] = None
+class VoicePreviewRequest(BaseModel):
+    """Request for voice preview"""
+    voice: str = Field(..., description="Voice name to preview")
+    text: Optional[str] = Field(
+        default="Hello! This is a preview of my voice.",
+        max_length=200
+    )

backend/app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+VoiceForge Services Package
+"""
+from .stt_service import STTService
+from .tts_service import TTSService
+from .file_service import FileService
+__all__ = [
+    "STTService",
+    "TTSService",
+    "FileService",
+]

backend/app/services/audio_service.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Audio Editing Service
+Handles audio manipulation: Trimming, Merging, and Conversion using Pydub/FFmpeg
+"""
+import os
+import logging
+from typing import List, Optional
+from pydub import AudioSegment
+import tempfile
+logger = logging.getLogger(__name__)
+class AudioService:
+    """
+    Service for audio manipulation tasks.
+    Requires ffmpeg to be installed/available in path.
+    """
+    def __init__(self):
+        pass
+    def load_audio(self, file_path: str) -> AudioSegment:
+        """Load audio file into Pydub AudioSegment"""
+        try:
+            return AudioSegment.from_file(file_path)
+        except Exception as e:
+            logger.error(f"Failed to load audio {file_path}: {e}")
+            raise ValueError(f"Could not load audio file: {str(e)}")
+    def trim_audio(self, input_path: str, start_ms: int, end_ms: int, output_path: Optional[str] = None) -> str:
+        """
+        Trim audio from start_ms to end_ms.
+        """
+        if start_ms < 0 or end_ms <= start_ms:
+            raise ValueError("Invalid start/end timestamps")
+        audio = self.load_audio(input_path)
+        # Check duration
+        if start_ms >= len(audio):
+            raise ValueError("Start time exceeds audio duration")
+        # Slice
+        trimmed = audio[start_ms:end_ms]
+        if not output_path:
+            base, ext = os.path.splitext(input_path)
+            output_path = f"{base}_trimmed{ext}"
+        trimmed.export(output_path, format=os.path.splitext(output_path)[1][1:])
+        logger.info(f"Trimmed audio saved to {output_path}")
+        return output_path
+    def merge_audio(self, file_paths: List[str], output_path: str, crossfade_ms: int = 0) -> str:
+        """
+        Merge multiple audio files into one.
+        """
+        if not file_paths:
+            raise ValueError("No files to merge")
+        combined = AudioSegment.empty()
+        for path in file_paths:
+            segment = self.load_audio(path)
+            if crossfade_ms > 0 and len(combined) > 0:
+                combined = combined.append(segment, crossfade=crossfade_ms)
+            else:
+                combined += segment
+        # Create dir if needed
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        # Export
+        fmt = os.path.splitext(output_path)[1][1:] or "mp3"
+        combined.export(output_path, format=fmt)
+        logger.info(f"Merged {len(file_paths)} files to {output_path}")
+        return output_path
+    def convert_format(self, input_path: str, target_format: str) -> str:
+        """
+        Convert audio format (e.g. wav -> mp3)
+        """
+        audio = self.load_audio(input_path)
+        base = os.path.splitext(input_path)[0]
+        output_path = f"{base}.{target_format}"
+        audio.export(output_path, format=target_format)
+        logger.info(f"Converted to {target_format}: {output_path}")
+        return output_path
+# Singleton
+_audio_service = None
+def get_audio_service() -> AudioService:
+    global _audio_service
+    if _audio_service is None:
+        _audio_service = AudioService()
+    return _audio_service

backend/app/services/batch_service.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""
+Batch Processing Service
+Handles multi-file transcription with job tracking and parallel processing
+"""
+import asyncio
+import logging
+import os
+import tempfile
+import uuid
+import zipfile
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, field
+from enum import Enum
+logger = logging.getLogger(__name__)
+class JobStatus(str, Enum):
+    """Batch job status enum."""
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+class FileStatus(str, Enum):
+    """Individual file status."""
+    QUEUED = "queued"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+@dataclass
+class FileResult:
+    """Result for a single file in batch."""
+    filename: str
+    status: FileStatus = FileStatus.QUEUED
+    progress: float = 0.0
+    transcript: Optional[str] = None
+    language: Optional[str] = None
+    duration: Optional[float] = None
+    word_count: Optional[int] = None
+    processing_time: Optional[float] = None
+    error: Optional[str] = None
+    output_path: Optional[str] = None
+@dataclass
+class BatchJob:
+    """Batch processing job."""
+    job_id: str
+    status: JobStatus = JobStatus.PENDING
+    created_at: datetime = field(default_factory=datetime.now)
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    files: Dict[str, FileResult] = field(default_factory=dict)
+    total_files: int = 0
+    completed_files: int = 0
+    failed_files: int = 0
+    options: Dict[str, Any] = field(default_factory=dict)
+    output_zip_path: Optional[str] = None
+    @property
+    def progress(self) -> float:
+        """Overall job progress percentage."""
+        if self.total_files == 0:
+            return 0.0
+        return (self.completed_files + self.failed_files) / self.total_files * 100
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for API response."""
+        return {
+            "job_id": self.job_id,
+            "status": self.status.value,
+            "progress": round(self.progress, 1),
+            "created_at": self.created_at.isoformat(),
+            "started_at": self.started_at.isoformat() if self.started_at else None,
+            "completed_at": self.completed_at.isoformat() if self.completed_at else None,
+            "total_files": self.total_files,
+            "completed_files": self.completed_files,
+            "failed_files": self.failed_files,
+            "files": {
+                name: {
+                    "filename": f.filename,
+                    "status": f.status.value,
+                    "progress": f.progress,
+                    "transcript": f.transcript[:500] + "..." if f.transcript and len(f.transcript) > 500 else f.transcript,
+                    "language": f.language,
+                    "duration": f.duration,
+                    "word_count": f.word_count,
+                    "processing_time": f.processing_time,
+                    "error": f.error,
+                }
+                for name, f in self.files.items()
+            },
+            "options": self.options,
+            "has_zip": self.output_zip_path is not None,
+        }
+# In-memory job store (use Redis in production)
+_batch_jobs: Dict[str, BatchJob] = {}
+class BatchProcessingService:
+    """
+    Service for batch audio transcription.
+    Processes multiple files with progress tracking.
+    """
+    def __init__(self, output_dir: Optional[str] = None):
+        """Initialize batch service."""
+        self.output_dir = output_dir or tempfile.gettempdir()
+        self._processing_lock = asyncio.Lock()
+    def create_job(
+        self,
+        filenames: List[str],
+        options: Optional[Dict[str, Any]] = None,
+    ) -> BatchJob:
+        """
+        Create a new batch job.
+        Args:
+            filenames: List of filenames to process
+            options: Processing options (language, output_format, etc.)
+        Returns:
+            Created BatchJob
+        """
+        job_id = str(uuid.uuid4())[:8]
+        files = {
+            name: FileResult(filename=name)
+            for name in filenames
+        }
+        job = BatchJob(
+            job_id=job_id,
+            files=files,
+            total_files=len(filenames),
+            options=options or {},
+        )
+        _batch_jobs[job_id] = job
+        logger.info(f"Created batch job {job_id} with {len(filenames)} files")
+        return job
+    def get_job(self, job_id: str) -> Optional[BatchJob]:
+        """Get job by ID."""
+        return _batch_jobs.get(job_id)
+    def list_jobs(self, limit: int = 20) -> List[BatchJob]:
+        """List recent jobs."""
+        jobs = list(_batch_jobs.values())
+        jobs.sort(key=lambda j: j.created_at, reverse=True)
+        return jobs[:limit]
+    async def process_job(
+        self,
+        job_id: str,
+        file_paths: Dict[str, str],
+    ) -> BatchJob:
+        """
+        Process all files in a batch job.
+        Args:
+            job_id: Job ID
+            file_paths: Mapping of filename -> temp file path
+        Returns:
+            Completed BatchJob
+        """
+        job = self.get_job(job_id)
+        if not job:
+            raise ValueError(f"Job not found: {job_id}")
+        job.status = JobStatus.PROCESSING
+        job.started_at = datetime.now()
+        # STT Service is used inside the worker now
+        # from app.services.whisper_stt_service import get_whisper_stt_service
+        # stt_service = get_whisper_stt_service()
+        # Get options
+        language = job.options.get("language")
+        output_format = job.options.get("output_format", "txt")
+        # Process each file
+        output_files: List[str] = []
+        for filename, file_path in file_paths.items():
+            file_result = job.files.get(filename)
+            if not file_result:
+                continue
+            file_result.status = FileStatus.PROCESSING
+            file_result.progress = 0.0
+            try:
+                import time
+                start_time = time.time()
+                # Transcribe via Celery Worker
+                from app.workers.tasks import transcribe_file_path
+                # Dispatch task
+                task = transcribe_file_path.delay(
+                    file_path=file_path,
+                    language=language,
+                    output_format=output_format
+                )
+                # Wait for result (since this service runs in background thread)
+                # In a full async arch we would return job_id and poll,
+                # but here we keep the batch logic simple while scaling the compute.
+                task_result = task.get(timeout=600) # 10 min timeout per file
+                processing_time = time.time() - start_time
+                # Update file result
+                file_result.transcript = task_result.get("text", "")
+                file_result.language = task_result.get("language", "unknown")
+                file_result.duration = task_result.get("duration")
+                file_result.word_count = len(file_result.transcript.split())
+                file_result.processing_time = round(processing_time, 2)
+                file_result.status = FileStatus.COMPLETED
+                file_result.progress = 100.0
+                # Helper for SRT writing since we have raw segments dicts now
+                result = {"segments": task_result.get("segments", []), "text": file_result.transcript}
+                # Save output file
+                output_filename = Path(filename).stem + f".{output_format}"
+                output_path = os.path.join(self.output_dir, job_id, output_filename)
+                os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                with open(output_path, "w", encoding="utf-8") as f:
+                    if output_format == "srt":
+                        # Write SRT format
+                        segments = result.get("segments", [])
+                        for i, seg in enumerate(segments, 1):
+                            start = self._format_srt_time(seg.get("start", 0))
+                            end = self._format_srt_time(seg.get("end", 0))
+                            text = seg.get("text", "").strip()
+                            f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
+                    else:
+                        f.write(file_result.transcript)
+                file_result.output_path = output_path
+                output_files.append(output_path)
+                job.completed_files += 1
+                logger.info(f"[{job_id}] Completed {filename} ({job.completed_files}/{job.total_files})")
+            except Exception as e:
+                file_result.status = FileStatus.FAILED
+                file_result.error = str(e)
+                file_result.progress = 0.0
+                job.failed_files += 1
+                logger.error(f"[{job_id}] Failed {filename}: {e}")
+            finally:
+                # Clean up temp file
+                try:
+                    if os.path.exists(file_path):
+                        os.unlink(file_path)
+                except:
+                    pass
+        # Create ZIP of all outputs
+        if output_files:
+            zip_path = os.path.join(self.output_dir, f"{job_id}_results.zip")
+            with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+                for file_path in output_files:
+                    zf.write(file_path, os.path.basename(file_path))
+            job.output_zip_path = zip_path
+            logger.info(f"[{job_id}] Created ZIP: {zip_path}")
+        # Update job status
+        job.status = JobStatus.COMPLETED if job.failed_files == 0 else JobStatus.FAILED
+        job.completed_at = datetime.now()
+        return job
+    def _format_srt_time(self, seconds: float) -> str:
+        """Format seconds to SRT time format (HH:MM:SS,mmm)."""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis = int((seconds % 1) * 1000)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+    def cancel_job(self, job_id: str) -> bool:
+        """Cancel a pending/processing job."""
+        job = self.get_job(job_id)
+        if job and job.status in [JobStatus.PENDING, JobStatus.PROCESSING]:
+            job.status = JobStatus.CANCELLED
+            return True
+        return False
+    def delete_job(self, job_id: str) -> bool:
+        """Delete a job and its output files."""
+        job = _batch_jobs.pop(job_id, None)
+        if job:
+            # Clean up files
+            if job.output_zip_path and os.path.exists(job.output_zip_path):
+                try:
+                    os.unlink(job.output_zip_path)
+                except:
+                    pass
+            job_dir = os.path.join(self.output_dir, job_id)
+            if os.path.exists(job_dir):
+                try:
+                    import shutil
+                    shutil.rmtree(job_dir)
+                except:
+                    pass
+            return True
+        return False
+    def get_zip_path(self, job_id: str) -> Optional[str]:
+        """Get path to job's output ZIP file."""
+        job = self.get_job(job_id)
+        if job and job.output_zip_path and os.path.exists(job.output_zip_path):
+            return job.output_zip_path
+        return None
+# Singleton instance
+_batch_service: Optional[BatchProcessingService] = None
+def get_batch_service() -> BatchProcessingService:
+    """Get or create BatchProcessingService singleton."""
+    global _batch_service
+    if _batch_service is None:
+        _batch_service = BatchProcessingService()
+    return _batch_service

backend/app/services/cache_service.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import redis
+import json
+import hashlib
+import logging
+from typing import Optional, Any
+from functools import lru_cache
+from ..core.config import get_settings
+logger = logging.getLogger(__name__)
+class CacheService:
+    def __init__(self):
+        settings = get_settings()
+        self.default_ttl = 3600 # 1 hour
+        self.redis = None
+        self.disk_cache = None
+        # Try Redis first
+        try:
+            self.redis = redis.from_url(settings.redis_url, decode_responses=False)
+            self.redis.ping()
+            logger.info("✅ Redis Cache connected")
+        except Exception as e:
+            logger.warning(f"⚠️ Redis unavailable, falling back to DiskCache: {e}")
+            self.redis = None
+            # Fallback to DiskCache
+            try:
+                import diskcache
+                cache_dir = "./cache_data"
+                self.disk_cache = diskcache.Cache(cache_dir)
+                logger.info(f"💾 DiskCache initialized at {cache_dir}")
+            except Exception as e:
+                logger.error(f"❌ DiskCache init failed: {e}")
+    def get(self, key: str) -> Optional[bytes]:
+        """Get raw bytes from cache"""
+        try:
+            if self.redis:
+                return self.redis.get(key)
+            elif self.disk_cache:
+                return self.disk_cache.get(key)
+        except Exception as e:
+            logger.error(f"Cache get failed: {e}")
+        return None
+    def set(self, key: str, value: bytes, ttl: int = None):
+        """Set raw bytes in cache"""
+        try:
+            ttl_val = ttl or self.default_ttl
+            if self.redis:
+                self.redis.setex(key, ttl_val, value)
+            elif self.disk_cache:
+                self.disk_cache.set(key, value, expire=ttl_val)
+        except Exception as e:
+             logger.error(f"Cache set failed: {e}")
+    def generate_key(self, prefix: str, **kwargs) -> str:
+        """Generate a stable cache key from arguments"""
+        # Convert all values to string for stability
+        safe_kwargs = {k: str(v) for k, v in kwargs.items()}
+        sorted_kwargs = dict(sorted(safe_kwargs.items()))
+        key_str = json.dumps(sorted_kwargs, sort_keys=True)
+        hash_str = hashlib.md5(key_str.encode()).hexdigest()
+        return f"{prefix}:{hash_str}"
+@lru_cache()
+def get_cache_service() -> CacheService:
+    return CacheService()

backend/app/services/clone_service.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Voice Cloning Service (Coqui XTTS)
+High-quality multi-lingual text-to-speech with voice cloning capabilities.
+"""
+import os
+import logging
+import torch
+import gc
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+import tempfile
+logger = logging.getLogger(__name__)
+class CloneService:
+    """
+    Service for Voice Cloning using Coqui XTTS v2.
+    """
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tts = None
+        self.model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+        self.loaded = False
+    def load_model(self):
+        """Lazy load the heavy XTTS model"""
+        if self.loaded:
+            return
+        try:
+            logger.info(f"Loading XTTS model ({self.device})... This may take a while.")
+            from TTS.api import TTS
+            # Load model
+            self.tts = TTS(self.model_name).to(self.device)
+            self.loaded = True
+            logger.info("✅ XTTS Model loaded successfully")
+        except ImportError as e:
+            logger.error("TTS library not installed. Please install 'TTS'.")
+            raise ImportError("Voice Cloning requires 'TTS' library.")
+        except Exception as e:
+            logger.error(f"Failed to load XTTS model: {e}")
+            raise e
+    def unload_model(self):
+        """Unload model to free VRAM"""
+        if self.tts:
+            del self.tts
+            self.tts = None
+            self.loaded = False
+            gc.collect()
+            torch.cuda.empty_cache()
+            logger.info("🗑️ XTTS Model unloaded")
+    def clone_voice(
+        self,
+        text: str,
+        speaker_wav_paths: List[str],
+        language: str = "en",
+        output_path: Optional[str] = None
+    ) -> str:
+        """
+        Synthesize speech in the style of the reference audio.
+        """
+        if not self.loaded:
+            self.load_model()
+        if not output_path:
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                output_path = f.name
+        try:
+            # XTTS synthesis
+            # Note: speaker_wav can be a list of files for better cloning
+            self.tts.tts_to_file(
+                text=text,
+                speaker_wav=speaker_wav_paths,
+                language=language,
+                file_path=output_path,
+                split_sentences=True
+            )
+            logger.info(f"Cloned speech generated: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Cloning failed: {e}")
+            raise e
+    def get_supported_languages(self) -> List[str]:
+        # XTTS v2 supported languages
+        return ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"]
+# Singleton
+_clone_service = None
+def get_clone_service():
+    global _clone_service
+    if _clone_service is None:
+        _clone_service = CloneService()
+    return _clone_service

backend/app/services/diarization_service.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""
+Speaker Diarization Service - Clean Implementation
+Uses faster-whisper + pyannote.audio directly (no whisperx)
+This avoids the KeyError bugs in whisperx alignment while providing
+the same functionality.
+"""
+import os
+import gc
+import logging
+import torch
+from typing import Optional, Dict, Any, List
+from dotenv import load_dotenv
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+# Load environment variables from .env file
+load_dotenv()
+# Workaround for PyTorch 2.6+ weights_only security restriction
+os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
+class DiarizationService:
+    """
+    Speaker Diarization Service using faster-whisper + pyannote.audio.
+    This implementation avoids whisperx entirely to prevent alignment bugs.
+    Flow:
+    1. Transcribe with faster-whisper (word-level timestamps)
+    2. Diarize with pyannote.audio (speaker segments)
+    3. Merge speakers with transcript segments
+    Requires:
+    - faster-whisper (already installed)
+    - pyannote.audio
+    - Valid Hugging Face Token (HF_TOKEN) in .env
+    """
+    def __init__(self):
+        self.settings = get_settings()
+        # Auto-detect GPU (prefer CUDA for speed)
+        if torch.cuda.is_available():
+            self.device = "cuda"
+            self.compute_type = "float16"
+            logger.info(f"🚀 Diarization using GPU: {torch.cuda.get_device_name(0)}")
+        else:
+            self.device = "cpu"
+            self.compute_type = "int8"
+            logger.info("⚠️ Diarization using CPU (slower)")
+        # Load HF token
+        self.hf_token = os.getenv("HF_TOKEN")
+        if not self.hf_token:
+            logger.warning("⚠️ HF_TOKEN not found. Speaker diarization will fail.")
+        # FFmpeg Setup for Windows
+        self._setup_ffmpeg()
+    def _setup_ffmpeg(self):
+        """Auto-configure FFmpeg from imageio-ffmpeg if not in PATH"""
+        try:
+            import imageio_ffmpeg
+            import shutil
+            ffmpeg_src = imageio_ffmpeg.get_ffmpeg_exe()
+            backend_dir = os.getcwd()
+            ffmpeg_dest = os.path.join(backend_dir, "ffmpeg.exe")
+            if not os.path.exists(ffmpeg_dest):
+                shutil.copy(ffmpeg_src, ffmpeg_dest)
+                logger.info(f"🔧 Configured FFmpeg: {ffmpeg_dest}")
+            if backend_dir not in os.environ.get("PATH", ""):
+                os.environ["PATH"] = backend_dir + os.pathsep + os.environ.get("PATH", "")
+        except Exception as e:
+            logger.warning(f"⚠️ Could not auto-configure FFmpeg: {e}")
+    def check_requirements(self):
+        """Validate requirements before processing"""
+        if not self.hf_token:
+            raise ValueError(
+                "HF_TOKEN is missing. Add HF_TOKEN=your_token to .env file. "
+                "Get one at: https://huggingface.co/settings/tokens"
+            )
+    def _get_diarization_pipeline(self):
+        """Load pyannote diarization pipeline with PyTorch 2.6+ fix"""
+        from pyannote.audio import Pipeline
+        # Monkey-patch torch.load for PyTorch 2.6+ compatibility
+        original_load = torch.load
+        def safe_load(*args, **kwargs):
+            kwargs.pop('weights_only', None)
+            return original_load(*args, **kwargs, weights_only=False)
+        torch.load = safe_load
+        try:
+            pipeline = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+                use_auth_token=self.hf_token
+            )
+            if self.device == "cuda":
+                pipeline.to(torch.device("cuda"))
+            return pipeline
+        finally:
+            torch.load = original_load
+    def _transcribe_with_timestamps(self, audio_path: str, language: Optional[str] = None) -> Dict:
+        """Transcribe audio using faster-whisper with word timestamps"""
+        from faster_whisper import WhisperModel
+        # CTranslate2 (faster-whisper) doesn't support float16 on all GPUs
+        # Use int8 for whisper, but pyannote still benefits from CUDA
+        whisper_compute = "int8" if self.device == "cuda" else "int8"
+        model = WhisperModel(
+            "small",
+            device=self.device,
+            compute_type=whisper_compute
+        )
+        segments_raw, info = model.transcribe(
+            audio_path,
+            language=language,
+            word_timestamps=True,
+            vad_filter=True
+        )
+        segments = []
+        for segment in segments_raw:
+            segments.append({
+                "start": segment.start,
+                "end": segment.end,
+                "text": segment.text.strip(),
+                "words": [
+                    {"start": w.start, "end": w.end, "word": w.word}
+                    for w in (segment.words or [])
+                ]
+            })
+        # Cleanup
+        del model
+        gc.collect()
+        return {
+            "segments": segments,
+            "language": info.language
+        }
+    def _preprocess_audio(self, audio_path: str) -> str:
+        """
+        Apply noise reduction to audio file.
+        Returns path to cleaned audio file.
+        """
+        try:
+            import noisereduce as nr
+            import librosa
+            import soundfile as sf
+            import tempfile
+            logger.info("🔧 Preprocessing audio (noise reduction)...")
+            # Load audio
+            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
+            # Apply spectral gating noise reduction
+            reduced_noise = nr.reduce_noise(
+                y=audio,
+                sr=sr,
+                stationary=True,
+                prop_decrease=0.75
+            )
+            # Save to temp file
+            temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            sf.write(temp_file.name, reduced_noise, sr)
+            logger.info(f"  → Noise reduction complete, saved to {temp_file.name}")
+            return temp_file.name
+        except ImportError as e:
+            logger.warning(f"⚠️ Audio preprocessing unavailable (install noisereduce, librosa, soundfile): {e}")
+            return audio_path
+        except Exception as e:
+            logger.warning(f"⚠️ Audio preprocessing failed: {e}")
+            return audio_path
+    def _merge_speakers(self, transcript: Dict, diarization) -> List[Dict]:
+        """
+        Merge speaker labels from diarization with transcript segments.
+        Uses midpoint matching with nearest-speaker fallback to minimize UNKNOWN labels.
+        """
+        segments = transcript["segments"]
+        result = []
+        # Build list of speaker turns for efficient lookup
+        speaker_turns = [
+            (turn.start, turn.end, spk)
+            for turn, _, spk in diarization.itertracks(yield_label=True)
+        ]
+        for seg in segments:
+            mid_time = (seg["start"] + seg["end"]) / 2
+            speaker = None
+            # Step 1: Try exact midpoint match
+            for start, end, spk in speaker_turns:
+                if start <= mid_time <= end:
+                    speaker = spk
+                    break
+            # Step 2: If no match, find nearest speaker (fallback)
+            if speaker is None and speaker_turns:
+                min_distance = float('inf')
+                for start, end, spk in speaker_turns:
+                    # Distance to nearest edge of speaker segment
+                    if mid_time < start:
+                        dist = start - mid_time
+                    elif mid_time > end:
+                        dist = mid_time - end
+                    else:
+                        dist = 0  # Should have been caught above
+                    if dist < min_distance:
+                        min_distance = dist
+                        speaker = spk
+            # Final fallback (shouldn't happen)
+            if speaker is None:
+                speaker = "UNKNOWN"
+            result.append({
+                "start": seg["start"],
+                "end": seg["end"],
+                "text": seg["text"],
+                "speaker": speaker
+            })
+        return result
+    def process_audio(
+        self,
+        audio_path: str,
+        num_speakers: Optional[int] = None,
+        min_speakers: Optional[int] = None,
+        max_speakers: Optional[int] = None,
+        language: Optional[str] = None,
+        preprocess: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Full diarization pipeline: [Preprocess] → Transcribe → Diarize → Merge
+        Args:
+            audio_path: Path to audio file
+            num_speakers: Exact number of speakers (optional)
+            min_speakers: Minimum speakers (optional)
+            max_speakers: Maximum speakers (optional)
+            language: Force language code (optional, auto-detected if None)
+            preprocess: Apply noise reduction before processing (default: False)
+        Returns:
+            Dict with segments, speaker_stats, language, status
+        """
+        self.check_requirements()
+        logger.info(f"🎤 Starting diarization on {self.device}...")
+        # Optional preprocessing for noise reduction
+        processed_path = audio_path
+        if preprocess:
+            processed_path = self._preprocess_audio(audio_path)
+        try:
+            # Step 1: Transcribe with faster-whisper
+            logger.info("Step 1/3: Transcribing audio...")
+            transcript = self._transcribe_with_timestamps(processed_path, language)
+            detected_lang = transcript["language"]
+            logger.info(f"  → Language: {detected_lang}, Segments: {len(transcript['segments'])}")
+            # Step 2: Diarize with pyannote
+            logger.info("Step 2/3: Identifying speakers...")
+            pipeline = self._get_diarization_pipeline()
+            diarization = pipeline(
+                processed_path,
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers
+            )
+            # Cleanup pipeline
+            del pipeline
+            gc.collect()
+            # Step 3: Merge results
+            logger.info("Step 3/3: Merging speakers with transcript...")
+            segments = self._merge_speakers(transcript, diarization)
+            # Calculate speaker stats
+            speaker_stats = {}
+            for seg in segments:
+                spk = seg["speaker"]
+                dur = seg["end"] - seg["start"]
+                speaker_stats[spk] = speaker_stats.get(spk, 0) + dur
+            logger.info(f"✅ Diarization complete: {len(segments)} segments, {len(speaker_stats)} speakers")
+            return {
+                "segments": segments,
+                "speaker_stats": speaker_stats,
+                "language": detected_lang,
+                "status": "success"
+            }
+        except Exception as e:
+            logger.exception("Diarization failed")
+            raise e
+        finally:
+            gc.collect()
+            if self.device == "cuda":
+                torch.cuda.empty_cache()
+# Singleton
+_diarization_service = None
+def get_diarization_service():
+    global _diarization_service
+    if not _diarization_service:
+        _diarization_service = DiarizationService()
+    return _diarization_service

backend/app/services/edge_tts_service.py ADDED Viewed

	@@ -0,0 +1,357 @@

+"""
+Edge-TTS Text-to-Speech Service
+Free, high-quality neural TTS using Microsoft Edge's speech synthesis
+"""
+import asyncio
+import io
+import logging
+import edge_tts
+from typing import Optional, List, Dict, Any
+logger = logging.getLogger(__name__)
+# Available voice samples by language
+VOICE_CATALOG = {
+    "en-US": [
+        {"name": "en-US-AriaNeural", "gender": "Female", "style": "professional"},
+        {"name": "en-US-GuyNeural", "gender": "Male", "style": "casual"},
+        {"name": "en-US-JennyNeural", "gender": "Female", "style": "friendly"},
+        {"name": "en-US-ChristopherNeural", "gender": "Male", "style": "newscast"},
+    ],
+    "en-GB": [
+        {"name": "en-GB-SoniaNeural", "gender": "Female", "style": "professional"},
+        {"name": "en-GB-RyanNeural", "gender": "Male", "style": "casual"},
+    ],
+    "en-IN": [
+        {"name": "en-IN-NeerjaNeural", "gender": "Female", "style": "professional"},
+        {"name": "en-IN-PrabhatNeural", "gender": "Male", "style": "casual"},
+    ],
+    "hi-IN": [
+        {"name": "hi-IN-SwaraNeural", "gender": "Female", "style": "professional"},
+        {"name": "hi-IN-MadhurNeural", "gender": "Male", "style": "casual"},
+    ],
+    "es-ES": [
+        {"name": "es-ES-ElviraNeural", "gender": "Female", "style": "professional"},
+        {"name": "es-ES-AlvaroNeural", "gender": "Male", "style": "casual"},
+    ],
+    "es-MX": [
+        {"name": "es-MX-DaliaNeural", "gender": "Female", "style": "professional"},
+        {"name": "es-MX-JorgeNeural", "gender": "Male", "style": "casual"},
+    ],
+    "fr-FR": [
+        {"name": "fr-FR-DeniseNeural", "gender": "Female", "style": "professional"},
+        {"name": "fr-FR-HenriNeural", "gender": "Male", "style": "casual"},
+    ],
+    "de-DE": [
+        {"name": "de-DE-KatjaNeural", "gender": "Female", "style": "professional"},
+        {"name": "de-DE-ConradNeural", "gender": "Male", "style": "casual"},
+    ],
+    "ja-JP": [
+        {"name": "ja-JP-NanamiNeural", "gender": "Female", "style": "professional"},
+        {"name": "ja-JP-KeitaNeural", "gender": "Male", "style": "casual"},
+    ],
+    "ko-KR": [
+        {"name": "ko-KR-SunHiNeural", "gender": "Female", "style": "professional"},
+        {"name": "ko-KR-InJoonNeural", "gender": "Male", "style": "casual"},
+    ],
+    "zh-CN": [
+        {"name": "zh-CN-XiaoxiaoNeural", "gender": "Female", "style": "professional"},
+        {"name": "zh-CN-YunxiNeural", "gender": "Male", "style": "casual"},
+    ],
+}
+class EdgeTTSService:
+    """
+    Text-to-Speech service using Microsoft Edge TTS (free, neural voices)
+    """
+    def __init__(self):
+        """Initialize the Edge TTS service"""
+        self._all_voices = None
+    # Class-level cache
+    _voices_cache = None
+    async def get_voices(self, language: Optional[str] = None) -> List[Dict[str, Any]]:
+        """
+        Get available voices
+        """
+        # Check cache
+        if EdgeTTSService._voices_cache is None:
+            try:
+                voices = await edge_tts.list_voices()
+                # Transform to our format
+                formatted_voices = []
+                for v in voices:
+                    formatted_voices.append({
+                        "name": v["ShortName"],
+                        "display_name": v["ShortName"].replace("-", " ").split("Neural")[0].strip(),
+                        "language_code": v["Locale"],
+                        "gender": v["Gender"],
+                        "voice_type": "Neural",
+                    })
+                EdgeTTSService._voices_cache = formatted_voices
+            except Exception as e:
+                logger.error(f"Failed to fetch voices from Edge TTS: {e}. Falling back to catalog.")
+                # Fallback to catalog
+                voices = []
+                for lang, lang_voices in VOICE_CATALOG.items():
+                    for v in lang_voices:
+                        voices.append({
+                            "name": v["name"],
+                            "display_name": v["name"].replace("-", " ").replace("Neural", "").strip(),
+                            "language_code": lang,
+                            "gender": v["gender"],
+                            "voice_type": "Neural",
+                        })
+                EdgeTTSService._voices_cache = voices
+        voices = EdgeTTSService._voices_cache
+        # Filter by language if specified
+        if language:
+            voices = [v for v in voices if v["language_code"].startswith(language)]
+        return voices
+    def get_voices_sync(self, language: Optional[str] = None) -> List[Dict[str, Any]]:
+        """Synchronous wrapper for get_voices"""
+        # Create a new event loop if necessary for sync wrapper
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        if loop.is_running():
+            # If loop is running, we can't block it.
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as pool:
+                future = asyncio.run_coroutine_threadsafe(self.get_voices(language), loop)
+                return future.result()
+        return loop.run_until_complete(self.get_voices(language))
+    def build_ssml(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "medium",
+        pitch: str = "medium",
+        emphasis: str = None,
+        breaks: bool = True
+    ) -> str:
+        """
+        Build SSML markup for advanced prosody control.
+        Args:
+            text: Plain text to convert
+            voice: Voice name
+            rate: Speed - 'x-slow', 'slow', 'medium', 'fast', 'x-fast' or percentage
+            pitch: Pitch - 'x-low', 'low', 'medium', 'high', 'x-high' or Hz offset
+            emphasis: Optional emphasis level - 'reduced', 'moderate', 'strong'
+            breaks: Auto-insert breaks at punctuation
+        Returns:
+            SSML-formatted string
+        """
+        # Normalize rate/pitch values
+        rate_value = rate if rate in ['x-slow', 'slow', 'medium', 'fast', 'x-fast'] else rate
+        pitch_value = pitch if pitch in ['x-low', 'low', 'medium', 'high', 'x-high'] else pitch
+        # Build SSML
+        ssml_parts = ['<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">']
+        ssml_parts.append(f'<voice name="{voice}">')
+        ssml_parts.append(f'<prosody rate="{rate_value}" pitch="{pitch_value}">')
+        if emphasis:
+            ssml_parts.append(f'<emphasis level="{emphasis}">')
+        # Auto-insert breaks for natural speech
+        if breaks:
+            import re
+            # Add short breaks after commas, longer after periods
+            processed_text = re.sub(r'([,;:])\s*', r'\1<break time="200ms"/>', text)
+            processed_text = re.sub(r'([.!?])\s+', r'\1<break time="500ms"/>', processed_text)
+            ssml_parts.append(processed_text)
+        else:
+            ssml_parts.append(text)
+        if emphasis:
+            ssml_parts.append('</emphasis>')
+        ssml_parts.append('</prosody>')
+        ssml_parts.append('</voice>')
+        ssml_parts.append('</speak>')
+        return ''.join(ssml_parts)
+    async def synthesize_ssml(
+        self,
+        ssml_text: str,
+        voice: str = "en-US-AriaNeural",
+    ) -> bytes:
+        """
+        Synthesize speech from SSML markup.
+        Args:
+            ssml_text: SSML-formatted text
+            voice: Voice name (for edge-tts communication)
+        Returns:
+            Audio bytes (MP3)
+        """
+        logger.info(f"Synthesizing SSML with voice: {voice}")
+        # Edge TTS handles SSML natively
+        communicate = edge_tts.Communicate(ssml_text, voice)
+        audio_buffer = io.BytesIO()
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                audio_buffer.write(chunk["data"])
+        audio_buffer.seek(0)
+        return audio_buffer.read()
+    async def synthesize_stream(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "+0%",
+        pitch: str = "+0Hz",
+    ):
+        """
+        Stream speech synthesis chunks.
+        Optimized to stream sentence-by-sentence to reduce TTFB (Time To First Byte),
+        avoiding full-text buffering issues.
+        """
+        import re
+        # Split text into sentences to force incremental processing
+        # This regex matches sentences ending with . ! ? or end of string
+        # It keeps the proper punctuation.
+        sentences = re.findall(r'[^.!?]+(?:[.!?]+|$)', text)
+        if not sentences:
+            sentences = [text]
+        logger.info(f"Streaming {len(sentences)} sentences for low latency...")
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            communicate = edge_tts.Communicate(sentence, voice, rate=rate, pitch=pitch)
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    yield chunk["data"]
+    async def synthesize(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "+0%",
+        pitch: str = "+0Hz",
+    ) -> bytes:
+        """
+        Synthesize speech from text
+        Args:
+            text: Text to synthesize
+            voice: Voice name (e.g., 'en-US-AriaNeural')
+            rate: Speaking rate adjustment (e.g., '+20%', '-10%')
+            pitch: Pitch adjustment (e.g., '+5Hz', '-10Hz')
+        Returns:
+            Audio content as bytes (MP3 format)
+        """
+        # Reuse stream method to avoid duplication
+        audio_buffer = io.BytesIO()
+        async for chunk in self.synthesize_stream(text, voice, rate, pitch):
+            audio_buffer.write(chunk)
+        audio_buffer.seek(0)
+        return audio_buffer.read()
+    def synthesize_sync(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "+0%",
+        pitch: str = "+0Hz",
+    ) -> bytes:
+        """Synchronous wrapper for synthesize"""
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        return loop.run_until_complete(self.synthesize(text, voice, rate, pitch))
+    async def synthesize_to_response(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        speaking_rate: float = 1.0,
+        pitch: float = 0.0,
+    ) -> Dict[str, Any]:
+        """
+        Synthesize speech and return API-compatible response
+        Args:
+            text: Text to synthesize
+            voice: Voice name
+            speaking_rate: Rate multiplier (1.0 = normal, 1.5 = 50% faster)
+            pitch: Pitch adjustment in semitones (-20 to +20)
+        Returns:
+            Dictionary with audio content and metadata
+        """
+        import base64
+        import time
+        start_time = time.time()
+        # Convert rate/pitch to Edge TTS format
+        rate_percent = int((speaking_rate - 1.0) * 100)
+        rate_str = f"+{rate_percent}%" if rate_percent >= 0 else f"{rate_percent}%"
+        pitch_str = f"+{int(pitch)}Hz" if pitch >= 0 else f"{int(pitch)}Hz"
+        # Synthesize
+        audio_bytes = await self.synthesize(text, voice, rate_str, pitch_str)
+        processing_time = time.time() - start_time
+        # Estimate duration (~150 chars per second at normal speed)
+        estimated_duration = len(text) / 150 / speaking_rate
+        return {
+            "audio_content": base64.b64encode(audio_bytes).decode("utf-8"),
+            "encoding": "MP3",
+            "audio_size": len(audio_bytes),
+            "duration_estimate": estimated_duration,
+            "voice_used": voice,
+            "processing_time": processing_time,
+            "cached": False,
+        }
+# Singleton instance
+_edge_tts_service: Optional[EdgeTTSService] = None
+def get_edge_tts_service() -> EdgeTTSService:
+    """Get or create the EdgeTTSService singleton"""
+    global _edge_tts_service
+    if _edge_tts_service is None:
+        _edge_tts_service = EdgeTTSService()
+    return _edge_tts_service

backend/app/services/emotion_service.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Emotion Analysis Service
+Detects emotion from audio using Wav2Vec2 and text using NLP
+"""
+import logging
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Dict, List, Any, Optional
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+class EmotionService:
+    """
+    Service for Speech Emotion Recognition (SER).
+    Uses 'ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition'
+    """
+    def __init__(self):
+        self.model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+        self._model = None
+        self._processor = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Supported emotions in model's order
+        self.emotions = [
+            "angry", "calm", "disgust", "fearful",
+            "happy", "neutral", "sad", "surprised"
+        ]
+    def _load_model(self):
+        """Lazy load model to save RAM"""
+        if self._model is None:
+            try:
+                from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
+                logger.info(f"🎭 Loading Emotion Model ({self.device})...")
+                self._processor = Wav2Vec2Processor.from_pretrained(self.model_name)
+                self._model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_name)
+                self._model.to(self.device)
+                logger.info("✅ Emotion Model loaded")
+            except Exception as e:
+                logger.error(f"Failed to load emotion model: {e}")
+                raise
+    def analyze_audio(self, audio_path: str) -> Dict[str, Any]:
+        """
+        Analyze emotion of an entire audio file.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Dict with dominant emotion and probability distribution
+        """
+        import librosa
+        self._load_model()
+        try:
+            # Load audio using librosa (16kHz required for Wav2Vec2)
+            # Duration limit: Analyze first 30s max for MVP to avoid OOM
+            # For full file, we should chunk it.
+            y, sr = librosa.load(audio_path, sr=16000, duration=60)
+            inputs = self._processor(y, sampling_rate=16000, return_tensors="pt", padding=True)
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                logits = self._model(**inputs).logits
+            # Get probabilities
+            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
+            # Map to emotions
+            scores = {
+                self.emotions[i]: float(probs[i])
+                for i in range(len(self.emotions))
+            }
+            # Get dominant
+            dominant = max(scores, key=scores.get)
+            return {
+                "dominant_emotion": dominant,
+                "confidence": scores[dominant],
+                "distribution": scores
+            }
+        except Exception as e:
+            logger.error(f"Audio emotion analysis failed: {e}")
+            raise e
+    def analyze_audio_segment(self, audio_data: np.ndarray, sr: int = 16000) -> Dict[str, Any]:
+        """
+        Analyze a raw numpy audio segment.
+        """
+        self._load_model()
+        try:
+            inputs = self._processor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                logits = self._model(**inputs).logits
+            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
+            scores = {self.emotions[i]: float(probs[i]) for i in range(len(self.emotions))}
+            dominant = max(scores, key=scores.get)
+            return {
+                "emotion": dominant,
+                "score": scores[dominant]
+            }
+        except Exception as e:
+            logger.error(f"Segment analysis failed: {e}")
+            return {"emotion": "neutral", "score": 0.0}
+# Singleton
+_emotion_service = None
+def get_emotion_service() -> EmotionService:
+    global _emotion_service
+    if _emotion_service is None:
+        _emotion_service = EmotionService()
+    return _emotion_service