Spaces:

vyluong
/

PoC_ASR_v1

Sleeping

App Files Files Community

vyluong commited on Jan 6

Commit

5ab6c6e

verified ·

1 Parent(s): f37e958

Upload folder using huggingface_hub

Browse files

Files changed (32) hide show

.dockerignore +22 -0
.env.example +57 -0
.github/workflows/sync-to-huggingface.yml +24 -0
.gitignore +46 -0
Dockerfile +68 -0
README.md +94 -4
app/__init__.py +1 -0
app/api/__init__.py +1 -0
app/api/routes.py +122 -0
app/core/__init__.py +1 -0
app/core/config.py +104 -0
app/main.py +115 -0
app/schemas/__init__.py +1 -0
app/schemas/models.py +73 -0
app/services/__init__.py +1 -0
app/services/alignment.py +353 -0
app/services/audio_processor.py +244 -0
app/services/denoiser.py +142 -0
app/services/diarization.py +180 -0
app/services/orchestrator.py +84 -0
app/services/transcription.py +168 -0
app/services/vocal_separator.py +118 -0
app/static/css/style.css +673 -0
app/static/js/app.js +312 -0
app/templates/index.html +162 -0
data/processed/.gitkeep +0 -0
data/uploads/.gitkeep +0 -0
docker-compose.yml +60 -0
docker/.gitkeep +0 -0
precision_voice_colab.ipynb +413 -0
requirements.txt +31 -0
scripts/verify_model_config.py +18 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,22 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.venv
+venv
+ENV
+.git
+.github
+.vscode
+.idea
+*.log
+.cache
+.pytest_cache
+data/uploads/*
+data/processed/*
+Dockerfile
+docker-compose.yml
+README.md
+implementation_plan.md
+walkthrough.md
+task.md

.env.example ADDED Viewed

	@@ -0,0 +1,57 @@

+# Environment Configuration for PrecisionVoice
+# HuggingFace token (required for pyannote.audio)
+# Get your token at: https://huggingface.co/settings/tokens
+# Accept terms at: https://huggingface.co/pyannote/speaker-diarization-3.1
+HF_TOKEN=your_huggingface_token_here
+# Model settings
+WHISPER_MODEL=kiendt/PhoWhisper-large-ct2
+DIARIZATION_MODEL=pyannote/speaker-diarization-3.1
+# Device settings (cuda, cpu, or auto)
+DEVICE=auto
+# --- Denoising (Speech Enhancement) ---
+# Enable speech enhancement (removes background noise, hum, etc.)
+ENABLE_DENOISER=True
+# Denoiser model: dns64 (standard), dns48, or master64
+DENOISER_MODEL=dns64
+# --- MDX-Net Vocal Separation ---
+# Enable vocal separation before transcription (isolates voice from music/noise)
+# More effective than the basic Demucs implementation.
+ENABLE_VOCAL_SEPARATION=True
+# MDX-Net model: Kim_Vocal_2.onnx (recommended for vocals)
+MDX_MODEL=Kim_Vocal_2.onnx
+# Upload settings
+MAX_UPLOAD_SIZE_MB=100
+# --- Optimization Settings ---
+# Enable subtle highpass filter (removes low-frequency rumble < 80Hz)
+ENABLE_NOISE_REDUCTION=True
+# Enable/Disable Loudness Normalization (EBU R128)
+ENABLE_LOUDNORM=True
+# --- VAD (Voice Activity Detection) Settings ---
+# Threshold for detecting speech (0.0 to 1.0). Higher = stricter
+VAD_THRESHOLD=0.5
+# Ignore speech segments shorter than this (milliseconds)
+VAD_MIN_SPEECH_DURATION_MS=250
+# Minimum silence duration to split segments (milliseconds)
+VAD_MIN_SILENCE_DURATION_MS=500
+# --- Post-processing (Clustering) Settings ---
+# Merge segments from same speaker if gap is less than this (seconds)
+MERGE_THRESHOLD_S=0.5
+# Filter out segments shorter than this (seconds) - removes blips/noise
+MIN_SEGMENT_DURATION_S=0.3
+# Server settings
+HOST=0.0.0.0
+PORT=8000

.github/workflows/sync-to-huggingface.yml ADDED Viewed

	@@ -0,0 +1,24 @@

+name: Sync to Hugging Face Hub
+on:
+  push:
+    branches:
+      - main
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to Hugging Face Hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          git remote add huggingface https://huggingface.co/spaces/ThiThanhChuong/precision-voice || true
+          git remote set-url huggingface https://huggingface.co/spaces/ThiThanhChuong/precision-voice
+          git push https://user:$HF_TOKEN@huggingface.co/spaces/ThiThanhChuong/precision-voice main --force

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environment
+venv/
+.venv/
+ENV/
+# Environment files
+.env
+!.env.example
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# Data directories (keep structure, ignore content)
+data/uploads/*
+data/processed/*
+!data/uploads/.gitkeep
+!data/processed/.gitkeep
+# Docker
+.docker/
+# Logs
+*.log
+logs/
+# Cache
+.cache/
+*.cache
+.pytest_cache/
+# OS
+.DS_Store
+Thumbs.db
+# Model files (will be downloaded at runtime)
+*.pt
+*.bin
+*.safetensors

Dockerfile ADDED Viewed

	@@ -0,0 +1,68 @@

+# ================================
+# PrecisionVoice Dockerfile
+# Optimized for performance and size
+# ================================
+# Stage 1: Builder
+FROM python:3.10-slim-bullseye AS builder
+WORKDIR /app
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    git \
+    ffmpeg \
+    libsndfile1-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install dependencies
+# Using --user to keep packages in /root/.local
+COPY requirements.txt .
+RUN pip install --no-cache-dir --user -r requirements.txt
+# ================================
+# Stage 2: Runtime
+# ================================
+FROM python:3.10-slim-bullseye
+WORKDIR /app
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+# Copy Python packages from builder
+COPY --from=builder /root/.local /root/.local
+# Ensure scripts in .local are available
+ENV PATH=/root/.local/bin:$PATH
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+# Model cache directories
+ENV HF_HOME=/root/.cache/huggingface
+ENV TORCH_HOME=/root/.cache/torch
+ENV TRANSFORMERS_CACHE=/root/.cache/huggingface
+# Copy application code
+COPY app/ ./app/
+COPY data/ ./data/
+# Create necessary directories
+RUN mkdir -p /app/data/uploads /app/data/processed
+# Port configuration
+ARG PORT=7860
+ENV PORT=${PORT}
+EXPOSE ${PORT}
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:${PORT}/api/health')" || exit 1
+# Run the application
+CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]

README.md CHANGED Viewed

@@ -1,10 +1,100 @@
 ---
-title: PoC PrecisisionVoice V1
-emoji: 🔥
-colorFrom: yellow
-colorTo: red
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PrecisionVoice
+emoji: 🎙️
+colorFrom: blue
+colorTo: purple
 sdk: docker
+app_file: app/main.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# PrecisionVoice - STT & Speaker Diarization
+A production-ready Speech-to-Text and Speaker Diarization web application using FastAPI, faster-whisper, and pyannote.audio.
+## Features
+- 🎙️ Speech-to-Text using `kiendt/PhoWhisper-large-ct2` (optimized for Vietnamese)
+- 👥 Speaker Diarization using `pyannote/speaker-diarization-3.1`
+- 🧼 Advanced Denoising using Facebook's `Denoiser` (dns64)
+- 🎤 Vocal Isolation using `MDX-Net` (UVR-MDX-NET-Voc_FT)
+- 🔄 Automatic speaker-transcript alignment
+- 📥 Download results in TXT or SRT format
+- 🐳 Docker-ready with persistent model caching and GPU support
+## Quick Start
+### Prerequisites
+1. Docker and Docker Compose
+2. (Optional) NVIDIA GPU with CUDA support
+3. HuggingFace account with access to pyannote models
+### Setup
+1. Clone and configure:
+   ```bash
+   cp .env.example .env
+   # Edit .env and add your HuggingFace token
+   ```
+2. Build and run:
+   ```bash
+   docker compose up --build
+   ```
+3. Open http://localhost:8000
+## Audio Processing Pipeline
+The system uses a state-of-the-art multi-stage pipeline to ensure maximum accuracy:
+1. **Speech Enhancement**: Background noise, hums, and interference are removed using Facebook's `Denoiser` (Deep Learning Wave-U-Net).
+2. **Vocal Isolation**: Vocals are stripped from any remaining background music or non-speech sounds using `MDX-Net`.
+3. **Refinement**: Subtle highpass filtering and EBU R128 loudness normalization for consistent volume.
+4. **Transcription**: High-precision Vietnamese transcription using `PhoWhisper`.
+5. **Diarization**: Segmenting audio by speaker.
+6. **Alignment**: Merging transcripts with speaker segments.
+## Configuration
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `HF_TOKEN` | - | Required for Pyannote models |
+| `ENABLE_DENOISER` | `True` | Toggle Facebook speech enhancement |
+| `DENOISER_MODEL` | `dns64` | Model for denoising |
+| `ENABLE_VOCAL_SEPARATION` | `True` | Toggle MDX-Net vocal isolation |
+| `MDX_MODEL` | `UVR-MDX-NET-Voc_FT` | Model for vocal separation |
+| `DEVICE` | `auto` | `cuda`, `cpu`, or `auto` |
+## Development
+### Local Setup (without Docker)
+```bash
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+uvicorn app.main:app --reload
+```
+### API Endpoints
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/` | GET | Web UI |
+| `/api/transcribe` | POST | Upload and transcribe audio |
+| `/api/download/{filename}` | GET | Download result files |
+## Supported Audio Formats
+- MP3
+- WAV
+- M4A
+- OGG
+## License
+MIT

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # App package

app/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # API package

app/api/routes.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+API routes for the transcription service.
+"""
+import json
+import time
+import logging
+from pathlib import Path
+from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, StreamingResponse
+from app.core.config import get_settings
+from app.schemas.models import TranscriptionResponse, ErrorResponse, HealthResponse
+from app.services.audio_processor import AudioProcessor, AudioProcessingError
+from app.services.transcription import TranscriptionService
+from app.services.diarization import DiarizationService
+from app.services.alignment import AlignmentService
+from app.services.orchestrator import PipelineOrchestrator
+logger = logging.getLogger(__name__)
+settings = get_settings()
+router = APIRouter()
+@router.get("/api/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint."""
+    return HealthResponse(
+        status="healthy",
+        models_loaded=TranscriptionService.is_loaded() and DiarizationService.is_loaded(),
+        device=settings.resolved_device
+    )
+from fastapi.responses import FileResponse, StreamingResponse
+# ... (rest of imports)
+@router.post("/api/transcribe", response_model=TranscriptionResponse)
+async def transcribe_audio(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(..., description="Audio file to transcribe")
+):
+    """
+    Upload and transcribe an audio file.
+    Status updates are logged on the server.
+    """
+    wav_path = None
+    try:
+        # Read file content
+        file_content = await file.read()
+        # Validate and process audio
+        try:
+            AudioProcessor.validate_file(file.filename or "audio.wav", len(file_content))
+        except AudioProcessingError as e:
+            raise HTTPException(status_code=400, detail=str(e))
+        # Save and convert to WAV (Noise reduction happens here)
+        wav_path, duration = await AudioProcessor.process_upload(
+            file_content,
+            file.filename or "audio.wav"
+        )
+        # Run orchestrated pipeline (Whisper + Pyannote in parallel -> Alignment)
+        logger.info("Executing orchestrated pipeline...")
+        response = await PipelineOrchestrator.process_audio(wav_path, duration)
+        # Schedule cleanup in background
+        background_tasks.add_task(cleanup_files, wav_path)
+        return response
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Processing failed")
+        if wav_path and wav_path.exists():
+            background_tasks.add_task(cleanup_files, wav_path)
+        raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+@router.get("/api/download/{filename}")
+async def download_file(filename: str, background_tasks: BackgroundTasks):
+    """
+    Download a generated transcript file.
+    Supports: .txt, .srt files
+    """
+    # Security: only allow specific extensions and no path traversal
+    if not filename.endswith(('.txt', '.srt')) or '/' in filename or '..' in filename:
+        raise HTTPException(status_code=400, detail="Invalid filename")
+    filepath = settings.processed_dir / filename
+    if not filepath.exists():
+        raise HTTPException(status_code=404, detail="File not found")
+    # Determine media type
+    media_type = "text/plain" if filename.endswith('.txt') else "application/x-subrip"
+    # Schedule cleanup after download (give some time for download to complete)
+    # Note: In production, you might want a separate cleanup job
+    return FileResponse(
+        path=filepath,
+        filename=filename,
+        media_type=media_type
+    )
+async def cleanup_files(*paths: Path):
+    """Background task to cleanup temporary files."""
+    import asyncio
+    # Wait a bit before cleanup to ensure files are not in use
+    await asyncio.sleep(5)
+    await AudioProcessor.cleanup_files(*paths)

app/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Core package

app/core/config.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Application configuration using Pydantic Settings.
+"""
+import os
+from pathlib import Path
+from functools import lru_cache
+from typing import Literal
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore"
+    )
+    # HuggingFace
+    hf_token: str = ""
+    enable_noise_reduction: bool = True
+    # Denoising (Speech Enhancement)
+    enable_denoiser: bool = True
+    denoiser_model: str = "dns64"
+    # MDX-Net Vocal Separation
+    enable_vocal_separation: bool = True
+    mdx_model: str = "Kim_Vocal_2.onnx"  # High quality vocal isolation
+    # Model settings
+    whisper_model: str = "kiendt/PhoWhisper-large-ct2"
+    diarization_model: str = "pyannote/speaker-diarization-3.1"
+    # Device settings
+    device: Literal["cuda", "cpu", "auto"] = "auto"
+    compute_type: str = "float16"  # float16 for GPU, int8 for CPU
+    # Upload settings
+    max_upload_size_mb: int = 100
+    allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
+    # Audio processing settings
+    sample_rate: int = 16000
+    channels: int = 1  # Mono
+    # Optimization parameters
+    noise_reduction_level: float = 12.0  # Used by anlmdn
+    enable_loudnorm: bool = True
+    # VAD parameters
+    vad_threshold: float = 0.5
+    vad_min_speech_duration_ms: int = 250
+    vad_min_silence_duration_ms: int = 500
+    # Post-processing
+    merge_threshold_s: float = 0.5  # Merge segments from same speaker if gap < this
+    min_segment_duration_s: float = 0.3  # Remove segments shorter than this
+    # Server settings
+    host: str = "0.0.0.0"
+    port: int = 7860
+    # Paths
+    base_dir: Path = Path(__file__).parent.parent.parent
+    data_dir: Path = base_dir / "data"
+    upload_dir: Path = data_dir / "uploads"
+    processed_dir: Path = data_dir / "processed"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Ensure directories exist
+        self.upload_dir.mkdir(parents=True, exist_ok=True)
+        self.processed_dir.mkdir(parents=True, exist_ok=True)
+    @property
+    def max_upload_size_bytes(self) -> int:
+        return self.max_upload_size_mb * 1024 * 1024
+    @property
+    def resolved_device(self) -> str:
+        """Resolve 'auto' to actual device."""
+        if self.device == "auto":
+            try:
+                import torch
+                return "cuda" if torch.cuda.is_available() else "cpu"
+            except ImportError:
+                return "cpu"
+        return self.device
+    @property
+    def resolved_compute_type(self) -> str:
+        """Get appropriate compute type for device."""
+        if self.resolved_device == "cuda":
+            return "float16"
+        return "int8"
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance."""
+    return Settings()

app/main.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+PrecisionVoice - Speech-to-Text & Speaker Diarization Application
+Main FastAPI application entry point.
+"""
+import logging
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
+from app.core.config import get_settings
+from app.api.routes import router
+from app.services.transcription import TranscriptionService
+from app.services.diarization import DiarizationService
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+settings = get_settings()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Application lifespan handler.
+    Preloads models on startup for faster first request.
+    """
+    logger.info("Starting PrecisionVoice application...")
+    logger.info(f"Device: {settings.resolved_device}")
+    logger.info(f"Whisper model: {settings.whisper_model}")
+    logger.info(f"Diarization model: {settings.diarization_model}")
+    # Preload models (optional - can be disabled for faster startup)
+    try:
+        logger.info("Preloading Whisper model...")
+        TranscriptionService.preload_model()
+    except Exception as e:
+        logger.error(f"Failed to preload Whisper model: {e}")
+    try:
+        if settings.hf_token:
+            logger.info("Preloading diarization pipeline...")
+            DiarizationService.preload_pipeline()
+        else:
+            logger.warning("HF_TOKEN not set, diarization will not be available")
+    except Exception as e:
+        logger.warning(f"Diarization preload failed (will try again on first use): {e}")
+    logger.info("Application startup complete")
+    yield
+    logger.info("Shutting down PrecisionVoice application...")
+# Create FastAPI app
+app = FastAPI(
+    title="PrecisionVoice",
+    description="Speech-to-Text and Speaker Diarization API",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure appropriately for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Mount static files
+app.mount(
+    "/static",
+    StaticFiles(directory="app/static"),
+    name="static"
+)
+# Templates
+templates = Jinja2Templates(directory="app/templates")
+# Include API routes
+app.include_router(router)
+@app.get("/", response_class=HTMLResponse)
+async def index(request: Request):
+    """Serve the main web interface."""
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "max_upload_mb": settings.max_upload_size_mb,
+            "allowed_formats": ", ".join(settings.allowed_extensions)
+        }
+    )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True
+    )

app/schemas/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Schemas package

app/schemas/models.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Pydantic models for API requests and responses.
+"""
+from pydantic import BaseModel, Field
+from typing import Optional
+from enum import Enum
+class ProcessingStatus(str, Enum):
+    """Status of the transcription process."""
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+class TranscriptSegment(BaseModel):
+    """A single segment of the transcript with speaker and timing."""
+    start: float = Field(..., description="Start time in seconds")
+    end: float = Field(..., description="End time in seconds")
+    speaker: str = Field(..., description="Speaker identifier")
+    text: str = Field(..., description="Transcribed text")
+    @property
+    def start_formatted(self) -> str:
+        """Format start time as HH:MM:SS."""
+        return self._format_time(self.start)
+    @property
+    def end_formatted(self) -> str:
+        """Format end time as HH:MM:SS."""
+        return self._format_time(self.end)
+    @staticmethod
+    def _format_time(seconds: float) -> str:
+        """Convert seconds to HH:MM:SS format."""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+class TranscriptionRequest(BaseModel):
+    """Request model for transcription settings."""
+    language: str = Field(default="vi", description="Language code for transcription")
+    num_speakers: Optional[int] = Field(default=None, description="Expected number of speakers (None for auto-detect)")
+    output_format: str = Field(default="json", description="Output format: json, txt, srt")
+class TranscriptionResponse(BaseModel):
+    """Response containing the transcription results."""
+    success: bool = Field(..., description="Whether transcription succeeded")
+    message: str = Field(default="", description="Status message")
+    segments: list[TranscriptSegment] = Field(default_factory=list, description="Transcript segments with speakers")
+    duration: float = Field(default=0.0, description="Audio duration in seconds")
+    num_speakers: int = Field(default=0, description="Number of detected speakers")
+    processing_time: float = Field(default=0.0, description="Processing time in seconds")
+    download_txt: Optional[str] = Field(default=None, description="Download URL for TXT file")
+    download_srt: Optional[str] = Field(default=None, description="Download URL for SRT file")
+class ErrorResponse(BaseModel):
+    """Error response model."""
+    success: bool = False
+    error: str = Field(..., description="Error message")
+    detail: Optional[str] = Field(default=None, description="Detailed error information")
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str = "healthy"
+    models_loaded: bool = False
+    device: str = "cpu"

app/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Services package

app/services/alignment.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""
+Precision alignment service - Word-center-based speaker assignment.
+Merges word-level transcription with speaker diarization using precise timestamps.
+"""
+import logging
+from pathlib import Path
+from typing import List, Tuple, Optional
+from dataclasses import dataclass
+from app.core.config import get_settings
+from app.schemas.models import TranscriptSegment
+from app.services.transcription import WordTimestamp
+from app.services.diarization import SpeakerSegment
+logger = logging.getLogger(__name__)
+settings = get_settings()
+@dataclass
+class WordWithSpeaker:
+    """A word with assigned speaker."""
+    word: str
+    start: float
+    end: float
+    speaker: str
+class AlignmentService:
+    """
+    Precision alignment service.
+    Uses word-center-based algorithm for accurate speaker-to-text mapping.
+    """
+    # Pause threshold for splitting segments (seconds)
+    PAUSE_THRESHOLD = 1.0
+    @staticmethod
+    def get_word_center(word: WordTimestamp) -> float:
+        """Calculate the center time of a word."""
+        return (word.start + word.end) / 2
+    @classmethod
+    def find_speaker_at_time(
+        cls,
+        time: float,
+        speaker_segments: List[SpeakerSegment]
+    ) -> Optional[str]:
+        """
+        Find which speaker is speaking at a given time.
+        Args:
+            time: Time point in seconds
+            speaker_segments: List of speaker segments from diarization
+        Returns:
+            Speaker label or None if no speaker found
+        """
+        for seg in speaker_segments:
+            if seg.start <= time <= seg.end:
+                return seg.speaker
+        return None
+    @classmethod
+    def find_closest_speaker(
+        cls,
+        time: float,
+        speaker_segments: List[SpeakerSegment]
+    ) -> str:
+        """
+        Find the closest speaker to a given time (for gaps/silence).
+        Args:
+            time: Time point in seconds
+            speaker_segments: List of speaker segments
+        Returns:
+            Closest speaker label or "Unknown"
+        """
+        if not speaker_segments:
+            return "Unknown"
+        min_distance = float('inf')
+        closest_speaker = "Unknown"
+        for seg in speaker_segments:
+            # Distance to segment start or end
+            dist_to_start = abs(time - seg.start)
+            dist_to_end = abs(time - seg.end)
+            min_seg_dist = min(dist_to_start, dist_to_end)
+            if min_seg_dist < min_distance:
+                min_distance = min_seg_dist
+                closest_speaker = seg.speaker
+        return closest_speaker
+    @classmethod
+    def assign_speakers_to_words(
+        cls,
+        words: List[WordTimestamp],
+        speaker_segments: List[SpeakerSegment]
+    ) -> List[WordWithSpeaker]:
+        """
+        Step 3c: Assign speakers to each word based on word center time.
+        Args:
+            words: List of words with timestamps from transcription
+            speaker_segments: List of speaker segments from diarization
+        Returns:
+            List of words with speaker assignments
+        """
+        if not speaker_segments:
+            # No diarization available, assign all to "Speaker 1"
+            logger.warning("No speaker segments available, using single speaker")
+            return [
+                WordWithSpeaker(
+                    word=w.word,
+                    start=w.start,
+                    end=w.end,
+                    speaker="Speaker 1"
+                )
+                for w in words
+            ]
+        words_with_speakers = []
+        for word in words:
+            # Calculate word center time
+            center_time = cls.get_word_center(word)
+            # Find speaker at this time
+            speaker = cls.find_speaker_at_time(center_time, speaker_segments)
+            # If no direct match, find closest speaker
+            if speaker is None:
+                speaker = cls.find_closest_speaker(center_time, speaker_segments)
+            words_with_speakers.append(WordWithSpeaker(
+                word=word.word,
+                start=word.start,
+                end=word.end,
+                speaker=speaker
+            ))
+        logger.debug(f"Assigned speakers to {len(words_with_speakers)} words")
+        return words_with_speakers
+    @classmethod
+    def reconstruct_segments(
+        cls,
+        words_with_speakers: List[WordWithSpeaker]
+    ) -> List[TranscriptSegment]:
+        """
+        Step 3d: Reconstruct sentence segments from words.
+        Groups consecutive words of the same speaker into segments.
+        Creates new segment when:
+        - Speaker changes
+        - Pause > PAUSE_THRESHOLD between words
+        Args:
+            words_with_speakers: List of words with speaker assignments
+        Returns:
+            List of TranscriptSegment with complete sentences
+        """
+        if not words_with_speakers:
+            return []
+        segments = []
+        # Start first segment
+        current_speaker = words_with_speakers[0].speaker
+        current_start = words_with_speakers[0].start
+        current_end = words_with_speakers[0].end
+        current_words = [words_with_speakers[0].word]
+        for i in range(1, len(words_with_speakers)):
+            word = words_with_speakers[i]
+            prev_word = words_with_speakers[i - 1]
+            # Calculate pause between words
+            pause = word.start - prev_word.end
+            # Check if we need to start a new segment
+            speaker_changed = word.speaker != current_speaker
+            significant_pause = pause > cls.PAUSE_THRESHOLD
+            if speaker_changed or significant_pause:
+                # Save current segment
+                segments.append(TranscriptSegment(
+                    start=current_start,
+                    end=current_end,
+                    speaker=current_speaker,
+                    text=" ".join(current_words)
+                ))
+                # Start new segment
+                current_speaker = word.speaker
+                current_start = word.start
+                current_end = word.end
+                current_words = [word.word]
+            else:
+                # Continue current segment
+                current_end = word.end
+                current_words.append(word.word)
+        # Don't forget the last segment
+        if current_words:
+            segments.append(TranscriptSegment(
+                start=current_start,
+                end=current_end,
+                speaker=current_speaker,
+                text=" ".join(current_words)
+            ))
+        logger.debug(f"Reconstructed {len(segments)} segments from {len(words_with_speakers)} words")
+        return segments
+    @classmethod
+    def resize_and_merge_segments(
+        cls,
+        segments: List[TranscriptSegment]
+    ) -> List[TranscriptSegment]:
+        """
+        Merge consecutive segments of the same speaker if the gap is small.
+        Also filters out extremely short segments.
+        """
+        if not segments:
+            return []
+        # Filter 1: Remove extremely short blips (noise)
+        segments = [s for s in segments if (s.end - s.start) >= settings.min_segment_duration_s]
+        if not segments:
+            return []
+        merged = []
+        curr = segments[0]
+        for i in range(1, len(segments)):
+            next_seg = segments[i]
+            # If same speaker and gap is small, merge
+            gap = next_seg.start - curr.end
+            if next_seg.speaker == curr.speaker and gap < settings.merge_threshold_s:
+                curr.end = next_seg.end
+                curr.text += " " + next_seg.text
+            else:
+                merged.append(curr)
+                curr = next_seg
+        merged.append(curr)
+        logger.debug(f"Merged segments: {len(segments)} -> {len(merged)}")
+        return merged
+    @classmethod
+    def align_precision(
+        cls,
+        words: List[WordTimestamp],
+        speaker_segments: List[SpeakerSegment]
+    ) -> List[TranscriptSegment]:
+        """
+        Full precision alignment pipeline.
+        Args:
+            words: Word-level timestamps from transcription
+            speaker_segments: Speaker segments from diarization
+        Returns:
+            List of TranscriptSegment with proper speaker assignments
+        """
+        # Step 3c: Assign speakers to words
+        words_with_speakers = cls.assign_speakers_to_words(words, speaker_segments)
+        # Step 3d: Reconstruct segments
+        segments = cls.reconstruct_segments(words_with_speakers)
+        # Step 3e: Clustering/Merging (Optimization)
+        segments = cls.resize_and_merge_segments(segments)
+        return segments
+    @staticmethod
+    def format_timestamp_txt(seconds: float) -> str:
+        """Format timestamp for TXT output: HH:MM:SS"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+    @staticmethod
+    def format_timestamp_srt(seconds: float) -> str:
+        """Format timestamp for SRT output: HH:MM:SS,mmm"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis = int((seconds % 1) * 1000)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+    @classmethod
+    def generate_txt(cls, segments: List[TranscriptSegment], output_path: Path) -> Path:
+        """
+        Generate TXT transcript file.
+        Format: [HH:MM:SS - HH:MM:SS] Speaker: Text
+        """
+        lines = []
+        for seg in segments:
+            start = cls.format_timestamp_txt(seg.start)
+            end = cls.format_timestamp_txt(seg.end)
+            lines.append(f"[{start} - {end}] {seg.speaker}: {seg.text}")
+        output_path.write_text("\n".join(lines), encoding="utf-8")
+        logger.info(f"Generated TXT: {output_path}")
+        return output_path
+    @classmethod
+    def generate_srt(cls, segments: List[TranscriptSegment], output_path: Path) -> Path:
+        """
+        Generate SRT subtitle file.
+        """
+        lines = []
+        for i, seg in enumerate(segments, 1):
+            start = cls.format_timestamp_srt(seg.start)
+            end = cls.format_timestamp_srt(seg.end)
+            lines.append(str(i))
+            lines.append(f"{start} --> {end}")
+            lines.append(f"[{seg.speaker}] {seg.text}")
+            lines.append("")  # Empty line between entries
+        output_path.write_text("\n".join(lines), encoding="utf-8")
+        logger.info(f"Generated SRT: {output_path}")
+        return output_path
+    @classmethod
+    def generate_outputs(
+        cls,
+        segments: List[TranscriptSegment],
+        base_filename: str
+    ) -> Tuple[Path, Path]:
+        """Generate both TXT and SRT output files."""
+        txt_path = settings.processed_dir / f"{base_filename}.txt"
+        srt_path = settings.processed_dir / f"{base_filename}.srt"
+        cls.generate_txt(segments, txt_path)
+        cls.generate_srt(segments, srt_path)
+        return txt_path, srt_path

app/services/audio_processor.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+Audio processing service using FFmpeg.
+Handles file validation, conversion to 16kHz mono WAV, and cleanup.
+"""
+import os
+import uuid
+import asyncio
+import logging
+from pathlib import Path
+from typing import Optional, Tuple
+import ffmpeg
+from app.core.config import get_settings
+from app.services.vocal_separator import VocalSeparator
+from app.services.denoiser import DenoiserService
+logger = logging.getLogger(__name__)
+settings = get_settings()
+class AudioProcessingError(Exception):
+    """Custom exception for audio processing errors."""
+    pass
+class AudioProcessor:
+    """Service for processing audio files."""
+    ALLOWED_EXTENSIONS = settings.allowed_extensions
+    TARGET_SAMPLE_RATE = settings.sample_rate
+    TARGET_CHANNELS = settings.channels
+    @classmethod
+    def validate_file(cls, filename: str, file_size: int) -> bool:
+        """
+        Validate uploaded file.
+        Args:
+            filename: Original filename
+            file_size: File size in bytes
+        Returns:
+            True if valid
+        Raises:
+            AudioProcessingError: If validation fails
+        """
+        # Check extension
+        ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
+        if ext not in cls.ALLOWED_EXTENSIONS:
+            raise AudioProcessingError(
+                f"Invalid file type: .{ext}. Allowed: {', '.join(cls.ALLOWED_EXTENSIONS)}"
+            )
+        # Check size
+        if file_size > settings.max_upload_size_bytes:
+            raise AudioProcessingError(
+                f"File too large: {file_size / (1024*1024):.1f}MB. "
+                f"Maximum: {settings.max_upload_size_mb}MB"
+            )
+        return True
+    @classmethod
+    async def save_upload(cls, file_content: bytes, original_filename: str) -> Path:
+        """
+        Save uploaded file to temporary location.
+        Args:
+            file_content: File bytes
+            original_filename: Original filename for extension
+        Returns:
+            Path to saved file
+        """
+        ext = original_filename.rsplit('.', 1)[-1].lower() if '.' in original_filename else 'wav'
+        unique_id = str(uuid.uuid4())[:8]
+        filename = f"{unique_id}.{ext}"
+        filepath = settings.upload_dir / filename
+        # Write file asynchronously
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(None, lambda: filepath.write_bytes(file_content))
+        logger.debug(f"Saved upload: {filepath}")
+        return filepath
+    @classmethod
+    async def convert_to_wav(cls, input_path: Path) -> Path:
+        """
+        Convert audio to 16kHz mono WAV using FFmpeg.
+        Args:
+            input_path: Path to input audio file
+        Returns:
+            Path to converted WAV file
+        """
+        output_filename = f"{input_path.stem}_processed.wav"
+        output_path = settings.processed_dir / output_filename
+        try:
+            # Run ffmpeg conversion in executor to not block
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(None, lambda: cls._run_ffmpeg_conversion(input_path, output_path))
+            logger.info(f"Converted to WAV: {output_path}")
+            return output_path
+        except ffmpeg.Error as e:
+            error_msg = e.stderr.decode() if e.stderr else str(e)
+            logger.error(f"FFmpeg error: {error_msg}")
+            raise AudioProcessingError(f"Audio conversion failed: {error_msg}")
+    @staticmethod
+    def _run_ffmpeg_conversion(input_path: Path, output_path: Path) -> None:
+        """Run the actual FFmpeg conversion (blocking)."""
+        stream = ffmpeg.input(str(input_path))
+        # Apply normalization if enabled (loudnorm is best for speech consistency)
+        if settings.enable_loudnorm:
+            logger.debug("Applying loudnorm normalization...")
+            stream = stream.filter('loudnorm', I=-16, TP=-1.5, LRA=11)
+        # Apply noise reduction if enabled (Note: basic filters are kept as minor cleanup)
+        if settings.enable_noise_reduction:
+            logger.debug("Applying subtle highpass filter...")
+            stream = stream.filter('highpass', f=80)
+        (
+            stream
+            .output(
+                str(output_path),
+                acodec='pcm_s16le',
+                ar=16000,
+                ac=1
+            )
+            .overwrite_output()
+            .run(quiet=True, capture_stderr=True)
+        )
+    @classmethod
+    async def get_audio_duration(cls, filepath: Path) -> float:
+        """
+        Get audio file duration in seconds.
+        Args:
+            filepath: Path to audio file
+        Returns:
+            Duration in seconds
+        """
+        try:
+            loop = asyncio.get_event_loop()
+            probe = await loop.run_in_executor(
+                None,
+                lambda: ffmpeg.probe(str(filepath))
+            )
+            duration = float(probe['format'].get('duration', 0))
+            return duration
+        except ffmpeg.Error as e:
+            logger.warning(f"Could not probe audio duration: {e}")
+            return 0.0
+    @classmethod
+    async def cleanup_files(cls, *filepaths: Path) -> None:
+        """
+        Delete temporary files.
+        Args:
+            filepaths: Paths to files to delete
+        """
+        for filepath in filepaths:
+            try:
+                if filepath and filepath.exists():
+                    filepath.unlink()
+                    logger.debug(f"Cleaned up: {filepath}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up {filepath}: {e}")
+    @classmethod
+    async def process_upload(cls, file_content: bytes, filename: str) -> Tuple[Path, float]:
+        """
+        Full upload processing pipeline: validate, save, convert.
+        Args:
+            file_content: Uploaded file bytes
+            filename: Original filename
+        Returns:
+            Tuple of (processed WAV path, duration in seconds)
+        """
+        # Validate
+        cls.validate_file(filename, len(file_content))
+        # Save original
+        original_path = await cls.save_upload(file_content, filename)
+        vocals_path = None
+        try:
+            # Step 1: Denoising (Speech Enhancement)
+            if settings.enable_denoiser:
+                denoised_path = await DenoiserService.enhance_audio(original_path)
+                source_for_separation = denoised_path
+            else:
+                source_for_separation = original_path
+                denoised_path = None
+            # Step 2: Vocal separation using MDX-Net
+            if settings.enable_vocal_separation:
+                vocals_path = await VocalSeparator.separate_vocals(source_for_separation)
+                source_for_conversion = vocals_path
+            else:
+                source_for_conversion = source_for_separation
+                vocals_path = None
+            # Step 3: Convert to 16kHz mono WAV (includes normalization)
+            wav_path = await cls.convert_to_wav(source_for_conversion)
+            # Get duration
+            duration = await cls.get_audio_duration(wav_path)
+            # Cleanup intermediate files
+            to_cleanup = [original_path]
+            if denoised_path and denoised_path != original_path:
+                to_cleanup.append(denoised_path)
+            if vocals_path and vocals_path not in [original_path, denoised_path]:
+                to_cleanup.append(vocals_path)
+            await cls.cleanup_files(*to_cleanup)
+            return wav_path, duration
+        except Exception as e:
+            # Cleanup on error
+            await cls.cleanup_files(original_path)
+            if 'denoised_path' in locals() and denoised_path and denoised_path != original_path:
+                await cls.cleanup_files(denoised_path)
+            if 'vocals_path' in locals() and vocals_path and vocals_path not in [original_path, denoised_path]:
+                await cls.cleanup_files(vocals_path)
+            raise

app/services/denoiser.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Speech Enhancement Service using Facebook's Denoiser.
+Removes background noise and enhances speech quality.
+"""
+import os
+import asyncio
+import logging
+from pathlib import Path
+import torch
+import torchaudio
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+settings = get_settings()
+class DenoiserError(Exception):
+    """Custom exception for denoiser errors."""
+    pass
+class DenoiserService:
+    """
+    Service for enhancing speech using Facebook's Denoiser models.
+    Supports dns48, dns64, master64, etc.
+    """
+    _model = None
+    _model_name: str = None
+    @classmethod
+    def _get_model(cls):
+        """Lazy load the Denoiser model."""
+        if cls._model is None or cls._model_name != settings.denoiser_model:
+            from denoiser.pretrained import dns48, dns64, master64
+            model_map = {
+                "dns48": dns48,
+                "dns64": dns64,
+                "master64": master64
+            }
+            model_func = model_map.get(settings.denoiser_model, dns64)
+            logger.debug(f"Loading Denoiser model: {settings.denoiser_model}")
+            model = model_func()
+            device = settings.resolved_device
+            model.to(device)
+            model.eval()
+            cls._model = model
+            cls._model_name = settings.denoiser_model
+            logger.debug(f"Denoiser model loaded on {device}")
+        return cls._model
+    @classmethod
+    async def enhance_audio(cls, input_path: Path) -> Path:
+        """
+        Enhance audio by removing noise.
+        Args:
+            input_path: Path to input audio file
+        Returns:
+            Path to enhanced WAV file
+        """
+        if not settings.enable_denoiser:
+            logger.debug("Denoiser disabled, skipping...")
+            return input_path
+        logger.debug(f"Starting speech enhancement for: {input_path.name}")
+        try:
+            # Run enhancement in executor to not block
+            loop = asyncio.get_event_loop()
+            enhanced_path = await loop.run_in_executor(
+                None,
+                lambda: cls._run_enhancement(input_path)
+            )
+            logger.info(f"Speech enhancement complete: {enhanced_path.name}")
+            return enhanced_path
+        except Exception as e:
+            logger.error(f"Speech enhancement failed: {e}")
+            # Fallback to original on failure rather than failing the whole pipeline
+            logger.warning("Falling back to original audio.")
+            return input_path
+    @classmethod
+    def _run_enhancement(cls, input_path: Path) -> Path:
+        """Run the actual denoiser enhancement (blocking)."""
+        from denoiser.enhance import enhance
+        model = cls._get_model()
+        device = settings.resolved_device
+        # Load audio
+        wav, sr = torchaudio.load(str(input_path))
+        wav = wav.to(device)
+        # Ensure correct sample rate for the model
+        if sr != model.sample_rate:
+            resampler = torchaudio.transforms.Resample(sr, model.sample_rate).to(device)
+            wav = resampler(wav)
+            sr = model.sample_rate
+        # Enhance
+        # wav shape: [channels, time]
+        from types import SimpleNamespace
+        args = SimpleNamespace(
+            streaming=False,
+            dry=0.0,
+            sample_rate=sr
+        )
+        with torch.no_grad():
+            # denoiser.enhance.enhance(args, model, wav)
+            if wav.dim() == 1:
+                wav = wav.unsqueeze(0).unsqueeze(0)
+            elif wav.dim() == 2:
+                wav = wav.unsqueeze(0)
+            enhanced = enhance(args, model, wav)
+            # remove batch dim
+            enhanced = enhanced.squeeze(0)
+        # Save enhanced audio
+        output_filename = f"{input_path.stem}_denoised.wav"
+        output_path = settings.processed_dir / output_filename
+        torchaudio.save(
+            str(output_path),
+            enhanced.cpu(),
+            sr
+        )
+        return output_path

app/services/diarization.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Speaker diarization service using pyannote.audio.
+Identifies speaker turns in audio files.
+"""
+import os
+import logging
+from pathlib import Path
+from typing import List, Optional
+from dataclasses import dataclass
+import torch
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+settings = get_settings()
+@dataclass
+class SpeakerSegment:
+    """A segment of audio attributed to a specific speaker."""
+    start: float
+    end: float
+    speaker: str
+class DiarizationService:
+    """
+    Service for speaker diarization using pyannote.audio.
+    Implements lazy loading to avoid memory overhead at startup.
+    """
+    _instance: Optional["DiarizationService"] = None
+    _pipeline = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    @classmethod
+    def get_pipeline(cls):
+        """
+        Get or load the diarization pipeline (lazy loading with caching).
+        Returns:
+            Loaded pyannote Pipeline
+        """
+        if cls._pipeline is None:
+            # Import here to avoid loading if not used
+            from pyannote.audio import Pipeline
+            hf_token = settings.hf_token
+            if not hf_token:
+                raise ValueError(
+                    "HuggingFace token required for pyannote.audio. "
+                    "Set HF_TOKEN in your environment or .env file."
+                )
+            logger.debug(f"Loading diarization pipeline: {settings.diarization_model}")
+            # Use 'token' parameter (use_auth_token is deprecated)
+            cls._pipeline = Pipeline.from_pretrained(
+                settings.diarization_model,
+                token=hf_token
+            )
+            # Move to GPU if available
+            device = torch.device(settings.resolved_device)
+            if device.type == "cuda":
+                cls._pipeline = cls._pipeline.to(device)
+                logger.debug("Diarization pipeline moved to GPU")
+            logger.debug("Diarization pipeline loaded successfully")
+        return cls._pipeline
+    @classmethod
+    def is_loaded(cls) -> bool:
+        """Check if pipeline is loaded."""
+        return cls._pipeline is not None
+    @classmethod
+    def diarize(
+        cls,
+        audio_path: Path,
+        num_speakers: Optional[int] = None,
+        min_speakers: int = 1,
+        max_speakers: int = 10
+    ) -> List[SpeakerSegment]:
+        """
+        Perform speaker diarization on audio file.
+        Args:
+            audio_path: Path to WAV audio file
+            num_speakers: Exact number of speakers (None for auto-detect)
+            min_speakers: Minimum number of speakers to detect
+            max_speakers: Maximum number of speakers to detect
+        Returns:
+            List of SpeakerSegment with speaker labels
+        """
+        pipeline = cls.get_pipeline()
+        logger.debug(f"Diarizing: {audio_path}")
+        # Build parameters
+        params = {}
+        if num_speakers is not None:
+            params["num_speakers"] = num_speakers
+        else:
+            params["min_speakers"] = min_speakers
+            params["max_speakers"] = max_speakers
+        # Run diarization
+        diarization = pipeline(str(audio_path), **params)
+        # Handle pyannote.audio 4.x breaking change
+        # In 4.x, pipeline returns a DiarizeOutput object wrapping the Annotation
+        # In 3.x, it returns the Annotation directly
+        annotation = diarization
+        if hasattr(diarization, "speaker_diarization"):
+            annotation = diarization.speaker_diarization
+            logger.debug("Detected pyannote.audio 4.x DiarizeOutput structure")
+        # Convert to segments
+        segments = []
+        speaker_map = {}  # Map SPEAKER_XX to Speaker 1, 2, etc.
+        for turn, _, speaker in annotation.itertracks(yield_label=True):
+            # Create readable speaker label
+            if speaker not in speaker_map:
+                speaker_map[speaker] = f"Speaker {len(speaker_map) + 1}"
+            segments.append(SpeakerSegment(
+                start=turn.start,
+                end=turn.end,
+                speaker=speaker_map[speaker]
+            ))
+        logger.info(f"Diarization complete: {len(segments)} turns, {len(speaker_map)} speakers")
+        return segments
+    @classmethod
+    async def diarize_async(
+        cls,
+        audio_path: Path,
+        num_speakers: Optional[int] = None,
+        min_speakers: int = 1,
+        max_speakers: int = 10
+    ) -> List[SpeakerSegment]:
+        """
+        Async wrapper for diarization (runs in thread pool).
+        Args:
+            audio_path: Path to WAV audio file
+            num_speakers: Exact number of speakers
+            min_speakers: Minimum speakers
+            max_speakers: Maximum speakers
+        Returns:
+            List of SpeakerSegment
+        """
+        import asyncio
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(
+            None,
+            lambda: cls.diarize(audio_path, num_speakers, min_speakers, max_speakers)
+        )
+    @classmethod
+    def preload_pipeline(cls) -> None:
+        """Preload the pipeline during startup."""
+        try:
+            cls.get_pipeline()
+        except Exception as e:
+            logger.warning(f"Failed to preload diarization pipeline: {e}")
+            # Don't raise - diarization is optional, app can work without it

app/services/orchestrator.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Pipeline Orchestrator for PrecisionVoice.
+Coordinates transcription and diarization in parallel.
+"""
+import time
+import asyncio
+import logging
+from pathlib import Path
+from app.core.config import get_settings
+from app.schemas.models import TranscriptionResponse
+from app.services.transcription import TranscriptionService
+from app.services.diarization import DiarizationService
+from app.services.alignment import AlignmentService
+logger = logging.getLogger(__name__)
+settings = get_settings()
+class PipelineOrchestrator:
+    """
+    Coordinates the AI pipeline with detailed server-side logging:
+    1. Audio -> Vocal Separation (MDX-Net) -> 16kHz WAV
+    2. Whisper (Transcribe) + Pyannote (Diarize) in parallel
+    3. Alignment (Matching Algorithm)
+    4. Generate outputs (TXT, SRT)
+    """
+    @classmethod
+    async def process_audio(
+        cls,
+        wav_path: Path,
+        duration: float
+    ) -> TranscriptionResponse:
+        """
+        Run the full processing pipeline and return the final response.
+        Each step is logged for server-side monitoring.
+        """
+        start_time = time.time()
+        # Step 1: Pre-processing (Vocal Separation + Noise Reduction)
+        logger.info(f"[Step 1/4] Audio pre-processing completed (MDX-Net: {settings.enable_vocal_separation}, Denoise: {settings.enable_noise_reduction})")
+        # Step 2: AI Processing (Transcription & Diarization)
+        logger.info(f"[Step 2/4] Starting AI models (Whisper + Pyannote) for: {wav_path.name}")
+        transcription_task = TranscriptionService.transcribe_async(wav_path)
+        diarization_task = DiarizationService.diarize_async(wav_path)
+        try:
+            word_timestamps, speaker_segments = await asyncio.gather(
+                transcription_task,
+                diarization_task,
+                return_exceptions=False
+            )
+            logger.info(f"AI models processing completed: {len(word_timestamps)} words, {len(speaker_segments)} segments")
+        except Exception as e:
+            logger.exception("Parallel task failed")
+            raise
+        # Step 3: Precision Alignment
+        logger.info("[Step 3/4] Aligning words with speaker turns...")
+        aligned_segments = AlignmentService.align_precision(word_timestamps, speaker_segments)
+        # Count unique speakers
+        speakers = set(seg.speaker for seg in aligned_segments)
+        # Step 4: Export Generation
+        logger.info("[Step 4/4] Generating export files (TXT, SRT)...")
+        base_filename = wav_path.stem.replace("_processed", "")
+        txt_path, srt_path = AlignmentService.generate_outputs(aligned_segments, base_filename)
+        processing_time = time.time() - start_time
+        logger.info(f"Pipeline complete for {wav_path.name} in {processing_time:.2f}s")
+        return TranscriptionResponse(
+            success=True,
+            message="Transcription completed successfully",
+            segments=aligned_segments,
+            duration=duration,
+            num_speakers=len(speakers),
+            processing_time=round(processing_time, 2),
+            download_txt=f"/api/download/{txt_path.name}",
+            download_srt=f"/api/download/{srt_path.name}"
+        )

app/services/transcription.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Transcription service using faster-whisper.
+Loads the suzii/vi-whisper-large-v3-turbo-v1-ct2 model for Vietnamese STT.
+Returns word-level timestamps for precision alignment.
+"""
+import logging
+from pathlib import Path
+from typing import List, Optional
+from dataclasses import dataclass
+from faster_whisper import WhisperModel
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+settings = get_settings()
+@dataclass
+class WordTimestamp:
+    """A single word with precise timestamp."""
+    word: str
+    start: float
+    end: float
+@dataclass
+class TranscriptSegmentRaw:
+    """Raw segment from Whisper transcription with word-level data."""
+    start: float
+    end: float
+    text: str
+    words: List[WordTimestamp]
+class TranscriptionService:
+    """
+    Service for speech-to-text transcription using faster-whisper.
+    Implements singleton pattern for model caching.
+    Returns word-level timestamps for precision speaker alignment.
+    """
+    _instance: Optional["TranscriptionService"] = None
+    _model: Optional[WhisperModel] = None
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    @classmethod
+    def get_model(cls) -> WhisperModel:
+        """
+        Get or load the Whisper model (lazy loading with caching).
+        Returns:
+            Loaded WhisperModel instance
+        """
+        if cls._model is None:
+            logger.debug(f"Loading Whisper model: {settings.whisper_model}")
+            logger.debug(f"Device: {settings.resolved_device}, Compute type: {settings.resolved_compute_type}")
+            cls._model = WhisperModel(
+                settings.whisper_model,
+                device=settings.resolved_device,
+                compute_type=settings.resolved_compute_type,
+                download_root=None,  # Use default HF cache
+            )
+            logger.debug("Whisper model loaded successfully")
+        return cls._model
+    @classmethod
+    def is_loaded(cls) -> bool:
+        """Check if model is loaded."""
+        return cls._model is not None
+    @classmethod
+    def transcribe(
+        cls,
+        audio_path: Path,
+        language: str = "vi",
+        initial_prompt: Optional[str] = None
+    ) -> List[WordTimestamp]:
+        """
+        Transcribe audio file with word-level timestamps.
+        Args:
+            audio_path: Path to WAV audio file
+            language: Language code (default: Vietnamese)
+            initial_prompt: Optional prompt for context
+        Returns:
+            List of WordTimestamp with precise timing for each word
+        """
+        model = cls.get_model()
+        logger.debug(f"Transcribing: {audio_path}")
+        # Run transcription with word timestamps - CRITICAL for precision alignment
+        segments_generator, info = model.transcribe(
+            str(audio_path),
+            language=language,
+            initial_prompt=initial_prompt,
+            word_timestamps=True,  # CRITICAL: Enable word-level timestamps
+            vad_filter=True,  # Re-enabled for optimization
+            vad_parameters=dict(
+                threshold=settings.vad_threshold,
+                min_speech_duration_ms=settings.vad_min_speech_duration_ms,
+                min_silence_duration_ms=settings.vad_min_silence_duration_ms,
+            ),
+            beam_size=5,
+            best_of=5,
+        )
+        # Extract all words with timestamps
+        all_words = []
+        segment_count = 0
+        for segment in segments_generator:
+            segment_count += 1
+            if segment.words:
+                for word in segment.words:
+                    all_words.append(WordTimestamp(
+                        word=word.word.strip(),
+                        start=word.start,
+                        end=word.end
+                    ))
+        logger.info(f"Transcription complete: {segment_count} segments, {len(all_words)} words, detected language: {info.language}")
+        return all_words
+    @classmethod
+    async def transcribe_async(
+        cls,
+        audio_path: Path,
+        language: str = "vi",
+        initial_prompt: Optional[str] = None
+    ) -> List[WordTimestamp]:
+        """
+        Async wrapper for transcription (runs in thread pool).
+        Args:
+            audio_path: Path to WAV audio file
+            language: Language code
+            initial_prompt: Optional prompt
+        Returns:
+            List of WordTimestamp
+        """
+        import asyncio
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(
+            None,
+            lambda: cls.transcribe(audio_path, language, initial_prompt)
+        )
+    @classmethod
+    def preload_model(cls) -> None:
+        """Preload the model during startup."""
+        try:
+            cls.get_model()
+        except Exception as e:
+            logger.error(f"Failed to preload Whisper model: {e}")
+            raise

app/services/vocal_separator.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Vocal Separation Service using MDX-Net (via audio-separator).
+Isolates vocals from audio files using state-of-the-art MDX-Net models.
+"""
+import os
+import asyncio
+import logging
+from pathlib import Path
+from typing import Optional
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+settings = get_settings()
+class VocalSeparationError(Exception):
+    """Custom exception for vocal separation errors."""
+    pass
+class VocalSeparator:
+    """
+    Service for separating vocals from audio using MDX-Net.
+    Uses the audio-separator library which supports UVR models.
+    """
+    _separator = None
+    _model_name: str = None
+    @classmethod
+    def _get_separator(cls):
+        """Lazy load the Audio Separator."""
+        if cls._separator is None or cls._model_name != settings.mdx_model:
+            from audio_separator.separator import Separator
+            logger.debug(f"Initializing MDX-Net separator with model: {settings.mdx_model}")
+            # Initialize separator
+            # Note: audio-separator expects output_dir to exist
+            settings.processed_dir.mkdir(parents=True, exist_ok=True)
+            separator = Separator(
+                output_dir=str(settings.processed_dir),
+                output_format="WAV",
+                normalization_threshold=0.9
+            )
+            # Load model
+            separator.load_model(settings.mdx_model)
+            cls._separator = separator
+            cls._model_name = settings.mdx_model
+            logger.debug(f"MDX-Net model loaded on {settings.resolved_device}")
+        return cls._separator
+    @classmethod
+    async def separate_vocals(cls, input_path: Path) -> Path:
+        """
+        Separate vocals from audio file using MDX-Net.
+        Args:
+            input_path: Path to input audio file
+        Returns:
+            Path to separated vocals WAV file
+        """
+        if not settings.enable_vocal_separation:
+            logger.debug("Vocal separation disabled, skipping...")
+            return input_path
+        logger.debug(f"Starting vocal separation for: {input_path.name}")
+        try:
+            # Run separation in executor to not block
+            loop = asyncio.get_event_loop()
+            vocals_path = await loop.run_in_executor(
+                None,
+                lambda: cls._run_separation(input_path)
+            )
+            logger.info(f"Vocal separation complete: {vocals_path.name}")
+            return vocals_path
+        except Exception as e:
+            logger.error(f"Vocal separation failed: {e}")
+            # Fallback to original
+            logger.warning("Falling back to original audio.")
+            return input_path
+    @classmethod
+    def _run_separation(cls, input_path: Path) -> Path:
+        """Run the actual separation (blocking)."""
+        separator = cls._get_separator()
+        # separate() returns a list of output filenames
+        output_files = separator.separate(str(input_path))
+        # audio-separator usually produces multiple files (Vocals, Instrumental)
+        # We need to find the vocals one.
+        # It typically names them like {input_stem}_(Vocals)_{model}.wav
+        vocals_file = None
+        for file in output_files:
+            if "Vocals" in file:
+                vocals_file = settings.processed_dir / file
+                break
+        if not vocals_file:
+            # If we can't find the vocals file specifically, just take the first one or fail
+            logger.warning("Could not identify vocals stem in output files.")
+            if output_files:
+                vocals_file = settings.processed_dir / output_files[0]
+            else:
+                raise VocalSeparationError("No output files generated by separator.")
+        return vocals_file

app/static/css/style.css ADDED Viewed

	@@ -0,0 +1,673 @@

+/* ================================
+   PrecisionVoice - Modern Dark Theme
+   ================================ */
+:root {
+    /* Color Palette */
+    --bg-primary: #0a0a0f;
+    --bg-secondary: #12121a;
+    --bg-card: rgba(255, 255, 255, 0.03);
+    --bg-card-hover: rgba(255, 255, 255, 0.05);
+    --text-primary: #ffffff;
+    --text-secondary: #a0a0b0;
+    --text-muted: #606070;
+    --accent-primary: #6366f1;
+    --accent-secondary: #8b5cf6;
+    --accent-gradient: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
+    --success: #10b981;
+    --error: #ef4444;
+    --warning: #f59e0b;
+    --border-color: rgba(255, 255, 255, 0.08);
+    --border-glow: rgba(99, 102, 241, 0.3);
+    /* Spacing */
+    --spacing-xs: 0.25rem;
+    --spacing-sm: 0.5rem;
+    --spacing-md: 1rem;
+    --spacing-lg: 1.5rem;
+    --spacing-xl: 2rem;
+    --spacing-2xl: 3rem;
+    /* Border Radius */
+    --radius-sm: 0.375rem;
+    --radius-md: 0.75rem;
+    --radius-lg: 1rem;
+    --radius-xl: 1.5rem;
+    /* Shadows */
+    --shadow-sm: 0 2px 8px rgba(0, 0, 0, 0.3);
+    --shadow-md: 0 4px 16px rgba(0, 0, 0, 0.4);
+    --shadow-lg: 0 8px 32px rgba(0, 0, 0, 0.5);
+    --shadow-glow: 0 0 40px rgba(99, 102, 241, 0.15);
+    /* Transitions */
+    --transition-fast: 0.15s ease;
+    --transition-normal: 0.3s ease;
+    --transition-slow: 0.5s ease;
+}
+/* ================================
+   Base Styles
+   ================================ */
+*,
+*::before,
+*::after {
+    box-sizing: border-box;
+    margin: 0;
+    padding: 0;
+}
+html {
+    font-size: 16px;
+    scroll-behavior: smooth;
+}
+body {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+    background: var(--bg-primary);
+    color: var(--text-primary);
+    line-height: 1.6;
+    min-height: 100vh;
+    -webkit-font-smoothing: antialiased;
+    -moz-osx-font-smoothing: grayscale;
+}
+/* Animated background gradient */
+body::before {
+    content: '';
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background:
+        radial-gradient(ellipse at 20% 20%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
+        radial-gradient(ellipse at 80% 80%, rgba(139, 92, 246, 0.06) 0%, transparent 50%),
+        radial-gradient(ellipse at 50% 50%, rgba(168, 85, 247, 0.04) 0%, transparent 70%);
+    pointer-events: none;
+    z-index: -1;
+}
+/* ================================
+   Layout
+   ================================ */
+.app-container {
+    max-width: 800px;
+    margin: 0 auto;
+    padding: var(--spacing-lg);
+    min-height: 100vh;
+    display: flex;
+    flex-direction: column;
+}
+/* ================================
+   Header
+   ================================ */
+.header {
+    text-align: center;
+    padding: var(--spacing-2xl) 0;
+}
+.logo {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    gap: var(--spacing-md);
+    margin-bottom: var(--spacing-sm);
+}
+.logo-icon {
+    width: 48px;
+    height: 48px;
+    background: var(--accent-gradient);
+    border-radius: var(--radius-lg);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    box-shadow: var(--shadow-glow);
+}
+.logo-icon svg {
+    width: 28px;
+    height: 28px;
+    color: white;
+}
+.logo h1 {
+    font-size: 2rem;
+    font-weight: 700;
+    background: var(--accent-gradient);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+}
+.tagline {
+    color: var(--text-secondary);
+    font-size: 1rem;
+    font-weight: 400;
+}
+/* ================================
+   Cards
+   ================================ */
+.card {
+    background: var(--bg-card);
+    backdrop-filter: blur(20px);
+    border: 1px solid var(--border-color);
+    border-radius: var(--radius-xl);
+    padding: var(--spacing-xl);
+    margin-bottom: var(--spacing-lg);
+    transition: var(--transition-normal);
+}
+.card:hover {
+    border-color: var(--border-glow);
+    box-shadow: var(--shadow-glow);
+}
+.card-header {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    margin-bottom: var(--spacing-lg);
+    flex-wrap: wrap;
+    gap: var(--spacing-sm);
+}
+.card-header h2 {
+    font-size: 1.25rem;
+    font-weight: 600;
+}
+/* ================================
+   Badge
+   ================================ */
+.badge {
+    display: inline-block;
+    padding: var(--spacing-xs) var(--spacing-sm);
+    background: rgba(99, 102, 241, 0.15);
+    color: var(--accent-primary);
+    border-radius: var(--radius-sm);
+    font-size: 0.75rem;
+    font-weight: 500;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+}
+/* ================================
+   Upload Zone
+   ================================ */
+.upload-zone {
+    border: 2px dashed var(--border-color);
+    border-radius: var(--radius-lg);
+    padding: var(--spacing-2xl);
+    text-align: center;
+    cursor: pointer;
+    transition: var(--transition-normal);
+    margin-bottom: var(--spacing-lg);
+}
+.upload-zone:hover,
+.upload-zone.dragover {
+    border-color: var(--accent-primary);
+    background: rgba(99, 102, 241, 0.05);
+}
+.upload-zone.dragover {
+    transform: scale(1.02);
+}
+.upload-icon {
+    width: 64px;
+    height: 64px;
+    margin: 0 auto var(--spacing-md);
+    background: var(--accent-gradient);
+    border-radius: 50%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    opacity: 0.8;
+}
+.upload-icon svg {
+    width: 32px;
+    height: 32px;
+    color: white;
+}
+.upload-text {
+    font-size: 1.125rem;
+    font-weight: 500;
+    color: var(--text-primary);
+    margin-bottom: var(--spacing-xs);
+}
+.upload-subtext {
+    color: var(--text-muted);
+    font-size: 0.875rem;
+}
+/* ================================
+   File Info
+   ================================ */
+.file-info {
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: var(--spacing-md);
+    background: rgba(99, 102, 241, 0.1);
+    border-radius: var(--radius-md);
+    margin-bottom: var(--spacing-lg);
+}
+.file-details {
+    display: flex;
+    flex-direction: column;
+    gap: var(--spacing-xs);
+}
+.file-name {
+    font-weight: 500;
+    color: var(--text-primary);
+}
+.file-size {
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+/* ================================
+   Buttons
+   ================================ */
+.btn {
+    display: inline-flex;
+    align-items: center;
+    justify-content: center;
+    gap: var(--spacing-sm);
+    padding: var(--spacing-md) var(--spacing-xl);
+    border: none;
+    border-radius: var(--radius-md);
+    font-family: inherit;
+    font-size: 1rem;
+    font-weight: 500;
+    cursor: pointer;
+    transition: var(--transition-fast);
+    text-decoration: none;
+}
+.btn:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+.btn svg {
+    width: 20px;
+    height: 20px;
+}
+.btn-primary {
+    width: 100%;
+    background: var(--accent-gradient);
+    color: white;
+    box-shadow: var(--shadow-md);
+}
+.btn-primary:hover:not(:disabled) {
+    transform: translateY(-2px);
+    box-shadow: var(--shadow-lg), var(--shadow-glow);
+}
+.btn-primary:active:not(:disabled) {
+    transform: translateY(0);
+}
+.btn-secondary {
+    background: var(--bg-card);
+    color: var(--text-primary);
+    border: 1px solid var(--border-color);
+}
+.btn-secondary:hover:not(:disabled) {
+    background: var(--bg-card-hover);
+    border-color: var(--accent-primary);
+}
+.btn-outline {
+    background: transparent;
+    color: var(--text-primary);
+    border: 1px solid var(--border-color);
+    padding: var(--spacing-sm) var(--spacing-md);
+}
+.btn-outline:hover {
+    background: var(--bg-card);
+    border-color: var(--accent-primary);
+}
+.btn-clear {
+    width: 36px;
+    height: 36px;
+    padding: 0;
+    background: transparent;
+    color: var(--text-muted);
+}
+.btn-clear:hover {
+    color: var(--error);
+}
+/* ================================
+   Processing Section
+   ================================ */
+.processing-content {
+    text-align: center;
+    padding: var(--spacing-xl) 0;
+}
+.spinner {
+    width: 56px;
+    height: 56px;
+    margin: 0 auto var(--spacing-lg);
+    border: 3px solid var(--border-color);
+    border-top-color: var(--accent-primary);
+    border-radius: 50%;
+    animation: spin 1s linear infinite;
+}
+@keyframes spin {
+    to {
+        transform: rotate(360deg);
+    }
+}
+.processing-content h3 {
+    font-size: 1.25rem;
+    margin-bottom: var(--spacing-sm);
+}
+.processing-content p {
+    color: var(--text-secondary);
+    margin-bottom: var(--spacing-lg);
+}
+.progress-bar {
+    height: 6px;
+    background: var(--bg-secondary);
+    border-radius: var(--radius-sm);
+    overflow: hidden;
+    margin-bottom: var(--spacing-md);
+}
+.progress-fill {
+    height: 100%;
+    width: 0%;
+    background: var(--accent-gradient);
+    border-radius: var(--radius-sm);
+    transition: width 0.3s ease;
+    animation: pulse 2s ease-in-out infinite;
+}
+@keyframes pulse {
+    0%,
+    100% {
+        opacity: 1;
+    }
+    50% {
+        opacity: 0.7;
+    }
+}
+.processing-hint {
+    font-size: 0.875rem;
+    color: var(--text-muted);
+}
+.timer-display {
+    font-size: 2rem;
+    font-weight: 700;
+    color: var(--accent-primary);
+    margin: var(--spacing-md) 0;
+    font-family: monospace;
+    text-shadow: 0 0 10px rgba(99, 102, 241, 0.3);
+}
+/* ================================
+   Results Section
+   ================================ */
+.result-meta {
+    display: flex;
+    gap: var(--spacing-sm);
+    flex-wrap: wrap;
+}
+.download-buttons {
+    display: flex;
+    gap: var(--spacing-md);
+    margin-bottom: var(--spacing-lg);
+    flex-wrap: wrap;
+}
+.transcript-container {
+    max-height: 400px;
+    overflow-y: auto;
+    padding-right: var(--spacing-sm);
+    margin-bottom: var(--spacing-lg);
+}
+.transcript-container::-webkit-scrollbar {
+    width: 6px;
+}
+.transcript-container::-webkit-scrollbar-track {
+    background: var(--bg-secondary);
+    border-radius: var(--radius-sm);
+}
+.transcript-container::-webkit-scrollbar-thumb {
+    background: var(--border-color);
+    border-radius: var(--radius-sm);
+}
+.transcript-container::-webkit-scrollbar-thumb:hover {
+    background: var(--text-muted);
+}
+/* Transcript Segment */
+.segment {
+    padding: var(--spacing-md);
+    border-radius: var(--radius-md);
+    margin-bottom: var(--spacing-sm);
+    background: var(--bg-secondary);
+    border-left: 3px solid var(--accent-primary);
+    transition: var(--transition-fast);
+}
+.segment:hover {
+    background: var(--bg-card-hover);
+}
+.segment-header {
+    display: flex;
+    align-items: center;
+    gap: var(--spacing-md);
+    margin-bottom: var(--spacing-xs);
+    flex-wrap: wrap;
+}
+.segment-speaker {
+    font-weight: 600;
+    color: var(--accent-primary);
+}
+.segment-time {
+    font-size: 0.75rem;
+    color: var(--text-muted);
+    font-family: monospace;
+}
+.segment-text {
+    color: var(--text-primary);
+    line-height: 1.7;
+}
+/* Speaker Colors */
+.speaker-1 {
+    border-left-color: #6366f1;
+}
+.speaker-1 .segment-speaker {
+    color: #6366f1;
+}
+.speaker-2 {
+    border-left-color: #10b981;
+}
+.speaker-2 .segment-speaker {
+    color: #10b981;
+}
+.speaker-3 {
+    border-left-color: #f59e0b;
+}
+.speaker-3 .segment-speaker {
+    color: #f59e0b;
+}
+.speaker-4 {
+    border-left-color: #ec4899;
+}
+.speaker-4 .segment-speaker {
+    color: #ec4899;
+}
+.speaker-5 {
+    border-left-color: #8b5cf6;
+}
+.speaker-5 .segment-speaker {
+    color: #8b5cf6;
+}
+/* ================================
+   Error Section
+   ================================ */
+.error-content {
+    text-align: center;
+    padding: var(--spacing-xl) 0;
+}
+.error-icon {
+    width: 64px;
+    height: 64px;
+    margin: 0 auto var(--spacing-lg);
+    background: rgba(239, 68, 68, 0.15);
+    border-radius: 50%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+}
+.error-icon svg {
+    width: 32px;
+    height: 32px;
+    color: var(--error);
+}
+.error-content h3 {
+    color: var(--error);
+    margin-bottom: var(--spacing-sm);
+}
+.error-content p {
+    color: var(--text-secondary);
+    margin-bottom: var(--spacing-lg);
+}
+/* ================================
+   Footer
+   ================================ */
+.footer {
+    margin-top: auto;
+    padding: var(--spacing-xl) 0;
+    text-align: center;
+    color: var(--text-muted);
+    font-size: 0.875rem;
+}
+.footer strong {
+    color: var(--text-secondary);
+}
+.footer-note {
+    margin-top: var(--spacing-xs);
+    font-size: 0.75rem;
+}
+/* ================================
+   Utility Classes
+   ================================ */
+.hidden {
+    display: none !important;
+}
+/* ================================
+   Responsive
+   ================================ */
+@media (max-width: 640px) {
+    :root {
+        font-size: 14px;
+    }
+    .app-container {
+        padding: var(--spacing-md);
+    }
+    .card {
+        padding: var(--spacing-lg);
+    }
+    .upload-zone {
+        padding: var(--spacing-xl);
+    }
+    .card-header {
+        flex-direction: column;
+        align-items: flex-start;
+    }
+    .result-meta {
+        width: 100%;
+    }
+    .download-buttons {
+        flex-direction: column;
+    }
+    .download-buttons .btn {
+        width: 100%;
+    }
+}

app/static/js/app.js ADDED Viewed

	@@ -0,0 +1,312 @@

+/**
+ * PrecisionVoice - Frontend Application Logic
+ * Handles file upload, transcription requests, and result display.
+ */
+document.addEventListener('DOMContentLoaded', () => {
+    // DOM Elements
+    const elements = {
+        // Upload
+        dropZone: document.getElementById('drop-zone'),
+        fileInput: document.getElementById('file-input'),
+        fileInfo: document.getElementById('file-info'),
+        fileName: document.getElementById('file-name'),
+        fileSize: document.getElementById('file-size'),
+        clearBtn: document.getElementById('clear-btn'),
+        transcribeBtn: document.getElementById('transcribe-btn'),
+        // Sections
+        uploadSection: document.getElementById('upload-section'),
+        processingSection: document.getElementById('processing-section'),
+        resultsSection: document.getElementById('results-section'),
+        errorSection: document.getElementById('error-section'),
+        // Processing
+        processingStatus: document.getElementById('processing-status'),
+        progressFill: document.getElementById('progress-fill'),
+        processingTimer: document.getElementById('processing-timer'),
+        // Results
+        speakerCount: document.getElementById('speaker-count'),
+        durationInfo: document.getElementById('duration-info'),
+        processingTime: document.getElementById('processing-time'),
+        transcriptContainer: document.getElementById('transcript-container'),
+        downloadTxt: document.getElementById('download-txt'),
+        downloadSrt: document.getElementById('download-srt'),
+        newUploadBtn: document.getElementById('new-upload-btn'),
+        // Error
+        errorMessage: document.getElementById('error-message'),
+        retryBtn: document.getElementById('retry-btn')
+    };
+    let selectedFile = null;
+    // =====================
+    // Event Listeners
+    // =====================
+    // Click to upload
+    elements.dropZone.addEventListener('click', () => {
+        elements.fileInput.click();
+    });
+    // File input change
+    elements.fileInput.addEventListener('change', (e) => {
+        if (e.target.files.length > 0) {
+            handleFileSelection(e.target.files[0]);
+        }
+    });
+    // Drag and drop
+    elements.dropZone.addEventListener('dragover', (e) => {
+        e.preventDefault();
+        elements.dropZone.classList.add('dragover');
+    });
+    elements.dropZone.addEventListener('dragleave', () => {
+        elements.dropZone.classList.remove('dragover');
+    });
+    elements.dropZone.addEventListener('drop', (e) => {
+        e.preventDefault();
+        elements.dropZone.classList.remove('dragover');
+        if (e.dataTransfer.files.length > 0) {
+            handleFileSelection(e.dataTransfer.files[0]);
+        }
+    });
+    // Clear file
+    elements.clearBtn.addEventListener('click', (e) => {
+        e.stopPropagation();
+        clearFileSelection();
+    });
+    // Transcribe button
+    elements.transcribeBtn.addEventListener('click', () => {
+        if (selectedFile) {
+            startTranscription();
+        }
+    });
+    // New upload button
+    elements.newUploadBtn.addEventListener('click', resetToUpload);
+    // Retry button
+    elements.retryBtn.addEventListener('click', resetToUpload);
+    // =====================
+    // File Handling
+    // =====================
+    function handleFileSelection(file) {
+        const allowedTypes = ['audio/mpeg', 'audio/wav', 'audio/x-wav', 'audio/mp4', 'audio/x-m4a',
+            'audio/ogg', 'audio/flac', 'audio/webm', 'video/webm'];
+        const allowedExtensions = ['mp3', 'wav', 'm4a', 'ogg', 'flac', 'webm'];
+        // Check file extension
+        const ext = file.name.split('.').pop().toLowerCase();
+        if (!allowedExtensions.includes(ext)) {
+            showError(`Unsupported file type: .${ext}. Supported: ${allowedExtensions.join(', ')}`);
+            return;
+        }
+        // Check file size (100MB limit)
+        const maxSize = 100 * 1024 * 1024;
+        if (file.size > maxSize) {
+            showError(`File too large. Maximum size: 100MB`);
+            return;
+        }
+        selectedFile = file;
+        // Update UI
+        elements.fileName.textContent = file.name;
+        elements.fileSize.textContent = formatFileSize(file.size);
+        elements.fileInfo.classList.remove('hidden');
+        elements.transcribeBtn.disabled = false;
+        // Hide drop zone text
+        elements.dropZone.style.display = 'none';
+    }
+    function clearFileSelection() {
+        selectedFile = null;
+        elements.fileInput.value = '';
+        elements.fileInfo.classList.add('hidden');
+        elements.transcribeBtn.disabled = true;
+        elements.dropZone.style.display = 'block';
+    }
+    function formatFileSize(bytes) {
+        if (bytes === 0) return '0 Bytes';
+        const k = 1024;
+        const sizes = ['Bytes', 'KB', 'MB', 'GB'];
+        const i = Math.floor(Math.log(bytes) / Math.log(k));
+        return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
+    }
+    // =====================
+    // Transcription
+    // =====================
+    async function startTranscription() {
+        if (!selectedFile) return;
+        // Show processing UI
+        showSection('processing');
+        updateProgress(100, 'Processing audio... (Check server logs for details)');
+        // Reset and start timer
+        let seconds = 0;
+        elements.processingTimer.textContent = '00:00';
+        const timerInterval = setInterval(() => {
+            seconds++;
+            const m = Math.floor(seconds / 60);
+            const s = seconds % 60;
+            elements.processingTimer.textContent = `${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
+        }, 1000);
+        try {
+            const formData = new FormData();
+            formData.append('file', selectedFile);
+            const response = await fetch('/api/transcribe', {
+                method: 'POST',
+                body: formData
+            });
+            clearInterval(timerInterval);
+            if (!response.ok) {
+                const errorData = await response.json();
+                throw new Error(errorData.detail || 'Processing failed');
+            }
+            const result = await response.json();
+            displayResults(result);
+        } catch (error) {
+            clearInterval(timerInterval);
+            console.error('Processing error:', error);
+            showError(error.message || 'An error occurred during processing');
+        }
+    }
+    function updateProgress(percent, status) {
+        elements.progressFill.style.width = `${percent}%`;
+        if (status) {
+            elements.processingStatus.textContent = status;
+        }
+    }
+    // =====================
+    // Results Display
+    // =====================
+    function displayResults(result) {
+        // Update metadata
+        elements.speakerCount.textContent = `${result.num_speakers} speaker${result.num_speakers !== 1 ? 's' : ''}`;
+        elements.durationInfo.textContent = formatDuration(result.duration);
+        elements.processingTime.textContent = `${result.processing_time}s`;
+        // Set download links
+        elements.downloadTxt.href = result.download_txt;
+        elements.downloadSrt.href = result.download_srt;
+        // Render transcript segments
+        renderTranscript(result.segments);
+        // Show results section
+        showSection('results');
+    }
+    function renderTranscript(segments) {
+        elements.transcriptContainer.innerHTML = '';
+        const speakerColors = {};
+        let colorIndex = 0;
+        segments.forEach((segment) => {
+            // Assign color to speaker
+            if (!(segment.speaker in speakerColors)) {
+                colorIndex++;
+                speakerColors[segment.speaker] = `speaker-${Math.min(colorIndex, 5)}`;
+            }
+            const segmentEl = document.createElement('div');
+            segmentEl.className = `segment ${speakerColors[segment.speaker]}`;
+            segmentEl.innerHTML = `
+                <div class="segment-header">
+                    <span class="segment-speaker">${escapeHtml(segment.speaker)}</span>
+                    <span class="segment-time">${formatTime(segment.start)} - ${formatTime(segment.end)}</span>
+                </div>
+                <p class="segment-text">${escapeHtml(segment.text)}</p>
+            `;
+            elements.transcriptContainer.appendChild(segmentEl);
+        });
+    }
+    function formatTime(seconds) {
+        const h = Math.floor(seconds / 3600);
+        const m = Math.floor((seconds % 3600) / 60);
+        const s = Math.floor(seconds % 60);
+        if (h > 0) {
+            return `${h}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
+        }
+        return `${m}:${s.toString().padStart(2, '0')}`;
+    }
+    function formatDuration(seconds) {
+        const m = Math.floor(seconds / 60);
+        const s = Math.floor(seconds % 60);
+        return `${m}:${s.toString().padStart(2, '0')}`;
+    }
+    function escapeHtml(text) {
+        const div = document.createElement('div');
+        div.textContent = text;
+        return div.innerHTML;
+    }
+    // =====================
+    // UI State Management
+    // =====================
+    function showSection(section) {
+        elements.uploadSection.classList.add('hidden');
+        elements.processingSection.classList.add('hidden');
+        elements.resultsSection.classList.add('hidden');
+        elements.errorSection.classList.add('hidden');
+        switch (section) {
+            case 'upload':
+                elements.uploadSection.classList.remove('hidden');
+                break;
+            case 'processing':
+                elements.processingSection.classList.remove('hidden');
+                break;
+            case 'results':
+                elements.resultsSection.classList.remove('hidden');
+                break;
+            case 'error':
+                elements.errorSection.classList.remove('hidden');
+                break;
+        }
+    }
+    function showError(message) {
+        elements.errorMessage.textContent = message;
+        showSection('error');
+    }
+    function resetToUpload() {
+        clearFileSelection();
+        showSection('upload');
+        updateProgress(0, 'Uploading file...');
+    }
+});

app/templates/index.html ADDED Viewed

	@@ -0,0 +1,162 @@

+<!DOCTYPE html>
+<html lang="vi">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta name="description" content="PrecisionVoice - Speech-to-Text and Speaker Diarization powered by AI">
+    <title>PrecisionVoice | AI Speech Transcription</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+    <link rel="stylesheet" href="/static/css/style.css">
+</head>
+<body>
+    <div class="app-container">
+        <!-- Header -->
+        <header class="header">
+            <div class="logo">
+                <div class="logo-icon">
+                    <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                        <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
+                        <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
+                        <line x1="12" y1="19" x2="12" y2="23" />
+                        <line x1="8" y1="23" x2="16" y2="23" />
+                    </svg>
+                </div>
+                <h1>PrecisionVoice</h1>
+            </div>
+            <p class="tagline">AI-Powered Speech Transcription with Speaker Detection</p>
+        </header>
+        <!-- Main Content -->
+        <main class="main-content">
+            <!-- Upload Section -->
+            <section id="upload-section" class="card upload-card">
+                <div class="card-header">
+                    <h2>Upload Audio</h2>
+                    <span class="badge">Supported: {{ allowed_formats }}</span>
+                </div>
+                <div class="upload-zone" id="drop-zone">
+                    <div class="upload-icon">
+                        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
+                            <polyline points="17 8 12 3 7 8" />
+                            <line x1="12" y1="3" x2="12" y2="15" />
+                        </svg>
+                    </div>
+                    <p class="upload-text">Drag & drop audio file here</p>
+                    <p class="upload-subtext">or click to browse</p>
+                    <input type="file" id="file-input" accept=".mp3,.wav,.m4a,.ogg,.flac,.webm" hidden>
+                </div>
+                <div id="file-info" class="file-info hidden">
+                    <div class="file-details">
+                        <span class="file-name" id="file-name">audio.mp3</span>
+                        <span class="file-size" id="file-size">0 MB</span>
+                    </div>
+                    <button class="btn btn-clear" id="clear-btn" title="Remove file">
+                        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <line x1="18" y1="6" x2="6" y2="18" />
+                            <line x1="6" y1="6" x2="18" y2="18" />
+                        </svg>
+                    </button>
+                </div>
+                <button class="btn btn-primary" id="transcribe-btn" disabled>
+                    <span class="btn-text">Transcribe</span>
+                    <span class="btn-icon">
+                        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <polygon points="5 3 19 12 5 21 5 3" />
+                        </svg>
+                    </span>
+                </button>
+            </section>
+            <!-- Processing Section -->
+            <section id="processing-section" class="card processing-card hidden">
+                <div class="processing-content">
+                    <div class="spinner"></div>
+                    <h3>Processing Audio</h3>
+                    <p id="processing-status">Uploading file...</p>
+                    <div class="progress-bar">
+                        <div class="progress-fill" id="progress-fill"></div>
+                    </div>
+                    <div class="timer-display" id="processing-timer">00:00</div>
+                    <p class="processing-hint">This may take a few minutes depending on audio length</p>
+                </div>
+            </section>
+            <!-- Results Section -->
+            <section id="results-section" class="card results-card hidden">
+                <div class="card-header">
+                    <h2>Transcription Results</h2>
+                    <div class="result-meta">
+                        <span id="speaker-count" class="badge">0 speakers</span>
+                        <span id="duration-info" class="badge">0:00</span>
+                        <span id="processing-time" class="badge">0.0s</span>
+                    </div>
+                </div>
+                <div class="download-buttons">
+                    <a href="#" id="download-txt" class="btn btn-outline" download>
+                        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
+                            <polyline points="7 10 12 15 17 10" />
+                            <line x1="12" y1="15" x2="12" y2="3" />
+                        </svg>
+                        Download TXT
+                    </a>
+                    <a href="#" id="download-srt" class="btn btn-outline" download>
+                        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
+                            <polyline points="7 10 12 15 17 10" />
+                            <line x1="12" y1="15" x2="12" y2="3" />
+                        </svg>
+                        Download SRT
+                    </a>
+                </div>
+                <div class="transcript-container" id="transcript-container">
+                    <!-- Transcript segments will be rendered here -->
+                </div>
+                <button class="btn btn-secondary" id="new-upload-btn">
+                    <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                        <polyline points="1 4 1 10 7 10" />
+                        <path d="M3.51 15a9 9 0 1 0 2.13-9.36L1 10" />
+                    </svg>
+                    New Transcription
+                </button>
+            </section>
+            <!-- Error Section -->
+            <section id="error-section" class="card error-card hidden">
+                <div class="error-content">
+                    <div class="error-icon">
+                        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <circle cx="12" cy="12" r="10" />
+                            <line x1="15" y1="9" x2="9" y2="15" />
+                            <line x1="9" y1="9" x2="15" y2="15" />
+                        </svg>
+                    </div>
+                    <h3>Error</h3>
+                    <p id="error-message">An error occurred during processing.</p>
+                    <button class="btn btn-secondary" id="retry-btn">Try Again</button>
+                </div>
+            </section>
+        </main>
+        <!-- Footer -->
+        <footer class="footer">
+            <p>Powered by <strong>faster-whisper</strong> & <strong>pyannote.audio</strong></p>
+            <p class="footer-note">Max file size: {{ max_upload_mb }}MB</p>
+        </footer>
+    </div>
+    <script src="/static/js/app.js"></script>
+</body>
+</html>

data/processed/.gitkeep ADDED Viewed

File without changes

data/uploads/.gitkeep ADDED Viewed

File without changes

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+services:
+  app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        - PORT=${PORT:-7860}
+    container_name: precisionvoice
+    ports:
+      - "${PORT:-7860}:${PORT:-7860}"
+    volumes:
+      # Persist uploaded/processed files
+      - ./data:/app/data
+      # Cache models to avoid re-downloading
+      - model_cache_hf:/root/.cache/huggingface
+      - model_cache_torch:/root/.cache/torch
+      - model_cache_mdx:/root/.audio-separator-models
+    environment:
+      # HuggingFace token (required for pyannote.audio)
+      - HF_TOKEN=${HF_TOKEN:-}
+      # Model settings
+      - WHISPER_MODEL=${WHISPER_MODEL:-kiendt/PhoWhisper-large-ct2}
+      - DIARIZATION_MODEL=${DIARIZATION_MODEL:-pyannote/speaker-diarization-3.1}
+      # Device (auto, cuda, cpu)
+      - DEVICE=${DEVICE:-auto}
+      # Denoising (Speech Enhancement)
+      - ENABLE_DENOISER=${ENABLE_DENOISER:-True}
+      - DENOISER_MODEL=${DENOISER_MODEL:-dns64}
+      # MDX-Net Vocal Separation
+      - ENABLE_VOCAL_SEPARATION=${ENABLE_VOCAL_SEPARATION:-True}
+      - MDX_MODEL=${MDX_MODEL:-UVR-MDX-NET-Voc_FT}
+      # Upload settings
+      - MAX_UPLOAD_SIZE_MB=${MAX_UPLOAD_SIZE_MB:-100}
+      # Optimization settings
+      - ENABLE_LOUDNORM=${ENABLE_LOUDNORM:-True}
+      - ENABLE_NOISE_REDUCTION=${ENABLE_NOISE_REDUCTION:-True}
+      # VAD settings
+      - VAD_THRESHOLD=${VAD_THRESHOLD:-0.5}
+      - VAD_MIN_SPEECH_DURATION_MS=${VAD_MIN_SPEECH_DURATION_MS:-250}
+      - VAD_MIN_SILENCE_DURATION_MS=${VAD_MIN_SILENCE_DURATION_MS:-500}
+      # Clustering settings
+      - MERGE_THRESHOLD_S=${MERGE_THRESHOLD_S:-0.5}
+      - MIN_SEGMENT_DURATION_S=${MIN_SEGMENT_DURATION_S:-0.3}
+    restart: unless-stopped
+    # GPU support (uncomment for NVIDIA GPU)
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: all
+    #           capabilities: [gpu]
+volumes:
+  model_cache_hf:
+    name: precisionvoice_hf_cache
+  model_cache_torch:
+    name: precisionvoice_torch_cache
+  model_cache_mdx:
+    name: precisionvoice_mdx_cache

docker/.gitkeep ADDED Viewed

File without changes

precision_voice_colab.ipynb ADDED Viewed

	@@ -0,0 +1,413 @@

+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# PrecisionVoice - Google Colab Runner\n",
+                "\n",
+                "This notebook allows you to run the [PrecisionVoice](https://github.com/thichuong/PrecisionVoice) application directly in Google Colab.\n",
+                "\n",
+                "### Instructions\n",
+                "1.  **Runtime Change**: Go to `Runtime` -> `Change runtime type` and make sure **T4 GPU** (or better) is selected.\n",
+                "2.  **Run All**: You can select `Runtime` -> `Run all` or run each cell step-by-step.\n",
+                "3.  **Public URL**: Look for the `ngrok` public URL in the final cell output to access the web interface."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "5efa55f1",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "GPU Detected: Tesla T4\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# @title 1. Check GPU Availability\n",
+                "import torch\n",
+                "\n",
+                "if torch.cuda.is_available():\n",
+                "    print(f\"GPU Detected: {torch.cuda.get_device_name(0)}\")\n",
+                "else:\n",
+                "    print(\"WARNING: No GPU detected. This application requires a GPU to run efficiently.\")\n",
+                "    print(\"Please go to Runtime -> Change runtime type -> Hardware accelerator -> T4 GPU\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "b068e8ac",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Cloning into 'PrecisionVoice'...\n",
+                        "remote: Enumerating objects: 94, done.\u001b[K\n",
+                        "remote: Counting objects: 100% (94/94), done.\u001b[K\n",
+                        "remote: Compressing objects: 100% (51/51), done.\u001b[K\n",
+                        "remote: Total 94 (delta 34), reused 88 (delta 28), pack-reused 0 (from 0)\u001b[K\n",
+                        "Receiving objects: 100% (94/94), 35.72 KiB | 5.10 MiB/s, done.\n",
+                        "Resolving deltas: 100% (34/34), done.\n",
+                        "/content/PrecisionVoice/PrecisionVoice/PrecisionVoice\n",
+                        "Repository cloned successfully.\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# @title 2. Clone Repository\n",
+                "import os\n",
+                "\n",
+                "# Clean up previous run if exists\n",
+                "if os.path.exists(\"PrecisionVoice\"):\n",
+                "    %cd /content\n",
+                "    !rm -rf PrecisionVoice\n",
+                "\n",
+                "!git clone https://github.com/thichuong/PrecisionVoice.git\n",
+                "%cd PrecisionVoice\n",
+                "print(\"Repository cloned successfully.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "42afe30f",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Installing system dependencies... (This may take a moment)\n",
+                        "✅ System dependencies (ffmpeg, libsndfile1) installed successfully.\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# @title 3. Install System Dependencies\n",
+                "import subprocess\n",
+                "\n",
+                "# Installing dependencies defined in Dockerfile (ffmpeg, libsndfile)\n",
+                "print(\"Installing system dependencies... (This may take a moment)\")\n",
+                "\n",
+                "# Update and install (suppressing harmless R-repo warnings common in Colab)\n",
+                "!apt-get update -y > /dev/null 2>&1\n",
+                "!apt-get install -y ffmpeg libsndfile1 > /dev/null 2>&1\n",
+                "\n",
+                "# Verify installation\n",
+                "try:\n",
+                "    subprocess.run([\"ffmpeg\", \"-version\"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)\n",
+                "    print(\"✅ System dependencies (ffmpeg, libsndfile1) installed successfully.\")\n",
+                "except Exception as e:\n",
+                "    print(\"❌ Warning: Potential installation issue. If the app fails, try running '!apt-get install -y ffmpeg' manually.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "4ec3974f",
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# @title 4. Install Python Dependencies\n",
+                "# Force upgrade torch, torchvision, torchaudio to ensure compatibility\n",
+                "!pip install -U torch torchvision torchaudio\n",
+                "\n",
+                "!pip install -r requirements.txt\n",
+                "# Install pyngrok to expose the local server to the internet\n",
+                "!pip install pyngrok\n",
+                "print(\"Python dependencies installed.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "1d5b721b",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        ".env file created with default settings.\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# @title 5. Setup Environment (.env)\n",
+                "# Creating a default .env file. You can modify this cell to add your specific keys.\n",
+                "\n",
+                "env_content = \"\"\"\n",
+                "PORT=7860\n",
+                "LOG_LEVEL=INFO\n",
+                "\n",
+                "# Audio Processing\n",
+                "NOISE_REDUCTION_LEVEL=5.0\n",
+                "VAD_THRESHOLD=0.5\n",
+                "VAD_MIN_SPEECH_DURATION_MS=250\n",
+                "VAD_MIN_SILENCE_DURATION_MS=500\n",
+                "MERGE_THRESHOLD_S=1.5\n",
+                "\"\"\"\n",
+                "\n",
+                "with open(\".env\", \"w\") as f:\n",
+                "    f.write(env_content)\n",
+                "\n",
+                "print(\".env file created with default settings.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "9afa4d11",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Cleaning up previous sessions...\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "application/javascript": "(async (port, path, width, height, cache, element) => {\n    if (!google.colab.kernel.accessAllowed && !cache) {\n      return;\n    }\n    element.appendChild(document.createTextNode(''));\n    const url = await google.colab.kernel.proxyPort(port, {cache});\n    const iframe = document.createElement('iframe');\n    iframe.src = new URL(path, url).toString();\n    iframe.height = height;\n    iframe.width = width;\n    iframe.style.border = 0;\n    iframe.allow = [\n        'accelerometer',\n        'autoplay',\n        'camera',\n        'clipboard-read',\n        'clipboard-write',\n        'gyroscope',\n        'magnetometer',\n        'microphone',\n        'serial',\n        'usb',\n        'xr-spatial-tracking',\n    ].join('; ');\n    element.appendChild(iframe);\n  })(8000, \"/\", \"100%\", 900, false, window.element)",
+                        "text/plain": [
+                            "<IPython.core.display.Javascript object>"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                },
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "\u001b[31mWarning: This function may stop working due to changes in browser security.\n",
+                        "Try `serve_kernel_port_as_iframe` instead. \u001b[0m\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "application/javascript": "(async (port, path, text, element) => {\n    if (!google.colab.kernel.accessAllowed) {\n      return;\n    }\n    element.appendChild(document.createTextNode(''));\n    const url = await google.colab.kernel.proxyPort(port);\n    const anchor = document.createElement('a');\n    anchor.href = new URL(path, url).toString();\n    anchor.target = '_blank';\n    anchor.setAttribute('data-href', url + path);\n    anchor.textContent = text;\n    element.appendChild(anchor);\n  })(8000, \"/\", \"https://localhost:8000/\", window.element)",
+                        "text/plain": [
+                            "<IPython.core.display.Javascript object>"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                },
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "Starting server on port 8000...\n",
+                        "\u001b[32mINFO\u001b[0m:     Started server process [\u001b[36m25608\u001b[0m]\n",
+                        "\u001b[32mINFO\u001b[0m:     Waiting for application startup.\n",
+                        "2026-01-04 03:10:59,288 - app.main - INFO - Starting PrecisionVoice application...\n",
+                        "2026-01-04 03:10:59,292 - app.main - INFO - Device: cuda\n",
+                        "2026-01-04 03:10:59,292 - app.main - INFO - Whisper model: kiendt/PhoWhisper-large-ct2\n",
+                        "2026-01-04 03:10:59,293 - app.main - INFO - Diarization model: pyannote/speaker-diarization-3.1\n",
+                        "2026-01-04 03:10:59,293 - app.main - INFO - Preloading Whisper model...\n",
+                        "2026-01-04 03:10:59,293 - app.services.transcription - INFO - Loading Whisper model: kiendt/PhoWhisper-large-ct2\n",
+                        "2026-01-04 03:10:59,293 - app.services.transcription - INFO - Device: cuda, Compute type: float16\n",
+                        "\n",
+                        "🚀 Ngrok Public URL: https://tandy-pileous-biologically.ngrok-free.dev\n",
+                        "\n",
+                        "2026-01-04 03:11:02,736 - app.services.transcription - INFO - Whisper model loaded successfully\n",
+                        "2026-01-04 03:11:02,737 - app.main - WARNING - HF_TOKEN not set, diarization will not be available\n",
+                        "2026-01-04 03:11:02,737 - app.main - INFO - Application startup complete\n",
+                        "\u001b[32mINFO\u001b[0m:     Application startup complete.\n",
+                        "\u001b[32mINFO\u001b[0m:     Uvicorn running on \u001b[1mhttp://0.0.0.0:8000\u001b[0m (Press CTRL+C to quit)\n",
+                        "\u001b[32mINFO\u001b[0m:     2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET / HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
+                        "\u001b[32mINFO\u001b[0m:     2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /static/css/style.css HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
+                        "\u001b[32mINFO\u001b[0m:     2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /static/js/app.js HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
+                        "\u001b[32mINFO\u001b[0m:     2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /favicon.ico HTTP/1.1\u001b[0m\" \u001b[31m404 Not Found\u001b[0m\n",
+                        "2026-01-04 03:11:17,130 - app.services.audio_processor - INFO - Saved upload: /content/PrecisionVoice/PrecisionVoice/data/uploads/4bf9c6ad.wav\n",
+                        "2026-01-04 03:11:17,131 - app.services.audio_processor - INFO - Applying loudnorm normalization...\n",
+                        "2026-01-04 03:11:17,131 - app.services.audio_processor - INFO - Applying advanced noise reduction (anlmdn, level=5.0)...\n",
+                        "\u001b[32mINFO\u001b[0m:     Shutting down\n",
+                        "\u001b[32mINFO\u001b[0m:     Finished server process [\u001b[36m25608\u001b[0m]\n",
+                        "\u001b[31mERROR\u001b[0m:    Traceback (most recent call last):\n",
+                        "  File \"/usr/lib/python3.12/asyncio/runners.py\", line 195, in run\n",
+                        "    return runner.run(main)\n",
+                        "           ^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/lib/python3.12/asyncio/runners.py\", line 118, in run\n",
+                        "    return self._loop.run_until_complete(task)\n",
+                        "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"uvloop/loop.pyx\", line 1512, in uvloop.loop.Loop.run_until_complete\n",
+                        "  File \"uvloop/loop.pyx\", line 1505, in uvloop.loop.Loop.run_until_complete\n",
+                        "  File \"uvloop/loop.pyx\", line 1379, in uvloop.loop.Loop.run_forever\n",
+                        "  File \"uvloop/loop.pyx\", line 557, in uvloop.loop.Loop._run\n",
+                        "  File \"uvloop/loop.pyx\", line 476, in uvloop.loop.Loop._on_idle\n",
+                        "  File \"uvloop/cbhandles.pyx\", line 83, in uvloop.loop.Handle._run\n",
+                        "  File \"uvloop/cbhandles.pyx\", line 63, in uvloop.loop.Handle._run\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 70, in serve\n",
+                        "    with self.capture_signals():\n",
+                        "         ^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/lib/python3.12/contextlib.py\", line 144, in __exit__\n",
+                        "    next(self.gen)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 331, in capture_signals\n",
+                        "    signal.raise_signal(captured_signal)\n",
+                        "  File \"/usr/lib/python3.12/asyncio/runners.py\", line 157, in _on_sigint\n",
+                        "    raise KeyboardInterrupt()\n",
+                        "KeyboardInterrupt\n",
+                        "\n",
+                        "During handling of the above exception, another exception occurred:\n",
+                        "\n",
+                        "Traceback (most recent call last):\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 701, in lifespan\n",
+                        "    await receive()\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/uvicorn/lifespan/on.py\", line 137, in receive\n",
+                        "    return await self.receive_queue.get()\n",
+                        "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/lib/python3.12/asyncio/queues.py\", line 158, in get\n",
+                        "    await getter\n",
+                        "asyncio.exceptions.CancelledError\n",
+                        "\n",
+                        "\u001b[31mERROR\u001b[0m:    Exception in ASGI application\n",
+                        "Traceback (most recent call last):\n",
+                        "  File \"/usr/lib/python3.12/asyncio/runners.py\", line 195, in run\n",
+                        "    return runner.run(main)\n",
+                        "           ^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/lib/python3.12/asyncio/runners.py\", line 118, in run\n",
+                        "    return self._loop.run_until_complete(task)\n",
+                        "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"uvloop/loop.pyx\", line 1512, in uvloop.loop.Loop.run_until_complete\n",
+                        "  File \"uvloop/loop.pyx\", line 1505, in uvloop.loop.Loop.run_until_complete\n",
+                        "  File \"uvloop/loop.pyx\", line 1379, in uvloop.loop.Loop.run_forever\n",
+                        "  File \"uvloop/loop.pyx\", line 557, in uvloop.loop.Loop._run\n",
+                        "  File \"uvloop/loop.pyx\", line 476, in uvloop.loop.Loop._on_idle\n",
+                        "  File \"uvloop/cbhandles.pyx\", line 83, in uvloop.loop.Handle._run\n",
+                        "  File \"uvloop/cbhandles.pyx\", line 63, in uvloop.loop.Handle._run\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 70, in serve\n",
+                        "    with self.capture_signals():\n",
+                        "         ^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/lib/python3.12/contextlib.py\", line 144, in __exit__\n",
+                        "    next(self.gen)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 331, in capture_signals\n",
+                        "    signal.raise_signal(captured_signal)\n",
+                        "  File \"/usr/lib/python3.12/asyncio/runners.py\", line 157, in _on_sigint\n",
+                        "    raise KeyboardInterrupt()\n",
+                        "KeyboardInterrupt\n",
+                        "\n",
+                        "During handling of the above exception, another exception occurred:\n",
+                        "\n",
+                        "Traceback (most recent call last):\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/httptools_impl.py\", line 409, in run_asgi\n",
+                        "    result = await app(  # type: ignore[func-returns-value]\n",
+                        "             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py\", line 60, in __call__\n",
+                        "    return await self.app(scope, receive, send)\n",
+                        "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/fastapi/applications.py\", line 1139, in __call__\n",
+                        "    await super().__call__(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/applications.py\", line 107, in __call__\n",
+                        "    await self.middleware_stack(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py\", line 164, in __call__\n",
+                        "    await self.app(scope, receive, _send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/cors.py\", line 93, in __call__\n",
+                        "    await self.simple_response(scope, receive, send, request_headers=headers)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/cors.py\", line 144, in simple_response\n",
+                        "    await self.app(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/exceptions.py\", line 63, in __call__\n",
+                        "    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/_exception_handler.py\", line 42, in wrapped_app\n",
+                        "    await app(scope, receive, sender)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/fastapi/middleware/asyncexitstack.py\", line 18, in __call__\n",
+                        "    await self.app(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 716, in __call__\n",
+                        "    await self.middleware_stack(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 736, in app\n",
+                        "    await route.handle(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 290, in handle\n",
+                        "    await self.app(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 119, in app\n",
+                        "    await wrap_app_handling_exceptions(app, request)(scope, receive, send)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/starlette/_exception_handler.py\", line 42, in wrapped_app\n",
+                        "    await app(scope, receive, sender)\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 105, in app\n",
+                        "    response = await f(request)\n",
+                        "               ^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 385, in app\n",
+                        "    raw_response = await run_endpoint_function(\n",
+                        "                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 284, in run_endpoint_function\n",
+                        "    return await dependant.call(**values)\n",
+                        "           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/content/PrecisionVoice/PrecisionVoice/app/api/routes.py\", line 62, in transcribe_audio\n",
+                        "    wav_path, duration = await AudioProcessor.process_upload(\n",
+                        "                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/content/PrecisionVoice/PrecisionVoice/app/services/audio_processor.py\", line 205, in process_upload\n",
+                        "    wav_path = await cls.convert_to_wav(original_path)\n",
+                        "               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+                        "  File \"/content/PrecisionVoice/PrecisionVoice/app/services/audio_processor.py\", line 104, in convert_to_wav\n",
+                        "    await loop.run_in_executor(None, lambda: cls._run_ffmpeg_conversion(input_path, output_path))\n",
+                        "asyncio.exceptions.CancelledError\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# @title 6. Run Application\n",
+                "import threading\n",
+                "import time\n",
+                "import os\n",
+                "from google.colab.output import serve_kernel_port_as_iframe, serve_kernel_port_as_window\n",
+                "from pyngrok import ngrok\n",
+                "\n",
+                "# FORCE KILL any existing ngrok processes to free up the auth token session\n",
+                "print(\"Cleaning up previous sessions...\")\n",
+                "!killall ngrok 2>/dev/null \n",
+                "ngrok.kill()\n",
+                "\n",
+                "# Set your authtoken (Ensure this matches the one in your Ngrok dashboard)\n",
+                "ngrok.set_auth_token(\"NGROK_TOKEN\")\n",
+                "\n",
+                "port = 8000\n",
+                "\n",
+                "def start_ngrok():\n",
+                "    # Wait a bit for the server to start\n",
+                "    time.sleep(5)\n",
+                "    try:\n",
+                "        # Connect to the port\n",
+                "        public_url = ngrok.connect(port).public_url\n",
+                "        print(f\"\\n🚀 Ngrok Public URL: {public_url}\\n\")\n",
+                "    except Exception as e:\n",
+                "        print(f\"Ngrok error: {e}\")\n",
+                "\n",
+                "# Start ngrok in a background thread\n",
+                "threading.Thread(target=start_ngrok, daemon=True).start()\n",
+                "\n",
+                "# Serve the application directly in the notebook cell\n",
+                "serve_kernel_port_as_iframe(port, height=900)\n",
+                "\n",
+                "# Also provide a link to open in a new tab via proxy\n",
+                "serve_kernel_port_as_window(port, path=\"/\")\n",
+                "\n",
+                "# Run the Uvicorn server\n",
+                "print(f\"Starting server on port {port}...\")\n",
+                "!uvicorn app.main:app --host 0.0.0.0 --port {port}"
+            ]
+        }
+    ],
+    "metadata": {
+        "accelerator": "GPU",
+        "colab": {
+            "gpuType": "T4",
+            "provenance": []
+        },
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# Core framework
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+python-multipart>=0.0.6
+jinja2>=3.1.2
+aiofiles>=23.2.1
+# AI/ML - Speech-to-Text
+faster-whisper>=1.0.0
+ctranslate2>=4.0.0
+# AI/ML - Speaker Diarization
+pyannote.audio>=3.1.0
+torch>=2.1.0
+torchaudio>=2.1.0
+# AI/ML - Vocal Separation
+audio-separator[cpu]>=0.17.0
+denoiser>=0.1.4
+# Audio processing
+ffmpeg-python>=0.2.0
+pydub>=0.25.1
+# Configuration
+pydantic-settings>=2.1.0
+python-dotenv>=1.0.0
+# Utilities
+aiohttp>=3.9.0
+numpy>=1.24.0

scripts/verify_model_config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+from app.core.config import get_settings
+from app.services.transcription import TranscriptionService
+def verify_stt_model():
+    settings = get_settings()
+    print(f"Current Whisper Model: {settings.whisper_model}")
+    print(f"Device: {settings.resolved_device}")
+    print(f"Compute Type: {settings.resolved_compute_type}")
+    expected_model = "kiendt/PhoWhisper-large-ct2"
+    if settings.whisper_model == expected_model:
+        print("✅ SUCCESS: Model configuration updated correctly.")
+    else:
+        print(f"❌ FAILURE: Expected {expected_model}, got {settings.whisper_model}")
+if __name__ == "__main__":
+    verify_stt_model()