vyluong commited on
Commit
5ab6c6e
·
verified ·
1 Parent(s): f37e958

Upload folder using huggingface_hub

Browse files
.dockerignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .venv
6
+ venv
7
+ ENV
8
+ .git
9
+ .github
10
+ .vscode
11
+ .idea
12
+ *.log
13
+ .cache
14
+ .pytest_cache
15
+ data/uploads/*
16
+ data/processed/*
17
+ Dockerfile
18
+ docker-compose.yml
19
+ README.md
20
+ implementation_plan.md
21
+ walkthrough.md
22
+ task.md
.env.example ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment Configuration for PrecisionVoice
2
+
3
+ # HuggingFace token (required for pyannote.audio)
4
+ # Get your token at: https://huggingface.co/settings/tokens
5
+ # Accept terms at: https://huggingface.co/pyannote/speaker-diarization-3.1
6
+ HF_TOKEN=your_huggingface_token_here
7
+
8
+ # Model settings
9
+ WHISPER_MODEL=kiendt/PhoWhisper-large-ct2
10
+ DIARIZATION_MODEL=pyannote/speaker-diarization-3.1
11
+
12
+ # Device settings (cuda, cpu, or auto)
13
+ DEVICE=auto
14
+
15
+ # --- Denoising (Speech Enhancement) ---
16
+ # Enable speech enhancement (removes background noise, hum, etc.)
17
+ ENABLE_DENOISER=True
18
+ # Denoiser model: dns64 (standard), dns48, or master64
19
+ DENOISER_MODEL=dns64
20
+
21
+ # --- MDX-Net Vocal Separation ---
22
+ # Enable vocal separation before transcription (isolates voice from music/noise)
23
+ # More effective than the basic Demucs implementation.
24
+ ENABLE_VOCAL_SEPARATION=True
25
+ # MDX-Net model: Kim_Vocal_2.onnx (recommended for vocals)
26
+ MDX_MODEL=Kim_Vocal_2.onnx
27
+
28
+ # Upload settings
29
+ MAX_UPLOAD_SIZE_MB=100
30
+
31
+ # --- Optimization Settings ---
32
+
33
+ # Enable subtle highpass filter (removes low-frequency rumble < 80Hz)
34
+ ENABLE_NOISE_REDUCTION=True
35
+
36
+ # Enable/Disable Loudness Normalization (EBU R128)
37
+ ENABLE_LOUDNORM=True
38
+
39
+ # --- VAD (Voice Activity Detection) Settings ---
40
+
41
+ # Threshold for detecting speech (0.0 to 1.0). Higher = stricter
42
+ VAD_THRESHOLD=0.5
43
+ # Ignore speech segments shorter than this (milliseconds)
44
+ VAD_MIN_SPEECH_DURATION_MS=250
45
+ # Minimum silence duration to split segments (milliseconds)
46
+ VAD_MIN_SILENCE_DURATION_MS=500
47
+
48
+ # --- Post-processing (Clustering) Settings ---
49
+
50
+ # Merge segments from same speaker if gap is less than this (seconds)
51
+ MERGE_THRESHOLD_S=0.5
52
+ # Filter out segments shorter than this (seconds) - removes blips/noise
53
+ MIN_SEGMENT_DURATION_S=0.3
54
+
55
+ # Server settings
56
+ HOST=0.0.0.0
57
+ PORT=8000
.github/workflows/sync-to-huggingface.yml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face Hub
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ sync:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Checkout repository
13
+ uses: actions/checkout@v4
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+
18
+ - name: Push to Hugging Face Hub
19
+ env:
20
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
21
+ run: |
22
+ git remote add huggingface https://huggingface.co/spaces/ThiThanhChuong/precision-voice || true
23
+ git remote set-url huggingface https://huggingface.co/spaces/ThiThanhChuong/precision-voice
24
+ git push https://user:$HF_TOKEN@huggingface.co/spaces/ThiThanhChuong/precision-voice main --force
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environment
7
+ venv/
8
+ .venv/
9
+ ENV/
10
+
11
+ # Environment files
12
+ .env
13
+ !.env.example
14
+
15
+ # IDE
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+ *.swo
20
+
21
+ # Data directories (keep structure, ignore content)
22
+ data/uploads/*
23
+ data/processed/*
24
+ !data/uploads/.gitkeep
25
+ !data/processed/.gitkeep
26
+
27
+ # Docker
28
+ .docker/
29
+
30
+ # Logs
31
+ *.log
32
+ logs/
33
+
34
+ # Cache
35
+ .cache/
36
+ *.cache
37
+ .pytest_cache/
38
+
39
+ # OS
40
+ .DS_Store
41
+ Thumbs.db
42
+
43
+ # Model files (will be downloaded at runtime)
44
+ *.pt
45
+ *.bin
46
+ *.safetensors
Dockerfile ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================
2
+ # PrecisionVoice Dockerfile
3
+ # Optimized for performance and size
4
+ # ================================
5
+
6
+ # Stage 1: Builder
7
+ FROM python:3.10-slim-bullseye AS builder
8
+
9
+ WORKDIR /app
10
+
11
+ # Install build dependencies
12
+ RUN apt-get update && apt-get install -y --no-install-recommends \
13
+ build-essential \
14
+ git \
15
+ ffmpeg \
16
+ libsndfile1-dev \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # Copy requirements and install dependencies
20
+ # Using --user to keep packages in /root/.local
21
+ COPY requirements.txt .
22
+ RUN pip install --no-cache-dir --user -r requirements.txt
23
+
24
+ # ================================
25
+ # Stage 2: Runtime
26
+ # ================================
27
+ FROM python:3.10-slim-bullseye
28
+
29
+ WORKDIR /app
30
+
31
+ # Install runtime dependencies
32
+ RUN apt-get update && apt-get install -y --no-install-recommends \
33
+ ffmpeg \
34
+ libsndfile1 \
35
+ && rm -rf /var/lib/apt/lists/* \
36
+ && apt-get clean
37
+
38
+ # Copy Python packages from builder
39
+ COPY --from=builder /root/.local /root/.local
40
+
41
+ # Ensure scripts in .local are available
42
+ ENV PATH=/root/.local/bin:$PATH
43
+ ENV PYTHONUNBUFFERED=1
44
+ ENV PYTHONDONTWRITEBYTECODE=1
45
+
46
+ # Model cache directories
47
+ ENV HF_HOME=/root/.cache/huggingface
48
+ ENV TORCH_HOME=/root/.cache/torch
49
+ ENV TRANSFORMERS_CACHE=/root/.cache/huggingface
50
+
51
+ # Copy application code
52
+ COPY app/ ./app/
53
+ COPY data/ ./data/
54
+
55
+ # Create necessary directories
56
+ RUN mkdir -p /app/data/uploads /app/data/processed
57
+
58
+ # Port configuration
59
+ ARG PORT=7860
60
+ ENV PORT=${PORT}
61
+ EXPOSE ${PORT}
62
+
63
+ # Health check
64
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
65
+ CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:${PORT}/api/health')" || exit 1
66
+
67
+ # Run the application
68
+ CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]
README.md CHANGED
@@ -1,10 +1,100 @@
1
  ---
2
- title: PoC PrecisisionVoice V1
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: PrecisionVoice
3
+ emoji: 🎙️
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: docker
7
+ app_file: app/main.py
8
  pinned: false
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
+
13
+ # PrecisionVoice - STT & Speaker Diarization
14
+
15
+ A production-ready Speech-to-Text and Speaker Diarization web application using FastAPI, faster-whisper, and pyannote.audio.
16
+
17
+ ## Features
18
+
19
+ - 🎙️ Speech-to-Text using `kiendt/PhoWhisper-large-ct2` (optimized for Vietnamese)
20
+ - 👥 Speaker Diarization using `pyannote/speaker-diarization-3.1`
21
+ - 🧼 Advanced Denoising using Facebook's `Denoiser` (dns64)
22
+ - 🎤 Vocal Isolation using `MDX-Net` (UVR-MDX-NET-Voc_FT)
23
+ - 🔄 Automatic speaker-transcript alignment
24
+ - 📥 Download results in TXT or SRT format
25
+ - 🐳 Docker-ready with persistent model caching and GPU support
26
+
27
+ ## Quick Start
28
+
29
+ ### Prerequisites
30
+
31
+ 1. Docker and Docker Compose
32
+ 2. (Optional) NVIDIA GPU with CUDA support
33
+ 3. HuggingFace account with access to pyannote models
34
+
35
+ ### Setup
36
+
37
+ 1. Clone and configure:
38
+ ```bash
39
+ cp .env.example .env
40
+ # Edit .env and add your HuggingFace token
41
+ ```
42
+
43
+ 2. Build and run:
44
+ ```bash
45
+ docker compose up --build
46
+ ```
47
+
48
+ 3. Open http://localhost:8000
49
+
50
+ ## Audio Processing Pipeline
51
+
52
+ The system uses a state-of-the-art multi-stage pipeline to ensure maximum accuracy:
53
+
54
+ 1. **Speech Enhancement**: Background noise, hums, and interference are removed using Facebook's `Denoiser` (Deep Learning Wave-U-Net).
55
+ 2. **Vocal Isolation**: Vocals are stripped from any remaining background music or non-speech sounds using `MDX-Net`.
56
+ 3. **Refinement**: Subtle highpass filtering and EBU R128 loudness normalization for consistent volume.
57
+ 4. **Transcription**: High-precision Vietnamese transcription using `PhoWhisper`.
58
+ 5. **Diarization**: Segmenting audio by speaker.
59
+ 6. **Alignment**: Merging transcripts with speaker segments.
60
+
61
+ ## Configuration
62
+
63
+ | Variable | Default | Description |
64
+ |----------|---------|-------------|
65
+ | `HF_TOKEN` | - | Required for Pyannote models |
66
+ | `ENABLE_DENOISER` | `True` | Toggle Facebook speech enhancement |
67
+ | `DENOISER_MODEL` | `dns64` | Model for denoising |
68
+ | `ENABLE_VOCAL_SEPARATION` | `True` | Toggle MDX-Net vocal isolation |
69
+ | `MDX_MODEL` | `UVR-MDX-NET-Voc_FT` | Model for vocal separation |
70
+ | `DEVICE` | `auto` | `cuda`, `cpu`, or `auto` |
71
+
72
+ ## Development
73
+
74
+ ### Local Setup (without Docker)
75
+
76
+ ```bash
77
+ python -m venv venv
78
+ source venv/bin/activate
79
+ pip install -r requirements.txt
80
+ uvicorn app.main:app --reload
81
+ ```
82
+
83
+ ### API Endpoints
84
+
85
+ | Endpoint | Method | Description |
86
+ |----------|--------|-------------|
87
+ | `/` | GET | Web UI |
88
+ | `/api/transcribe` | POST | Upload and transcribe audio |
89
+ | `/api/download/{filename}` | GET | Download result files |
90
+
91
+ ## Supported Audio Formats
92
+
93
+ - MP3
94
+ - WAV
95
+ - M4A
96
+ - OGG
97
+
98
+ ## License
99
+
100
+ MIT
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # App package
app/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # API package
app/api/routes.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API routes for the transcription service.
3
+ """
4
+ import json
5
+ import time
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks
10
+ from fastapi.responses import FileResponse, StreamingResponse
11
+
12
+ from app.core.config import get_settings
13
+ from app.schemas.models import TranscriptionResponse, ErrorResponse, HealthResponse
14
+ from app.services.audio_processor import AudioProcessor, AudioProcessingError
15
+ from app.services.transcription import TranscriptionService
16
+ from app.services.diarization import DiarizationService
17
+ from app.services.alignment import AlignmentService
18
+ from app.services.orchestrator import PipelineOrchestrator
19
+
20
+ logger = logging.getLogger(__name__)
21
+ settings = get_settings()
22
+
23
+ router = APIRouter()
24
+
25
+
26
+ @router.get("/api/health", response_model=HealthResponse)
27
+ async def health_check():
28
+ """Health check endpoint."""
29
+ return HealthResponse(
30
+ status="healthy",
31
+ models_loaded=TranscriptionService.is_loaded() and DiarizationService.is_loaded(),
32
+ device=settings.resolved_device
33
+ )
34
+
35
+
36
+ from fastapi.responses import FileResponse, StreamingResponse
37
+
38
+ # ... (rest of imports)
39
+
40
+ @router.post("/api/transcribe", response_model=TranscriptionResponse)
41
+ async def transcribe_audio(
42
+ background_tasks: BackgroundTasks,
43
+ file: UploadFile = File(..., description="Audio file to transcribe")
44
+ ):
45
+ """
46
+ Upload and transcribe an audio file.
47
+ Status updates are logged on the server.
48
+ """
49
+ wav_path = None
50
+
51
+ try:
52
+ # Read file content
53
+ file_content = await file.read()
54
+
55
+ # Validate and process audio
56
+ try:
57
+ AudioProcessor.validate_file(file.filename or "audio.wav", len(file_content))
58
+ except AudioProcessingError as e:
59
+ raise HTTPException(status_code=400, detail=str(e))
60
+
61
+ # Save and convert to WAV (Noise reduction happens here)
62
+ wav_path, duration = await AudioProcessor.process_upload(
63
+ file_content,
64
+ file.filename or "audio.wav"
65
+ )
66
+
67
+ # Run orchestrated pipeline (Whisper + Pyannote in parallel -> Alignment)
68
+ logger.info("Executing orchestrated pipeline...")
69
+ response = await PipelineOrchestrator.process_audio(wav_path, duration)
70
+
71
+ # Schedule cleanup in background
72
+ background_tasks.add_task(cleanup_files, wav_path)
73
+
74
+ return response
75
+
76
+ except HTTPException:
77
+ raise
78
+ except Exception as e:
79
+ logger.exception("Processing failed")
80
+ if wav_path and wav_path.exists():
81
+ background_tasks.add_task(cleanup_files, wav_path)
82
+ raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
83
+
84
+
85
+
86
+ @router.get("/api/download/{filename}")
87
+ async def download_file(filename: str, background_tasks: BackgroundTasks):
88
+ """
89
+ Download a generated transcript file.
90
+
91
+ Supports: .txt, .srt files
92
+ """
93
+ # Security: only allow specific extensions and no path traversal
94
+ if not filename.endswith(('.txt', '.srt')) or '/' in filename or '..' in filename:
95
+ raise HTTPException(status_code=400, detail="Invalid filename")
96
+
97
+ filepath = settings.processed_dir / filename
98
+
99
+ if not filepath.exists():
100
+ raise HTTPException(status_code=404, detail="File not found")
101
+
102
+ # Determine media type
103
+ media_type = "text/plain" if filename.endswith('.txt') else "application/x-subrip"
104
+
105
+ # Schedule cleanup after download (give some time for download to complete)
106
+ # Note: In production, you might want a separate cleanup job
107
+
108
+ return FileResponse(
109
+ path=filepath,
110
+ filename=filename,
111
+ media_type=media_type
112
+ )
113
+
114
+
115
+ async def cleanup_files(*paths: Path):
116
+ """Background task to cleanup temporary files."""
117
+ import asyncio
118
+
119
+ # Wait a bit before cleanup to ensure files are not in use
120
+ await asyncio.sleep(5)
121
+
122
+ await AudioProcessor.cleanup_files(*paths)
app/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Core package
app/core/config.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Application configuration using Pydantic Settings.
3
+ """
4
+ import os
5
+ from pathlib import Path
6
+ from functools import lru_cache
7
+ from typing import Literal
8
+
9
+ from pydantic_settings import BaseSettings, SettingsConfigDict
10
+
11
+
12
+ class Settings(BaseSettings):
13
+ """Application settings loaded from environment variables."""
14
+
15
+ model_config = SettingsConfigDict(
16
+ env_file=".env",
17
+ env_file_encoding="utf-8",
18
+ extra="ignore"
19
+ )
20
+
21
+ # HuggingFace
22
+ hf_token: str = ""
23
+ enable_noise_reduction: bool = True
24
+
25
+ # Denoising (Speech Enhancement)
26
+ enable_denoiser: bool = True
27
+ denoiser_model: str = "dns64"
28
+
29
+ # MDX-Net Vocal Separation
30
+ enable_vocal_separation: bool = True
31
+ mdx_model: str = "Kim_Vocal_2.onnx" # High quality vocal isolation
32
+
33
+ # Model settings
34
+ whisper_model: str = "kiendt/PhoWhisper-large-ct2"
35
+ diarization_model: str = "pyannote/speaker-diarization-3.1"
36
+
37
+ # Device settings
38
+ device: Literal["cuda", "cpu", "auto"] = "auto"
39
+ compute_type: str = "float16" # float16 for GPU, int8 for CPU
40
+
41
+ # Upload settings
42
+ max_upload_size_mb: int = 100
43
+ allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
44
+
45
+ # Audio processing settings
46
+ sample_rate: int = 16000
47
+ channels: int = 1 # Mono
48
+
49
+ # Optimization parameters
50
+ noise_reduction_level: float = 12.0 # Used by anlmdn
51
+ enable_loudnorm: bool = True
52
+
53
+ # VAD parameters
54
+ vad_threshold: float = 0.5
55
+ vad_min_speech_duration_ms: int = 250
56
+ vad_min_silence_duration_ms: int = 500
57
+
58
+ # Post-processing
59
+ merge_threshold_s: float = 0.5 # Merge segments from same speaker if gap < this
60
+ min_segment_duration_s: float = 0.3 # Remove segments shorter than this
61
+
62
+ # Server settings
63
+ host: str = "0.0.0.0"
64
+ port: int = 7860
65
+
66
+ # Paths
67
+ base_dir: Path = Path(__file__).parent.parent.parent
68
+ data_dir: Path = base_dir / "data"
69
+ upload_dir: Path = data_dir / "uploads"
70
+ processed_dir: Path = data_dir / "processed"
71
+
72
+ def __init__(self, **kwargs):
73
+ super().__init__(**kwargs)
74
+ # Ensure directories exist
75
+ self.upload_dir.mkdir(parents=True, exist_ok=True)
76
+ self.processed_dir.mkdir(parents=True, exist_ok=True)
77
+
78
+ @property
79
+ def max_upload_size_bytes(self) -> int:
80
+ return self.max_upload_size_mb * 1024 * 1024
81
+
82
+ @property
83
+ def resolved_device(self) -> str:
84
+ """Resolve 'auto' to actual device."""
85
+ if self.device == "auto":
86
+ try:
87
+ import torch
88
+ return "cuda" if torch.cuda.is_available() else "cpu"
89
+ except ImportError:
90
+ return "cpu"
91
+ return self.device
92
+
93
+ @property
94
+ def resolved_compute_type(self) -> str:
95
+ """Get appropriate compute type for device."""
96
+ if self.resolved_device == "cuda":
97
+ return "float16"
98
+ return "int8"
99
+
100
+
101
+ @lru_cache
102
+ def get_settings() -> Settings:
103
+ """Get cached settings instance."""
104
+ return Settings()
app/main.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PrecisionVoice - Speech-to-Text & Speaker Diarization Application
3
+
4
+ Main FastAPI application entry point.
5
+ """
6
+ import logging
7
+ from contextlib import asynccontextmanager
8
+
9
+ from fastapi import FastAPI, Request
10
+ from fastapi.staticfiles import StaticFiles
11
+ from fastapi.templating import Jinja2Templates
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from fastapi.responses import HTMLResponse
14
+
15
+ from app.core.config import get_settings
16
+ from app.api.routes import router
17
+ from app.services.transcription import TranscriptionService
18
+ from app.services.diarization import DiarizationService
19
+
20
+ # Configure logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ settings = get_settings()
28
+
29
+
30
+ @asynccontextmanager
31
+ async def lifespan(app: FastAPI):
32
+ """
33
+ Application lifespan handler.
34
+ Preloads models on startup for faster first request.
35
+ """
36
+ logger.info("Starting PrecisionVoice application...")
37
+ logger.info(f"Device: {settings.resolved_device}")
38
+ logger.info(f"Whisper model: {settings.whisper_model}")
39
+ logger.info(f"Diarization model: {settings.diarization_model}")
40
+
41
+ # Preload models (optional - can be disabled for faster startup)
42
+ try:
43
+ logger.info("Preloading Whisper model...")
44
+ TranscriptionService.preload_model()
45
+ except Exception as e:
46
+ logger.error(f"Failed to preload Whisper model: {e}")
47
+
48
+ try:
49
+ if settings.hf_token:
50
+ logger.info("Preloading diarization pipeline...")
51
+ DiarizationService.preload_pipeline()
52
+ else:
53
+ logger.warning("HF_TOKEN not set, diarization will not be available")
54
+ except Exception as e:
55
+ logger.warning(f"Diarization preload failed (will try again on first use): {e}")
56
+
57
+ logger.info("Application startup complete")
58
+
59
+ yield
60
+
61
+ logger.info("Shutting down PrecisionVoice application...")
62
+
63
+
64
+ # Create FastAPI app
65
+ app = FastAPI(
66
+ title="PrecisionVoice",
67
+ description="Speech-to-Text and Speaker Diarization API",
68
+ version="1.0.0",
69
+ lifespan=lifespan
70
+ )
71
+
72
+ # CORS middleware
73
+ app.add_middleware(
74
+ CORSMiddleware,
75
+ allow_origins=["*"], # Configure appropriately for production
76
+ allow_credentials=True,
77
+ allow_methods=["*"],
78
+ allow_headers=["*"],
79
+ )
80
+
81
+ # Mount static files
82
+ app.mount(
83
+ "/static",
84
+ StaticFiles(directory="app/static"),
85
+ name="static"
86
+ )
87
+
88
+ # Templates
89
+ templates = Jinja2Templates(directory="app/templates")
90
+
91
+ # Include API routes
92
+ app.include_router(router)
93
+
94
+
95
+ @app.get("/", response_class=HTMLResponse)
96
+ async def index(request: Request):
97
+ """Serve the main web interface."""
98
+ return templates.TemplateResponse(
99
+ "index.html",
100
+ {
101
+ "request": request,
102
+ "max_upload_mb": settings.max_upload_size_mb,
103
+ "allowed_formats": ", ".join(settings.allowed_extensions)
104
+ }
105
+ )
106
+
107
+
108
+ if __name__ == "__main__":
109
+ import uvicorn
110
+ uvicorn.run(
111
+ "app.main:app",
112
+ host=settings.host,
113
+ port=settings.port,
114
+ reload=True
115
+ )
app/schemas/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Schemas package
app/schemas/models.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pydantic models for API requests and responses.
3
+ """
4
+ from pydantic import BaseModel, Field
5
+ from typing import Optional
6
+ from enum import Enum
7
+
8
+
9
+ class ProcessingStatus(str, Enum):
10
+ """Status of the transcription process."""
11
+ PENDING = "pending"
12
+ PROCESSING = "processing"
13
+ COMPLETED = "completed"
14
+ FAILED = "failed"
15
+
16
+
17
+ class TranscriptSegment(BaseModel):
18
+ """A single segment of the transcript with speaker and timing."""
19
+ start: float = Field(..., description="Start time in seconds")
20
+ end: float = Field(..., description="End time in seconds")
21
+ speaker: str = Field(..., description="Speaker identifier")
22
+ text: str = Field(..., description="Transcribed text")
23
+
24
+ @property
25
+ def start_formatted(self) -> str:
26
+ """Format start time as HH:MM:SS."""
27
+ return self._format_time(self.start)
28
+
29
+ @property
30
+ def end_formatted(self) -> str:
31
+ """Format end time as HH:MM:SS."""
32
+ return self._format_time(self.end)
33
+
34
+ @staticmethod
35
+ def _format_time(seconds: float) -> str:
36
+ """Convert seconds to HH:MM:SS format."""
37
+ hours = int(seconds // 3600)
38
+ minutes = int((seconds % 3600) // 60)
39
+ secs = int(seconds % 60)
40
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
41
+
42
+
43
+ class TranscriptionRequest(BaseModel):
44
+ """Request model for transcription settings."""
45
+ language: str = Field(default="vi", description="Language code for transcription")
46
+ num_speakers: Optional[int] = Field(default=None, description="Expected number of speakers (None for auto-detect)")
47
+ output_format: str = Field(default="json", description="Output format: json, txt, srt")
48
+
49
+
50
+ class TranscriptionResponse(BaseModel):
51
+ """Response containing the transcription results."""
52
+ success: bool = Field(..., description="Whether transcription succeeded")
53
+ message: str = Field(default="", description="Status message")
54
+ segments: list[TranscriptSegment] = Field(default_factory=list, description="Transcript segments with speakers")
55
+ duration: float = Field(default=0.0, description="Audio duration in seconds")
56
+ num_speakers: int = Field(default=0, description="Number of detected speakers")
57
+ processing_time: float = Field(default=0.0, description="Processing time in seconds")
58
+ download_txt: Optional[str] = Field(default=None, description="Download URL for TXT file")
59
+ download_srt: Optional[str] = Field(default=None, description="Download URL for SRT file")
60
+
61
+
62
+ class ErrorResponse(BaseModel):
63
+ """Error response model."""
64
+ success: bool = False
65
+ error: str = Field(..., description="Error message")
66
+ detail: Optional[str] = Field(default=None, description="Detailed error information")
67
+
68
+
69
+ class HealthResponse(BaseModel):
70
+ """Health check response."""
71
+ status: str = "healthy"
72
+ models_loaded: bool = False
73
+ device: str = "cpu"
app/services/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Services package
app/services/alignment.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Precision alignment service - Word-center-based speaker assignment.
3
+ Merges word-level transcription with speaker diarization using precise timestamps.
4
+ """
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import List, Tuple, Optional
8
+ from dataclasses import dataclass
9
+
10
+ from app.core.config import get_settings
11
+ from app.schemas.models import TranscriptSegment
12
+ from app.services.transcription import WordTimestamp
13
+ from app.services.diarization import SpeakerSegment
14
+
15
+ logger = logging.getLogger(__name__)
16
+ settings = get_settings()
17
+
18
+
19
+ @dataclass
20
+ class WordWithSpeaker:
21
+ """A word with assigned speaker."""
22
+ word: str
23
+ start: float
24
+ end: float
25
+ speaker: str
26
+
27
+
28
+ class AlignmentService:
29
+ """
30
+ Precision alignment service.
31
+ Uses word-center-based algorithm for accurate speaker-to-text mapping.
32
+ """
33
+
34
+ # Pause threshold for splitting segments (seconds)
35
+ PAUSE_THRESHOLD = 1.0
36
+
37
+ @staticmethod
38
+ def get_word_center(word: WordTimestamp) -> float:
39
+ """Calculate the center time of a word."""
40
+ return (word.start + word.end) / 2
41
+
42
+ @classmethod
43
+ def find_speaker_at_time(
44
+ cls,
45
+ time: float,
46
+ speaker_segments: List[SpeakerSegment]
47
+ ) -> Optional[str]:
48
+ """
49
+ Find which speaker is speaking at a given time.
50
+
51
+ Args:
52
+ time: Time point in seconds
53
+ speaker_segments: List of speaker segments from diarization
54
+
55
+ Returns:
56
+ Speaker label or None if no speaker found
57
+ """
58
+ for seg in speaker_segments:
59
+ if seg.start <= time <= seg.end:
60
+ return seg.speaker
61
+ return None
62
+
63
+ @classmethod
64
+ def find_closest_speaker(
65
+ cls,
66
+ time: float,
67
+ speaker_segments: List[SpeakerSegment]
68
+ ) -> str:
69
+ """
70
+ Find the closest speaker to a given time (for gaps/silence).
71
+
72
+ Args:
73
+ time: Time point in seconds
74
+ speaker_segments: List of speaker segments
75
+
76
+ Returns:
77
+ Closest speaker label or "Unknown"
78
+ """
79
+ if not speaker_segments:
80
+ return "Unknown"
81
+
82
+ min_distance = float('inf')
83
+ closest_speaker = "Unknown"
84
+
85
+ for seg in speaker_segments:
86
+ # Distance to segment start or end
87
+ dist_to_start = abs(time - seg.start)
88
+ dist_to_end = abs(time - seg.end)
89
+ min_seg_dist = min(dist_to_start, dist_to_end)
90
+
91
+ if min_seg_dist < min_distance:
92
+ min_distance = min_seg_dist
93
+ closest_speaker = seg.speaker
94
+
95
+ return closest_speaker
96
+
97
+ @classmethod
98
+ def assign_speakers_to_words(
99
+ cls,
100
+ words: List[WordTimestamp],
101
+ speaker_segments: List[SpeakerSegment]
102
+ ) -> List[WordWithSpeaker]:
103
+ """
104
+ Step 3c: Assign speakers to each word based on word center time.
105
+
106
+ Args:
107
+ words: List of words with timestamps from transcription
108
+ speaker_segments: List of speaker segments from diarization
109
+
110
+ Returns:
111
+ List of words with speaker assignments
112
+ """
113
+ if not speaker_segments:
114
+ # No diarization available, assign all to "Speaker 1"
115
+ logger.warning("No speaker segments available, using single speaker")
116
+ return [
117
+ WordWithSpeaker(
118
+ word=w.word,
119
+ start=w.start,
120
+ end=w.end,
121
+ speaker="Speaker 1"
122
+ )
123
+ for w in words
124
+ ]
125
+
126
+ words_with_speakers = []
127
+
128
+ for word in words:
129
+ # Calculate word center time
130
+ center_time = cls.get_word_center(word)
131
+
132
+ # Find speaker at this time
133
+ speaker = cls.find_speaker_at_time(center_time, speaker_segments)
134
+
135
+ # If no direct match, find closest speaker
136
+ if speaker is None:
137
+ speaker = cls.find_closest_speaker(center_time, speaker_segments)
138
+
139
+ words_with_speakers.append(WordWithSpeaker(
140
+ word=word.word,
141
+ start=word.start,
142
+ end=word.end,
143
+ speaker=speaker
144
+ ))
145
+
146
+ logger.debug(f"Assigned speakers to {len(words_with_speakers)} words")
147
+ return words_with_speakers
148
+
149
+ @classmethod
150
+ def reconstruct_segments(
151
+ cls,
152
+ words_with_speakers: List[WordWithSpeaker]
153
+ ) -> List[TranscriptSegment]:
154
+ """
155
+ Step 3d: Reconstruct sentence segments from words.
156
+
157
+ Groups consecutive words of the same speaker into segments.
158
+ Creates new segment when:
159
+ - Speaker changes
160
+ - Pause > PAUSE_THRESHOLD between words
161
+
162
+ Args:
163
+ words_with_speakers: List of words with speaker assignments
164
+
165
+ Returns:
166
+ List of TranscriptSegment with complete sentences
167
+ """
168
+ if not words_with_speakers:
169
+ return []
170
+
171
+ segments = []
172
+
173
+ # Start first segment
174
+ current_speaker = words_with_speakers[0].speaker
175
+ current_start = words_with_speakers[0].start
176
+ current_end = words_with_speakers[0].end
177
+ current_words = [words_with_speakers[0].word]
178
+
179
+ for i in range(1, len(words_with_speakers)):
180
+ word = words_with_speakers[i]
181
+ prev_word = words_with_speakers[i - 1]
182
+
183
+ # Calculate pause between words
184
+ pause = word.start - prev_word.end
185
+
186
+ # Check if we need to start a new segment
187
+ speaker_changed = word.speaker != current_speaker
188
+ significant_pause = pause > cls.PAUSE_THRESHOLD
189
+
190
+ if speaker_changed or significant_pause:
191
+ # Save current segment
192
+ segments.append(TranscriptSegment(
193
+ start=current_start,
194
+ end=current_end,
195
+ speaker=current_speaker,
196
+ text=" ".join(current_words)
197
+ ))
198
+
199
+ # Start new segment
200
+ current_speaker = word.speaker
201
+ current_start = word.start
202
+ current_end = word.end
203
+ current_words = [word.word]
204
+ else:
205
+ # Continue current segment
206
+ current_end = word.end
207
+ current_words.append(word.word)
208
+
209
+ # Don't forget the last segment
210
+ if current_words:
211
+ segments.append(TranscriptSegment(
212
+ start=current_start,
213
+ end=current_end,
214
+ speaker=current_speaker,
215
+ text=" ".join(current_words)
216
+ ))
217
+
218
+ logger.debug(f"Reconstructed {len(segments)} segments from {len(words_with_speakers)} words")
219
+ return segments
220
+
221
+ @classmethod
222
+ def resize_and_merge_segments(
223
+ cls,
224
+ segments: List[TranscriptSegment]
225
+ ) -> List[TranscriptSegment]:
226
+ """
227
+ Merge consecutive segments of the same speaker if the gap is small.
228
+ Also filters out extremely short segments.
229
+ """
230
+ if not segments:
231
+ return []
232
+
233
+ # Filter 1: Remove extremely short blips (noise)
234
+ segments = [s for s in segments if (s.end - s.start) >= settings.min_segment_duration_s]
235
+
236
+ if not segments:
237
+ return []
238
+
239
+ merged = []
240
+ curr = segments[0]
241
+
242
+ for i in range(1, len(segments)):
243
+ next_seg = segments[i]
244
+
245
+ # If same speaker and gap is small, merge
246
+ gap = next_seg.start - curr.end
247
+ if next_seg.speaker == curr.speaker and gap < settings.merge_threshold_s:
248
+ curr.end = next_seg.end
249
+ curr.text += " " + next_seg.text
250
+ else:
251
+ merged.append(curr)
252
+ curr = next_seg
253
+
254
+ merged.append(curr)
255
+
256
+ logger.debug(f"Merged segments: {len(segments)} -> {len(merged)}")
257
+ return merged
258
+
259
+ @classmethod
260
+ def align_precision(
261
+ cls,
262
+ words: List[WordTimestamp],
263
+ speaker_segments: List[SpeakerSegment]
264
+ ) -> List[TranscriptSegment]:
265
+ """
266
+ Full precision alignment pipeline.
267
+
268
+ Args:
269
+ words: Word-level timestamps from transcription
270
+ speaker_segments: Speaker segments from diarization
271
+
272
+ Returns:
273
+ List of TranscriptSegment with proper speaker assignments
274
+ """
275
+ # Step 3c: Assign speakers to words
276
+ words_with_speakers = cls.assign_speakers_to_words(words, speaker_segments)
277
+
278
+ # Step 3d: Reconstruct segments
279
+ segments = cls.reconstruct_segments(words_with_speakers)
280
+
281
+ # Step 3e: Clustering/Merging (Optimization)
282
+ segments = cls.resize_and_merge_segments(segments)
283
+
284
+ return segments
285
+
286
+ @staticmethod
287
+ def format_timestamp_txt(seconds: float) -> str:
288
+ """Format timestamp for TXT output: HH:MM:SS"""
289
+ hours = int(seconds // 3600)
290
+ minutes = int((seconds % 3600) // 60)
291
+ secs = int(seconds % 60)
292
+ return f"{hours:02d}:{minutes:02d}:{secs:02d}"
293
+
294
+ @staticmethod
295
+ def format_timestamp_srt(seconds: float) -> str:
296
+ """Format timestamp for SRT output: HH:MM:SS,mmm"""
297
+ hours = int(seconds // 3600)
298
+ minutes = int((seconds % 3600) // 60)
299
+ secs = int(seconds % 60)
300
+ millis = int((seconds % 1) * 1000)
301
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
302
+
303
+ @classmethod
304
+ def generate_txt(cls, segments: List[TranscriptSegment], output_path: Path) -> Path:
305
+ """
306
+ Generate TXT transcript file.
307
+
308
+ Format: [HH:MM:SS - HH:MM:SS] Speaker: Text
309
+ """
310
+ lines = []
311
+ for seg in segments:
312
+ start = cls.format_timestamp_txt(seg.start)
313
+ end = cls.format_timestamp_txt(seg.end)
314
+ lines.append(f"[{start} - {end}] {seg.speaker}: {seg.text}")
315
+
316
+ output_path.write_text("\n".join(lines), encoding="utf-8")
317
+ logger.info(f"Generated TXT: {output_path}")
318
+
319
+ return output_path
320
+
321
+ @classmethod
322
+ def generate_srt(cls, segments: List[TranscriptSegment], output_path: Path) -> Path:
323
+ """
324
+ Generate SRT subtitle file.
325
+ """
326
+ lines = []
327
+ for i, seg in enumerate(segments, 1):
328
+ start = cls.format_timestamp_srt(seg.start)
329
+ end = cls.format_timestamp_srt(seg.end)
330
+ lines.append(str(i))
331
+ lines.append(f"{start} --> {end}")
332
+ lines.append(f"[{seg.speaker}] {seg.text}")
333
+ lines.append("") # Empty line between entries
334
+
335
+ output_path.write_text("\n".join(lines), encoding="utf-8")
336
+ logger.info(f"Generated SRT: {output_path}")
337
+
338
+ return output_path
339
+
340
+ @classmethod
341
+ def generate_outputs(
342
+ cls,
343
+ segments: List[TranscriptSegment],
344
+ base_filename: str
345
+ ) -> Tuple[Path, Path]:
346
+ """Generate both TXT and SRT output files."""
347
+ txt_path = settings.processed_dir / f"{base_filename}.txt"
348
+ srt_path = settings.processed_dir / f"{base_filename}.srt"
349
+
350
+ cls.generate_txt(segments, txt_path)
351
+ cls.generate_srt(segments, srt_path)
352
+
353
+ return txt_path, srt_path
app/services/audio_processor.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio processing service using FFmpeg.
3
+ Handles file validation, conversion to 16kHz mono WAV, and cleanup.
4
+ """
5
+ import os
6
+ import uuid
7
+ import asyncio
8
+ import logging
9
+ from pathlib import Path
10
+ from typing import Optional, Tuple
11
+
12
+ import ffmpeg
13
+
14
+ from app.core.config import get_settings
15
+ from app.services.vocal_separator import VocalSeparator
16
+ from app.services.denoiser import DenoiserService
17
+
18
+ logger = logging.getLogger(__name__)
19
+ settings = get_settings()
20
+
21
+
22
+ class AudioProcessingError(Exception):
23
+ """Custom exception for audio processing errors."""
24
+ pass
25
+
26
+
27
+ class AudioProcessor:
28
+ """Service for processing audio files."""
29
+
30
+ ALLOWED_EXTENSIONS = settings.allowed_extensions
31
+ TARGET_SAMPLE_RATE = settings.sample_rate
32
+ TARGET_CHANNELS = settings.channels
33
+
34
+ @classmethod
35
+ def validate_file(cls, filename: str, file_size: int) -> bool:
36
+ """
37
+ Validate uploaded file.
38
+
39
+ Args:
40
+ filename: Original filename
41
+ file_size: File size in bytes
42
+
43
+ Returns:
44
+ True if valid
45
+
46
+ Raises:
47
+ AudioProcessingError: If validation fails
48
+ """
49
+ # Check extension
50
+ ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
51
+ if ext not in cls.ALLOWED_EXTENSIONS:
52
+ raise AudioProcessingError(
53
+ f"Invalid file type: .{ext}. Allowed: {', '.join(cls.ALLOWED_EXTENSIONS)}"
54
+ )
55
+
56
+ # Check size
57
+ if file_size > settings.max_upload_size_bytes:
58
+ raise AudioProcessingError(
59
+ f"File too large: {file_size / (1024*1024):.1f}MB. "
60
+ f"Maximum: {settings.max_upload_size_mb}MB"
61
+ )
62
+
63
+ return True
64
+
65
+ @classmethod
66
+ async def save_upload(cls, file_content: bytes, original_filename: str) -> Path:
67
+ """
68
+ Save uploaded file to temporary location.
69
+
70
+ Args:
71
+ file_content: File bytes
72
+ original_filename: Original filename for extension
73
+
74
+ Returns:
75
+ Path to saved file
76
+ """
77
+ ext = original_filename.rsplit('.', 1)[-1].lower() if '.' in original_filename else 'wav'
78
+ unique_id = str(uuid.uuid4())[:8]
79
+ filename = f"{unique_id}.{ext}"
80
+ filepath = settings.upload_dir / filename
81
+
82
+ # Write file asynchronously
83
+ loop = asyncio.get_event_loop()
84
+ await loop.run_in_executor(None, lambda: filepath.write_bytes(file_content))
85
+
86
+ logger.debug(f"Saved upload: {filepath}")
87
+ return filepath
88
+
89
+ @classmethod
90
+ async def convert_to_wav(cls, input_path: Path) -> Path:
91
+ """
92
+ Convert audio to 16kHz mono WAV using FFmpeg.
93
+
94
+ Args:
95
+ input_path: Path to input audio file
96
+
97
+ Returns:
98
+ Path to converted WAV file
99
+ """
100
+ output_filename = f"{input_path.stem}_processed.wav"
101
+ output_path = settings.processed_dir / output_filename
102
+
103
+ try:
104
+ # Run ffmpeg conversion in executor to not block
105
+ loop = asyncio.get_event_loop()
106
+ await loop.run_in_executor(None, lambda: cls._run_ffmpeg_conversion(input_path, output_path))
107
+
108
+ logger.info(f"Converted to WAV: {output_path}")
109
+ return output_path
110
+
111
+ except ffmpeg.Error as e:
112
+ error_msg = e.stderr.decode() if e.stderr else str(e)
113
+ logger.error(f"FFmpeg error: {error_msg}")
114
+ raise AudioProcessingError(f"Audio conversion failed: {error_msg}")
115
+
116
+ @staticmethod
117
+ def _run_ffmpeg_conversion(input_path: Path, output_path: Path) -> None:
118
+ """Run the actual FFmpeg conversion (blocking)."""
119
+ stream = ffmpeg.input(str(input_path))
120
+
121
+ # Apply normalization if enabled (loudnorm is best for speech consistency)
122
+ if settings.enable_loudnorm:
123
+ logger.debug("Applying loudnorm normalization...")
124
+ stream = stream.filter('loudnorm', I=-16, TP=-1.5, LRA=11)
125
+
126
+ # Apply noise reduction if enabled (Note: basic filters are kept as minor cleanup)
127
+ if settings.enable_noise_reduction:
128
+ logger.debug("Applying subtle highpass filter...")
129
+ stream = stream.filter('highpass', f=80)
130
+
131
+ (
132
+ stream
133
+ .output(
134
+ str(output_path),
135
+ acodec='pcm_s16le',
136
+ ar=16000,
137
+ ac=1
138
+ )
139
+ .overwrite_output()
140
+ .run(quiet=True, capture_stderr=True)
141
+ )
142
+
143
+ @classmethod
144
+ async def get_audio_duration(cls, filepath: Path) -> float:
145
+ """
146
+ Get audio file duration in seconds.
147
+
148
+ Args:
149
+ filepath: Path to audio file
150
+
151
+ Returns:
152
+ Duration in seconds
153
+ """
154
+ try:
155
+ loop = asyncio.get_event_loop()
156
+ probe = await loop.run_in_executor(
157
+ None,
158
+ lambda: ffmpeg.probe(str(filepath))
159
+ )
160
+
161
+ duration = float(probe['format'].get('duration', 0))
162
+ return duration
163
+
164
+ except ffmpeg.Error as e:
165
+ logger.warning(f"Could not probe audio duration: {e}")
166
+ return 0.0
167
+
168
+ @classmethod
169
+ async def cleanup_files(cls, *filepaths: Path) -> None:
170
+ """
171
+ Delete temporary files.
172
+
173
+ Args:
174
+ filepaths: Paths to files to delete
175
+ """
176
+ for filepath in filepaths:
177
+ try:
178
+ if filepath and filepath.exists():
179
+ filepath.unlink()
180
+ logger.debug(f"Cleaned up: {filepath}")
181
+ except Exception as e:
182
+ logger.warning(f"Failed to clean up {filepath}: {e}")
183
+
184
+ @classmethod
185
+ async def process_upload(cls, file_content: bytes, filename: str) -> Tuple[Path, float]:
186
+ """
187
+ Full upload processing pipeline: validate, save, convert.
188
+
189
+ Args:
190
+ file_content: Uploaded file bytes
191
+ filename: Original filename
192
+
193
+ Returns:
194
+ Tuple of (processed WAV path, duration in seconds)
195
+ """
196
+ # Validate
197
+ cls.validate_file(filename, len(file_content))
198
+
199
+ # Save original
200
+ original_path = await cls.save_upload(file_content, filename)
201
+ vocals_path = None
202
+
203
+ try:
204
+ # Step 1: Denoising (Speech Enhancement)
205
+ if settings.enable_denoiser:
206
+ denoised_path = await DenoiserService.enhance_audio(original_path)
207
+ source_for_separation = denoised_path
208
+ else:
209
+ source_for_separation = original_path
210
+ denoised_path = None
211
+
212
+ # Step 2: Vocal separation using MDX-Net
213
+ if settings.enable_vocal_separation:
214
+ vocals_path = await VocalSeparator.separate_vocals(source_for_separation)
215
+ source_for_conversion = vocals_path
216
+ else:
217
+ source_for_conversion = source_for_separation
218
+ vocals_path = None
219
+
220
+ # Step 3: Convert to 16kHz mono WAV (includes normalization)
221
+ wav_path = await cls.convert_to_wav(source_for_conversion)
222
+
223
+ # Get duration
224
+ duration = await cls.get_audio_duration(wav_path)
225
+
226
+ # Cleanup intermediate files
227
+ to_cleanup = [original_path]
228
+ if denoised_path and denoised_path != original_path:
229
+ to_cleanup.append(denoised_path)
230
+ if vocals_path and vocals_path not in [original_path, denoised_path]:
231
+ to_cleanup.append(vocals_path)
232
+
233
+ await cls.cleanup_files(*to_cleanup)
234
+
235
+ return wav_path, duration
236
+
237
+ except Exception as e:
238
+ # Cleanup on error
239
+ await cls.cleanup_files(original_path)
240
+ if 'denoised_path' in locals() and denoised_path and denoised_path != original_path:
241
+ await cls.cleanup_files(denoised_path)
242
+ if 'vocals_path' in locals() and vocals_path and vocals_path not in [original_path, denoised_path]:
243
+ await cls.cleanup_files(vocals_path)
244
+ raise
app/services/denoiser.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speech Enhancement Service using Facebook's Denoiser.
3
+ Removes background noise and enhances speech quality.
4
+ """
5
+ import os
6
+ import asyncio
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ import torch
11
+ import torchaudio
12
+
13
+ from app.core.config import get_settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+ settings = get_settings()
17
+
18
+
19
+ class DenoiserError(Exception):
20
+ """Custom exception for denoiser errors."""
21
+ pass
22
+
23
+
24
+ class DenoiserService:
25
+ """
26
+ Service for enhancing speech using Facebook's Denoiser models.
27
+ Supports dns48, dns64, master64, etc.
28
+ """
29
+
30
+ _model = None
31
+ _model_name: str = None
32
+
33
+ @classmethod
34
+ def _get_model(cls):
35
+ """Lazy load the Denoiser model."""
36
+ if cls._model is None or cls._model_name != settings.denoiser_model:
37
+ from denoiser.pretrained import dns48, dns64, master64
38
+
39
+ model_map = {
40
+ "dns48": dns48,
41
+ "dns64": dns64,
42
+ "master64": master64
43
+ }
44
+
45
+ model_func = model_map.get(settings.denoiser_model, dns64)
46
+ logger.debug(f"Loading Denoiser model: {settings.denoiser_model}")
47
+
48
+ model = model_func()
49
+ device = settings.resolved_device
50
+ model.to(device)
51
+ model.eval()
52
+
53
+ cls._model = model
54
+ cls._model_name = settings.denoiser_model
55
+ logger.debug(f"Denoiser model loaded on {device}")
56
+
57
+ return cls._model
58
+
59
+ @classmethod
60
+ async def enhance_audio(cls, input_path: Path) -> Path:
61
+ """
62
+ Enhance audio by removing noise.
63
+
64
+ Args:
65
+ input_path: Path to input audio file
66
+
67
+ Returns:
68
+ Path to enhanced WAV file
69
+ """
70
+ if not settings.enable_denoiser:
71
+ logger.debug("Denoiser disabled, skipping...")
72
+ return input_path
73
+
74
+ logger.debug(f"Starting speech enhancement for: {input_path.name}")
75
+
76
+ try:
77
+ # Run enhancement in executor to not block
78
+ loop = asyncio.get_event_loop()
79
+ enhanced_path = await loop.run_in_executor(
80
+ None,
81
+ lambda: cls._run_enhancement(input_path)
82
+ )
83
+
84
+ logger.info(f"Speech enhancement complete: {enhanced_path.name}")
85
+ return enhanced_path
86
+
87
+ except Exception as e:
88
+ logger.error(f"Speech enhancement failed: {e}")
89
+ # Fallback to original on failure rather than failing the whole pipeline
90
+ logger.warning("Falling back to original audio.")
91
+ return input_path
92
+
93
+ @classmethod
94
+ def _run_enhancement(cls, input_path: Path) -> Path:
95
+ """Run the actual denoiser enhancement (blocking)."""
96
+ from denoiser.enhance import enhance
97
+
98
+ model = cls._get_model()
99
+ device = settings.resolved_device
100
+
101
+ # Load audio
102
+ wav, sr = torchaudio.load(str(input_path))
103
+ wav = wav.to(device)
104
+
105
+ # Ensure correct sample rate for the model
106
+ if sr != model.sample_rate:
107
+ resampler = torchaudio.transforms.Resample(sr, model.sample_rate).to(device)
108
+ wav = resampler(wav)
109
+ sr = model.sample_rate
110
+
111
+ # Enhance
112
+ # wav shape: [channels, time]
113
+ from types import SimpleNamespace
114
+
115
+ args = SimpleNamespace(
116
+ streaming=False,
117
+ dry=0.0,
118
+ sample_rate=sr
119
+ )
120
+
121
+ with torch.no_grad():
122
+ # denoiser.enhance.enhance(args, model, wav)
123
+ if wav.dim() == 1:
124
+ wav = wav.unsqueeze(0).unsqueeze(0)
125
+ elif wav.dim() == 2:
126
+ wav = wav.unsqueeze(0)
127
+
128
+ enhanced = enhance(args, model, wav)
129
+ # remove batch dim
130
+ enhanced = enhanced.squeeze(0)
131
+
132
+ # Save enhanced audio
133
+ output_filename = f"{input_path.stem}_denoised.wav"
134
+ output_path = settings.processed_dir / output_filename
135
+
136
+ torchaudio.save(
137
+ str(output_path),
138
+ enhanced.cpu(),
139
+ sr
140
+ )
141
+
142
+ return output_path
app/services/diarization.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speaker diarization service using pyannote.audio.
3
+ Identifies speaker turns in audio files.
4
+ """
5
+ import os
6
+ import logging
7
+ from pathlib import Path
8
+ from typing import List, Optional
9
+ from dataclasses import dataclass
10
+
11
+ import torch
12
+
13
+ from app.core.config import get_settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+ settings = get_settings()
17
+
18
+
19
+ @dataclass
20
+ class SpeakerSegment:
21
+ """A segment of audio attributed to a specific speaker."""
22
+ start: float
23
+ end: float
24
+ speaker: str
25
+
26
+
27
+ class DiarizationService:
28
+ """
29
+ Service for speaker diarization using pyannote.audio.
30
+ Implements lazy loading to avoid memory overhead at startup.
31
+ """
32
+
33
+ _instance: Optional["DiarizationService"] = None
34
+ _pipeline = None
35
+
36
+ def __new__(cls):
37
+ if cls._instance is None:
38
+ cls._instance = super().__new__(cls)
39
+ return cls._instance
40
+
41
+ @classmethod
42
+ def get_pipeline(cls):
43
+ """
44
+ Get or load the diarization pipeline (lazy loading with caching).
45
+
46
+ Returns:
47
+ Loaded pyannote Pipeline
48
+ """
49
+ if cls._pipeline is None:
50
+ # Import here to avoid loading if not used
51
+ from pyannote.audio import Pipeline
52
+
53
+ hf_token = settings.hf_token
54
+ if not hf_token:
55
+ raise ValueError(
56
+ "HuggingFace token required for pyannote.audio. "
57
+ "Set HF_TOKEN in your environment or .env file."
58
+ )
59
+
60
+ logger.debug(f"Loading diarization pipeline: {settings.diarization_model}")
61
+
62
+ # Use 'token' parameter (use_auth_token is deprecated)
63
+ cls._pipeline = Pipeline.from_pretrained(
64
+ settings.diarization_model,
65
+ token=hf_token
66
+ )
67
+
68
+ # Move to GPU if available
69
+ device = torch.device(settings.resolved_device)
70
+ if device.type == "cuda":
71
+ cls._pipeline = cls._pipeline.to(device)
72
+ logger.debug("Diarization pipeline moved to GPU")
73
+
74
+ logger.debug("Diarization pipeline loaded successfully")
75
+
76
+ return cls._pipeline
77
+
78
+ @classmethod
79
+ def is_loaded(cls) -> bool:
80
+ """Check if pipeline is loaded."""
81
+ return cls._pipeline is not None
82
+
83
+ @classmethod
84
+ def diarize(
85
+ cls,
86
+ audio_path: Path,
87
+ num_speakers: Optional[int] = None,
88
+ min_speakers: int = 1,
89
+ max_speakers: int = 10
90
+ ) -> List[SpeakerSegment]:
91
+ """
92
+ Perform speaker diarization on audio file.
93
+
94
+ Args:
95
+ audio_path: Path to WAV audio file
96
+ num_speakers: Exact number of speakers (None for auto-detect)
97
+ min_speakers: Minimum number of speakers to detect
98
+ max_speakers: Maximum number of speakers to detect
99
+
100
+ Returns:
101
+ List of SpeakerSegment with speaker labels
102
+ """
103
+ pipeline = cls.get_pipeline()
104
+
105
+ logger.debug(f"Diarizing: {audio_path}")
106
+
107
+ # Build parameters
108
+ params = {}
109
+ if num_speakers is not None:
110
+ params["num_speakers"] = num_speakers
111
+ else:
112
+ params["min_speakers"] = min_speakers
113
+ params["max_speakers"] = max_speakers
114
+
115
+ # Run diarization
116
+ diarization = pipeline(str(audio_path), **params)
117
+
118
+ # Handle pyannote.audio 4.x breaking change
119
+ # In 4.x, pipeline returns a DiarizeOutput object wrapping the Annotation
120
+ # In 3.x, it returns the Annotation directly
121
+ annotation = diarization
122
+ if hasattr(diarization, "speaker_diarization"):
123
+ annotation = diarization.speaker_diarization
124
+ logger.debug("Detected pyannote.audio 4.x DiarizeOutput structure")
125
+
126
+ # Convert to segments
127
+ segments = []
128
+ speaker_map = {} # Map SPEAKER_XX to Speaker 1, 2, etc.
129
+
130
+ for turn, _, speaker in annotation.itertracks(yield_label=True):
131
+ # Create readable speaker label
132
+ if speaker not in speaker_map:
133
+ speaker_map[speaker] = f"Speaker {len(speaker_map) + 1}"
134
+
135
+ segments.append(SpeakerSegment(
136
+ start=turn.start,
137
+ end=turn.end,
138
+ speaker=speaker_map[speaker]
139
+ ))
140
+
141
+ logger.info(f"Diarization complete: {len(segments)} turns, {len(speaker_map)} speakers")
142
+
143
+ return segments
144
+
145
+ @classmethod
146
+ async def diarize_async(
147
+ cls,
148
+ audio_path: Path,
149
+ num_speakers: Optional[int] = None,
150
+ min_speakers: int = 1,
151
+ max_speakers: int = 10
152
+ ) -> List[SpeakerSegment]:
153
+ """
154
+ Async wrapper for diarization (runs in thread pool).
155
+
156
+ Args:
157
+ audio_path: Path to WAV audio file
158
+ num_speakers: Exact number of speakers
159
+ min_speakers: Minimum speakers
160
+ max_speakers: Maximum speakers
161
+
162
+ Returns:
163
+ List of SpeakerSegment
164
+ """
165
+ import asyncio
166
+
167
+ loop = asyncio.get_event_loop()
168
+ return await loop.run_in_executor(
169
+ None,
170
+ lambda: cls.diarize(audio_path, num_speakers, min_speakers, max_speakers)
171
+ )
172
+
173
+ @classmethod
174
+ def preload_pipeline(cls) -> None:
175
+ """Preload the pipeline during startup."""
176
+ try:
177
+ cls.get_pipeline()
178
+ except Exception as e:
179
+ logger.warning(f"Failed to preload diarization pipeline: {e}")
180
+ # Don't raise - diarization is optional, app can work without it
app/services/orchestrator.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pipeline Orchestrator for PrecisionVoice.
3
+ Coordinates transcription and diarization in parallel.
4
+ """
5
+ import time
6
+ import asyncio
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ from app.core.config import get_settings
11
+ from app.schemas.models import TranscriptionResponse
12
+ from app.services.transcription import TranscriptionService
13
+ from app.services.diarization import DiarizationService
14
+ from app.services.alignment import AlignmentService
15
+
16
+ logger = logging.getLogger(__name__)
17
+ settings = get_settings()
18
+
19
+ class PipelineOrchestrator:
20
+ """
21
+ Coordinates the AI pipeline with detailed server-side logging:
22
+ 1. Audio -> Vocal Separation (MDX-Net) -> 16kHz WAV
23
+ 2. Whisper (Transcribe) + Pyannote (Diarize) in parallel
24
+ 3. Alignment (Matching Algorithm)
25
+ 4. Generate outputs (TXT, SRT)
26
+ """
27
+
28
+ @classmethod
29
+ async def process_audio(
30
+ cls,
31
+ wav_path: Path,
32
+ duration: float
33
+ ) -> TranscriptionResponse:
34
+ """
35
+ Run the full processing pipeline and return the final response.
36
+ Each step is logged for server-side monitoring.
37
+ """
38
+ start_time = time.time()
39
+
40
+ # Step 1: Pre-processing (Vocal Separation + Noise Reduction)
41
+ logger.info(f"[Step 1/4] Audio pre-processing completed (MDX-Net: {settings.enable_vocal_separation}, Denoise: {settings.enable_noise_reduction})")
42
+
43
+ # Step 2: AI Processing (Transcription & Diarization)
44
+ logger.info(f"[Step 2/4] Starting AI models (Whisper + Pyannote) for: {wav_path.name}")
45
+
46
+ transcription_task = TranscriptionService.transcribe_async(wav_path)
47
+ diarization_task = DiarizationService.diarize_async(wav_path)
48
+
49
+ try:
50
+ word_timestamps, speaker_segments = await asyncio.gather(
51
+ transcription_task,
52
+ diarization_task,
53
+ return_exceptions=False
54
+ )
55
+ logger.info(f"AI models processing completed: {len(word_timestamps)} words, {len(speaker_segments)} segments")
56
+ except Exception as e:
57
+ logger.exception("Parallel task failed")
58
+ raise
59
+
60
+ # Step 3: Precision Alignment
61
+ logger.info("[Step 3/4] Aligning words with speaker turns...")
62
+ aligned_segments = AlignmentService.align_precision(word_timestamps, speaker_segments)
63
+
64
+ # Count unique speakers
65
+ speakers = set(seg.speaker for seg in aligned_segments)
66
+
67
+ # Step 4: Export Generation
68
+ logger.info("[Step 4/4] Generating export files (TXT, SRT)...")
69
+ base_filename = wav_path.stem.replace("_processed", "")
70
+ txt_path, srt_path = AlignmentService.generate_outputs(aligned_segments, base_filename)
71
+
72
+ processing_time = time.time() - start_time
73
+ logger.info(f"Pipeline complete for {wav_path.name} in {processing_time:.2f}s")
74
+
75
+ return TranscriptionResponse(
76
+ success=True,
77
+ message="Transcription completed successfully",
78
+ segments=aligned_segments,
79
+ duration=duration,
80
+ num_speakers=len(speakers),
81
+ processing_time=round(processing_time, 2),
82
+ download_txt=f"/api/download/{txt_path.name}",
83
+ download_srt=f"/api/download/{srt_path.name}"
84
+ )
app/services/transcription.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Transcription service using faster-whisper.
3
+ Loads the suzii/vi-whisper-large-v3-turbo-v1-ct2 model for Vietnamese STT.
4
+ Returns word-level timestamps for precision alignment.
5
+ """
6
+ import logging
7
+ from pathlib import Path
8
+ from typing import List, Optional
9
+ from dataclasses import dataclass
10
+
11
+ from faster_whisper import WhisperModel
12
+
13
+ from app.core.config import get_settings
14
+
15
+ logger = logging.getLogger(__name__)
16
+ settings = get_settings()
17
+
18
+
19
+ @dataclass
20
+ class WordTimestamp:
21
+ """A single word with precise timestamp."""
22
+ word: str
23
+ start: float
24
+ end: float
25
+
26
+
27
+ @dataclass
28
+ class TranscriptSegmentRaw:
29
+ """Raw segment from Whisper transcription with word-level data."""
30
+ start: float
31
+ end: float
32
+ text: str
33
+ words: List[WordTimestamp]
34
+
35
+
36
+ class TranscriptionService:
37
+ """
38
+ Service for speech-to-text transcription using faster-whisper.
39
+ Implements singleton pattern for model caching.
40
+ Returns word-level timestamps for precision speaker alignment.
41
+ """
42
+
43
+ _instance: Optional["TranscriptionService"] = None
44
+ _model: Optional[WhisperModel] = None
45
+
46
+ def __new__(cls):
47
+ if cls._instance is None:
48
+ cls._instance = super().__new__(cls)
49
+ return cls._instance
50
+
51
+ @classmethod
52
+ def get_model(cls) -> WhisperModel:
53
+ """
54
+ Get or load the Whisper model (lazy loading with caching).
55
+
56
+ Returns:
57
+ Loaded WhisperModel instance
58
+ """
59
+ if cls._model is None:
60
+ logger.debug(f"Loading Whisper model: {settings.whisper_model}")
61
+ logger.debug(f"Device: {settings.resolved_device}, Compute type: {settings.resolved_compute_type}")
62
+
63
+ cls._model = WhisperModel(
64
+ settings.whisper_model,
65
+ device=settings.resolved_device,
66
+ compute_type=settings.resolved_compute_type,
67
+ download_root=None, # Use default HF cache
68
+ )
69
+
70
+ logger.debug("Whisper model loaded successfully")
71
+
72
+ return cls._model
73
+
74
+ @classmethod
75
+ def is_loaded(cls) -> bool:
76
+ """Check if model is loaded."""
77
+ return cls._model is not None
78
+
79
+ @classmethod
80
+ def transcribe(
81
+ cls,
82
+ audio_path: Path,
83
+ language: str = "vi",
84
+ initial_prompt: Optional[str] = None
85
+ ) -> List[WordTimestamp]:
86
+ """
87
+ Transcribe audio file with word-level timestamps.
88
+
89
+ Args:
90
+ audio_path: Path to WAV audio file
91
+ language: Language code (default: Vietnamese)
92
+ initial_prompt: Optional prompt for context
93
+
94
+ Returns:
95
+ List of WordTimestamp with precise timing for each word
96
+ """
97
+ model = cls.get_model()
98
+
99
+ logger.debug(f"Transcribing: {audio_path}")
100
+
101
+ # Run transcription with word timestamps - CRITICAL for precision alignment
102
+ segments_generator, info = model.transcribe(
103
+ str(audio_path),
104
+ language=language,
105
+ initial_prompt=initial_prompt,
106
+ word_timestamps=True, # CRITICAL: Enable word-level timestamps
107
+ vad_filter=True, # Re-enabled for optimization
108
+ vad_parameters=dict(
109
+ threshold=settings.vad_threshold,
110
+ min_speech_duration_ms=settings.vad_min_speech_duration_ms,
111
+ min_silence_duration_ms=settings.vad_min_silence_duration_ms,
112
+ ),
113
+ beam_size=5,
114
+ best_of=5,
115
+ )
116
+
117
+ # Extract all words with timestamps
118
+ all_words = []
119
+ segment_count = 0
120
+
121
+ for segment in segments_generator:
122
+ segment_count += 1
123
+ if segment.words:
124
+ for word in segment.words:
125
+ all_words.append(WordTimestamp(
126
+ word=word.word.strip(),
127
+ start=word.start,
128
+ end=word.end
129
+ ))
130
+
131
+ logger.info(f"Transcription complete: {segment_count} segments, {len(all_words)} words, detected language: {info.language}")
132
+
133
+ return all_words
134
+
135
+ @classmethod
136
+ async def transcribe_async(
137
+ cls,
138
+ audio_path: Path,
139
+ language: str = "vi",
140
+ initial_prompt: Optional[str] = None
141
+ ) -> List[WordTimestamp]:
142
+ """
143
+ Async wrapper for transcription (runs in thread pool).
144
+
145
+ Args:
146
+ audio_path: Path to WAV audio file
147
+ language: Language code
148
+ initial_prompt: Optional prompt
149
+
150
+ Returns:
151
+ List of WordTimestamp
152
+ """
153
+ import asyncio
154
+
155
+ loop = asyncio.get_event_loop()
156
+ return await loop.run_in_executor(
157
+ None,
158
+ lambda: cls.transcribe(audio_path, language, initial_prompt)
159
+ )
160
+
161
+ @classmethod
162
+ def preload_model(cls) -> None:
163
+ """Preload the model during startup."""
164
+ try:
165
+ cls.get_model()
166
+ except Exception as e:
167
+ logger.error(f"Failed to preload Whisper model: {e}")
168
+ raise
app/services/vocal_separator.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vocal Separation Service using MDX-Net (via audio-separator).
3
+ Isolates vocals from audio files using state-of-the-art MDX-Net models.
4
+ """
5
+ import os
6
+ import asyncio
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from app.core.config import get_settings
12
+
13
+ logger = logging.getLogger(__name__)
14
+ settings = get_settings()
15
+
16
+
17
+ class VocalSeparationError(Exception):
18
+ """Custom exception for vocal separation errors."""
19
+ pass
20
+
21
+
22
+ class VocalSeparator:
23
+ """
24
+ Service for separating vocals from audio using MDX-Net.
25
+ Uses the audio-separator library which supports UVR models.
26
+ """
27
+
28
+ _separator = None
29
+ _model_name: str = None
30
+
31
+ @classmethod
32
+ def _get_separator(cls):
33
+ """Lazy load the Audio Separator."""
34
+ if cls._separator is None or cls._model_name != settings.mdx_model:
35
+ from audio_separator.separator import Separator
36
+
37
+ logger.debug(f"Initializing MDX-Net separator with model: {settings.mdx_model}")
38
+
39
+ # Initialize separator
40
+ # Note: audio-separator expects output_dir to exist
41
+ settings.processed_dir.mkdir(parents=True, exist_ok=True)
42
+
43
+ separator = Separator(
44
+ output_dir=str(settings.processed_dir),
45
+ output_format="WAV",
46
+ normalization_threshold=0.9
47
+ )
48
+
49
+ # Load model
50
+ separator.load_model(settings.mdx_model)
51
+
52
+ cls._separator = separator
53
+ cls._model_name = settings.mdx_model
54
+ logger.debug(f"MDX-Net model loaded on {settings.resolved_device}")
55
+
56
+ return cls._separator
57
+
58
+ @classmethod
59
+ async def separate_vocals(cls, input_path: Path) -> Path:
60
+ """
61
+ Separate vocals from audio file using MDX-Net.
62
+
63
+ Args:
64
+ input_path: Path to input audio file
65
+
66
+ Returns:
67
+ Path to separated vocals WAV file
68
+ """
69
+ if not settings.enable_vocal_separation:
70
+ logger.debug("Vocal separation disabled, skipping...")
71
+ return input_path
72
+
73
+ logger.debug(f"Starting vocal separation for: {input_path.name}")
74
+
75
+ try:
76
+ # Run separation in executor to not block
77
+ loop = asyncio.get_event_loop()
78
+ vocals_path = await loop.run_in_executor(
79
+ None,
80
+ lambda: cls._run_separation(input_path)
81
+ )
82
+
83
+ logger.info(f"Vocal separation complete: {vocals_path.name}")
84
+ return vocals_path
85
+
86
+ except Exception as e:
87
+ logger.error(f"Vocal separation failed: {e}")
88
+ # Fallback to original
89
+ logger.warning("Falling back to original audio.")
90
+ return input_path
91
+
92
+ @classmethod
93
+ def _run_separation(cls, input_path: Path) -> Path:
94
+ """Run the actual separation (blocking)."""
95
+ separator = cls._get_separator()
96
+
97
+ # separate() returns a list of output filenames
98
+ output_files = separator.separate(str(input_path))
99
+
100
+ # audio-separator usually produces multiple files (Vocals, Instrumental)
101
+ # We need to find the vocals one.
102
+ # It typically names them like {input_stem}_(Vocals)_{model}.wav
103
+
104
+ vocals_file = None
105
+ for file in output_files:
106
+ if "Vocals" in file:
107
+ vocals_file = settings.processed_dir / file
108
+ break
109
+
110
+ if not vocals_file:
111
+ # If we can't find the vocals file specifically, just take the first one or fail
112
+ logger.warning("Could not identify vocals stem in output files.")
113
+ if output_files:
114
+ vocals_file = settings.processed_dir / output_files[0]
115
+ else:
116
+ raise VocalSeparationError("No output files generated by separator.")
117
+
118
+ return vocals_file
app/static/css/style.css ADDED
@@ -0,0 +1,673 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ================================
2
+ PrecisionVoice - Modern Dark Theme
3
+ ================================ */
4
+
5
+ :root {
6
+ /* Color Palette */
7
+ --bg-primary: #0a0a0f;
8
+ --bg-secondary: #12121a;
9
+ --bg-card: rgba(255, 255, 255, 0.03);
10
+ --bg-card-hover: rgba(255, 255, 255, 0.05);
11
+
12
+ --text-primary: #ffffff;
13
+ --text-secondary: #a0a0b0;
14
+ --text-muted: #606070;
15
+
16
+ --accent-primary: #6366f1;
17
+ --accent-secondary: #8b5cf6;
18
+ --accent-gradient: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
19
+
20
+ --success: #10b981;
21
+ --error: #ef4444;
22
+ --warning: #f59e0b;
23
+
24
+ --border-color: rgba(255, 255, 255, 0.08);
25
+ --border-glow: rgba(99, 102, 241, 0.3);
26
+
27
+ /* Spacing */
28
+ --spacing-xs: 0.25rem;
29
+ --spacing-sm: 0.5rem;
30
+ --spacing-md: 1rem;
31
+ --spacing-lg: 1.5rem;
32
+ --spacing-xl: 2rem;
33
+ --spacing-2xl: 3rem;
34
+
35
+ /* Border Radius */
36
+ --radius-sm: 0.375rem;
37
+ --radius-md: 0.75rem;
38
+ --radius-lg: 1rem;
39
+ --radius-xl: 1.5rem;
40
+
41
+ /* Shadows */
42
+ --shadow-sm: 0 2px 8px rgba(0, 0, 0, 0.3);
43
+ --shadow-md: 0 4px 16px rgba(0, 0, 0, 0.4);
44
+ --shadow-lg: 0 8px 32px rgba(0, 0, 0, 0.5);
45
+ --shadow-glow: 0 0 40px rgba(99, 102, 241, 0.15);
46
+
47
+ /* Transitions */
48
+ --transition-fast: 0.15s ease;
49
+ --transition-normal: 0.3s ease;
50
+ --transition-slow: 0.5s ease;
51
+ }
52
+
53
+ /* ================================
54
+ Base Styles
55
+ ================================ */
56
+
57
+ *,
58
+ *::before,
59
+ *::after {
60
+ box-sizing: border-box;
61
+ margin: 0;
62
+ padding: 0;
63
+ }
64
+
65
+ html {
66
+ font-size: 16px;
67
+ scroll-behavior: smooth;
68
+ }
69
+
70
+ body {
71
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
72
+ background: var(--bg-primary);
73
+ color: var(--text-primary);
74
+ line-height: 1.6;
75
+ min-height: 100vh;
76
+ -webkit-font-smoothing: antialiased;
77
+ -moz-osx-font-smoothing: grayscale;
78
+ }
79
+
80
+ /* Animated background gradient */
81
+ body::before {
82
+ content: '';
83
+ position: fixed;
84
+ top: 0;
85
+ left: 0;
86
+ right: 0;
87
+ bottom: 0;
88
+ background:
89
+ radial-gradient(ellipse at 20% 20%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
90
+ radial-gradient(ellipse at 80% 80%, rgba(139, 92, 246, 0.06) 0%, transparent 50%),
91
+ radial-gradient(ellipse at 50% 50%, rgba(168, 85, 247, 0.04) 0%, transparent 70%);
92
+ pointer-events: none;
93
+ z-index: -1;
94
+ }
95
+
96
+ /* ================================
97
+ Layout
98
+ ================================ */
99
+
100
+ .app-container {
101
+ max-width: 800px;
102
+ margin: 0 auto;
103
+ padding: var(--spacing-lg);
104
+ min-height: 100vh;
105
+ display: flex;
106
+ flex-direction: column;
107
+ }
108
+
109
+ /* ================================
110
+ Header
111
+ ================================ */
112
+
113
+ .header {
114
+ text-align: center;
115
+ padding: var(--spacing-2xl) 0;
116
+ }
117
+
118
+ .logo {
119
+ display: flex;
120
+ align-items: center;
121
+ justify-content: center;
122
+ gap: var(--spacing-md);
123
+ margin-bottom: var(--spacing-sm);
124
+ }
125
+
126
+ .logo-icon {
127
+ width: 48px;
128
+ height: 48px;
129
+ background: var(--accent-gradient);
130
+ border-radius: var(--radius-lg);
131
+ display: flex;
132
+ align-items: center;
133
+ justify-content: center;
134
+ box-shadow: var(--shadow-glow);
135
+ }
136
+
137
+ .logo-icon svg {
138
+ width: 28px;
139
+ height: 28px;
140
+ color: white;
141
+ }
142
+
143
+ .logo h1 {
144
+ font-size: 2rem;
145
+ font-weight: 700;
146
+ background: var(--accent-gradient);
147
+ -webkit-background-clip: text;
148
+ -webkit-text-fill-color: transparent;
149
+ background-clip: text;
150
+ }
151
+
152
+ .tagline {
153
+ color: var(--text-secondary);
154
+ font-size: 1rem;
155
+ font-weight: 400;
156
+ }
157
+
158
+ /* ================================
159
+ Cards
160
+ ================================ */
161
+
162
+ .card {
163
+ background: var(--bg-card);
164
+ backdrop-filter: blur(20px);
165
+ border: 1px solid var(--border-color);
166
+ border-radius: var(--radius-xl);
167
+ padding: var(--spacing-xl);
168
+ margin-bottom: var(--spacing-lg);
169
+ transition: var(--transition-normal);
170
+ }
171
+
172
+ .card:hover {
173
+ border-color: var(--border-glow);
174
+ box-shadow: var(--shadow-glow);
175
+ }
176
+
177
+ .card-header {
178
+ display: flex;
179
+ align-items: center;
180
+ justify-content: space-between;
181
+ margin-bottom: var(--spacing-lg);
182
+ flex-wrap: wrap;
183
+ gap: var(--spacing-sm);
184
+ }
185
+
186
+ .card-header h2 {
187
+ font-size: 1.25rem;
188
+ font-weight: 600;
189
+ }
190
+
191
+ /* ================================
192
+ Badge
193
+ ================================ */
194
+
195
+ .badge {
196
+ display: inline-block;
197
+ padding: var(--spacing-xs) var(--spacing-sm);
198
+ background: rgba(99, 102, 241, 0.15);
199
+ color: var(--accent-primary);
200
+ border-radius: var(--radius-sm);
201
+ font-size: 0.75rem;
202
+ font-weight: 500;
203
+ text-transform: uppercase;
204
+ letter-spacing: 0.5px;
205
+ }
206
+
207
+ /* ================================
208
+ Upload Zone
209
+ ================================ */
210
+
211
+ .upload-zone {
212
+ border: 2px dashed var(--border-color);
213
+ border-radius: var(--radius-lg);
214
+ padding: var(--spacing-2xl);
215
+ text-align: center;
216
+ cursor: pointer;
217
+ transition: var(--transition-normal);
218
+ margin-bottom: var(--spacing-lg);
219
+ }
220
+
221
+ .upload-zone:hover,
222
+ .upload-zone.dragover {
223
+ border-color: var(--accent-primary);
224
+ background: rgba(99, 102, 241, 0.05);
225
+ }
226
+
227
+ .upload-zone.dragover {
228
+ transform: scale(1.02);
229
+ }
230
+
231
+ .upload-icon {
232
+ width: 64px;
233
+ height: 64px;
234
+ margin: 0 auto var(--spacing-md);
235
+ background: var(--accent-gradient);
236
+ border-radius: 50%;
237
+ display: flex;
238
+ align-items: center;
239
+ justify-content: center;
240
+ opacity: 0.8;
241
+ }
242
+
243
+ .upload-icon svg {
244
+ width: 32px;
245
+ height: 32px;
246
+ color: white;
247
+ }
248
+
249
+ .upload-text {
250
+ font-size: 1.125rem;
251
+ font-weight: 500;
252
+ color: var(--text-primary);
253
+ margin-bottom: var(--spacing-xs);
254
+ }
255
+
256
+ .upload-subtext {
257
+ color: var(--text-muted);
258
+ font-size: 0.875rem;
259
+ }
260
+
261
+ /* ================================
262
+ File Info
263
+ ================================ */
264
+
265
+ .file-info {
266
+ display: flex;
267
+ align-items: center;
268
+ justify-content: space-between;
269
+ padding: var(--spacing-md);
270
+ background: rgba(99, 102, 241, 0.1);
271
+ border-radius: var(--radius-md);
272
+ margin-bottom: var(--spacing-lg);
273
+ }
274
+
275
+ .file-details {
276
+ display: flex;
277
+ flex-direction: column;
278
+ gap: var(--spacing-xs);
279
+ }
280
+
281
+ .file-name {
282
+ font-weight: 500;
283
+ color: var(--text-primary);
284
+ }
285
+
286
+ .file-size {
287
+ font-size: 0.875rem;
288
+ color: var(--text-secondary);
289
+ }
290
+
291
+ /* ================================
292
+ Buttons
293
+ ================================ */
294
+
295
+ .btn {
296
+ display: inline-flex;
297
+ align-items: center;
298
+ justify-content: center;
299
+ gap: var(--spacing-sm);
300
+ padding: var(--spacing-md) var(--spacing-xl);
301
+ border: none;
302
+ border-radius: var(--radius-md);
303
+ font-family: inherit;
304
+ font-size: 1rem;
305
+ font-weight: 500;
306
+ cursor: pointer;
307
+ transition: var(--transition-fast);
308
+ text-decoration: none;
309
+ }
310
+
311
+ .btn:disabled {
312
+ opacity: 0.5;
313
+ cursor: not-allowed;
314
+ }
315
+
316
+ .btn svg {
317
+ width: 20px;
318
+ height: 20px;
319
+ }
320
+
321
+ .btn-primary {
322
+ width: 100%;
323
+ background: var(--accent-gradient);
324
+ color: white;
325
+ box-shadow: var(--shadow-md);
326
+ }
327
+
328
+ .btn-primary:hover:not(:disabled) {
329
+ transform: translateY(-2px);
330
+ box-shadow: var(--shadow-lg), var(--shadow-glow);
331
+ }
332
+
333
+ .btn-primary:active:not(:disabled) {
334
+ transform: translateY(0);
335
+ }
336
+
337
+ .btn-secondary {
338
+ background: var(--bg-card);
339
+ color: var(--text-primary);
340
+ border: 1px solid var(--border-color);
341
+ }
342
+
343
+ .btn-secondary:hover:not(:disabled) {
344
+ background: var(--bg-card-hover);
345
+ border-color: var(--accent-primary);
346
+ }
347
+
348
+ .btn-outline {
349
+ background: transparent;
350
+ color: var(--text-primary);
351
+ border: 1px solid var(--border-color);
352
+ padding: var(--spacing-sm) var(--spacing-md);
353
+ }
354
+
355
+ .btn-outline:hover {
356
+ background: var(--bg-card);
357
+ border-color: var(--accent-primary);
358
+ }
359
+
360
+ .btn-clear {
361
+ width: 36px;
362
+ height: 36px;
363
+ padding: 0;
364
+ background: transparent;
365
+ color: var(--text-muted);
366
+ }
367
+
368
+ .btn-clear:hover {
369
+ color: var(--error);
370
+ }
371
+
372
+ /* ================================
373
+ Processing Section
374
+ ================================ */
375
+
376
+ .processing-content {
377
+ text-align: center;
378
+ padding: var(--spacing-xl) 0;
379
+ }
380
+
381
+ .spinner {
382
+ width: 56px;
383
+ height: 56px;
384
+ margin: 0 auto var(--spacing-lg);
385
+ border: 3px solid var(--border-color);
386
+ border-top-color: var(--accent-primary);
387
+ border-radius: 50%;
388
+ animation: spin 1s linear infinite;
389
+ }
390
+
391
+ @keyframes spin {
392
+ to {
393
+ transform: rotate(360deg);
394
+ }
395
+ }
396
+
397
+ .processing-content h3 {
398
+ font-size: 1.25rem;
399
+ margin-bottom: var(--spacing-sm);
400
+ }
401
+
402
+ .processing-content p {
403
+ color: var(--text-secondary);
404
+ margin-bottom: var(--spacing-lg);
405
+ }
406
+
407
+ .progress-bar {
408
+ height: 6px;
409
+ background: var(--bg-secondary);
410
+ border-radius: var(--radius-sm);
411
+ overflow: hidden;
412
+ margin-bottom: var(--spacing-md);
413
+ }
414
+
415
+ .progress-fill {
416
+ height: 100%;
417
+ width: 0%;
418
+ background: var(--accent-gradient);
419
+ border-radius: var(--radius-sm);
420
+ transition: width 0.3s ease;
421
+ animation: pulse 2s ease-in-out infinite;
422
+ }
423
+
424
+ @keyframes pulse {
425
+
426
+ 0%,
427
+ 100% {
428
+ opacity: 1;
429
+ }
430
+
431
+ 50% {
432
+ opacity: 0.7;
433
+ }
434
+ }
435
+
436
+ .processing-hint {
437
+ font-size: 0.875rem;
438
+ color: var(--text-muted);
439
+ }
440
+
441
+ .timer-display {
442
+ font-size: 2rem;
443
+ font-weight: 700;
444
+ color: var(--accent-primary);
445
+ margin: var(--spacing-md) 0;
446
+ font-family: monospace;
447
+ text-shadow: 0 0 10px rgba(99, 102, 241, 0.3);
448
+ }
449
+
450
+ /* ================================
451
+ Results Section
452
+ ================================ */
453
+
454
+ .result-meta {
455
+ display: flex;
456
+ gap: var(--spacing-sm);
457
+ flex-wrap: wrap;
458
+ }
459
+
460
+ .download-buttons {
461
+ display: flex;
462
+ gap: var(--spacing-md);
463
+ margin-bottom: var(--spacing-lg);
464
+ flex-wrap: wrap;
465
+ }
466
+
467
+ .transcript-container {
468
+ max-height: 400px;
469
+ overflow-y: auto;
470
+ padding-right: var(--spacing-sm);
471
+ margin-bottom: var(--spacing-lg);
472
+ }
473
+
474
+ .transcript-container::-webkit-scrollbar {
475
+ width: 6px;
476
+ }
477
+
478
+ .transcript-container::-webkit-scrollbar-track {
479
+ background: var(--bg-secondary);
480
+ border-radius: var(--radius-sm);
481
+ }
482
+
483
+ .transcript-container::-webkit-scrollbar-thumb {
484
+ background: var(--border-color);
485
+ border-radius: var(--radius-sm);
486
+ }
487
+
488
+ .transcript-container::-webkit-scrollbar-thumb:hover {
489
+ background: var(--text-muted);
490
+ }
491
+
492
+ /* Transcript Segment */
493
+ .segment {
494
+ padding: var(--spacing-md);
495
+ border-radius: var(--radius-md);
496
+ margin-bottom: var(--spacing-sm);
497
+ background: var(--bg-secondary);
498
+ border-left: 3px solid var(--accent-primary);
499
+ transition: var(--transition-fast);
500
+ }
501
+
502
+ .segment:hover {
503
+ background: var(--bg-card-hover);
504
+ }
505
+
506
+ .segment-header {
507
+ display: flex;
508
+ align-items: center;
509
+ gap: var(--spacing-md);
510
+ margin-bottom: var(--spacing-xs);
511
+ flex-wrap: wrap;
512
+ }
513
+
514
+ .segment-speaker {
515
+ font-weight: 600;
516
+ color: var(--accent-primary);
517
+ }
518
+
519
+ .segment-time {
520
+ font-size: 0.75rem;
521
+ color: var(--text-muted);
522
+ font-family: monospace;
523
+ }
524
+
525
+ .segment-text {
526
+ color: var(--text-primary);
527
+ line-height: 1.7;
528
+ }
529
+
530
+ /* Speaker Colors */
531
+ .speaker-1 {
532
+ border-left-color: #6366f1;
533
+ }
534
+
535
+ .speaker-1 .segment-speaker {
536
+ color: #6366f1;
537
+ }
538
+
539
+ .speaker-2 {
540
+ border-left-color: #10b981;
541
+ }
542
+
543
+ .speaker-2 .segment-speaker {
544
+ color: #10b981;
545
+ }
546
+
547
+ .speaker-3 {
548
+ border-left-color: #f59e0b;
549
+ }
550
+
551
+ .speaker-3 .segment-speaker {
552
+ color: #f59e0b;
553
+ }
554
+
555
+ .speaker-4 {
556
+ border-left-color: #ec4899;
557
+ }
558
+
559
+ .speaker-4 .segment-speaker {
560
+ color: #ec4899;
561
+ }
562
+
563
+ .speaker-5 {
564
+ border-left-color: #8b5cf6;
565
+ }
566
+
567
+ .speaker-5 .segment-speaker {
568
+ color: #8b5cf6;
569
+ }
570
+
571
+ /* ================================
572
+ Error Section
573
+ ================================ */
574
+
575
+ .error-content {
576
+ text-align: center;
577
+ padding: var(--spacing-xl) 0;
578
+ }
579
+
580
+ .error-icon {
581
+ width: 64px;
582
+ height: 64px;
583
+ margin: 0 auto var(--spacing-lg);
584
+ background: rgba(239, 68, 68, 0.15);
585
+ border-radius: 50%;
586
+ display: flex;
587
+ align-items: center;
588
+ justify-content: center;
589
+ }
590
+
591
+ .error-icon svg {
592
+ width: 32px;
593
+ height: 32px;
594
+ color: var(--error);
595
+ }
596
+
597
+ .error-content h3 {
598
+ color: var(--error);
599
+ margin-bottom: var(--spacing-sm);
600
+ }
601
+
602
+ .error-content p {
603
+ color: var(--text-secondary);
604
+ margin-bottom: var(--spacing-lg);
605
+ }
606
+
607
+ /* ================================
608
+ Footer
609
+ ================================ */
610
+
611
+ .footer {
612
+ margin-top: auto;
613
+ padding: var(--spacing-xl) 0;
614
+ text-align: center;
615
+ color: var(--text-muted);
616
+ font-size: 0.875rem;
617
+ }
618
+
619
+ .footer strong {
620
+ color: var(--text-secondary);
621
+ }
622
+
623
+ .footer-note {
624
+ margin-top: var(--spacing-xs);
625
+ font-size: 0.75rem;
626
+ }
627
+
628
+ /* ================================
629
+ Utility Classes
630
+ ================================ */
631
+
632
+ .hidden {
633
+ display: none !important;
634
+ }
635
+
636
+ /* ================================
637
+ Responsive
638
+ ================================ */
639
+
640
+ @media (max-width: 640px) {
641
+ :root {
642
+ font-size: 14px;
643
+ }
644
+
645
+ .app-container {
646
+ padding: var(--spacing-md);
647
+ }
648
+
649
+ .card {
650
+ padding: var(--spacing-lg);
651
+ }
652
+
653
+ .upload-zone {
654
+ padding: var(--spacing-xl);
655
+ }
656
+
657
+ .card-header {
658
+ flex-direction: column;
659
+ align-items: flex-start;
660
+ }
661
+
662
+ .result-meta {
663
+ width: 100%;
664
+ }
665
+
666
+ .download-buttons {
667
+ flex-direction: column;
668
+ }
669
+
670
+ .download-buttons .btn {
671
+ width: 100%;
672
+ }
673
+ }
app/static/js/app.js ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * PrecisionVoice - Frontend Application Logic
3
+ * Handles file upload, transcription requests, and result display.
4
+ */
5
+
6
+ document.addEventListener('DOMContentLoaded', () => {
7
+ // DOM Elements
8
+ const elements = {
9
+ // Upload
10
+ dropZone: document.getElementById('drop-zone'),
11
+ fileInput: document.getElementById('file-input'),
12
+ fileInfo: document.getElementById('file-info'),
13
+ fileName: document.getElementById('file-name'),
14
+ fileSize: document.getElementById('file-size'),
15
+ clearBtn: document.getElementById('clear-btn'),
16
+ transcribeBtn: document.getElementById('transcribe-btn'),
17
+
18
+ // Sections
19
+ uploadSection: document.getElementById('upload-section'),
20
+ processingSection: document.getElementById('processing-section'),
21
+ resultsSection: document.getElementById('results-section'),
22
+ errorSection: document.getElementById('error-section'),
23
+
24
+ // Processing
25
+ processingStatus: document.getElementById('processing-status'),
26
+ progressFill: document.getElementById('progress-fill'),
27
+ processingTimer: document.getElementById('processing-timer'),
28
+
29
+ // Results
30
+ speakerCount: document.getElementById('speaker-count'),
31
+ durationInfo: document.getElementById('duration-info'),
32
+ processingTime: document.getElementById('processing-time'),
33
+ transcriptContainer: document.getElementById('transcript-container'),
34
+ downloadTxt: document.getElementById('download-txt'),
35
+ downloadSrt: document.getElementById('download-srt'),
36
+ newUploadBtn: document.getElementById('new-upload-btn'),
37
+
38
+ // Error
39
+ errorMessage: document.getElementById('error-message'),
40
+ retryBtn: document.getElementById('retry-btn')
41
+ };
42
+
43
+ let selectedFile = null;
44
+
45
+ // =====================
46
+ // Event Listeners
47
+ // =====================
48
+
49
+ // Click to upload
50
+ elements.dropZone.addEventListener('click', () => {
51
+ elements.fileInput.click();
52
+ });
53
+
54
+ // File input change
55
+ elements.fileInput.addEventListener('change', (e) => {
56
+ if (e.target.files.length > 0) {
57
+ handleFileSelection(e.target.files[0]);
58
+ }
59
+ });
60
+
61
+ // Drag and drop
62
+ elements.dropZone.addEventListener('dragover', (e) => {
63
+ e.preventDefault();
64
+ elements.dropZone.classList.add('dragover');
65
+ });
66
+
67
+ elements.dropZone.addEventListener('dragleave', () => {
68
+ elements.dropZone.classList.remove('dragover');
69
+ });
70
+
71
+ elements.dropZone.addEventListener('drop', (e) => {
72
+ e.preventDefault();
73
+ elements.dropZone.classList.remove('dragover');
74
+
75
+ if (e.dataTransfer.files.length > 0) {
76
+ handleFileSelection(e.dataTransfer.files[0]);
77
+ }
78
+ });
79
+
80
+ // Clear file
81
+ elements.clearBtn.addEventListener('click', (e) => {
82
+ e.stopPropagation();
83
+ clearFileSelection();
84
+ });
85
+
86
+ // Transcribe button
87
+ elements.transcribeBtn.addEventListener('click', () => {
88
+ if (selectedFile) {
89
+ startTranscription();
90
+ }
91
+ });
92
+
93
+ // New upload button
94
+ elements.newUploadBtn.addEventListener('click', resetToUpload);
95
+
96
+ // Retry button
97
+ elements.retryBtn.addEventListener('click', resetToUpload);
98
+
99
+ // =====================
100
+ // File Handling
101
+ // =====================
102
+
103
+ function handleFileSelection(file) {
104
+ const allowedTypes = ['audio/mpeg', 'audio/wav', 'audio/x-wav', 'audio/mp4', 'audio/x-m4a',
105
+ 'audio/ogg', 'audio/flac', 'audio/webm', 'video/webm'];
106
+ const allowedExtensions = ['mp3', 'wav', 'm4a', 'ogg', 'flac', 'webm'];
107
+
108
+ // Check file extension
109
+ const ext = file.name.split('.').pop().toLowerCase();
110
+ if (!allowedExtensions.includes(ext)) {
111
+ showError(`Unsupported file type: .${ext}. Supported: ${allowedExtensions.join(', ')}`);
112
+ return;
113
+ }
114
+
115
+ // Check file size (100MB limit)
116
+ const maxSize = 100 * 1024 * 1024;
117
+ if (file.size > maxSize) {
118
+ showError(`File too large. Maximum size: 100MB`);
119
+ return;
120
+ }
121
+
122
+ selectedFile = file;
123
+
124
+ // Update UI
125
+ elements.fileName.textContent = file.name;
126
+ elements.fileSize.textContent = formatFileSize(file.size);
127
+ elements.fileInfo.classList.remove('hidden');
128
+ elements.transcribeBtn.disabled = false;
129
+
130
+ // Hide drop zone text
131
+ elements.dropZone.style.display = 'none';
132
+ }
133
+
134
+ function clearFileSelection() {
135
+ selectedFile = null;
136
+ elements.fileInput.value = '';
137
+ elements.fileInfo.classList.add('hidden');
138
+ elements.transcribeBtn.disabled = true;
139
+ elements.dropZone.style.display = 'block';
140
+ }
141
+
142
+ function formatFileSize(bytes) {
143
+ if (bytes === 0) return '0 Bytes';
144
+ const k = 1024;
145
+ const sizes = ['Bytes', 'KB', 'MB', 'GB'];
146
+ const i = Math.floor(Math.log(bytes) / Math.log(k));
147
+ return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
148
+ }
149
+
150
+ // =====================
151
+ // Transcription
152
+ // =====================
153
+
154
+ async function startTranscription() {
155
+ if (!selectedFile) return;
156
+
157
+ // Show processing UI
158
+ showSection('processing');
159
+ updateProgress(100, 'Processing audio... (Check server logs for details)');
160
+
161
+ // Reset and start timer
162
+ let seconds = 0;
163
+ elements.processingTimer.textContent = '00:00';
164
+ const timerInterval = setInterval(() => {
165
+ seconds++;
166
+ const m = Math.floor(seconds / 60);
167
+ const s = seconds % 60;
168
+ elements.processingTimer.textContent = `${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
169
+ }, 1000);
170
+
171
+ try {
172
+ const formData = new FormData();
173
+ formData.append('file', selectedFile);
174
+
175
+ const response = await fetch('/api/transcribe', {
176
+ method: 'POST',
177
+ body: formData
178
+ });
179
+
180
+ clearInterval(timerInterval);
181
+
182
+ if (!response.ok) {
183
+ const errorData = await response.json();
184
+ throw new Error(errorData.detail || 'Processing failed');
185
+ }
186
+
187
+ const result = await response.json();
188
+ displayResults(result);
189
+
190
+ } catch (error) {
191
+ clearInterval(timerInterval);
192
+ console.error('Processing error:', error);
193
+ showError(error.message || 'An error occurred during processing');
194
+ }
195
+ }
196
+
197
+ function updateProgress(percent, status) {
198
+ elements.progressFill.style.width = `${percent}%`;
199
+ if (status) {
200
+ elements.processingStatus.textContent = status;
201
+ }
202
+ }
203
+
204
+ // =====================
205
+ // Results Display
206
+ // =====================
207
+
208
+ function displayResults(result) {
209
+ // Update metadata
210
+ elements.speakerCount.textContent = `${result.num_speakers} speaker${result.num_speakers !== 1 ? 's' : ''}`;
211
+ elements.durationInfo.textContent = formatDuration(result.duration);
212
+ elements.processingTime.textContent = `${result.processing_time}s`;
213
+
214
+ // Set download links
215
+ elements.downloadTxt.href = result.download_txt;
216
+ elements.downloadSrt.href = result.download_srt;
217
+
218
+ // Render transcript segments
219
+ renderTranscript(result.segments);
220
+
221
+ // Show results section
222
+ showSection('results');
223
+ }
224
+
225
+ function renderTranscript(segments) {
226
+ elements.transcriptContainer.innerHTML = '';
227
+
228
+ const speakerColors = {};
229
+ let colorIndex = 0;
230
+
231
+ segments.forEach((segment) => {
232
+ // Assign color to speaker
233
+ if (!(segment.speaker in speakerColors)) {
234
+ colorIndex++;
235
+ speakerColors[segment.speaker] = `speaker-${Math.min(colorIndex, 5)}`;
236
+ }
237
+
238
+ const segmentEl = document.createElement('div');
239
+ segmentEl.className = `segment ${speakerColors[segment.speaker]}`;
240
+
241
+ segmentEl.innerHTML = `
242
+ <div class="segment-header">
243
+ <span class="segment-speaker">${escapeHtml(segment.speaker)}</span>
244
+ <span class="segment-time">${formatTime(segment.start)} - ${formatTime(segment.end)}</span>
245
+ </div>
246
+ <p class="segment-text">${escapeHtml(segment.text)}</p>
247
+ `;
248
+
249
+ elements.transcriptContainer.appendChild(segmentEl);
250
+ });
251
+ }
252
+
253
+ function formatTime(seconds) {
254
+ const h = Math.floor(seconds / 3600);
255
+ const m = Math.floor((seconds % 3600) / 60);
256
+ const s = Math.floor(seconds % 60);
257
+
258
+ if (h > 0) {
259
+ return `${h}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
260
+ }
261
+ return `${m}:${s.toString().padStart(2, '0')}`;
262
+ }
263
+
264
+ function formatDuration(seconds) {
265
+ const m = Math.floor(seconds / 60);
266
+ const s = Math.floor(seconds % 60);
267
+ return `${m}:${s.toString().padStart(2, '0')}`;
268
+ }
269
+
270
+ function escapeHtml(text) {
271
+ const div = document.createElement('div');
272
+ div.textContent = text;
273
+ return div.innerHTML;
274
+ }
275
+
276
+ // =====================
277
+ // UI State Management
278
+ // =====================
279
+
280
+ function showSection(section) {
281
+ elements.uploadSection.classList.add('hidden');
282
+ elements.processingSection.classList.add('hidden');
283
+ elements.resultsSection.classList.add('hidden');
284
+ elements.errorSection.classList.add('hidden');
285
+
286
+ switch (section) {
287
+ case 'upload':
288
+ elements.uploadSection.classList.remove('hidden');
289
+ break;
290
+ case 'processing':
291
+ elements.processingSection.classList.remove('hidden');
292
+ break;
293
+ case 'results':
294
+ elements.resultsSection.classList.remove('hidden');
295
+ break;
296
+ case 'error':
297
+ elements.errorSection.classList.remove('hidden');
298
+ break;
299
+ }
300
+ }
301
+
302
+ function showError(message) {
303
+ elements.errorMessage.textContent = message;
304
+ showSection('error');
305
+ }
306
+
307
+ function resetToUpload() {
308
+ clearFileSelection();
309
+ showSection('upload');
310
+ updateProgress(0, 'Uploading file...');
311
+ }
312
+ });
app/templates/index.html ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="vi">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <meta name="description" content="PrecisionVoice - Speech-to-Text and Speaker Diarization powered by AI">
8
+ <title>PrecisionVoice | AI Speech Transcription</title>
9
+ <link rel="preconnect" href="https://fonts.googleapis.com">
10
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
11
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
12
+ <link rel="stylesheet" href="/static/css/style.css">
13
+ </head>
14
+
15
+ <body>
16
+ <div class="app-container">
17
+ <!-- Header -->
18
+ <header class="header">
19
+ <div class="logo">
20
+ <div class="logo-icon">
21
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
22
+ <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
23
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2" />
24
+ <line x1="12" y1="19" x2="12" y2="23" />
25
+ <line x1="8" y1="23" x2="16" y2="23" />
26
+ </svg>
27
+ </div>
28
+ <h1>PrecisionVoice</h1>
29
+ </div>
30
+ <p class="tagline">AI-Powered Speech Transcription with Speaker Detection</p>
31
+ </header>
32
+
33
+ <!-- Main Content -->
34
+ <main class="main-content">
35
+ <!-- Upload Section -->
36
+ <section id="upload-section" class="card upload-card">
37
+ <div class="card-header">
38
+ <h2>Upload Audio</h2>
39
+ <span class="badge">Supported: {{ allowed_formats }}</span>
40
+ </div>
41
+
42
+ <div class="upload-zone" id="drop-zone">
43
+ <div class="upload-icon">
44
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
45
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
46
+ <polyline points="17 8 12 3 7 8" />
47
+ <line x1="12" y1="3" x2="12" y2="15" />
48
+ </svg>
49
+ </div>
50
+ <p class="upload-text">Drag & drop audio file here</p>
51
+ <p class="upload-subtext">or click to browse</p>
52
+ <input type="file" id="file-input" accept=".mp3,.wav,.m4a,.ogg,.flac,.webm" hidden>
53
+ </div>
54
+
55
+ <div id="file-info" class="file-info hidden">
56
+ <div class="file-details">
57
+ <span class="file-name" id="file-name">audio.mp3</span>
58
+ <span class="file-size" id="file-size">0 MB</span>
59
+ </div>
60
+ <button class="btn btn-clear" id="clear-btn" title="Remove file">
61
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
62
+ <line x1="18" y1="6" x2="6" y2="18" />
63
+ <line x1="6" y1="6" x2="18" y2="18" />
64
+ </svg>
65
+ </button>
66
+ </div>
67
+
68
+ <button class="btn btn-primary" id="transcribe-btn" disabled>
69
+ <span class="btn-text">Transcribe</span>
70
+ <span class="btn-icon">
71
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
72
+ <polygon points="5 3 19 12 5 21 5 3" />
73
+ </svg>
74
+ </span>
75
+ </button>
76
+ </section>
77
+
78
+ <!-- Processing Section -->
79
+ <section id="processing-section" class="card processing-card hidden">
80
+ <div class="processing-content">
81
+ <div class="spinner"></div>
82
+ <h3>Processing Audio</h3>
83
+ <p id="processing-status">Uploading file...</p>
84
+ <div class="progress-bar">
85
+ <div class="progress-fill" id="progress-fill"></div>
86
+ </div>
87
+ <div class="timer-display" id="processing-timer">00:00</div>
88
+ <p class="processing-hint">This may take a few minutes depending on audio length</p>
89
+ </div>
90
+ </section>
91
+
92
+ <!-- Results Section -->
93
+ <section id="results-section" class="card results-card hidden">
94
+ <div class="card-header">
95
+ <h2>Transcription Results</h2>
96
+ <div class="result-meta">
97
+ <span id="speaker-count" class="badge">0 speakers</span>
98
+ <span id="duration-info" class="badge">0:00</span>
99
+ <span id="processing-time" class="badge">0.0s</span>
100
+ </div>
101
+ </div>
102
+
103
+ <div class="download-buttons">
104
+ <a href="#" id="download-txt" class="btn btn-outline" download>
105
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
106
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
107
+ <polyline points="7 10 12 15 17 10" />
108
+ <line x1="12" y1="15" x2="12" y2="3" />
109
+ </svg>
110
+ Download TXT
111
+ </a>
112
+ <a href="#" id="download-srt" class="btn btn-outline" download>
113
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
114
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
115
+ <polyline points="7 10 12 15 17 10" />
116
+ <line x1="12" y1="15" x2="12" y2="3" />
117
+ </svg>
118
+ Download SRT
119
+ </a>
120
+ </div>
121
+
122
+ <div class="transcript-container" id="transcript-container">
123
+ <!-- Transcript segments will be rendered here -->
124
+ </div>
125
+
126
+ <button class="btn btn-secondary" id="new-upload-btn">
127
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
128
+ <polyline points="1 4 1 10 7 10" />
129
+ <path d="M3.51 15a9 9 0 1 0 2.13-9.36L1 10" />
130
+ </svg>
131
+ New Transcription
132
+ </button>
133
+ </section>
134
+
135
+ <!-- Error Section -->
136
+ <section id="error-section" class="card error-card hidden">
137
+ <div class="error-content">
138
+ <div class="error-icon">
139
+ <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
140
+ <circle cx="12" cy="12" r="10" />
141
+ <line x1="15" y1="9" x2="9" y2="15" />
142
+ <line x1="9" y1="9" x2="15" y2="15" />
143
+ </svg>
144
+ </div>
145
+ <h3>Error</h3>
146
+ <p id="error-message">An error occurred during processing.</p>
147
+ <button class="btn btn-secondary" id="retry-btn">Try Again</button>
148
+ </div>
149
+ </section>
150
+ </main>
151
+
152
+ <!-- Footer -->
153
+ <footer class="footer">
154
+ <p>Powered by <strong>faster-whisper</strong> & <strong>pyannote.audio</strong></p>
155
+ <p class="footer-note">Max file size: {{ max_upload_mb }}MB</p>
156
+ </footer>
157
+ </div>
158
+
159
+ <script src="/static/js/app.js"></script>
160
+ </body>
161
+
162
+ </html>
data/processed/.gitkeep ADDED
File without changes
data/uploads/.gitkeep ADDED
File without changes
docker-compose.yml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ app:
3
+ build:
4
+ context: .
5
+ dockerfile: Dockerfile
6
+ args:
7
+ - PORT=${PORT:-7860}
8
+ container_name: precisionvoice
9
+ ports:
10
+ - "${PORT:-7860}:${PORT:-7860}"
11
+ volumes:
12
+ # Persist uploaded/processed files
13
+ - ./data:/app/data
14
+ # Cache models to avoid re-downloading
15
+ - model_cache_hf:/root/.cache/huggingface
16
+ - model_cache_torch:/root/.cache/torch
17
+ - model_cache_mdx:/root/.audio-separator-models
18
+ environment:
19
+ # HuggingFace token (required for pyannote.audio)
20
+ - HF_TOKEN=${HF_TOKEN:-}
21
+ # Model settings
22
+ - WHISPER_MODEL=${WHISPER_MODEL:-kiendt/PhoWhisper-large-ct2}
23
+ - DIARIZATION_MODEL=${DIARIZATION_MODEL:-pyannote/speaker-diarization-3.1}
24
+ # Device (auto, cuda, cpu)
25
+ - DEVICE=${DEVICE:-auto}
26
+ # Denoising (Speech Enhancement)
27
+ - ENABLE_DENOISER=${ENABLE_DENOISER:-True}
28
+ - DENOISER_MODEL=${DENOISER_MODEL:-dns64}
29
+ # MDX-Net Vocal Separation
30
+ - ENABLE_VOCAL_SEPARATION=${ENABLE_VOCAL_SEPARATION:-True}
31
+ - MDX_MODEL=${MDX_MODEL:-UVR-MDX-NET-Voc_FT}
32
+ # Upload settings
33
+ - MAX_UPLOAD_SIZE_MB=${MAX_UPLOAD_SIZE_MB:-100}
34
+ # Optimization settings
35
+ - ENABLE_LOUDNORM=${ENABLE_LOUDNORM:-True}
36
+ - ENABLE_NOISE_REDUCTION=${ENABLE_NOISE_REDUCTION:-True}
37
+ # VAD settings
38
+ - VAD_THRESHOLD=${VAD_THRESHOLD:-0.5}
39
+ - VAD_MIN_SPEECH_DURATION_MS=${VAD_MIN_SPEECH_DURATION_MS:-250}
40
+ - VAD_MIN_SILENCE_DURATION_MS=${VAD_MIN_SILENCE_DURATION_MS:-500}
41
+ # Clustering settings
42
+ - MERGE_THRESHOLD_S=${MERGE_THRESHOLD_S:-0.5}
43
+ - MIN_SEGMENT_DURATION_S=${MIN_SEGMENT_DURATION_S:-0.3}
44
+ restart: unless-stopped
45
+ # GPU support (uncomment for NVIDIA GPU)
46
+ # deploy:
47
+ # resources:
48
+ # reservations:
49
+ # devices:
50
+ # - driver: nvidia
51
+ # count: all
52
+ # capabilities: [gpu]
53
+
54
+ volumes:
55
+ model_cache_hf:
56
+ name: precisionvoice_hf_cache
57
+ model_cache_torch:
58
+ name: precisionvoice_torch_cache
59
+ model_cache_mdx:
60
+ name: precisionvoice_mdx_cache
docker/.gitkeep ADDED
File without changes
precision_voice_colab.ipynb ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# PrecisionVoice - Google Colab Runner\n",
8
+ "\n",
9
+ "This notebook allows you to run the [PrecisionVoice](https://github.com/thichuong/PrecisionVoice) application directly in Google Colab.\n",
10
+ "\n",
11
+ "### Instructions\n",
12
+ "1. **Runtime Change**: Go to `Runtime` -> `Change runtime type` and make sure **T4 GPU** (or better) is selected.\n",
13
+ "2. **Run All**: You can select `Runtime` -> `Run all` or run each cell step-by-step.\n",
14
+ "3. **Public URL**: Look for the `ngrok` public URL in the final cell output to access the web interface."
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "id": "5efa55f1",
21
+ "metadata": {},
22
+ "outputs": [
23
+ {
24
+ "name": "stdout",
25
+ "output_type": "stream",
26
+ "text": [
27
+ "GPU Detected: Tesla T4\n"
28
+ ]
29
+ }
30
+ ],
31
+ "source": [
32
+ "# @title 1. Check GPU Availability\n",
33
+ "import torch\n",
34
+ "\n",
35
+ "if torch.cuda.is_available():\n",
36
+ " print(f\"GPU Detected: {torch.cuda.get_device_name(0)}\")\n",
37
+ "else:\n",
38
+ " print(\"WARNING: No GPU detected. This application requires a GPU to run efficiently.\")\n",
39
+ " print(\"Please go to Runtime -> Change runtime type -> Hardware accelerator -> T4 GPU\")"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "id": "b068e8ac",
46
+ "metadata": {},
47
+ "outputs": [
48
+ {
49
+ "name": "stdout",
50
+ "output_type": "stream",
51
+ "text": [
52
+ "Cloning into 'PrecisionVoice'...\n",
53
+ "remote: Enumerating objects: 94, done.\u001b[K\n",
54
+ "remote: Counting objects: 100% (94/94), done.\u001b[K\n",
55
+ "remote: Compressing objects: 100% (51/51), done.\u001b[K\n",
56
+ "remote: Total 94 (delta 34), reused 88 (delta 28), pack-reused 0 (from 0)\u001b[K\n",
57
+ "Receiving objects: 100% (94/94), 35.72 KiB | 5.10 MiB/s, done.\n",
58
+ "Resolving deltas: 100% (34/34), done.\n",
59
+ "/content/PrecisionVoice/PrecisionVoice/PrecisionVoice\n",
60
+ "Repository cloned successfully.\n"
61
+ ]
62
+ }
63
+ ],
64
+ "source": [
65
+ "# @title 2. Clone Repository\n",
66
+ "import os\n",
67
+ "\n",
68
+ "# Clean up previous run if exists\n",
69
+ "if os.path.exists(\"PrecisionVoice\"):\n",
70
+ " %cd /content\n",
71
+ " !rm -rf PrecisionVoice\n",
72
+ "\n",
73
+ "!git clone https://github.com/thichuong/PrecisionVoice.git\n",
74
+ "%cd PrecisionVoice\n",
75
+ "print(\"Repository cloned successfully.\")"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "id": "42afe30f",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "name": "stdout",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "Installing system dependencies... (This may take a moment)\n",
89
+ "✅ System dependencies (ffmpeg, libsndfile1) installed successfully.\n"
90
+ ]
91
+ }
92
+ ],
93
+ "source": [
94
+ "# @title 3. Install System Dependencies\n",
95
+ "import subprocess\n",
96
+ "\n",
97
+ "# Installing dependencies defined in Dockerfile (ffmpeg, libsndfile)\n",
98
+ "print(\"Installing system dependencies... (This may take a moment)\")\n",
99
+ "\n",
100
+ "# Update and install (suppressing harmless R-repo warnings common in Colab)\n",
101
+ "!apt-get update -y > /dev/null 2>&1\n",
102
+ "!apt-get install -y ffmpeg libsndfile1 > /dev/null 2>&1\n",
103
+ "\n",
104
+ "# Verify installation\n",
105
+ "try:\n",
106
+ " subprocess.run([\"ffmpeg\", \"-version\"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)\n",
107
+ " print(\"✅ System dependencies (ffmpeg, libsndfile1) installed successfully.\")\n",
108
+ "except Exception as e:\n",
109
+ " print(\"❌ Warning: Potential installation issue. If the app fails, try running '!apt-get install -y ffmpeg' manually.\")"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "4ec3974f",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "# @title 4. Install Python Dependencies\n",
120
+ "# Force upgrade torch, torchvision, torchaudio to ensure compatibility\n",
121
+ "!pip install -U torch torchvision torchaudio\n",
122
+ "\n",
123
+ "!pip install -r requirements.txt\n",
124
+ "# Install pyngrok to expose the local server to the internet\n",
125
+ "!pip install pyngrok\n",
126
+ "print(\"Python dependencies installed.\")"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "id": "1d5b721b",
133
+ "metadata": {},
134
+ "outputs": [
135
+ {
136
+ "name": "stdout",
137
+ "output_type": "stream",
138
+ "text": [
139
+ ".env file created with default settings.\n"
140
+ ]
141
+ }
142
+ ],
143
+ "source": [
144
+ "# @title 5. Setup Environment (.env)\n",
145
+ "# Creating a default .env file. You can modify this cell to add your specific keys.\n",
146
+ "\n",
147
+ "env_content = \"\"\"\n",
148
+ "PORT=7860\n",
149
+ "LOG_LEVEL=INFO\n",
150
+ "\n",
151
+ "# Audio Processing\n",
152
+ "NOISE_REDUCTION_LEVEL=5.0\n",
153
+ "VAD_THRESHOLD=0.5\n",
154
+ "VAD_MIN_SPEECH_DURATION_MS=250\n",
155
+ "VAD_MIN_SILENCE_DURATION_MS=500\n",
156
+ "MERGE_THRESHOLD_S=1.5\n",
157
+ "\"\"\"\n",
158
+ "\n",
159
+ "with open(\".env\", \"w\") as f:\n",
160
+ " f.write(env_content)\n",
161
+ "\n",
162
+ "print(\".env file created with default settings.\")"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "id": "9afa4d11",
169
+ "metadata": {},
170
+ "outputs": [
171
+ {
172
+ "name": "stdout",
173
+ "output_type": "stream",
174
+ "text": [
175
+ "Cleaning up previous sessions...\n"
176
+ ]
177
+ },
178
+ {
179
+ "data": {
180
+ "application/javascript": "(async (port, path, width, height, cache, element) => {\n if (!google.colab.kernel.accessAllowed && !cache) {\n return;\n }\n element.appendChild(document.createTextNode(''));\n const url = await google.colab.kernel.proxyPort(port, {cache});\n const iframe = document.createElement('iframe');\n iframe.src = new URL(path, url).toString();\n iframe.height = height;\n iframe.width = width;\n iframe.style.border = 0;\n iframe.allow = [\n 'accelerometer',\n 'autoplay',\n 'camera',\n 'clipboard-read',\n 'clipboard-write',\n 'gyroscope',\n 'magnetometer',\n 'microphone',\n 'serial',\n 'usb',\n 'xr-spatial-tracking',\n ].join('; ');\n element.appendChild(iframe);\n })(8000, \"/\", \"100%\", 900, false, window.element)",
181
+ "text/plain": [
182
+ "<IPython.core.display.Javascript object>"
183
+ ]
184
+ },
185
+ "metadata": {},
186
+ "output_type": "display_data"
187
+ },
188
+ {
189
+ "name": "stdout",
190
+ "output_type": "stream",
191
+ "text": [
192
+ "\u001b[31mWarning: This function may stop working due to changes in browser security.\n",
193
+ "Try `serve_kernel_port_as_iframe` instead. \u001b[0m\n"
194
+ ]
195
+ },
196
+ {
197
+ "data": {
198
+ "application/javascript": "(async (port, path, text, element) => {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n element.appendChild(document.createTextNode(''));\n const url = await google.colab.kernel.proxyPort(port);\n const anchor = document.createElement('a');\n anchor.href = new URL(path, url).toString();\n anchor.target = '_blank';\n anchor.setAttribute('data-href', url + path);\n anchor.textContent = text;\n element.appendChild(anchor);\n })(8000, \"/\", \"https://localhost:8000/\", window.element)",
199
+ "text/plain": [
200
+ "<IPython.core.display.Javascript object>"
201
+ ]
202
+ },
203
+ "metadata": {},
204
+ "output_type": "display_data"
205
+ },
206
+ {
207
+ "name": "stdout",
208
+ "output_type": "stream",
209
+ "text": [
210
+ "Starting server on port 8000...\n",
211
+ "\u001b[32mINFO\u001b[0m: Started server process [\u001b[36m25608\u001b[0m]\n",
212
+ "\u001b[32mINFO\u001b[0m: Waiting for application startup.\n",
213
+ "2026-01-04 03:10:59,288 - app.main - INFO - Starting PrecisionVoice application...\n",
214
+ "2026-01-04 03:10:59,292 - app.main - INFO - Device: cuda\n",
215
+ "2026-01-04 03:10:59,292 - app.main - INFO - Whisper model: kiendt/PhoWhisper-large-ct2\n",
216
+ "2026-01-04 03:10:59,293 - app.main - INFO - Diarization model: pyannote/speaker-diarization-3.1\n",
217
+ "2026-01-04 03:10:59,293 - app.main - INFO - Preloading Whisper model...\n",
218
+ "2026-01-04 03:10:59,293 - app.services.transcription - INFO - Loading Whisper model: kiendt/PhoWhisper-large-ct2\n",
219
+ "2026-01-04 03:10:59,293 - app.services.transcription - INFO - Device: cuda, Compute type: float16\n",
220
+ "\n",
221
+ "🚀 Ngrok Public URL: https://tandy-pileous-biologically.ngrok-free.dev\n",
222
+ "\n",
223
+ "2026-01-04 03:11:02,736 - app.services.transcription - INFO - Whisper model loaded successfully\n",
224
+ "2026-01-04 03:11:02,737 - app.main - WARNING - HF_TOKEN not set, diarization will not be available\n",
225
+ "2026-01-04 03:11:02,737 - app.main - INFO - Application startup complete\n",
226
+ "\u001b[32mINFO\u001b[0m: Application startup complete.\n",
227
+ "\u001b[32mINFO\u001b[0m: Uvicorn running on \u001b[1mhttp://0.0.0.0:8000\u001b[0m (Press CTRL+C to quit)\n",
228
+ "\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET / HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
229
+ "\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /static/css/style.css HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
230
+ "\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /static/js/app.js HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
231
+ "\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /favicon.ico HTTP/1.1\u001b[0m\" \u001b[31m404 Not Found\u001b[0m\n",
232
+ "2026-01-04 03:11:17,130 - app.services.audio_processor - INFO - Saved upload: /content/PrecisionVoice/PrecisionVoice/data/uploads/4bf9c6ad.wav\n",
233
+ "2026-01-04 03:11:17,131 - app.services.audio_processor - INFO - Applying loudnorm normalization...\n",
234
+ "2026-01-04 03:11:17,131 - app.services.audio_processor - INFO - Applying advanced noise reduction (anlmdn, level=5.0)...\n",
235
+ "\u001b[32mINFO\u001b[0m: Shutting down\n",
236
+ "\u001b[32mINFO\u001b[0m: Finished server process [\u001b[36m25608\u001b[0m]\n",
237
+ "\u001b[31mERROR\u001b[0m: Traceback (most recent call last):\n",
238
+ " File \"/usr/lib/python3.12/asyncio/runners.py\", line 195, in run\n",
239
+ " return runner.run(main)\n",
240
+ " ^^^^^^^^^^^^^^^^\n",
241
+ " File \"/usr/lib/python3.12/asyncio/runners.py\", line 118, in run\n",
242
+ " return self._loop.run_until_complete(task)\n",
243
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
244
+ " File \"uvloop/loop.pyx\", line 1512, in uvloop.loop.Loop.run_until_complete\n",
245
+ " File \"uvloop/loop.pyx\", line 1505, in uvloop.loop.Loop.run_until_complete\n",
246
+ " File \"uvloop/loop.pyx\", line 1379, in uvloop.loop.Loop.run_forever\n",
247
+ " File \"uvloop/loop.pyx\", line 557, in uvloop.loop.Loop._run\n",
248
+ " File \"uvloop/loop.pyx\", line 476, in uvloop.loop.Loop._on_idle\n",
249
+ " File \"uvloop/cbhandles.pyx\", line 83, in uvloop.loop.Handle._run\n",
250
+ " File \"uvloop/cbhandles.pyx\", line 63, in uvloop.loop.Handle._run\n",
251
+ " File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 70, in serve\n",
252
+ " with self.capture_signals():\n",
253
+ " ^^^^^^^^^^^^^^^^^^^^^^\n",
254
+ " File \"/usr/lib/python3.12/contextlib.py\", line 144, in __exit__\n",
255
+ " next(self.gen)\n",
256
+ " File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 331, in capture_signals\n",
257
+ " signal.raise_signal(captured_signal)\n",
258
+ " File \"/usr/lib/python3.12/asyncio/runners.py\", line 157, in _on_sigint\n",
259
+ " raise KeyboardInterrupt()\n",
260
+ "KeyboardInterrupt\n",
261
+ "\n",
262
+ "During handling of the above exception, another exception occurred:\n",
263
+ "\n",
264
+ "Traceback (most recent call last):\n",
265
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 701, in lifespan\n",
266
+ " await receive()\n",
267
+ " File \"/usr/local/lib/python3.12/dist-packages/uvicorn/lifespan/on.py\", line 137, in receive\n",
268
+ " return await self.receive_queue.get()\n",
269
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
270
+ " File \"/usr/lib/python3.12/asyncio/queues.py\", line 158, in get\n",
271
+ " await getter\n",
272
+ "asyncio.exceptions.CancelledError\n",
273
+ "\n",
274
+ "\u001b[31mERROR\u001b[0m: Exception in ASGI application\n",
275
+ "Traceback (most recent call last):\n",
276
+ " File \"/usr/lib/python3.12/asyncio/runners.py\", line 195, in run\n",
277
+ " return runner.run(main)\n",
278
+ " ^^^^^^^^^^^^^^^^\n",
279
+ " File \"/usr/lib/python3.12/asyncio/runners.py\", line 118, in run\n",
280
+ " return self._loop.run_until_complete(task)\n",
281
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
282
+ " File \"uvloop/loop.pyx\", line 1512, in uvloop.loop.Loop.run_until_complete\n",
283
+ " File \"uvloop/loop.pyx\", line 1505, in uvloop.loop.Loop.run_until_complete\n",
284
+ " File \"uvloop/loop.pyx\", line 1379, in uvloop.loop.Loop.run_forever\n",
285
+ " File \"uvloop/loop.pyx\", line 557, in uvloop.loop.Loop._run\n",
286
+ " File \"uvloop/loop.pyx\", line 476, in uvloop.loop.Loop._on_idle\n",
287
+ " File \"uvloop/cbhandles.pyx\", line 83, in uvloop.loop.Handle._run\n",
288
+ " File \"uvloop/cbhandles.pyx\", line 63, in uvloop.loop.Handle._run\n",
289
+ " File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 70, in serve\n",
290
+ " with self.capture_signals():\n",
291
+ " ^^^^^^^^^^^^^^^^^^^^^^\n",
292
+ " File \"/usr/lib/python3.12/contextlib.py\", line 144, in __exit__\n",
293
+ " next(self.gen)\n",
294
+ " File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 331, in capture_signals\n",
295
+ " signal.raise_signal(captured_signal)\n",
296
+ " File \"/usr/lib/python3.12/asyncio/runners.py\", line 157, in _on_sigint\n",
297
+ " raise KeyboardInterrupt()\n",
298
+ "KeyboardInterrupt\n",
299
+ "\n",
300
+ "During handling of the above exception, another exception occurred:\n",
301
+ "\n",
302
+ "Traceback (most recent call last):\n",
303
+ " File \"/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/httptools_impl.py\", line 409, in run_asgi\n",
304
+ " result = await app( # type: ignore[func-returns-value]\n",
305
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
306
+ " File \"/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py\", line 60, in __call__\n",
307
+ " return await self.app(scope, receive, send)\n",
308
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
309
+ " File \"/usr/local/lib/python3.12/dist-packages/fastapi/applications.py\", line 1139, in __call__\n",
310
+ " await super().__call__(scope, receive, send)\n",
311
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/applications.py\", line 107, in __call__\n",
312
+ " await self.middleware_stack(scope, receive, send)\n",
313
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py\", line 164, in __call__\n",
314
+ " await self.app(scope, receive, _send)\n",
315
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/cors.py\", line 93, in __call__\n",
316
+ " await self.simple_response(scope, receive, send, request_headers=headers)\n",
317
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/cors.py\", line 144, in simple_response\n",
318
+ " await self.app(scope, receive, send)\n",
319
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/exceptions.py\", line 63, in __call__\n",
320
+ " await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)\n",
321
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/_exception_handler.py\", line 42, in wrapped_app\n",
322
+ " await app(scope, receive, sender)\n",
323
+ " File \"/usr/local/lib/python3.12/dist-packages/fastapi/middleware/asyncexitstack.py\", line 18, in __call__\n",
324
+ " await self.app(scope, receive, send)\n",
325
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 716, in __call__\n",
326
+ " await self.middleware_stack(scope, receive, send)\n",
327
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 736, in app\n",
328
+ " await route.handle(scope, receive, send)\n",
329
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 290, in handle\n",
330
+ " await self.app(scope, receive, send)\n",
331
+ " File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 119, in app\n",
332
+ " await wrap_app_handling_exceptions(app, request)(scope, receive, send)\n",
333
+ " File \"/usr/local/lib/python3.12/dist-packages/starlette/_exception_handler.py\", line 42, in wrapped_app\n",
334
+ " await app(scope, receive, sender)\n",
335
+ " File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 105, in app\n",
336
+ " response = await f(request)\n",
337
+ " ^^^^^^^^^^^^^^^^\n",
338
+ " File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 385, in app\n",
339
+ " raw_response = await run_endpoint_function(\n",
340
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
341
+ " File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 284, in run_endpoint_function\n",
342
+ " return await dependant.call(**values)\n",
343
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
344
+ " File \"/content/PrecisionVoice/PrecisionVoice/app/api/routes.py\", line 62, in transcribe_audio\n",
345
+ " wav_path, duration = await AudioProcessor.process_upload(\n",
346
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
347
+ " File \"/content/PrecisionVoice/PrecisionVoice/app/services/audio_processor.py\", line 205, in process_upload\n",
348
+ " wav_path = await cls.convert_to_wav(original_path)\n",
349
+ " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
350
+ " File \"/content/PrecisionVoice/PrecisionVoice/app/services/audio_processor.py\", line 104, in convert_to_wav\n",
351
+ " await loop.run_in_executor(None, lambda: cls._run_ffmpeg_conversion(input_path, output_path))\n",
352
+ "asyncio.exceptions.CancelledError\n"
353
+ ]
354
+ }
355
+ ],
356
+ "source": [
357
+ "# @title 6. Run Application\n",
358
+ "import threading\n",
359
+ "import time\n",
360
+ "import os\n",
361
+ "from google.colab.output import serve_kernel_port_as_iframe, serve_kernel_port_as_window\n",
362
+ "from pyngrok import ngrok\n",
363
+ "\n",
364
+ "# FORCE KILL any existing ngrok processes to free up the auth token session\n",
365
+ "print(\"Cleaning up previous sessions...\")\n",
366
+ "!killall ngrok 2>/dev/null \n",
367
+ "ngrok.kill()\n",
368
+ "\n",
369
+ "# Set your authtoken (Ensure this matches the one in your Ngrok dashboard)\n",
370
+ "ngrok.set_auth_token(\"NGROK_TOKEN\")\n",
371
+ "\n",
372
+ "port = 8000\n",
373
+ "\n",
374
+ "def start_ngrok():\n",
375
+ " # Wait a bit for the server to start\n",
376
+ " time.sleep(5)\n",
377
+ " try:\n",
378
+ " # Connect to the port\n",
379
+ " public_url = ngrok.connect(port).public_url\n",
380
+ " print(f\"\\n🚀 Ngrok Public URL: {public_url}\\n\")\n",
381
+ " except Exception as e:\n",
382
+ " print(f\"Ngrok error: {e}\")\n",
383
+ "\n",
384
+ "# Start ngrok in a background thread\n",
385
+ "threading.Thread(target=start_ngrok, daemon=True).start()\n",
386
+ "\n",
387
+ "# Serve the application directly in the notebook cell\n",
388
+ "serve_kernel_port_as_iframe(port, height=900)\n",
389
+ "\n",
390
+ "# Also provide a link to open in a new tab via proxy\n",
391
+ "serve_kernel_port_as_window(port, path=\"/\")\n",
392
+ "\n",
393
+ "# Run the Uvicorn server\n",
394
+ "print(f\"Starting server on port {port}...\")\n",
395
+ "!uvicorn app.main:app --host 0.0.0.0 --port {port}"
396
+ ]
397
+ }
398
+ ],
399
+ "metadata": {
400
+ "accelerator": "GPU",
401
+ "colab": {
402
+ "gpuType": "T4",
403
+ "provenance": []
404
+ },
405
+ "kernelspec": {
406
+ "display_name": "Python 3 (ipykernel)",
407
+ "language": "python",
408
+ "name": "python3"
409
+ }
410
+ },
411
+ "nbformat": 4,
412
+ "nbformat_minor": 5
413
+ }
requirements.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core framework
2
+ fastapi>=0.109.0
3
+ uvicorn[standard]>=0.27.0
4
+ python-multipart>=0.0.6
5
+ jinja2>=3.1.2
6
+ aiofiles>=23.2.1
7
+
8
+ # AI/ML - Speech-to-Text
9
+ faster-whisper>=1.0.0
10
+ ctranslate2>=4.0.0
11
+
12
+ # AI/ML - Speaker Diarization
13
+ pyannote.audio>=3.1.0
14
+ torch>=2.1.0
15
+ torchaudio>=2.1.0
16
+
17
+ # AI/ML - Vocal Separation
18
+ audio-separator[cpu]>=0.17.0
19
+ denoiser>=0.1.4
20
+
21
+ # Audio processing
22
+ ffmpeg-python>=0.2.0
23
+ pydub>=0.25.1
24
+
25
+ # Configuration
26
+ pydantic-settings>=2.1.0
27
+ python-dotenv>=1.0.0
28
+
29
+ # Utilities
30
+ aiohttp>=3.9.0
31
+ numpy>=1.24.0
scripts/verify_model_config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from app.core.config import get_settings
3
+ from app.services.transcription import TranscriptionService
4
+
5
+ def verify_stt_model():
6
+ settings = get_settings()
7
+ print(f"Current Whisper Model: {settings.whisper_model}")
8
+ print(f"Device: {settings.resolved_device}")
9
+ print(f"Compute Type: {settings.resolved_compute_type}")
10
+
11
+ expected_model = "kiendt/PhoWhisper-large-ct2"
12
+ if settings.whisper_model == expected_model:
13
+ print("✅ SUCCESS: Model configuration updated correctly.")
14
+ else:
15
+ print(f"❌ FAILURE: Expected {expected_model}, got {settings.whisper_model}")
16
+
17
+ if __name__ == "__main__":
18
+ verify_stt_model()