Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .dockerignore +22 -0
- .env.example +57 -0
- .github/workflows/sync-to-huggingface.yml +24 -0
- .gitignore +46 -0
- Dockerfile +68 -0
- README.md +94 -4
- app/__init__.py +1 -0
- app/api/__init__.py +1 -0
- app/api/routes.py +122 -0
- app/core/__init__.py +1 -0
- app/core/config.py +104 -0
- app/main.py +115 -0
- app/schemas/__init__.py +1 -0
- app/schemas/models.py +73 -0
- app/services/__init__.py +1 -0
- app/services/alignment.py +353 -0
- app/services/audio_processor.py +244 -0
- app/services/denoiser.py +142 -0
- app/services/diarization.py +180 -0
- app/services/orchestrator.py +84 -0
- app/services/transcription.py +168 -0
- app/services/vocal_separator.py +118 -0
- app/static/css/style.css +673 -0
- app/static/js/app.js +312 -0
- app/templates/index.html +162 -0
- data/processed/.gitkeep +0 -0
- data/uploads/.gitkeep +0 -0
- docker-compose.yml +60 -0
- docker/.gitkeep +0 -0
- precision_voice_colab.ipynb +413 -0
- requirements.txt +31 -0
- scripts/verify_model_config.py +18 -0
.dockerignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.venv
|
| 6 |
+
venv
|
| 7 |
+
ENV
|
| 8 |
+
.git
|
| 9 |
+
.github
|
| 10 |
+
.vscode
|
| 11 |
+
.idea
|
| 12 |
+
*.log
|
| 13 |
+
.cache
|
| 14 |
+
.pytest_cache
|
| 15 |
+
data/uploads/*
|
| 16 |
+
data/processed/*
|
| 17 |
+
Dockerfile
|
| 18 |
+
docker-compose.yml
|
| 19 |
+
README.md
|
| 20 |
+
implementation_plan.md
|
| 21 |
+
walkthrough.md
|
| 22 |
+
task.md
|
.env.example
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Configuration for PrecisionVoice
|
| 2 |
+
|
| 3 |
+
# HuggingFace token (required for pyannote.audio)
|
| 4 |
+
# Get your token at: https://huggingface.co/settings/tokens
|
| 5 |
+
# Accept terms at: https://huggingface.co/pyannote/speaker-diarization-3.1
|
| 6 |
+
HF_TOKEN=your_huggingface_token_here
|
| 7 |
+
|
| 8 |
+
# Model settings
|
| 9 |
+
WHISPER_MODEL=kiendt/PhoWhisper-large-ct2
|
| 10 |
+
DIARIZATION_MODEL=pyannote/speaker-diarization-3.1
|
| 11 |
+
|
| 12 |
+
# Device settings (cuda, cpu, or auto)
|
| 13 |
+
DEVICE=auto
|
| 14 |
+
|
| 15 |
+
# --- Denoising (Speech Enhancement) ---
|
| 16 |
+
# Enable speech enhancement (removes background noise, hum, etc.)
|
| 17 |
+
ENABLE_DENOISER=True
|
| 18 |
+
# Denoiser model: dns64 (standard), dns48, or master64
|
| 19 |
+
DENOISER_MODEL=dns64
|
| 20 |
+
|
| 21 |
+
# --- MDX-Net Vocal Separation ---
|
| 22 |
+
# Enable vocal separation before transcription (isolates voice from music/noise)
|
| 23 |
+
# More effective than the basic Demucs implementation.
|
| 24 |
+
ENABLE_VOCAL_SEPARATION=True
|
| 25 |
+
# MDX-Net model: Kim_Vocal_2.onnx (recommended for vocals)
|
| 26 |
+
MDX_MODEL=Kim_Vocal_2.onnx
|
| 27 |
+
|
| 28 |
+
# Upload settings
|
| 29 |
+
MAX_UPLOAD_SIZE_MB=100
|
| 30 |
+
|
| 31 |
+
# --- Optimization Settings ---
|
| 32 |
+
|
| 33 |
+
# Enable subtle highpass filter (removes low-frequency rumble < 80Hz)
|
| 34 |
+
ENABLE_NOISE_REDUCTION=True
|
| 35 |
+
|
| 36 |
+
# Enable/Disable Loudness Normalization (EBU R128)
|
| 37 |
+
ENABLE_LOUDNORM=True
|
| 38 |
+
|
| 39 |
+
# --- VAD (Voice Activity Detection) Settings ---
|
| 40 |
+
|
| 41 |
+
# Threshold for detecting speech (0.0 to 1.0). Higher = stricter
|
| 42 |
+
VAD_THRESHOLD=0.5
|
| 43 |
+
# Ignore speech segments shorter than this (milliseconds)
|
| 44 |
+
VAD_MIN_SPEECH_DURATION_MS=250
|
| 45 |
+
# Minimum silence duration to split segments (milliseconds)
|
| 46 |
+
VAD_MIN_SILENCE_DURATION_MS=500
|
| 47 |
+
|
| 48 |
+
# --- Post-processing (Clustering) Settings ---
|
| 49 |
+
|
| 50 |
+
# Merge segments from same speaker if gap is less than this (seconds)
|
| 51 |
+
MERGE_THRESHOLD_S=0.5
|
| 52 |
+
# Filter out segments shorter than this (seconds) - removes blips/noise
|
| 53 |
+
MIN_SEGMENT_DURATION_S=0.3
|
| 54 |
+
|
| 55 |
+
# Server settings
|
| 56 |
+
HOST=0.0.0.0
|
| 57 |
+
PORT=8000
|
.github/workflows/sync-to-huggingface.yml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face Hub
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
sync:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- name: Checkout repository
|
| 13 |
+
uses: actions/checkout@v4
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
lfs: true
|
| 17 |
+
|
| 18 |
+
- name: Push to Hugging Face Hub
|
| 19 |
+
env:
|
| 20 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 21 |
+
run: |
|
| 22 |
+
git remote add huggingface https://huggingface.co/spaces/ThiThanhChuong/precision-voice || true
|
| 23 |
+
git remote set-url huggingface https://huggingface.co/spaces/ThiThanhChuong/precision-voice
|
| 24 |
+
git push https://user:$HF_TOKEN@huggingface.co/spaces/ThiThanhChuong/precision-voice main --force
|
.gitignore
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Virtual environment
|
| 7 |
+
venv/
|
| 8 |
+
.venv/
|
| 9 |
+
ENV/
|
| 10 |
+
|
| 11 |
+
# Environment files
|
| 12 |
+
.env
|
| 13 |
+
!.env.example
|
| 14 |
+
|
| 15 |
+
# IDE
|
| 16 |
+
.vscode/
|
| 17 |
+
.idea/
|
| 18 |
+
*.swp
|
| 19 |
+
*.swo
|
| 20 |
+
|
| 21 |
+
# Data directories (keep structure, ignore content)
|
| 22 |
+
data/uploads/*
|
| 23 |
+
data/processed/*
|
| 24 |
+
!data/uploads/.gitkeep
|
| 25 |
+
!data/processed/.gitkeep
|
| 26 |
+
|
| 27 |
+
# Docker
|
| 28 |
+
.docker/
|
| 29 |
+
|
| 30 |
+
# Logs
|
| 31 |
+
*.log
|
| 32 |
+
logs/
|
| 33 |
+
|
| 34 |
+
# Cache
|
| 35 |
+
.cache/
|
| 36 |
+
*.cache
|
| 37 |
+
.pytest_cache/
|
| 38 |
+
|
| 39 |
+
# OS
|
| 40 |
+
.DS_Store
|
| 41 |
+
Thumbs.db
|
| 42 |
+
|
| 43 |
+
# Model files (will be downloaded at runtime)
|
| 44 |
+
*.pt
|
| 45 |
+
*.bin
|
| 46 |
+
*.safetensors
|
Dockerfile
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ================================
|
| 2 |
+
# PrecisionVoice Dockerfile
|
| 3 |
+
# Optimized for performance and size
|
| 4 |
+
# ================================
|
| 5 |
+
|
| 6 |
+
# Stage 1: Builder
|
| 7 |
+
FROM python:3.10-slim-bullseye AS builder
|
| 8 |
+
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install build dependencies
|
| 12 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 13 |
+
build-essential \
|
| 14 |
+
git \
|
| 15 |
+
ffmpeg \
|
| 16 |
+
libsndfile1-dev \
|
| 17 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
# Copy requirements and install dependencies
|
| 20 |
+
# Using --user to keep packages in /root/.local
|
| 21 |
+
COPY requirements.txt .
|
| 22 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# ================================
|
| 25 |
+
# Stage 2: Runtime
|
| 26 |
+
# ================================
|
| 27 |
+
FROM python:3.10-slim-bullseye
|
| 28 |
+
|
| 29 |
+
WORKDIR /app
|
| 30 |
+
|
| 31 |
+
# Install runtime dependencies
|
| 32 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 33 |
+
ffmpeg \
|
| 34 |
+
libsndfile1 \
|
| 35 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 36 |
+
&& apt-get clean
|
| 37 |
+
|
| 38 |
+
# Copy Python packages from builder
|
| 39 |
+
COPY --from=builder /root/.local /root/.local
|
| 40 |
+
|
| 41 |
+
# Ensure scripts in .local are available
|
| 42 |
+
ENV PATH=/root/.local/bin:$PATH
|
| 43 |
+
ENV PYTHONUNBUFFERED=1
|
| 44 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 45 |
+
|
| 46 |
+
# Model cache directories
|
| 47 |
+
ENV HF_HOME=/root/.cache/huggingface
|
| 48 |
+
ENV TORCH_HOME=/root/.cache/torch
|
| 49 |
+
ENV TRANSFORMERS_CACHE=/root/.cache/huggingface
|
| 50 |
+
|
| 51 |
+
# Copy application code
|
| 52 |
+
COPY app/ ./app/
|
| 53 |
+
COPY data/ ./data/
|
| 54 |
+
|
| 55 |
+
# Create necessary directories
|
| 56 |
+
RUN mkdir -p /app/data/uploads /app/data/processed
|
| 57 |
+
|
| 58 |
+
# Port configuration
|
| 59 |
+
ARG PORT=7860
|
| 60 |
+
ENV PORT=${PORT}
|
| 61 |
+
EXPOSE ${PORT}
|
| 62 |
+
|
| 63 |
+
# Health check
|
| 64 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 65 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:${PORT}/api/health')" || exit 1
|
| 66 |
+
|
| 67 |
+
# Run the application
|
| 68 |
+
CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]
|
README.md
CHANGED
|
@@ -1,10 +1,100 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PrecisionVoice
|
| 3 |
+
emoji: 🎙️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
+
app_file: app/main.py
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 12 |
+
|
| 13 |
+
# PrecisionVoice - STT & Speaker Diarization
|
| 14 |
+
|
| 15 |
+
A production-ready Speech-to-Text and Speaker Diarization web application using FastAPI, faster-whisper, and pyannote.audio.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- 🎙️ Speech-to-Text using `kiendt/PhoWhisper-large-ct2` (optimized for Vietnamese)
|
| 20 |
+
- 👥 Speaker Diarization using `pyannote/speaker-diarization-3.1`
|
| 21 |
+
- 🧼 Advanced Denoising using Facebook's `Denoiser` (dns64)
|
| 22 |
+
- 🎤 Vocal Isolation using `MDX-Net` (UVR-MDX-NET-Voc_FT)
|
| 23 |
+
- 🔄 Automatic speaker-transcript alignment
|
| 24 |
+
- 📥 Download results in TXT or SRT format
|
| 25 |
+
- 🐳 Docker-ready with persistent model caching and GPU support
|
| 26 |
+
|
| 27 |
+
## Quick Start
|
| 28 |
+
|
| 29 |
+
### Prerequisites
|
| 30 |
+
|
| 31 |
+
1. Docker and Docker Compose
|
| 32 |
+
2. (Optional) NVIDIA GPU with CUDA support
|
| 33 |
+
3. HuggingFace account with access to pyannote models
|
| 34 |
+
|
| 35 |
+
### Setup
|
| 36 |
+
|
| 37 |
+
1. Clone and configure:
|
| 38 |
+
```bash
|
| 39 |
+
cp .env.example .env
|
| 40 |
+
# Edit .env and add your HuggingFace token
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
2. Build and run:
|
| 44 |
+
```bash
|
| 45 |
+
docker compose up --build
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
3. Open http://localhost:8000
|
| 49 |
+
|
| 50 |
+
## Audio Processing Pipeline
|
| 51 |
+
|
| 52 |
+
The system uses a state-of-the-art multi-stage pipeline to ensure maximum accuracy:
|
| 53 |
+
|
| 54 |
+
1. **Speech Enhancement**: Background noise, hums, and interference are removed using Facebook's `Denoiser` (Deep Learning Wave-U-Net).
|
| 55 |
+
2. **Vocal Isolation**: Vocals are stripped from any remaining background music or non-speech sounds using `MDX-Net`.
|
| 56 |
+
3. **Refinement**: Subtle highpass filtering and EBU R128 loudness normalization for consistent volume.
|
| 57 |
+
4. **Transcription**: High-precision Vietnamese transcription using `PhoWhisper`.
|
| 58 |
+
5. **Diarization**: Segmenting audio by speaker.
|
| 59 |
+
6. **Alignment**: Merging transcripts with speaker segments.
|
| 60 |
+
|
| 61 |
+
## Configuration
|
| 62 |
+
|
| 63 |
+
| Variable | Default | Description |
|
| 64 |
+
|----------|---------|-------------|
|
| 65 |
+
| `HF_TOKEN` | - | Required for Pyannote models |
|
| 66 |
+
| `ENABLE_DENOISER` | `True` | Toggle Facebook speech enhancement |
|
| 67 |
+
| `DENOISER_MODEL` | `dns64` | Model for denoising |
|
| 68 |
+
| `ENABLE_VOCAL_SEPARATION` | `True` | Toggle MDX-Net vocal isolation |
|
| 69 |
+
| `MDX_MODEL` | `UVR-MDX-NET-Voc_FT` | Model for vocal separation |
|
| 70 |
+
| `DEVICE` | `auto` | `cuda`, `cpu`, or `auto` |
|
| 71 |
+
|
| 72 |
+
## Development
|
| 73 |
+
|
| 74 |
+
### Local Setup (without Docker)
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
python -m venv venv
|
| 78 |
+
source venv/bin/activate
|
| 79 |
+
pip install -r requirements.txt
|
| 80 |
+
uvicorn app.main:app --reload
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
### API Endpoints
|
| 84 |
+
|
| 85 |
+
| Endpoint | Method | Description |
|
| 86 |
+
|----------|--------|-------------|
|
| 87 |
+
| `/` | GET | Web UI |
|
| 88 |
+
| `/api/transcribe` | POST | Upload and transcribe audio |
|
| 89 |
+
| `/api/download/{filename}` | GET | Download result files |
|
| 90 |
+
|
| 91 |
+
## Supported Audio Formats
|
| 92 |
+
|
| 93 |
+
- MP3
|
| 94 |
+
- WAV
|
| 95 |
+
- M4A
|
| 96 |
+
- OGG
|
| 97 |
+
|
| 98 |
+
## License
|
| 99 |
+
|
| 100 |
+
MIT
|
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# App package
|
app/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# API package
|
app/api/routes.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API routes for the transcription service.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
import time
|
| 6 |
+
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks
|
| 10 |
+
from fastapi.responses import FileResponse, StreamingResponse
|
| 11 |
+
|
| 12 |
+
from app.core.config import get_settings
|
| 13 |
+
from app.schemas.models import TranscriptionResponse, ErrorResponse, HealthResponse
|
| 14 |
+
from app.services.audio_processor import AudioProcessor, AudioProcessingError
|
| 15 |
+
from app.services.transcription import TranscriptionService
|
| 16 |
+
from app.services.diarization import DiarizationService
|
| 17 |
+
from app.services.alignment import AlignmentService
|
| 18 |
+
from app.services.orchestrator import PipelineOrchestrator
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
settings = get_settings()
|
| 22 |
+
|
| 23 |
+
router = APIRouter()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@router.get("/api/health", response_model=HealthResponse)
|
| 27 |
+
async def health_check():
|
| 28 |
+
"""Health check endpoint."""
|
| 29 |
+
return HealthResponse(
|
| 30 |
+
status="healthy",
|
| 31 |
+
models_loaded=TranscriptionService.is_loaded() and DiarizationService.is_loaded(),
|
| 32 |
+
device=settings.resolved_device
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
from fastapi.responses import FileResponse, StreamingResponse
|
| 37 |
+
|
| 38 |
+
# ... (rest of imports)
|
| 39 |
+
|
| 40 |
+
@router.post("/api/transcribe", response_model=TranscriptionResponse)
|
| 41 |
+
async def transcribe_audio(
|
| 42 |
+
background_tasks: BackgroundTasks,
|
| 43 |
+
file: UploadFile = File(..., description="Audio file to transcribe")
|
| 44 |
+
):
|
| 45 |
+
"""
|
| 46 |
+
Upload and transcribe an audio file.
|
| 47 |
+
Status updates are logged on the server.
|
| 48 |
+
"""
|
| 49 |
+
wav_path = None
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
# Read file content
|
| 53 |
+
file_content = await file.read()
|
| 54 |
+
|
| 55 |
+
# Validate and process audio
|
| 56 |
+
try:
|
| 57 |
+
AudioProcessor.validate_file(file.filename or "audio.wav", len(file_content))
|
| 58 |
+
except AudioProcessingError as e:
|
| 59 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 60 |
+
|
| 61 |
+
# Save and convert to WAV (Noise reduction happens here)
|
| 62 |
+
wav_path, duration = await AudioProcessor.process_upload(
|
| 63 |
+
file_content,
|
| 64 |
+
file.filename or "audio.wav"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Run orchestrated pipeline (Whisper + Pyannote in parallel -> Alignment)
|
| 68 |
+
logger.info("Executing orchestrated pipeline...")
|
| 69 |
+
response = await PipelineOrchestrator.process_audio(wav_path, duration)
|
| 70 |
+
|
| 71 |
+
# Schedule cleanup in background
|
| 72 |
+
background_tasks.add_task(cleanup_files, wav_path)
|
| 73 |
+
|
| 74 |
+
return response
|
| 75 |
+
|
| 76 |
+
except HTTPException:
|
| 77 |
+
raise
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.exception("Processing failed")
|
| 80 |
+
if wav_path and wav_path.exists():
|
| 81 |
+
background_tasks.add_task(cleanup_files, wav_path)
|
| 82 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@router.get("/api/download/{filename}")
|
| 87 |
+
async def download_file(filename: str, background_tasks: BackgroundTasks):
|
| 88 |
+
"""
|
| 89 |
+
Download a generated transcript file.
|
| 90 |
+
|
| 91 |
+
Supports: .txt, .srt files
|
| 92 |
+
"""
|
| 93 |
+
# Security: only allow specific extensions and no path traversal
|
| 94 |
+
if not filename.endswith(('.txt', '.srt')) or '/' in filename or '..' in filename:
|
| 95 |
+
raise HTTPException(status_code=400, detail="Invalid filename")
|
| 96 |
+
|
| 97 |
+
filepath = settings.processed_dir / filename
|
| 98 |
+
|
| 99 |
+
if not filepath.exists():
|
| 100 |
+
raise HTTPException(status_code=404, detail="File not found")
|
| 101 |
+
|
| 102 |
+
# Determine media type
|
| 103 |
+
media_type = "text/plain" if filename.endswith('.txt') else "application/x-subrip"
|
| 104 |
+
|
| 105 |
+
# Schedule cleanup after download (give some time for download to complete)
|
| 106 |
+
# Note: In production, you might want a separate cleanup job
|
| 107 |
+
|
| 108 |
+
return FileResponse(
|
| 109 |
+
path=filepath,
|
| 110 |
+
filename=filename,
|
| 111 |
+
media_type=media_type
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
async def cleanup_files(*paths: Path):
|
| 116 |
+
"""Background task to cleanup temporary files."""
|
| 117 |
+
import asyncio
|
| 118 |
+
|
| 119 |
+
# Wait a bit before cleanup to ensure files are not in use
|
| 120 |
+
await asyncio.sleep(5)
|
| 121 |
+
|
| 122 |
+
await AudioProcessor.cleanup_files(*paths)
|
app/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Core package
|
app/core/config.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Application configuration using Pydantic Settings.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import Literal
|
| 8 |
+
|
| 9 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Settings(BaseSettings):
|
| 13 |
+
"""Application settings loaded from environment variables."""
|
| 14 |
+
|
| 15 |
+
model_config = SettingsConfigDict(
|
| 16 |
+
env_file=".env",
|
| 17 |
+
env_file_encoding="utf-8",
|
| 18 |
+
extra="ignore"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# HuggingFace
|
| 22 |
+
hf_token: str = ""
|
| 23 |
+
enable_noise_reduction: bool = True
|
| 24 |
+
|
| 25 |
+
# Denoising (Speech Enhancement)
|
| 26 |
+
enable_denoiser: bool = True
|
| 27 |
+
denoiser_model: str = "dns64"
|
| 28 |
+
|
| 29 |
+
# MDX-Net Vocal Separation
|
| 30 |
+
enable_vocal_separation: bool = True
|
| 31 |
+
mdx_model: str = "Kim_Vocal_2.onnx" # High quality vocal isolation
|
| 32 |
+
|
| 33 |
+
# Model settings
|
| 34 |
+
whisper_model: str = "kiendt/PhoWhisper-large-ct2"
|
| 35 |
+
diarization_model: str = "pyannote/speaker-diarization-3.1"
|
| 36 |
+
|
| 37 |
+
# Device settings
|
| 38 |
+
device: Literal["cuda", "cpu", "auto"] = "auto"
|
| 39 |
+
compute_type: str = "float16" # float16 for GPU, int8 for CPU
|
| 40 |
+
|
| 41 |
+
# Upload settings
|
| 42 |
+
max_upload_size_mb: int = 100
|
| 43 |
+
allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
|
| 44 |
+
|
| 45 |
+
# Audio processing settings
|
| 46 |
+
sample_rate: int = 16000
|
| 47 |
+
channels: int = 1 # Mono
|
| 48 |
+
|
| 49 |
+
# Optimization parameters
|
| 50 |
+
noise_reduction_level: float = 12.0 # Used by anlmdn
|
| 51 |
+
enable_loudnorm: bool = True
|
| 52 |
+
|
| 53 |
+
# VAD parameters
|
| 54 |
+
vad_threshold: float = 0.5
|
| 55 |
+
vad_min_speech_duration_ms: int = 250
|
| 56 |
+
vad_min_silence_duration_ms: int = 500
|
| 57 |
+
|
| 58 |
+
# Post-processing
|
| 59 |
+
merge_threshold_s: float = 0.5 # Merge segments from same speaker if gap < this
|
| 60 |
+
min_segment_duration_s: float = 0.3 # Remove segments shorter than this
|
| 61 |
+
|
| 62 |
+
# Server settings
|
| 63 |
+
host: str = "0.0.0.0"
|
| 64 |
+
port: int = 7860
|
| 65 |
+
|
| 66 |
+
# Paths
|
| 67 |
+
base_dir: Path = Path(__file__).parent.parent.parent
|
| 68 |
+
data_dir: Path = base_dir / "data"
|
| 69 |
+
upload_dir: Path = data_dir / "uploads"
|
| 70 |
+
processed_dir: Path = data_dir / "processed"
|
| 71 |
+
|
| 72 |
+
def __init__(self, **kwargs):
|
| 73 |
+
super().__init__(**kwargs)
|
| 74 |
+
# Ensure directories exist
|
| 75 |
+
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
| 76 |
+
self.processed_dir.mkdir(parents=True, exist_ok=True)
|
| 77 |
+
|
| 78 |
+
@property
|
| 79 |
+
def max_upload_size_bytes(self) -> int:
|
| 80 |
+
return self.max_upload_size_mb * 1024 * 1024
|
| 81 |
+
|
| 82 |
+
@property
|
| 83 |
+
def resolved_device(self) -> str:
|
| 84 |
+
"""Resolve 'auto' to actual device."""
|
| 85 |
+
if self.device == "auto":
|
| 86 |
+
try:
|
| 87 |
+
import torch
|
| 88 |
+
return "cuda" if torch.cuda.is_available() else "cpu"
|
| 89 |
+
except ImportError:
|
| 90 |
+
return "cpu"
|
| 91 |
+
return self.device
|
| 92 |
+
|
| 93 |
+
@property
|
| 94 |
+
def resolved_compute_type(self) -> str:
|
| 95 |
+
"""Get appropriate compute type for device."""
|
| 96 |
+
if self.resolved_device == "cuda":
|
| 97 |
+
return "float16"
|
| 98 |
+
return "int8"
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@lru_cache
|
| 102 |
+
def get_settings() -> Settings:
|
| 103 |
+
"""Get cached settings instance."""
|
| 104 |
+
return Settings()
|
app/main.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PrecisionVoice - Speech-to-Text & Speaker Diarization Application
|
| 3 |
+
|
| 4 |
+
Main FastAPI application entry point.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
from contextlib import asynccontextmanager
|
| 8 |
+
|
| 9 |
+
from fastapi import FastAPI, Request
|
| 10 |
+
from fastapi.staticfiles import StaticFiles
|
| 11 |
+
from fastapi.templating import Jinja2Templates
|
| 12 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 13 |
+
from fastapi.responses import HTMLResponse
|
| 14 |
+
|
| 15 |
+
from app.core.config import get_settings
|
| 16 |
+
from app.api.routes import router
|
| 17 |
+
from app.services.transcription import TranscriptionService
|
| 18 |
+
from app.services.diarization import DiarizationService
|
| 19 |
+
|
| 20 |
+
# Configure logging
|
| 21 |
+
logging.basicConfig(
|
| 22 |
+
level=logging.INFO,
|
| 23 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 24 |
+
)
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
settings = get_settings()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@asynccontextmanager
|
| 31 |
+
async def lifespan(app: FastAPI):
|
| 32 |
+
"""
|
| 33 |
+
Application lifespan handler.
|
| 34 |
+
Preloads models on startup for faster first request.
|
| 35 |
+
"""
|
| 36 |
+
logger.info("Starting PrecisionVoice application...")
|
| 37 |
+
logger.info(f"Device: {settings.resolved_device}")
|
| 38 |
+
logger.info(f"Whisper model: {settings.whisper_model}")
|
| 39 |
+
logger.info(f"Diarization model: {settings.diarization_model}")
|
| 40 |
+
|
| 41 |
+
# Preload models (optional - can be disabled for faster startup)
|
| 42 |
+
try:
|
| 43 |
+
logger.info("Preloading Whisper model...")
|
| 44 |
+
TranscriptionService.preload_model()
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Failed to preload Whisper model: {e}")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
if settings.hf_token:
|
| 50 |
+
logger.info("Preloading diarization pipeline...")
|
| 51 |
+
DiarizationService.preload_pipeline()
|
| 52 |
+
else:
|
| 53 |
+
logger.warning("HF_TOKEN not set, diarization will not be available")
|
| 54 |
+
except Exception as e:
|
| 55 |
+
logger.warning(f"Diarization preload failed (will try again on first use): {e}")
|
| 56 |
+
|
| 57 |
+
logger.info("Application startup complete")
|
| 58 |
+
|
| 59 |
+
yield
|
| 60 |
+
|
| 61 |
+
logger.info("Shutting down PrecisionVoice application...")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# Create FastAPI app
|
| 65 |
+
app = FastAPI(
|
| 66 |
+
title="PrecisionVoice",
|
| 67 |
+
description="Speech-to-Text and Speaker Diarization API",
|
| 68 |
+
version="1.0.0",
|
| 69 |
+
lifespan=lifespan
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# CORS middleware
|
| 73 |
+
app.add_middleware(
|
| 74 |
+
CORSMiddleware,
|
| 75 |
+
allow_origins=["*"], # Configure appropriately for production
|
| 76 |
+
allow_credentials=True,
|
| 77 |
+
allow_methods=["*"],
|
| 78 |
+
allow_headers=["*"],
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# Mount static files
|
| 82 |
+
app.mount(
|
| 83 |
+
"/static",
|
| 84 |
+
StaticFiles(directory="app/static"),
|
| 85 |
+
name="static"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Templates
|
| 89 |
+
templates = Jinja2Templates(directory="app/templates")
|
| 90 |
+
|
| 91 |
+
# Include API routes
|
| 92 |
+
app.include_router(router)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
@app.get("/", response_class=HTMLResponse)
|
| 96 |
+
async def index(request: Request):
|
| 97 |
+
"""Serve the main web interface."""
|
| 98 |
+
return templates.TemplateResponse(
|
| 99 |
+
"index.html",
|
| 100 |
+
{
|
| 101 |
+
"request": request,
|
| 102 |
+
"max_upload_mb": settings.max_upload_size_mb,
|
| 103 |
+
"allowed_formats": ", ".join(settings.allowed_extensions)
|
| 104 |
+
}
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
if __name__ == "__main__":
|
| 109 |
+
import uvicorn
|
| 110 |
+
uvicorn.run(
|
| 111 |
+
"app.main:app",
|
| 112 |
+
host=settings.host,
|
| 113 |
+
port=settings.port,
|
| 114 |
+
reload=True
|
| 115 |
+
)
|
app/schemas/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Schemas package
|
app/schemas/models.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic models for API requests and responses.
|
| 3 |
+
"""
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from enum import Enum
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ProcessingStatus(str, Enum):
|
| 10 |
+
"""Status of the transcription process."""
|
| 11 |
+
PENDING = "pending"
|
| 12 |
+
PROCESSING = "processing"
|
| 13 |
+
COMPLETED = "completed"
|
| 14 |
+
FAILED = "failed"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TranscriptSegment(BaseModel):
|
| 18 |
+
"""A single segment of the transcript with speaker and timing."""
|
| 19 |
+
start: float = Field(..., description="Start time in seconds")
|
| 20 |
+
end: float = Field(..., description="End time in seconds")
|
| 21 |
+
speaker: str = Field(..., description="Speaker identifier")
|
| 22 |
+
text: str = Field(..., description="Transcribed text")
|
| 23 |
+
|
| 24 |
+
@property
|
| 25 |
+
def start_formatted(self) -> str:
|
| 26 |
+
"""Format start time as HH:MM:SS."""
|
| 27 |
+
return self._format_time(self.start)
|
| 28 |
+
|
| 29 |
+
@property
|
| 30 |
+
def end_formatted(self) -> str:
|
| 31 |
+
"""Format end time as HH:MM:SS."""
|
| 32 |
+
return self._format_time(self.end)
|
| 33 |
+
|
| 34 |
+
@staticmethod
|
| 35 |
+
def _format_time(seconds: float) -> str:
|
| 36 |
+
"""Convert seconds to HH:MM:SS format."""
|
| 37 |
+
hours = int(seconds // 3600)
|
| 38 |
+
minutes = int((seconds % 3600) // 60)
|
| 39 |
+
secs = int(seconds % 60)
|
| 40 |
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class TranscriptionRequest(BaseModel):
|
| 44 |
+
"""Request model for transcription settings."""
|
| 45 |
+
language: str = Field(default="vi", description="Language code for transcription")
|
| 46 |
+
num_speakers: Optional[int] = Field(default=None, description="Expected number of speakers (None for auto-detect)")
|
| 47 |
+
output_format: str = Field(default="json", description="Output format: json, txt, srt")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class TranscriptionResponse(BaseModel):
|
| 51 |
+
"""Response containing the transcription results."""
|
| 52 |
+
success: bool = Field(..., description="Whether transcription succeeded")
|
| 53 |
+
message: str = Field(default="", description="Status message")
|
| 54 |
+
segments: list[TranscriptSegment] = Field(default_factory=list, description="Transcript segments with speakers")
|
| 55 |
+
duration: float = Field(default=0.0, description="Audio duration in seconds")
|
| 56 |
+
num_speakers: int = Field(default=0, description="Number of detected speakers")
|
| 57 |
+
processing_time: float = Field(default=0.0, description="Processing time in seconds")
|
| 58 |
+
download_txt: Optional[str] = Field(default=None, description="Download URL for TXT file")
|
| 59 |
+
download_srt: Optional[str] = Field(default=None, description="Download URL for SRT file")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class ErrorResponse(BaseModel):
|
| 63 |
+
"""Error response model."""
|
| 64 |
+
success: bool = False
|
| 65 |
+
error: str = Field(..., description="Error message")
|
| 66 |
+
detail: Optional[str] = Field(default=None, description="Detailed error information")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class HealthResponse(BaseModel):
|
| 70 |
+
"""Health check response."""
|
| 71 |
+
status: str = "healthy"
|
| 72 |
+
models_loaded: bool = False
|
| 73 |
+
device: str = "cpu"
|
app/services/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Services package
|
app/services/alignment.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Precision alignment service - Word-center-based speaker assignment.
|
| 3 |
+
Merges word-level transcription with speaker diarization using precise timestamps.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Tuple, Optional
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
from app.core.config import get_settings
|
| 11 |
+
from app.schemas.models import TranscriptSegment
|
| 12 |
+
from app.services.transcription import WordTimestamp
|
| 13 |
+
from app.services.diarization import SpeakerSegment
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
settings = get_settings()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class WordWithSpeaker:
|
| 21 |
+
"""A word with assigned speaker."""
|
| 22 |
+
word: str
|
| 23 |
+
start: float
|
| 24 |
+
end: float
|
| 25 |
+
speaker: str
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class AlignmentService:
|
| 29 |
+
"""
|
| 30 |
+
Precision alignment service.
|
| 31 |
+
Uses word-center-based algorithm for accurate speaker-to-text mapping.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
# Pause threshold for splitting segments (seconds)
|
| 35 |
+
PAUSE_THRESHOLD = 1.0
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def get_word_center(word: WordTimestamp) -> float:
|
| 39 |
+
"""Calculate the center time of a word."""
|
| 40 |
+
return (word.start + word.end) / 2
|
| 41 |
+
|
| 42 |
+
@classmethod
|
| 43 |
+
def find_speaker_at_time(
|
| 44 |
+
cls,
|
| 45 |
+
time: float,
|
| 46 |
+
speaker_segments: List[SpeakerSegment]
|
| 47 |
+
) -> Optional[str]:
|
| 48 |
+
"""
|
| 49 |
+
Find which speaker is speaking at a given time.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
time: Time point in seconds
|
| 53 |
+
speaker_segments: List of speaker segments from diarization
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
Speaker label or None if no speaker found
|
| 57 |
+
"""
|
| 58 |
+
for seg in speaker_segments:
|
| 59 |
+
if seg.start <= time <= seg.end:
|
| 60 |
+
return seg.speaker
|
| 61 |
+
return None
|
| 62 |
+
|
| 63 |
+
@classmethod
|
| 64 |
+
def find_closest_speaker(
|
| 65 |
+
cls,
|
| 66 |
+
time: float,
|
| 67 |
+
speaker_segments: List[SpeakerSegment]
|
| 68 |
+
) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Find the closest speaker to a given time (for gaps/silence).
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
time: Time point in seconds
|
| 74 |
+
speaker_segments: List of speaker segments
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Closest speaker label or "Unknown"
|
| 78 |
+
"""
|
| 79 |
+
if not speaker_segments:
|
| 80 |
+
return "Unknown"
|
| 81 |
+
|
| 82 |
+
min_distance = float('inf')
|
| 83 |
+
closest_speaker = "Unknown"
|
| 84 |
+
|
| 85 |
+
for seg in speaker_segments:
|
| 86 |
+
# Distance to segment start or end
|
| 87 |
+
dist_to_start = abs(time - seg.start)
|
| 88 |
+
dist_to_end = abs(time - seg.end)
|
| 89 |
+
min_seg_dist = min(dist_to_start, dist_to_end)
|
| 90 |
+
|
| 91 |
+
if min_seg_dist < min_distance:
|
| 92 |
+
min_distance = min_seg_dist
|
| 93 |
+
closest_speaker = seg.speaker
|
| 94 |
+
|
| 95 |
+
return closest_speaker
|
| 96 |
+
|
| 97 |
+
@classmethod
|
| 98 |
+
def assign_speakers_to_words(
|
| 99 |
+
cls,
|
| 100 |
+
words: List[WordTimestamp],
|
| 101 |
+
speaker_segments: List[SpeakerSegment]
|
| 102 |
+
) -> List[WordWithSpeaker]:
|
| 103 |
+
"""
|
| 104 |
+
Step 3c: Assign speakers to each word based on word center time.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
words: List of words with timestamps from transcription
|
| 108 |
+
speaker_segments: List of speaker segments from diarization
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
List of words with speaker assignments
|
| 112 |
+
"""
|
| 113 |
+
if not speaker_segments:
|
| 114 |
+
# No diarization available, assign all to "Speaker 1"
|
| 115 |
+
logger.warning("No speaker segments available, using single speaker")
|
| 116 |
+
return [
|
| 117 |
+
WordWithSpeaker(
|
| 118 |
+
word=w.word,
|
| 119 |
+
start=w.start,
|
| 120 |
+
end=w.end,
|
| 121 |
+
speaker="Speaker 1"
|
| 122 |
+
)
|
| 123 |
+
for w in words
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
words_with_speakers = []
|
| 127 |
+
|
| 128 |
+
for word in words:
|
| 129 |
+
# Calculate word center time
|
| 130 |
+
center_time = cls.get_word_center(word)
|
| 131 |
+
|
| 132 |
+
# Find speaker at this time
|
| 133 |
+
speaker = cls.find_speaker_at_time(center_time, speaker_segments)
|
| 134 |
+
|
| 135 |
+
# If no direct match, find closest speaker
|
| 136 |
+
if speaker is None:
|
| 137 |
+
speaker = cls.find_closest_speaker(center_time, speaker_segments)
|
| 138 |
+
|
| 139 |
+
words_with_speakers.append(WordWithSpeaker(
|
| 140 |
+
word=word.word,
|
| 141 |
+
start=word.start,
|
| 142 |
+
end=word.end,
|
| 143 |
+
speaker=speaker
|
| 144 |
+
))
|
| 145 |
+
|
| 146 |
+
logger.debug(f"Assigned speakers to {len(words_with_speakers)} words")
|
| 147 |
+
return words_with_speakers
|
| 148 |
+
|
| 149 |
+
@classmethod
|
| 150 |
+
def reconstruct_segments(
|
| 151 |
+
cls,
|
| 152 |
+
words_with_speakers: List[WordWithSpeaker]
|
| 153 |
+
) -> List[TranscriptSegment]:
|
| 154 |
+
"""
|
| 155 |
+
Step 3d: Reconstruct sentence segments from words.
|
| 156 |
+
|
| 157 |
+
Groups consecutive words of the same speaker into segments.
|
| 158 |
+
Creates new segment when:
|
| 159 |
+
- Speaker changes
|
| 160 |
+
- Pause > PAUSE_THRESHOLD between words
|
| 161 |
+
|
| 162 |
+
Args:
|
| 163 |
+
words_with_speakers: List of words with speaker assignments
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
List of TranscriptSegment with complete sentences
|
| 167 |
+
"""
|
| 168 |
+
if not words_with_speakers:
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
segments = []
|
| 172 |
+
|
| 173 |
+
# Start first segment
|
| 174 |
+
current_speaker = words_with_speakers[0].speaker
|
| 175 |
+
current_start = words_with_speakers[0].start
|
| 176 |
+
current_end = words_with_speakers[0].end
|
| 177 |
+
current_words = [words_with_speakers[0].word]
|
| 178 |
+
|
| 179 |
+
for i in range(1, len(words_with_speakers)):
|
| 180 |
+
word = words_with_speakers[i]
|
| 181 |
+
prev_word = words_with_speakers[i - 1]
|
| 182 |
+
|
| 183 |
+
# Calculate pause between words
|
| 184 |
+
pause = word.start - prev_word.end
|
| 185 |
+
|
| 186 |
+
# Check if we need to start a new segment
|
| 187 |
+
speaker_changed = word.speaker != current_speaker
|
| 188 |
+
significant_pause = pause > cls.PAUSE_THRESHOLD
|
| 189 |
+
|
| 190 |
+
if speaker_changed or significant_pause:
|
| 191 |
+
# Save current segment
|
| 192 |
+
segments.append(TranscriptSegment(
|
| 193 |
+
start=current_start,
|
| 194 |
+
end=current_end,
|
| 195 |
+
speaker=current_speaker,
|
| 196 |
+
text=" ".join(current_words)
|
| 197 |
+
))
|
| 198 |
+
|
| 199 |
+
# Start new segment
|
| 200 |
+
current_speaker = word.speaker
|
| 201 |
+
current_start = word.start
|
| 202 |
+
current_end = word.end
|
| 203 |
+
current_words = [word.word]
|
| 204 |
+
else:
|
| 205 |
+
# Continue current segment
|
| 206 |
+
current_end = word.end
|
| 207 |
+
current_words.append(word.word)
|
| 208 |
+
|
| 209 |
+
# Don't forget the last segment
|
| 210 |
+
if current_words:
|
| 211 |
+
segments.append(TranscriptSegment(
|
| 212 |
+
start=current_start,
|
| 213 |
+
end=current_end,
|
| 214 |
+
speaker=current_speaker,
|
| 215 |
+
text=" ".join(current_words)
|
| 216 |
+
))
|
| 217 |
+
|
| 218 |
+
logger.debug(f"Reconstructed {len(segments)} segments from {len(words_with_speakers)} words")
|
| 219 |
+
return segments
|
| 220 |
+
|
| 221 |
+
@classmethod
|
| 222 |
+
def resize_and_merge_segments(
|
| 223 |
+
cls,
|
| 224 |
+
segments: List[TranscriptSegment]
|
| 225 |
+
) -> List[TranscriptSegment]:
|
| 226 |
+
"""
|
| 227 |
+
Merge consecutive segments of the same speaker if the gap is small.
|
| 228 |
+
Also filters out extremely short segments.
|
| 229 |
+
"""
|
| 230 |
+
if not segments:
|
| 231 |
+
return []
|
| 232 |
+
|
| 233 |
+
# Filter 1: Remove extremely short blips (noise)
|
| 234 |
+
segments = [s for s in segments if (s.end - s.start) >= settings.min_segment_duration_s]
|
| 235 |
+
|
| 236 |
+
if not segments:
|
| 237 |
+
return []
|
| 238 |
+
|
| 239 |
+
merged = []
|
| 240 |
+
curr = segments[0]
|
| 241 |
+
|
| 242 |
+
for i in range(1, len(segments)):
|
| 243 |
+
next_seg = segments[i]
|
| 244 |
+
|
| 245 |
+
# If same speaker and gap is small, merge
|
| 246 |
+
gap = next_seg.start - curr.end
|
| 247 |
+
if next_seg.speaker == curr.speaker and gap < settings.merge_threshold_s:
|
| 248 |
+
curr.end = next_seg.end
|
| 249 |
+
curr.text += " " + next_seg.text
|
| 250 |
+
else:
|
| 251 |
+
merged.append(curr)
|
| 252 |
+
curr = next_seg
|
| 253 |
+
|
| 254 |
+
merged.append(curr)
|
| 255 |
+
|
| 256 |
+
logger.debug(f"Merged segments: {len(segments)} -> {len(merged)}")
|
| 257 |
+
return merged
|
| 258 |
+
|
| 259 |
+
@classmethod
|
| 260 |
+
def align_precision(
|
| 261 |
+
cls,
|
| 262 |
+
words: List[WordTimestamp],
|
| 263 |
+
speaker_segments: List[SpeakerSegment]
|
| 264 |
+
) -> List[TranscriptSegment]:
|
| 265 |
+
"""
|
| 266 |
+
Full precision alignment pipeline.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
words: Word-level timestamps from transcription
|
| 270 |
+
speaker_segments: Speaker segments from diarization
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
List of TranscriptSegment with proper speaker assignments
|
| 274 |
+
"""
|
| 275 |
+
# Step 3c: Assign speakers to words
|
| 276 |
+
words_with_speakers = cls.assign_speakers_to_words(words, speaker_segments)
|
| 277 |
+
|
| 278 |
+
# Step 3d: Reconstruct segments
|
| 279 |
+
segments = cls.reconstruct_segments(words_with_speakers)
|
| 280 |
+
|
| 281 |
+
# Step 3e: Clustering/Merging (Optimization)
|
| 282 |
+
segments = cls.resize_and_merge_segments(segments)
|
| 283 |
+
|
| 284 |
+
return segments
|
| 285 |
+
|
| 286 |
+
@staticmethod
|
| 287 |
+
def format_timestamp_txt(seconds: float) -> str:
|
| 288 |
+
"""Format timestamp for TXT output: HH:MM:SS"""
|
| 289 |
+
hours = int(seconds // 3600)
|
| 290 |
+
minutes = int((seconds % 3600) // 60)
|
| 291 |
+
secs = int(seconds % 60)
|
| 292 |
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
| 293 |
+
|
| 294 |
+
@staticmethod
|
| 295 |
+
def format_timestamp_srt(seconds: float) -> str:
|
| 296 |
+
"""Format timestamp for SRT output: HH:MM:SS,mmm"""
|
| 297 |
+
hours = int(seconds // 3600)
|
| 298 |
+
minutes = int((seconds % 3600) // 60)
|
| 299 |
+
secs = int(seconds % 60)
|
| 300 |
+
millis = int((seconds % 1) * 1000)
|
| 301 |
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
| 302 |
+
|
| 303 |
+
@classmethod
|
| 304 |
+
def generate_txt(cls, segments: List[TranscriptSegment], output_path: Path) -> Path:
|
| 305 |
+
"""
|
| 306 |
+
Generate TXT transcript file.
|
| 307 |
+
|
| 308 |
+
Format: [HH:MM:SS - HH:MM:SS] Speaker: Text
|
| 309 |
+
"""
|
| 310 |
+
lines = []
|
| 311 |
+
for seg in segments:
|
| 312 |
+
start = cls.format_timestamp_txt(seg.start)
|
| 313 |
+
end = cls.format_timestamp_txt(seg.end)
|
| 314 |
+
lines.append(f"[{start} - {end}] {seg.speaker}: {seg.text}")
|
| 315 |
+
|
| 316 |
+
output_path.write_text("\n".join(lines), encoding="utf-8")
|
| 317 |
+
logger.info(f"Generated TXT: {output_path}")
|
| 318 |
+
|
| 319 |
+
return output_path
|
| 320 |
+
|
| 321 |
+
@classmethod
|
| 322 |
+
def generate_srt(cls, segments: List[TranscriptSegment], output_path: Path) -> Path:
|
| 323 |
+
"""
|
| 324 |
+
Generate SRT subtitle file.
|
| 325 |
+
"""
|
| 326 |
+
lines = []
|
| 327 |
+
for i, seg in enumerate(segments, 1):
|
| 328 |
+
start = cls.format_timestamp_srt(seg.start)
|
| 329 |
+
end = cls.format_timestamp_srt(seg.end)
|
| 330 |
+
lines.append(str(i))
|
| 331 |
+
lines.append(f"{start} --> {end}")
|
| 332 |
+
lines.append(f"[{seg.speaker}] {seg.text}")
|
| 333 |
+
lines.append("") # Empty line between entries
|
| 334 |
+
|
| 335 |
+
output_path.write_text("\n".join(lines), encoding="utf-8")
|
| 336 |
+
logger.info(f"Generated SRT: {output_path}")
|
| 337 |
+
|
| 338 |
+
return output_path
|
| 339 |
+
|
| 340 |
+
@classmethod
|
| 341 |
+
def generate_outputs(
|
| 342 |
+
cls,
|
| 343 |
+
segments: List[TranscriptSegment],
|
| 344 |
+
base_filename: str
|
| 345 |
+
) -> Tuple[Path, Path]:
|
| 346 |
+
"""Generate both TXT and SRT output files."""
|
| 347 |
+
txt_path = settings.processed_dir / f"{base_filename}.txt"
|
| 348 |
+
srt_path = settings.processed_dir / f"{base_filename}.srt"
|
| 349 |
+
|
| 350 |
+
cls.generate_txt(segments, txt_path)
|
| 351 |
+
cls.generate_srt(segments, srt_path)
|
| 352 |
+
|
| 353 |
+
return txt_path, srt_path
|
app/services/audio_processor.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio processing service using FFmpeg.
|
| 3 |
+
Handles file validation, conversion to 16kHz mono WAV, and cleanup.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import uuid
|
| 7 |
+
import asyncio
|
| 8 |
+
import logging
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional, Tuple
|
| 11 |
+
|
| 12 |
+
import ffmpeg
|
| 13 |
+
|
| 14 |
+
from app.core.config import get_settings
|
| 15 |
+
from app.services.vocal_separator import VocalSeparator
|
| 16 |
+
from app.services.denoiser import DenoiserService
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
settings = get_settings()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class AudioProcessingError(Exception):
|
| 23 |
+
"""Custom exception for audio processing errors."""
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class AudioProcessor:
|
| 28 |
+
"""Service for processing audio files."""
|
| 29 |
+
|
| 30 |
+
ALLOWED_EXTENSIONS = settings.allowed_extensions
|
| 31 |
+
TARGET_SAMPLE_RATE = settings.sample_rate
|
| 32 |
+
TARGET_CHANNELS = settings.channels
|
| 33 |
+
|
| 34 |
+
@classmethod
|
| 35 |
+
def validate_file(cls, filename: str, file_size: int) -> bool:
|
| 36 |
+
"""
|
| 37 |
+
Validate uploaded file.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
filename: Original filename
|
| 41 |
+
file_size: File size in bytes
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
True if valid
|
| 45 |
+
|
| 46 |
+
Raises:
|
| 47 |
+
AudioProcessingError: If validation fails
|
| 48 |
+
"""
|
| 49 |
+
# Check extension
|
| 50 |
+
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
| 51 |
+
if ext not in cls.ALLOWED_EXTENSIONS:
|
| 52 |
+
raise AudioProcessingError(
|
| 53 |
+
f"Invalid file type: .{ext}. Allowed: {', '.join(cls.ALLOWED_EXTENSIONS)}"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Check size
|
| 57 |
+
if file_size > settings.max_upload_size_bytes:
|
| 58 |
+
raise AudioProcessingError(
|
| 59 |
+
f"File too large: {file_size / (1024*1024):.1f}MB. "
|
| 60 |
+
f"Maximum: {settings.max_upload_size_mb}MB"
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return True
|
| 64 |
+
|
| 65 |
+
@classmethod
|
| 66 |
+
async def save_upload(cls, file_content: bytes, original_filename: str) -> Path:
|
| 67 |
+
"""
|
| 68 |
+
Save uploaded file to temporary location.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
file_content: File bytes
|
| 72 |
+
original_filename: Original filename for extension
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Path to saved file
|
| 76 |
+
"""
|
| 77 |
+
ext = original_filename.rsplit('.', 1)[-1].lower() if '.' in original_filename else 'wav'
|
| 78 |
+
unique_id = str(uuid.uuid4())[:8]
|
| 79 |
+
filename = f"{unique_id}.{ext}"
|
| 80 |
+
filepath = settings.upload_dir / filename
|
| 81 |
+
|
| 82 |
+
# Write file asynchronously
|
| 83 |
+
loop = asyncio.get_event_loop()
|
| 84 |
+
await loop.run_in_executor(None, lambda: filepath.write_bytes(file_content))
|
| 85 |
+
|
| 86 |
+
logger.debug(f"Saved upload: {filepath}")
|
| 87 |
+
return filepath
|
| 88 |
+
|
| 89 |
+
@classmethod
|
| 90 |
+
async def convert_to_wav(cls, input_path: Path) -> Path:
|
| 91 |
+
"""
|
| 92 |
+
Convert audio to 16kHz mono WAV using FFmpeg.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
input_path: Path to input audio file
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Path to converted WAV file
|
| 99 |
+
"""
|
| 100 |
+
output_filename = f"{input_path.stem}_processed.wav"
|
| 101 |
+
output_path = settings.processed_dir / output_filename
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
# Run ffmpeg conversion in executor to not block
|
| 105 |
+
loop = asyncio.get_event_loop()
|
| 106 |
+
await loop.run_in_executor(None, lambda: cls._run_ffmpeg_conversion(input_path, output_path))
|
| 107 |
+
|
| 108 |
+
logger.info(f"Converted to WAV: {output_path}")
|
| 109 |
+
return output_path
|
| 110 |
+
|
| 111 |
+
except ffmpeg.Error as e:
|
| 112 |
+
error_msg = e.stderr.decode() if e.stderr else str(e)
|
| 113 |
+
logger.error(f"FFmpeg error: {error_msg}")
|
| 114 |
+
raise AudioProcessingError(f"Audio conversion failed: {error_msg}")
|
| 115 |
+
|
| 116 |
+
@staticmethod
|
| 117 |
+
def _run_ffmpeg_conversion(input_path: Path, output_path: Path) -> None:
|
| 118 |
+
"""Run the actual FFmpeg conversion (blocking)."""
|
| 119 |
+
stream = ffmpeg.input(str(input_path))
|
| 120 |
+
|
| 121 |
+
# Apply normalization if enabled (loudnorm is best for speech consistency)
|
| 122 |
+
if settings.enable_loudnorm:
|
| 123 |
+
logger.debug("Applying loudnorm normalization...")
|
| 124 |
+
stream = stream.filter('loudnorm', I=-16, TP=-1.5, LRA=11)
|
| 125 |
+
|
| 126 |
+
# Apply noise reduction if enabled (Note: basic filters are kept as minor cleanup)
|
| 127 |
+
if settings.enable_noise_reduction:
|
| 128 |
+
logger.debug("Applying subtle highpass filter...")
|
| 129 |
+
stream = stream.filter('highpass', f=80)
|
| 130 |
+
|
| 131 |
+
(
|
| 132 |
+
stream
|
| 133 |
+
.output(
|
| 134 |
+
str(output_path),
|
| 135 |
+
acodec='pcm_s16le',
|
| 136 |
+
ar=16000,
|
| 137 |
+
ac=1
|
| 138 |
+
)
|
| 139 |
+
.overwrite_output()
|
| 140 |
+
.run(quiet=True, capture_stderr=True)
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
@classmethod
|
| 144 |
+
async def get_audio_duration(cls, filepath: Path) -> float:
|
| 145 |
+
"""
|
| 146 |
+
Get audio file duration in seconds.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
filepath: Path to audio file
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
Duration in seconds
|
| 153 |
+
"""
|
| 154 |
+
try:
|
| 155 |
+
loop = asyncio.get_event_loop()
|
| 156 |
+
probe = await loop.run_in_executor(
|
| 157 |
+
None,
|
| 158 |
+
lambda: ffmpeg.probe(str(filepath))
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
duration = float(probe['format'].get('duration', 0))
|
| 162 |
+
return duration
|
| 163 |
+
|
| 164 |
+
except ffmpeg.Error as e:
|
| 165 |
+
logger.warning(f"Could not probe audio duration: {e}")
|
| 166 |
+
return 0.0
|
| 167 |
+
|
| 168 |
+
@classmethod
|
| 169 |
+
async def cleanup_files(cls, *filepaths: Path) -> None:
|
| 170 |
+
"""
|
| 171 |
+
Delete temporary files.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
filepaths: Paths to files to delete
|
| 175 |
+
"""
|
| 176 |
+
for filepath in filepaths:
|
| 177 |
+
try:
|
| 178 |
+
if filepath and filepath.exists():
|
| 179 |
+
filepath.unlink()
|
| 180 |
+
logger.debug(f"Cleaned up: {filepath}")
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.warning(f"Failed to clean up {filepath}: {e}")
|
| 183 |
+
|
| 184 |
+
@classmethod
|
| 185 |
+
async def process_upload(cls, file_content: bytes, filename: str) -> Tuple[Path, float]:
|
| 186 |
+
"""
|
| 187 |
+
Full upload processing pipeline: validate, save, convert.
|
| 188 |
+
|
| 189 |
+
Args:
|
| 190 |
+
file_content: Uploaded file bytes
|
| 191 |
+
filename: Original filename
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Tuple of (processed WAV path, duration in seconds)
|
| 195 |
+
"""
|
| 196 |
+
# Validate
|
| 197 |
+
cls.validate_file(filename, len(file_content))
|
| 198 |
+
|
| 199 |
+
# Save original
|
| 200 |
+
original_path = await cls.save_upload(file_content, filename)
|
| 201 |
+
vocals_path = None
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
# Step 1: Denoising (Speech Enhancement)
|
| 205 |
+
if settings.enable_denoiser:
|
| 206 |
+
denoised_path = await DenoiserService.enhance_audio(original_path)
|
| 207 |
+
source_for_separation = denoised_path
|
| 208 |
+
else:
|
| 209 |
+
source_for_separation = original_path
|
| 210 |
+
denoised_path = None
|
| 211 |
+
|
| 212 |
+
# Step 2: Vocal separation using MDX-Net
|
| 213 |
+
if settings.enable_vocal_separation:
|
| 214 |
+
vocals_path = await VocalSeparator.separate_vocals(source_for_separation)
|
| 215 |
+
source_for_conversion = vocals_path
|
| 216 |
+
else:
|
| 217 |
+
source_for_conversion = source_for_separation
|
| 218 |
+
vocals_path = None
|
| 219 |
+
|
| 220 |
+
# Step 3: Convert to 16kHz mono WAV (includes normalization)
|
| 221 |
+
wav_path = await cls.convert_to_wav(source_for_conversion)
|
| 222 |
+
|
| 223 |
+
# Get duration
|
| 224 |
+
duration = await cls.get_audio_duration(wav_path)
|
| 225 |
+
|
| 226 |
+
# Cleanup intermediate files
|
| 227 |
+
to_cleanup = [original_path]
|
| 228 |
+
if denoised_path and denoised_path != original_path:
|
| 229 |
+
to_cleanup.append(denoised_path)
|
| 230 |
+
if vocals_path and vocals_path not in [original_path, denoised_path]:
|
| 231 |
+
to_cleanup.append(vocals_path)
|
| 232 |
+
|
| 233 |
+
await cls.cleanup_files(*to_cleanup)
|
| 234 |
+
|
| 235 |
+
return wav_path, duration
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
# Cleanup on error
|
| 239 |
+
await cls.cleanup_files(original_path)
|
| 240 |
+
if 'denoised_path' in locals() and denoised_path and denoised_path != original_path:
|
| 241 |
+
await cls.cleanup_files(denoised_path)
|
| 242 |
+
if 'vocals_path' in locals() and vocals_path and vocals_path not in [original_path, denoised_path]:
|
| 243 |
+
await cls.cleanup_files(vocals_path)
|
| 244 |
+
raise
|
app/services/denoiser.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speech Enhancement Service using Facebook's Denoiser.
|
| 3 |
+
Removes background noise and enhances speech quality.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import asyncio
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
import torchaudio
|
| 12 |
+
|
| 13 |
+
from app.core.config import get_settings
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
settings = get_settings()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DenoiserError(Exception):
|
| 20 |
+
"""Custom exception for denoiser errors."""
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class DenoiserService:
|
| 25 |
+
"""
|
| 26 |
+
Service for enhancing speech using Facebook's Denoiser models.
|
| 27 |
+
Supports dns48, dns64, master64, etc.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
_model = None
|
| 31 |
+
_model_name: str = None
|
| 32 |
+
|
| 33 |
+
@classmethod
|
| 34 |
+
def _get_model(cls):
|
| 35 |
+
"""Lazy load the Denoiser model."""
|
| 36 |
+
if cls._model is None or cls._model_name != settings.denoiser_model:
|
| 37 |
+
from denoiser.pretrained import dns48, dns64, master64
|
| 38 |
+
|
| 39 |
+
model_map = {
|
| 40 |
+
"dns48": dns48,
|
| 41 |
+
"dns64": dns64,
|
| 42 |
+
"master64": master64
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
model_func = model_map.get(settings.denoiser_model, dns64)
|
| 46 |
+
logger.debug(f"Loading Denoiser model: {settings.denoiser_model}")
|
| 47 |
+
|
| 48 |
+
model = model_func()
|
| 49 |
+
device = settings.resolved_device
|
| 50 |
+
model.to(device)
|
| 51 |
+
model.eval()
|
| 52 |
+
|
| 53 |
+
cls._model = model
|
| 54 |
+
cls._model_name = settings.denoiser_model
|
| 55 |
+
logger.debug(f"Denoiser model loaded on {device}")
|
| 56 |
+
|
| 57 |
+
return cls._model
|
| 58 |
+
|
| 59 |
+
@classmethod
|
| 60 |
+
async def enhance_audio(cls, input_path: Path) -> Path:
|
| 61 |
+
"""
|
| 62 |
+
Enhance audio by removing noise.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
input_path: Path to input audio file
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Path to enhanced WAV file
|
| 69 |
+
"""
|
| 70 |
+
if not settings.enable_denoiser:
|
| 71 |
+
logger.debug("Denoiser disabled, skipping...")
|
| 72 |
+
return input_path
|
| 73 |
+
|
| 74 |
+
logger.debug(f"Starting speech enhancement for: {input_path.name}")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
# Run enhancement in executor to not block
|
| 78 |
+
loop = asyncio.get_event_loop()
|
| 79 |
+
enhanced_path = await loop.run_in_executor(
|
| 80 |
+
None,
|
| 81 |
+
lambda: cls._run_enhancement(input_path)
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
logger.info(f"Speech enhancement complete: {enhanced_path.name}")
|
| 85 |
+
return enhanced_path
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"Speech enhancement failed: {e}")
|
| 89 |
+
# Fallback to original on failure rather than failing the whole pipeline
|
| 90 |
+
logger.warning("Falling back to original audio.")
|
| 91 |
+
return input_path
|
| 92 |
+
|
| 93 |
+
@classmethod
|
| 94 |
+
def _run_enhancement(cls, input_path: Path) -> Path:
|
| 95 |
+
"""Run the actual denoiser enhancement (blocking)."""
|
| 96 |
+
from denoiser.enhance import enhance
|
| 97 |
+
|
| 98 |
+
model = cls._get_model()
|
| 99 |
+
device = settings.resolved_device
|
| 100 |
+
|
| 101 |
+
# Load audio
|
| 102 |
+
wav, sr = torchaudio.load(str(input_path))
|
| 103 |
+
wav = wav.to(device)
|
| 104 |
+
|
| 105 |
+
# Ensure correct sample rate for the model
|
| 106 |
+
if sr != model.sample_rate:
|
| 107 |
+
resampler = torchaudio.transforms.Resample(sr, model.sample_rate).to(device)
|
| 108 |
+
wav = resampler(wav)
|
| 109 |
+
sr = model.sample_rate
|
| 110 |
+
|
| 111 |
+
# Enhance
|
| 112 |
+
# wav shape: [channels, time]
|
| 113 |
+
from types import SimpleNamespace
|
| 114 |
+
|
| 115 |
+
args = SimpleNamespace(
|
| 116 |
+
streaming=False,
|
| 117 |
+
dry=0.0,
|
| 118 |
+
sample_rate=sr
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
with torch.no_grad():
|
| 122 |
+
# denoiser.enhance.enhance(args, model, wav)
|
| 123 |
+
if wav.dim() == 1:
|
| 124 |
+
wav = wav.unsqueeze(0).unsqueeze(0)
|
| 125 |
+
elif wav.dim() == 2:
|
| 126 |
+
wav = wav.unsqueeze(0)
|
| 127 |
+
|
| 128 |
+
enhanced = enhance(args, model, wav)
|
| 129 |
+
# remove batch dim
|
| 130 |
+
enhanced = enhanced.squeeze(0)
|
| 131 |
+
|
| 132 |
+
# Save enhanced audio
|
| 133 |
+
output_filename = f"{input_path.stem}_denoised.wav"
|
| 134 |
+
output_path = settings.processed_dir / output_filename
|
| 135 |
+
|
| 136 |
+
torchaudio.save(
|
| 137 |
+
str(output_path),
|
| 138 |
+
enhanced.cpu(),
|
| 139 |
+
sr
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
return output_path
|
app/services/diarization.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speaker diarization service using pyannote.audio.
|
| 3 |
+
Identifies speaker turns in audio files.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
from app.core.config import get_settings
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
settings = get_settings()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class SpeakerSegment:
|
| 21 |
+
"""A segment of audio attributed to a specific speaker."""
|
| 22 |
+
start: float
|
| 23 |
+
end: float
|
| 24 |
+
speaker: str
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class DiarizationService:
|
| 28 |
+
"""
|
| 29 |
+
Service for speaker diarization using pyannote.audio.
|
| 30 |
+
Implements lazy loading to avoid memory overhead at startup.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
_instance: Optional["DiarizationService"] = None
|
| 34 |
+
_pipeline = None
|
| 35 |
+
|
| 36 |
+
def __new__(cls):
|
| 37 |
+
if cls._instance is None:
|
| 38 |
+
cls._instance = super().__new__(cls)
|
| 39 |
+
return cls._instance
|
| 40 |
+
|
| 41 |
+
@classmethod
|
| 42 |
+
def get_pipeline(cls):
|
| 43 |
+
"""
|
| 44 |
+
Get or load the diarization pipeline (lazy loading with caching).
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Loaded pyannote Pipeline
|
| 48 |
+
"""
|
| 49 |
+
if cls._pipeline is None:
|
| 50 |
+
# Import here to avoid loading if not used
|
| 51 |
+
from pyannote.audio import Pipeline
|
| 52 |
+
|
| 53 |
+
hf_token = settings.hf_token
|
| 54 |
+
if not hf_token:
|
| 55 |
+
raise ValueError(
|
| 56 |
+
"HuggingFace token required for pyannote.audio. "
|
| 57 |
+
"Set HF_TOKEN in your environment or .env file."
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
logger.debug(f"Loading diarization pipeline: {settings.diarization_model}")
|
| 61 |
+
|
| 62 |
+
# Use 'token' parameter (use_auth_token is deprecated)
|
| 63 |
+
cls._pipeline = Pipeline.from_pretrained(
|
| 64 |
+
settings.diarization_model,
|
| 65 |
+
token=hf_token
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Move to GPU if available
|
| 69 |
+
device = torch.device(settings.resolved_device)
|
| 70 |
+
if device.type == "cuda":
|
| 71 |
+
cls._pipeline = cls._pipeline.to(device)
|
| 72 |
+
logger.debug("Diarization pipeline moved to GPU")
|
| 73 |
+
|
| 74 |
+
logger.debug("Diarization pipeline loaded successfully")
|
| 75 |
+
|
| 76 |
+
return cls._pipeline
|
| 77 |
+
|
| 78 |
+
@classmethod
|
| 79 |
+
def is_loaded(cls) -> bool:
|
| 80 |
+
"""Check if pipeline is loaded."""
|
| 81 |
+
return cls._pipeline is not None
|
| 82 |
+
|
| 83 |
+
@classmethod
|
| 84 |
+
def diarize(
|
| 85 |
+
cls,
|
| 86 |
+
audio_path: Path,
|
| 87 |
+
num_speakers: Optional[int] = None,
|
| 88 |
+
min_speakers: int = 1,
|
| 89 |
+
max_speakers: int = 10
|
| 90 |
+
) -> List[SpeakerSegment]:
|
| 91 |
+
"""
|
| 92 |
+
Perform speaker diarization on audio file.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
audio_path: Path to WAV audio file
|
| 96 |
+
num_speakers: Exact number of speakers (None for auto-detect)
|
| 97 |
+
min_speakers: Minimum number of speakers to detect
|
| 98 |
+
max_speakers: Maximum number of speakers to detect
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
List of SpeakerSegment with speaker labels
|
| 102 |
+
"""
|
| 103 |
+
pipeline = cls.get_pipeline()
|
| 104 |
+
|
| 105 |
+
logger.debug(f"Diarizing: {audio_path}")
|
| 106 |
+
|
| 107 |
+
# Build parameters
|
| 108 |
+
params = {}
|
| 109 |
+
if num_speakers is not None:
|
| 110 |
+
params["num_speakers"] = num_speakers
|
| 111 |
+
else:
|
| 112 |
+
params["min_speakers"] = min_speakers
|
| 113 |
+
params["max_speakers"] = max_speakers
|
| 114 |
+
|
| 115 |
+
# Run diarization
|
| 116 |
+
diarization = pipeline(str(audio_path), **params)
|
| 117 |
+
|
| 118 |
+
# Handle pyannote.audio 4.x breaking change
|
| 119 |
+
# In 4.x, pipeline returns a DiarizeOutput object wrapping the Annotation
|
| 120 |
+
# In 3.x, it returns the Annotation directly
|
| 121 |
+
annotation = diarization
|
| 122 |
+
if hasattr(diarization, "speaker_diarization"):
|
| 123 |
+
annotation = diarization.speaker_diarization
|
| 124 |
+
logger.debug("Detected pyannote.audio 4.x DiarizeOutput structure")
|
| 125 |
+
|
| 126 |
+
# Convert to segments
|
| 127 |
+
segments = []
|
| 128 |
+
speaker_map = {} # Map SPEAKER_XX to Speaker 1, 2, etc.
|
| 129 |
+
|
| 130 |
+
for turn, _, speaker in annotation.itertracks(yield_label=True):
|
| 131 |
+
# Create readable speaker label
|
| 132 |
+
if speaker not in speaker_map:
|
| 133 |
+
speaker_map[speaker] = f"Speaker {len(speaker_map) + 1}"
|
| 134 |
+
|
| 135 |
+
segments.append(SpeakerSegment(
|
| 136 |
+
start=turn.start,
|
| 137 |
+
end=turn.end,
|
| 138 |
+
speaker=speaker_map[speaker]
|
| 139 |
+
))
|
| 140 |
+
|
| 141 |
+
logger.info(f"Diarization complete: {len(segments)} turns, {len(speaker_map)} speakers")
|
| 142 |
+
|
| 143 |
+
return segments
|
| 144 |
+
|
| 145 |
+
@classmethod
|
| 146 |
+
async def diarize_async(
|
| 147 |
+
cls,
|
| 148 |
+
audio_path: Path,
|
| 149 |
+
num_speakers: Optional[int] = None,
|
| 150 |
+
min_speakers: int = 1,
|
| 151 |
+
max_speakers: int = 10
|
| 152 |
+
) -> List[SpeakerSegment]:
|
| 153 |
+
"""
|
| 154 |
+
Async wrapper for diarization (runs in thread pool).
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
audio_path: Path to WAV audio file
|
| 158 |
+
num_speakers: Exact number of speakers
|
| 159 |
+
min_speakers: Minimum speakers
|
| 160 |
+
max_speakers: Maximum speakers
|
| 161 |
+
|
| 162 |
+
Returns:
|
| 163 |
+
List of SpeakerSegment
|
| 164 |
+
"""
|
| 165 |
+
import asyncio
|
| 166 |
+
|
| 167 |
+
loop = asyncio.get_event_loop()
|
| 168 |
+
return await loop.run_in_executor(
|
| 169 |
+
None,
|
| 170 |
+
lambda: cls.diarize(audio_path, num_speakers, min_speakers, max_speakers)
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
@classmethod
|
| 174 |
+
def preload_pipeline(cls) -> None:
|
| 175 |
+
"""Preload the pipeline during startup."""
|
| 176 |
+
try:
|
| 177 |
+
cls.get_pipeline()
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.warning(f"Failed to preload diarization pipeline: {e}")
|
| 180 |
+
# Don't raise - diarization is optional, app can work without it
|
app/services/orchestrator.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pipeline Orchestrator for PrecisionVoice.
|
| 3 |
+
Coordinates transcription and diarization in parallel.
|
| 4 |
+
"""
|
| 5 |
+
import time
|
| 6 |
+
import asyncio
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from app.core.config import get_settings
|
| 11 |
+
from app.schemas.models import TranscriptionResponse
|
| 12 |
+
from app.services.transcription import TranscriptionService
|
| 13 |
+
from app.services.diarization import DiarizationService
|
| 14 |
+
from app.services.alignment import AlignmentService
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
settings = get_settings()
|
| 18 |
+
|
| 19 |
+
class PipelineOrchestrator:
|
| 20 |
+
"""
|
| 21 |
+
Coordinates the AI pipeline with detailed server-side logging:
|
| 22 |
+
1. Audio -> Vocal Separation (MDX-Net) -> 16kHz WAV
|
| 23 |
+
2. Whisper (Transcribe) + Pyannote (Diarize) in parallel
|
| 24 |
+
3. Alignment (Matching Algorithm)
|
| 25 |
+
4. Generate outputs (TXT, SRT)
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
@classmethod
|
| 29 |
+
async def process_audio(
|
| 30 |
+
cls,
|
| 31 |
+
wav_path: Path,
|
| 32 |
+
duration: float
|
| 33 |
+
) -> TranscriptionResponse:
|
| 34 |
+
"""
|
| 35 |
+
Run the full processing pipeline and return the final response.
|
| 36 |
+
Each step is logged for server-side monitoring.
|
| 37 |
+
"""
|
| 38 |
+
start_time = time.time()
|
| 39 |
+
|
| 40 |
+
# Step 1: Pre-processing (Vocal Separation + Noise Reduction)
|
| 41 |
+
logger.info(f"[Step 1/4] Audio pre-processing completed (MDX-Net: {settings.enable_vocal_separation}, Denoise: {settings.enable_noise_reduction})")
|
| 42 |
+
|
| 43 |
+
# Step 2: AI Processing (Transcription & Diarization)
|
| 44 |
+
logger.info(f"[Step 2/4] Starting AI models (Whisper + Pyannote) for: {wav_path.name}")
|
| 45 |
+
|
| 46 |
+
transcription_task = TranscriptionService.transcribe_async(wav_path)
|
| 47 |
+
diarization_task = DiarizationService.diarize_async(wav_path)
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
word_timestamps, speaker_segments = await asyncio.gather(
|
| 51 |
+
transcription_task,
|
| 52 |
+
diarization_task,
|
| 53 |
+
return_exceptions=False
|
| 54 |
+
)
|
| 55 |
+
logger.info(f"AI models processing completed: {len(word_timestamps)} words, {len(speaker_segments)} segments")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.exception("Parallel task failed")
|
| 58 |
+
raise
|
| 59 |
+
|
| 60 |
+
# Step 3: Precision Alignment
|
| 61 |
+
logger.info("[Step 3/4] Aligning words with speaker turns...")
|
| 62 |
+
aligned_segments = AlignmentService.align_precision(word_timestamps, speaker_segments)
|
| 63 |
+
|
| 64 |
+
# Count unique speakers
|
| 65 |
+
speakers = set(seg.speaker for seg in aligned_segments)
|
| 66 |
+
|
| 67 |
+
# Step 4: Export Generation
|
| 68 |
+
logger.info("[Step 4/4] Generating export files (TXT, SRT)...")
|
| 69 |
+
base_filename = wav_path.stem.replace("_processed", "")
|
| 70 |
+
txt_path, srt_path = AlignmentService.generate_outputs(aligned_segments, base_filename)
|
| 71 |
+
|
| 72 |
+
processing_time = time.time() - start_time
|
| 73 |
+
logger.info(f"Pipeline complete for {wav_path.name} in {processing_time:.2f}s")
|
| 74 |
+
|
| 75 |
+
return TranscriptionResponse(
|
| 76 |
+
success=True,
|
| 77 |
+
message="Transcription completed successfully",
|
| 78 |
+
segments=aligned_segments,
|
| 79 |
+
duration=duration,
|
| 80 |
+
num_speakers=len(speakers),
|
| 81 |
+
processing_time=round(processing_time, 2),
|
| 82 |
+
download_txt=f"/api/download/{txt_path.name}",
|
| 83 |
+
download_srt=f"/api/download/{srt_path.name}"
|
| 84 |
+
)
|
app/services/transcription.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Transcription service using faster-whisper.
|
| 3 |
+
Loads the suzii/vi-whisper-large-v3-turbo-v1-ct2 model for Vietnamese STT.
|
| 4 |
+
Returns word-level timestamps for precision alignment.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
|
| 11 |
+
from faster_whisper import WhisperModel
|
| 12 |
+
|
| 13 |
+
from app.core.config import get_settings
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
settings = get_settings()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class WordTimestamp:
|
| 21 |
+
"""A single word with precise timestamp."""
|
| 22 |
+
word: str
|
| 23 |
+
start: float
|
| 24 |
+
end: float
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class TranscriptSegmentRaw:
|
| 29 |
+
"""Raw segment from Whisper transcription with word-level data."""
|
| 30 |
+
start: float
|
| 31 |
+
end: float
|
| 32 |
+
text: str
|
| 33 |
+
words: List[WordTimestamp]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class TranscriptionService:
|
| 37 |
+
"""
|
| 38 |
+
Service for speech-to-text transcription using faster-whisper.
|
| 39 |
+
Implements singleton pattern for model caching.
|
| 40 |
+
Returns word-level timestamps for precision speaker alignment.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
_instance: Optional["TranscriptionService"] = None
|
| 44 |
+
_model: Optional[WhisperModel] = None
|
| 45 |
+
|
| 46 |
+
def __new__(cls):
|
| 47 |
+
if cls._instance is None:
|
| 48 |
+
cls._instance = super().__new__(cls)
|
| 49 |
+
return cls._instance
|
| 50 |
+
|
| 51 |
+
@classmethod
|
| 52 |
+
def get_model(cls) -> WhisperModel:
|
| 53 |
+
"""
|
| 54 |
+
Get or load the Whisper model (lazy loading with caching).
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Loaded WhisperModel instance
|
| 58 |
+
"""
|
| 59 |
+
if cls._model is None:
|
| 60 |
+
logger.debug(f"Loading Whisper model: {settings.whisper_model}")
|
| 61 |
+
logger.debug(f"Device: {settings.resolved_device}, Compute type: {settings.resolved_compute_type}")
|
| 62 |
+
|
| 63 |
+
cls._model = WhisperModel(
|
| 64 |
+
settings.whisper_model,
|
| 65 |
+
device=settings.resolved_device,
|
| 66 |
+
compute_type=settings.resolved_compute_type,
|
| 67 |
+
download_root=None, # Use default HF cache
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
logger.debug("Whisper model loaded successfully")
|
| 71 |
+
|
| 72 |
+
return cls._model
|
| 73 |
+
|
| 74 |
+
@classmethod
|
| 75 |
+
def is_loaded(cls) -> bool:
|
| 76 |
+
"""Check if model is loaded."""
|
| 77 |
+
return cls._model is not None
|
| 78 |
+
|
| 79 |
+
@classmethod
|
| 80 |
+
def transcribe(
|
| 81 |
+
cls,
|
| 82 |
+
audio_path: Path,
|
| 83 |
+
language: str = "vi",
|
| 84 |
+
initial_prompt: Optional[str] = None
|
| 85 |
+
) -> List[WordTimestamp]:
|
| 86 |
+
"""
|
| 87 |
+
Transcribe audio file with word-level timestamps.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
audio_path: Path to WAV audio file
|
| 91 |
+
language: Language code (default: Vietnamese)
|
| 92 |
+
initial_prompt: Optional prompt for context
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
List of WordTimestamp with precise timing for each word
|
| 96 |
+
"""
|
| 97 |
+
model = cls.get_model()
|
| 98 |
+
|
| 99 |
+
logger.debug(f"Transcribing: {audio_path}")
|
| 100 |
+
|
| 101 |
+
# Run transcription with word timestamps - CRITICAL for precision alignment
|
| 102 |
+
segments_generator, info = model.transcribe(
|
| 103 |
+
str(audio_path),
|
| 104 |
+
language=language,
|
| 105 |
+
initial_prompt=initial_prompt,
|
| 106 |
+
word_timestamps=True, # CRITICAL: Enable word-level timestamps
|
| 107 |
+
vad_filter=True, # Re-enabled for optimization
|
| 108 |
+
vad_parameters=dict(
|
| 109 |
+
threshold=settings.vad_threshold,
|
| 110 |
+
min_speech_duration_ms=settings.vad_min_speech_duration_ms,
|
| 111 |
+
min_silence_duration_ms=settings.vad_min_silence_duration_ms,
|
| 112 |
+
),
|
| 113 |
+
beam_size=5,
|
| 114 |
+
best_of=5,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Extract all words with timestamps
|
| 118 |
+
all_words = []
|
| 119 |
+
segment_count = 0
|
| 120 |
+
|
| 121 |
+
for segment in segments_generator:
|
| 122 |
+
segment_count += 1
|
| 123 |
+
if segment.words:
|
| 124 |
+
for word in segment.words:
|
| 125 |
+
all_words.append(WordTimestamp(
|
| 126 |
+
word=word.word.strip(),
|
| 127 |
+
start=word.start,
|
| 128 |
+
end=word.end
|
| 129 |
+
))
|
| 130 |
+
|
| 131 |
+
logger.info(f"Transcription complete: {segment_count} segments, {len(all_words)} words, detected language: {info.language}")
|
| 132 |
+
|
| 133 |
+
return all_words
|
| 134 |
+
|
| 135 |
+
@classmethod
|
| 136 |
+
async def transcribe_async(
|
| 137 |
+
cls,
|
| 138 |
+
audio_path: Path,
|
| 139 |
+
language: str = "vi",
|
| 140 |
+
initial_prompt: Optional[str] = None
|
| 141 |
+
) -> List[WordTimestamp]:
|
| 142 |
+
"""
|
| 143 |
+
Async wrapper for transcription (runs in thread pool).
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
audio_path: Path to WAV audio file
|
| 147 |
+
language: Language code
|
| 148 |
+
initial_prompt: Optional prompt
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
List of WordTimestamp
|
| 152 |
+
"""
|
| 153 |
+
import asyncio
|
| 154 |
+
|
| 155 |
+
loop = asyncio.get_event_loop()
|
| 156 |
+
return await loop.run_in_executor(
|
| 157 |
+
None,
|
| 158 |
+
lambda: cls.transcribe(audio_path, language, initial_prompt)
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
@classmethod
|
| 162 |
+
def preload_model(cls) -> None:
|
| 163 |
+
"""Preload the model during startup."""
|
| 164 |
+
try:
|
| 165 |
+
cls.get_model()
|
| 166 |
+
except Exception as e:
|
| 167 |
+
logger.error(f"Failed to preload Whisper model: {e}")
|
| 168 |
+
raise
|
app/services/vocal_separator.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vocal Separation Service using MDX-Net (via audio-separator).
|
| 3 |
+
Isolates vocals from audio files using state-of-the-art MDX-Net models.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import asyncio
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
from app.core.config import get_settings
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
settings = get_settings()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class VocalSeparationError(Exception):
|
| 18 |
+
"""Custom exception for vocal separation errors."""
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class VocalSeparator:
|
| 23 |
+
"""
|
| 24 |
+
Service for separating vocals from audio using MDX-Net.
|
| 25 |
+
Uses the audio-separator library which supports UVR models.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
_separator = None
|
| 29 |
+
_model_name: str = None
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
def _get_separator(cls):
|
| 33 |
+
"""Lazy load the Audio Separator."""
|
| 34 |
+
if cls._separator is None or cls._model_name != settings.mdx_model:
|
| 35 |
+
from audio_separator.separator import Separator
|
| 36 |
+
|
| 37 |
+
logger.debug(f"Initializing MDX-Net separator with model: {settings.mdx_model}")
|
| 38 |
+
|
| 39 |
+
# Initialize separator
|
| 40 |
+
# Note: audio-separator expects output_dir to exist
|
| 41 |
+
settings.processed_dir.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
separator = Separator(
|
| 44 |
+
output_dir=str(settings.processed_dir),
|
| 45 |
+
output_format="WAV",
|
| 46 |
+
normalization_threshold=0.9
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Load model
|
| 50 |
+
separator.load_model(settings.mdx_model)
|
| 51 |
+
|
| 52 |
+
cls._separator = separator
|
| 53 |
+
cls._model_name = settings.mdx_model
|
| 54 |
+
logger.debug(f"MDX-Net model loaded on {settings.resolved_device}")
|
| 55 |
+
|
| 56 |
+
return cls._separator
|
| 57 |
+
|
| 58 |
+
@classmethod
|
| 59 |
+
async def separate_vocals(cls, input_path: Path) -> Path:
|
| 60 |
+
"""
|
| 61 |
+
Separate vocals from audio file using MDX-Net.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
input_path: Path to input audio file
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Path to separated vocals WAV file
|
| 68 |
+
"""
|
| 69 |
+
if not settings.enable_vocal_separation:
|
| 70 |
+
logger.debug("Vocal separation disabled, skipping...")
|
| 71 |
+
return input_path
|
| 72 |
+
|
| 73 |
+
logger.debug(f"Starting vocal separation for: {input_path.name}")
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
# Run separation in executor to not block
|
| 77 |
+
loop = asyncio.get_event_loop()
|
| 78 |
+
vocals_path = await loop.run_in_executor(
|
| 79 |
+
None,
|
| 80 |
+
lambda: cls._run_separation(input_path)
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
logger.info(f"Vocal separation complete: {vocals_path.name}")
|
| 84 |
+
return vocals_path
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Vocal separation failed: {e}")
|
| 88 |
+
# Fallback to original
|
| 89 |
+
logger.warning("Falling back to original audio.")
|
| 90 |
+
return input_path
|
| 91 |
+
|
| 92 |
+
@classmethod
|
| 93 |
+
def _run_separation(cls, input_path: Path) -> Path:
|
| 94 |
+
"""Run the actual separation (blocking)."""
|
| 95 |
+
separator = cls._get_separator()
|
| 96 |
+
|
| 97 |
+
# separate() returns a list of output filenames
|
| 98 |
+
output_files = separator.separate(str(input_path))
|
| 99 |
+
|
| 100 |
+
# audio-separator usually produces multiple files (Vocals, Instrumental)
|
| 101 |
+
# We need to find the vocals one.
|
| 102 |
+
# It typically names them like {input_stem}_(Vocals)_{model}.wav
|
| 103 |
+
|
| 104 |
+
vocals_file = None
|
| 105 |
+
for file in output_files:
|
| 106 |
+
if "Vocals" in file:
|
| 107 |
+
vocals_file = settings.processed_dir / file
|
| 108 |
+
break
|
| 109 |
+
|
| 110 |
+
if not vocals_file:
|
| 111 |
+
# If we can't find the vocals file specifically, just take the first one or fail
|
| 112 |
+
logger.warning("Could not identify vocals stem in output files.")
|
| 113 |
+
if output_files:
|
| 114 |
+
vocals_file = settings.processed_dir / output_files[0]
|
| 115 |
+
else:
|
| 116 |
+
raise VocalSeparationError("No output files generated by separator.")
|
| 117 |
+
|
| 118 |
+
return vocals_file
|
app/static/css/style.css
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ================================
|
| 2 |
+
PrecisionVoice - Modern Dark Theme
|
| 3 |
+
================================ */
|
| 4 |
+
|
| 5 |
+
:root {
|
| 6 |
+
/* Color Palette */
|
| 7 |
+
--bg-primary: #0a0a0f;
|
| 8 |
+
--bg-secondary: #12121a;
|
| 9 |
+
--bg-card: rgba(255, 255, 255, 0.03);
|
| 10 |
+
--bg-card-hover: rgba(255, 255, 255, 0.05);
|
| 11 |
+
|
| 12 |
+
--text-primary: #ffffff;
|
| 13 |
+
--text-secondary: #a0a0b0;
|
| 14 |
+
--text-muted: #606070;
|
| 15 |
+
|
| 16 |
+
--accent-primary: #6366f1;
|
| 17 |
+
--accent-secondary: #8b5cf6;
|
| 18 |
+
--accent-gradient: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
|
| 19 |
+
|
| 20 |
+
--success: #10b981;
|
| 21 |
+
--error: #ef4444;
|
| 22 |
+
--warning: #f59e0b;
|
| 23 |
+
|
| 24 |
+
--border-color: rgba(255, 255, 255, 0.08);
|
| 25 |
+
--border-glow: rgba(99, 102, 241, 0.3);
|
| 26 |
+
|
| 27 |
+
/* Spacing */
|
| 28 |
+
--spacing-xs: 0.25rem;
|
| 29 |
+
--spacing-sm: 0.5rem;
|
| 30 |
+
--spacing-md: 1rem;
|
| 31 |
+
--spacing-lg: 1.5rem;
|
| 32 |
+
--spacing-xl: 2rem;
|
| 33 |
+
--spacing-2xl: 3rem;
|
| 34 |
+
|
| 35 |
+
/* Border Radius */
|
| 36 |
+
--radius-sm: 0.375rem;
|
| 37 |
+
--radius-md: 0.75rem;
|
| 38 |
+
--radius-lg: 1rem;
|
| 39 |
+
--radius-xl: 1.5rem;
|
| 40 |
+
|
| 41 |
+
/* Shadows */
|
| 42 |
+
--shadow-sm: 0 2px 8px rgba(0, 0, 0, 0.3);
|
| 43 |
+
--shadow-md: 0 4px 16px rgba(0, 0, 0, 0.4);
|
| 44 |
+
--shadow-lg: 0 8px 32px rgba(0, 0, 0, 0.5);
|
| 45 |
+
--shadow-glow: 0 0 40px rgba(99, 102, 241, 0.15);
|
| 46 |
+
|
| 47 |
+
/* Transitions */
|
| 48 |
+
--transition-fast: 0.15s ease;
|
| 49 |
+
--transition-normal: 0.3s ease;
|
| 50 |
+
--transition-slow: 0.5s ease;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
/* ================================
|
| 54 |
+
Base Styles
|
| 55 |
+
================================ */
|
| 56 |
+
|
| 57 |
+
*,
|
| 58 |
+
*::before,
|
| 59 |
+
*::after {
|
| 60 |
+
box-sizing: border-box;
|
| 61 |
+
margin: 0;
|
| 62 |
+
padding: 0;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
html {
|
| 66 |
+
font-size: 16px;
|
| 67 |
+
scroll-behavior: smooth;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
body {
|
| 71 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
| 72 |
+
background: var(--bg-primary);
|
| 73 |
+
color: var(--text-primary);
|
| 74 |
+
line-height: 1.6;
|
| 75 |
+
min-height: 100vh;
|
| 76 |
+
-webkit-font-smoothing: antialiased;
|
| 77 |
+
-moz-osx-font-smoothing: grayscale;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/* Animated background gradient */
|
| 81 |
+
body::before {
|
| 82 |
+
content: '';
|
| 83 |
+
position: fixed;
|
| 84 |
+
top: 0;
|
| 85 |
+
left: 0;
|
| 86 |
+
right: 0;
|
| 87 |
+
bottom: 0;
|
| 88 |
+
background:
|
| 89 |
+
radial-gradient(ellipse at 20% 20%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
|
| 90 |
+
radial-gradient(ellipse at 80% 80%, rgba(139, 92, 246, 0.06) 0%, transparent 50%),
|
| 91 |
+
radial-gradient(ellipse at 50% 50%, rgba(168, 85, 247, 0.04) 0%, transparent 70%);
|
| 92 |
+
pointer-events: none;
|
| 93 |
+
z-index: -1;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
/* ================================
|
| 97 |
+
Layout
|
| 98 |
+
================================ */
|
| 99 |
+
|
| 100 |
+
.app-container {
|
| 101 |
+
max-width: 800px;
|
| 102 |
+
margin: 0 auto;
|
| 103 |
+
padding: var(--spacing-lg);
|
| 104 |
+
min-height: 100vh;
|
| 105 |
+
display: flex;
|
| 106 |
+
flex-direction: column;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
/* ================================
|
| 110 |
+
Header
|
| 111 |
+
================================ */
|
| 112 |
+
|
| 113 |
+
.header {
|
| 114 |
+
text-align: center;
|
| 115 |
+
padding: var(--spacing-2xl) 0;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
.logo {
|
| 119 |
+
display: flex;
|
| 120 |
+
align-items: center;
|
| 121 |
+
justify-content: center;
|
| 122 |
+
gap: var(--spacing-md);
|
| 123 |
+
margin-bottom: var(--spacing-sm);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
.logo-icon {
|
| 127 |
+
width: 48px;
|
| 128 |
+
height: 48px;
|
| 129 |
+
background: var(--accent-gradient);
|
| 130 |
+
border-radius: var(--radius-lg);
|
| 131 |
+
display: flex;
|
| 132 |
+
align-items: center;
|
| 133 |
+
justify-content: center;
|
| 134 |
+
box-shadow: var(--shadow-glow);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
.logo-icon svg {
|
| 138 |
+
width: 28px;
|
| 139 |
+
height: 28px;
|
| 140 |
+
color: white;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.logo h1 {
|
| 144 |
+
font-size: 2rem;
|
| 145 |
+
font-weight: 700;
|
| 146 |
+
background: var(--accent-gradient);
|
| 147 |
+
-webkit-background-clip: text;
|
| 148 |
+
-webkit-text-fill-color: transparent;
|
| 149 |
+
background-clip: text;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.tagline {
|
| 153 |
+
color: var(--text-secondary);
|
| 154 |
+
font-size: 1rem;
|
| 155 |
+
font-weight: 400;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
/* ================================
|
| 159 |
+
Cards
|
| 160 |
+
================================ */
|
| 161 |
+
|
| 162 |
+
.card {
|
| 163 |
+
background: var(--bg-card);
|
| 164 |
+
backdrop-filter: blur(20px);
|
| 165 |
+
border: 1px solid var(--border-color);
|
| 166 |
+
border-radius: var(--radius-xl);
|
| 167 |
+
padding: var(--spacing-xl);
|
| 168 |
+
margin-bottom: var(--spacing-lg);
|
| 169 |
+
transition: var(--transition-normal);
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
.card:hover {
|
| 173 |
+
border-color: var(--border-glow);
|
| 174 |
+
box-shadow: var(--shadow-glow);
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.card-header {
|
| 178 |
+
display: flex;
|
| 179 |
+
align-items: center;
|
| 180 |
+
justify-content: space-between;
|
| 181 |
+
margin-bottom: var(--spacing-lg);
|
| 182 |
+
flex-wrap: wrap;
|
| 183 |
+
gap: var(--spacing-sm);
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
.card-header h2 {
|
| 187 |
+
font-size: 1.25rem;
|
| 188 |
+
font-weight: 600;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
/* ================================
|
| 192 |
+
Badge
|
| 193 |
+
================================ */
|
| 194 |
+
|
| 195 |
+
.badge {
|
| 196 |
+
display: inline-block;
|
| 197 |
+
padding: var(--spacing-xs) var(--spacing-sm);
|
| 198 |
+
background: rgba(99, 102, 241, 0.15);
|
| 199 |
+
color: var(--accent-primary);
|
| 200 |
+
border-radius: var(--radius-sm);
|
| 201 |
+
font-size: 0.75rem;
|
| 202 |
+
font-weight: 500;
|
| 203 |
+
text-transform: uppercase;
|
| 204 |
+
letter-spacing: 0.5px;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
/* ================================
|
| 208 |
+
Upload Zone
|
| 209 |
+
================================ */
|
| 210 |
+
|
| 211 |
+
.upload-zone {
|
| 212 |
+
border: 2px dashed var(--border-color);
|
| 213 |
+
border-radius: var(--radius-lg);
|
| 214 |
+
padding: var(--spacing-2xl);
|
| 215 |
+
text-align: center;
|
| 216 |
+
cursor: pointer;
|
| 217 |
+
transition: var(--transition-normal);
|
| 218 |
+
margin-bottom: var(--spacing-lg);
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.upload-zone:hover,
|
| 222 |
+
.upload-zone.dragover {
|
| 223 |
+
border-color: var(--accent-primary);
|
| 224 |
+
background: rgba(99, 102, 241, 0.05);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.upload-zone.dragover {
|
| 228 |
+
transform: scale(1.02);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.upload-icon {
|
| 232 |
+
width: 64px;
|
| 233 |
+
height: 64px;
|
| 234 |
+
margin: 0 auto var(--spacing-md);
|
| 235 |
+
background: var(--accent-gradient);
|
| 236 |
+
border-radius: 50%;
|
| 237 |
+
display: flex;
|
| 238 |
+
align-items: center;
|
| 239 |
+
justify-content: center;
|
| 240 |
+
opacity: 0.8;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
.upload-icon svg {
|
| 244 |
+
width: 32px;
|
| 245 |
+
height: 32px;
|
| 246 |
+
color: white;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.upload-text {
|
| 250 |
+
font-size: 1.125rem;
|
| 251 |
+
font-weight: 500;
|
| 252 |
+
color: var(--text-primary);
|
| 253 |
+
margin-bottom: var(--spacing-xs);
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
.upload-subtext {
|
| 257 |
+
color: var(--text-muted);
|
| 258 |
+
font-size: 0.875rem;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
/* ================================
|
| 262 |
+
File Info
|
| 263 |
+
================================ */
|
| 264 |
+
|
| 265 |
+
.file-info {
|
| 266 |
+
display: flex;
|
| 267 |
+
align-items: center;
|
| 268 |
+
justify-content: space-between;
|
| 269 |
+
padding: var(--spacing-md);
|
| 270 |
+
background: rgba(99, 102, 241, 0.1);
|
| 271 |
+
border-radius: var(--radius-md);
|
| 272 |
+
margin-bottom: var(--spacing-lg);
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
.file-details {
|
| 276 |
+
display: flex;
|
| 277 |
+
flex-direction: column;
|
| 278 |
+
gap: var(--spacing-xs);
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
.file-name {
|
| 282 |
+
font-weight: 500;
|
| 283 |
+
color: var(--text-primary);
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
.file-size {
|
| 287 |
+
font-size: 0.875rem;
|
| 288 |
+
color: var(--text-secondary);
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
/* ================================
|
| 292 |
+
Buttons
|
| 293 |
+
================================ */
|
| 294 |
+
|
| 295 |
+
.btn {
|
| 296 |
+
display: inline-flex;
|
| 297 |
+
align-items: center;
|
| 298 |
+
justify-content: center;
|
| 299 |
+
gap: var(--spacing-sm);
|
| 300 |
+
padding: var(--spacing-md) var(--spacing-xl);
|
| 301 |
+
border: none;
|
| 302 |
+
border-radius: var(--radius-md);
|
| 303 |
+
font-family: inherit;
|
| 304 |
+
font-size: 1rem;
|
| 305 |
+
font-weight: 500;
|
| 306 |
+
cursor: pointer;
|
| 307 |
+
transition: var(--transition-fast);
|
| 308 |
+
text-decoration: none;
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
.btn:disabled {
|
| 312 |
+
opacity: 0.5;
|
| 313 |
+
cursor: not-allowed;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
.btn svg {
|
| 317 |
+
width: 20px;
|
| 318 |
+
height: 20px;
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
.btn-primary {
|
| 322 |
+
width: 100%;
|
| 323 |
+
background: var(--accent-gradient);
|
| 324 |
+
color: white;
|
| 325 |
+
box-shadow: var(--shadow-md);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
.btn-primary:hover:not(:disabled) {
|
| 329 |
+
transform: translateY(-2px);
|
| 330 |
+
box-shadow: var(--shadow-lg), var(--shadow-glow);
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
.btn-primary:active:not(:disabled) {
|
| 334 |
+
transform: translateY(0);
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
.btn-secondary {
|
| 338 |
+
background: var(--bg-card);
|
| 339 |
+
color: var(--text-primary);
|
| 340 |
+
border: 1px solid var(--border-color);
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.btn-secondary:hover:not(:disabled) {
|
| 344 |
+
background: var(--bg-card-hover);
|
| 345 |
+
border-color: var(--accent-primary);
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
.btn-outline {
|
| 349 |
+
background: transparent;
|
| 350 |
+
color: var(--text-primary);
|
| 351 |
+
border: 1px solid var(--border-color);
|
| 352 |
+
padding: var(--spacing-sm) var(--spacing-md);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
.btn-outline:hover {
|
| 356 |
+
background: var(--bg-card);
|
| 357 |
+
border-color: var(--accent-primary);
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
.btn-clear {
|
| 361 |
+
width: 36px;
|
| 362 |
+
height: 36px;
|
| 363 |
+
padding: 0;
|
| 364 |
+
background: transparent;
|
| 365 |
+
color: var(--text-muted);
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
.btn-clear:hover {
|
| 369 |
+
color: var(--error);
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
/* ================================
|
| 373 |
+
Processing Section
|
| 374 |
+
================================ */
|
| 375 |
+
|
| 376 |
+
.processing-content {
|
| 377 |
+
text-align: center;
|
| 378 |
+
padding: var(--spacing-xl) 0;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
.spinner {
|
| 382 |
+
width: 56px;
|
| 383 |
+
height: 56px;
|
| 384 |
+
margin: 0 auto var(--spacing-lg);
|
| 385 |
+
border: 3px solid var(--border-color);
|
| 386 |
+
border-top-color: var(--accent-primary);
|
| 387 |
+
border-radius: 50%;
|
| 388 |
+
animation: spin 1s linear infinite;
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
@keyframes spin {
|
| 392 |
+
to {
|
| 393 |
+
transform: rotate(360deg);
|
| 394 |
+
}
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
.processing-content h3 {
|
| 398 |
+
font-size: 1.25rem;
|
| 399 |
+
margin-bottom: var(--spacing-sm);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
.processing-content p {
|
| 403 |
+
color: var(--text-secondary);
|
| 404 |
+
margin-bottom: var(--spacing-lg);
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
.progress-bar {
|
| 408 |
+
height: 6px;
|
| 409 |
+
background: var(--bg-secondary);
|
| 410 |
+
border-radius: var(--radius-sm);
|
| 411 |
+
overflow: hidden;
|
| 412 |
+
margin-bottom: var(--spacing-md);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
.progress-fill {
|
| 416 |
+
height: 100%;
|
| 417 |
+
width: 0%;
|
| 418 |
+
background: var(--accent-gradient);
|
| 419 |
+
border-radius: var(--radius-sm);
|
| 420 |
+
transition: width 0.3s ease;
|
| 421 |
+
animation: pulse 2s ease-in-out infinite;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
@keyframes pulse {
|
| 425 |
+
|
| 426 |
+
0%,
|
| 427 |
+
100% {
|
| 428 |
+
opacity: 1;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
50% {
|
| 432 |
+
opacity: 0.7;
|
| 433 |
+
}
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
.processing-hint {
|
| 437 |
+
font-size: 0.875rem;
|
| 438 |
+
color: var(--text-muted);
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
.timer-display {
|
| 442 |
+
font-size: 2rem;
|
| 443 |
+
font-weight: 700;
|
| 444 |
+
color: var(--accent-primary);
|
| 445 |
+
margin: var(--spacing-md) 0;
|
| 446 |
+
font-family: monospace;
|
| 447 |
+
text-shadow: 0 0 10px rgba(99, 102, 241, 0.3);
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
/* ================================
|
| 451 |
+
Results Section
|
| 452 |
+
================================ */
|
| 453 |
+
|
| 454 |
+
.result-meta {
|
| 455 |
+
display: flex;
|
| 456 |
+
gap: var(--spacing-sm);
|
| 457 |
+
flex-wrap: wrap;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
.download-buttons {
|
| 461 |
+
display: flex;
|
| 462 |
+
gap: var(--spacing-md);
|
| 463 |
+
margin-bottom: var(--spacing-lg);
|
| 464 |
+
flex-wrap: wrap;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.transcript-container {
|
| 468 |
+
max-height: 400px;
|
| 469 |
+
overflow-y: auto;
|
| 470 |
+
padding-right: var(--spacing-sm);
|
| 471 |
+
margin-bottom: var(--spacing-lg);
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
.transcript-container::-webkit-scrollbar {
|
| 475 |
+
width: 6px;
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
.transcript-container::-webkit-scrollbar-track {
|
| 479 |
+
background: var(--bg-secondary);
|
| 480 |
+
border-radius: var(--radius-sm);
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
.transcript-container::-webkit-scrollbar-thumb {
|
| 484 |
+
background: var(--border-color);
|
| 485 |
+
border-radius: var(--radius-sm);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
.transcript-container::-webkit-scrollbar-thumb:hover {
|
| 489 |
+
background: var(--text-muted);
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
/* Transcript Segment */
|
| 493 |
+
.segment {
|
| 494 |
+
padding: var(--spacing-md);
|
| 495 |
+
border-radius: var(--radius-md);
|
| 496 |
+
margin-bottom: var(--spacing-sm);
|
| 497 |
+
background: var(--bg-secondary);
|
| 498 |
+
border-left: 3px solid var(--accent-primary);
|
| 499 |
+
transition: var(--transition-fast);
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
.segment:hover {
|
| 503 |
+
background: var(--bg-card-hover);
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
.segment-header {
|
| 507 |
+
display: flex;
|
| 508 |
+
align-items: center;
|
| 509 |
+
gap: var(--spacing-md);
|
| 510 |
+
margin-bottom: var(--spacing-xs);
|
| 511 |
+
flex-wrap: wrap;
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
.segment-speaker {
|
| 515 |
+
font-weight: 600;
|
| 516 |
+
color: var(--accent-primary);
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
.segment-time {
|
| 520 |
+
font-size: 0.75rem;
|
| 521 |
+
color: var(--text-muted);
|
| 522 |
+
font-family: monospace;
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
.segment-text {
|
| 526 |
+
color: var(--text-primary);
|
| 527 |
+
line-height: 1.7;
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
/* Speaker Colors */
|
| 531 |
+
.speaker-1 {
|
| 532 |
+
border-left-color: #6366f1;
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
.speaker-1 .segment-speaker {
|
| 536 |
+
color: #6366f1;
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
.speaker-2 {
|
| 540 |
+
border-left-color: #10b981;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
.speaker-2 .segment-speaker {
|
| 544 |
+
color: #10b981;
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
.speaker-3 {
|
| 548 |
+
border-left-color: #f59e0b;
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
.speaker-3 .segment-speaker {
|
| 552 |
+
color: #f59e0b;
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
.speaker-4 {
|
| 556 |
+
border-left-color: #ec4899;
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
.speaker-4 .segment-speaker {
|
| 560 |
+
color: #ec4899;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
.speaker-5 {
|
| 564 |
+
border-left-color: #8b5cf6;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
.speaker-5 .segment-speaker {
|
| 568 |
+
color: #8b5cf6;
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
/* ================================
|
| 572 |
+
Error Section
|
| 573 |
+
================================ */
|
| 574 |
+
|
| 575 |
+
.error-content {
|
| 576 |
+
text-align: center;
|
| 577 |
+
padding: var(--spacing-xl) 0;
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
.error-icon {
|
| 581 |
+
width: 64px;
|
| 582 |
+
height: 64px;
|
| 583 |
+
margin: 0 auto var(--spacing-lg);
|
| 584 |
+
background: rgba(239, 68, 68, 0.15);
|
| 585 |
+
border-radius: 50%;
|
| 586 |
+
display: flex;
|
| 587 |
+
align-items: center;
|
| 588 |
+
justify-content: center;
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
.error-icon svg {
|
| 592 |
+
width: 32px;
|
| 593 |
+
height: 32px;
|
| 594 |
+
color: var(--error);
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
.error-content h3 {
|
| 598 |
+
color: var(--error);
|
| 599 |
+
margin-bottom: var(--spacing-sm);
|
| 600 |
+
}
|
| 601 |
+
|
| 602 |
+
.error-content p {
|
| 603 |
+
color: var(--text-secondary);
|
| 604 |
+
margin-bottom: var(--spacing-lg);
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
/* ================================
|
| 608 |
+
Footer
|
| 609 |
+
================================ */
|
| 610 |
+
|
| 611 |
+
.footer {
|
| 612 |
+
margin-top: auto;
|
| 613 |
+
padding: var(--spacing-xl) 0;
|
| 614 |
+
text-align: center;
|
| 615 |
+
color: var(--text-muted);
|
| 616 |
+
font-size: 0.875rem;
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
.footer strong {
|
| 620 |
+
color: var(--text-secondary);
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
.footer-note {
|
| 624 |
+
margin-top: var(--spacing-xs);
|
| 625 |
+
font-size: 0.75rem;
|
| 626 |
+
}
|
| 627 |
+
|
| 628 |
+
/* ================================
|
| 629 |
+
Utility Classes
|
| 630 |
+
================================ */
|
| 631 |
+
|
| 632 |
+
.hidden {
|
| 633 |
+
display: none !important;
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
/* ================================
|
| 637 |
+
Responsive
|
| 638 |
+
================================ */
|
| 639 |
+
|
| 640 |
+
@media (max-width: 640px) {
|
| 641 |
+
:root {
|
| 642 |
+
font-size: 14px;
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
.app-container {
|
| 646 |
+
padding: var(--spacing-md);
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
.card {
|
| 650 |
+
padding: var(--spacing-lg);
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
.upload-zone {
|
| 654 |
+
padding: var(--spacing-xl);
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
.card-header {
|
| 658 |
+
flex-direction: column;
|
| 659 |
+
align-items: flex-start;
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
.result-meta {
|
| 663 |
+
width: 100%;
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
.download-buttons {
|
| 667 |
+
flex-direction: column;
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
.download-buttons .btn {
|
| 671 |
+
width: 100%;
|
| 672 |
+
}
|
| 673 |
+
}
|
app/static/js/app.js
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* PrecisionVoice - Frontend Application Logic
|
| 3 |
+
* Handles file upload, transcription requests, and result display.
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 7 |
+
// DOM Elements
|
| 8 |
+
const elements = {
|
| 9 |
+
// Upload
|
| 10 |
+
dropZone: document.getElementById('drop-zone'),
|
| 11 |
+
fileInput: document.getElementById('file-input'),
|
| 12 |
+
fileInfo: document.getElementById('file-info'),
|
| 13 |
+
fileName: document.getElementById('file-name'),
|
| 14 |
+
fileSize: document.getElementById('file-size'),
|
| 15 |
+
clearBtn: document.getElementById('clear-btn'),
|
| 16 |
+
transcribeBtn: document.getElementById('transcribe-btn'),
|
| 17 |
+
|
| 18 |
+
// Sections
|
| 19 |
+
uploadSection: document.getElementById('upload-section'),
|
| 20 |
+
processingSection: document.getElementById('processing-section'),
|
| 21 |
+
resultsSection: document.getElementById('results-section'),
|
| 22 |
+
errorSection: document.getElementById('error-section'),
|
| 23 |
+
|
| 24 |
+
// Processing
|
| 25 |
+
processingStatus: document.getElementById('processing-status'),
|
| 26 |
+
progressFill: document.getElementById('progress-fill'),
|
| 27 |
+
processingTimer: document.getElementById('processing-timer'),
|
| 28 |
+
|
| 29 |
+
// Results
|
| 30 |
+
speakerCount: document.getElementById('speaker-count'),
|
| 31 |
+
durationInfo: document.getElementById('duration-info'),
|
| 32 |
+
processingTime: document.getElementById('processing-time'),
|
| 33 |
+
transcriptContainer: document.getElementById('transcript-container'),
|
| 34 |
+
downloadTxt: document.getElementById('download-txt'),
|
| 35 |
+
downloadSrt: document.getElementById('download-srt'),
|
| 36 |
+
newUploadBtn: document.getElementById('new-upload-btn'),
|
| 37 |
+
|
| 38 |
+
// Error
|
| 39 |
+
errorMessage: document.getElementById('error-message'),
|
| 40 |
+
retryBtn: document.getElementById('retry-btn')
|
| 41 |
+
};
|
| 42 |
+
|
| 43 |
+
let selectedFile = null;
|
| 44 |
+
|
| 45 |
+
// =====================
|
| 46 |
+
// Event Listeners
|
| 47 |
+
// =====================
|
| 48 |
+
|
| 49 |
+
// Click to upload
|
| 50 |
+
elements.dropZone.addEventListener('click', () => {
|
| 51 |
+
elements.fileInput.click();
|
| 52 |
+
});
|
| 53 |
+
|
| 54 |
+
// File input change
|
| 55 |
+
elements.fileInput.addEventListener('change', (e) => {
|
| 56 |
+
if (e.target.files.length > 0) {
|
| 57 |
+
handleFileSelection(e.target.files[0]);
|
| 58 |
+
}
|
| 59 |
+
});
|
| 60 |
+
|
| 61 |
+
// Drag and drop
|
| 62 |
+
elements.dropZone.addEventListener('dragover', (e) => {
|
| 63 |
+
e.preventDefault();
|
| 64 |
+
elements.dropZone.classList.add('dragover');
|
| 65 |
+
});
|
| 66 |
+
|
| 67 |
+
elements.dropZone.addEventListener('dragleave', () => {
|
| 68 |
+
elements.dropZone.classList.remove('dragover');
|
| 69 |
+
});
|
| 70 |
+
|
| 71 |
+
elements.dropZone.addEventListener('drop', (e) => {
|
| 72 |
+
e.preventDefault();
|
| 73 |
+
elements.dropZone.classList.remove('dragover');
|
| 74 |
+
|
| 75 |
+
if (e.dataTransfer.files.length > 0) {
|
| 76 |
+
handleFileSelection(e.dataTransfer.files[0]);
|
| 77 |
+
}
|
| 78 |
+
});
|
| 79 |
+
|
| 80 |
+
// Clear file
|
| 81 |
+
elements.clearBtn.addEventListener('click', (e) => {
|
| 82 |
+
e.stopPropagation();
|
| 83 |
+
clearFileSelection();
|
| 84 |
+
});
|
| 85 |
+
|
| 86 |
+
// Transcribe button
|
| 87 |
+
elements.transcribeBtn.addEventListener('click', () => {
|
| 88 |
+
if (selectedFile) {
|
| 89 |
+
startTranscription();
|
| 90 |
+
}
|
| 91 |
+
});
|
| 92 |
+
|
| 93 |
+
// New upload button
|
| 94 |
+
elements.newUploadBtn.addEventListener('click', resetToUpload);
|
| 95 |
+
|
| 96 |
+
// Retry button
|
| 97 |
+
elements.retryBtn.addEventListener('click', resetToUpload);
|
| 98 |
+
|
| 99 |
+
// =====================
|
| 100 |
+
// File Handling
|
| 101 |
+
// =====================
|
| 102 |
+
|
| 103 |
+
function handleFileSelection(file) {
|
| 104 |
+
const allowedTypes = ['audio/mpeg', 'audio/wav', 'audio/x-wav', 'audio/mp4', 'audio/x-m4a',
|
| 105 |
+
'audio/ogg', 'audio/flac', 'audio/webm', 'video/webm'];
|
| 106 |
+
const allowedExtensions = ['mp3', 'wav', 'm4a', 'ogg', 'flac', 'webm'];
|
| 107 |
+
|
| 108 |
+
// Check file extension
|
| 109 |
+
const ext = file.name.split('.').pop().toLowerCase();
|
| 110 |
+
if (!allowedExtensions.includes(ext)) {
|
| 111 |
+
showError(`Unsupported file type: .${ext}. Supported: ${allowedExtensions.join(', ')}`);
|
| 112 |
+
return;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Check file size (100MB limit)
|
| 116 |
+
const maxSize = 100 * 1024 * 1024;
|
| 117 |
+
if (file.size > maxSize) {
|
| 118 |
+
showError(`File too large. Maximum size: 100MB`);
|
| 119 |
+
return;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
selectedFile = file;
|
| 123 |
+
|
| 124 |
+
// Update UI
|
| 125 |
+
elements.fileName.textContent = file.name;
|
| 126 |
+
elements.fileSize.textContent = formatFileSize(file.size);
|
| 127 |
+
elements.fileInfo.classList.remove('hidden');
|
| 128 |
+
elements.transcribeBtn.disabled = false;
|
| 129 |
+
|
| 130 |
+
// Hide drop zone text
|
| 131 |
+
elements.dropZone.style.display = 'none';
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
function clearFileSelection() {
|
| 135 |
+
selectedFile = null;
|
| 136 |
+
elements.fileInput.value = '';
|
| 137 |
+
elements.fileInfo.classList.add('hidden');
|
| 138 |
+
elements.transcribeBtn.disabled = true;
|
| 139 |
+
elements.dropZone.style.display = 'block';
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
function formatFileSize(bytes) {
|
| 143 |
+
if (bytes === 0) return '0 Bytes';
|
| 144 |
+
const k = 1024;
|
| 145 |
+
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
|
| 146 |
+
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
| 147 |
+
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// =====================
|
| 151 |
+
// Transcription
|
| 152 |
+
// =====================
|
| 153 |
+
|
| 154 |
+
async function startTranscription() {
|
| 155 |
+
if (!selectedFile) return;
|
| 156 |
+
|
| 157 |
+
// Show processing UI
|
| 158 |
+
showSection('processing');
|
| 159 |
+
updateProgress(100, 'Processing audio... (Check server logs for details)');
|
| 160 |
+
|
| 161 |
+
// Reset and start timer
|
| 162 |
+
let seconds = 0;
|
| 163 |
+
elements.processingTimer.textContent = '00:00';
|
| 164 |
+
const timerInterval = setInterval(() => {
|
| 165 |
+
seconds++;
|
| 166 |
+
const m = Math.floor(seconds / 60);
|
| 167 |
+
const s = seconds % 60;
|
| 168 |
+
elements.processingTimer.textContent = `${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
|
| 169 |
+
}, 1000);
|
| 170 |
+
|
| 171 |
+
try {
|
| 172 |
+
const formData = new FormData();
|
| 173 |
+
formData.append('file', selectedFile);
|
| 174 |
+
|
| 175 |
+
const response = await fetch('/api/transcribe', {
|
| 176 |
+
method: 'POST',
|
| 177 |
+
body: formData
|
| 178 |
+
});
|
| 179 |
+
|
| 180 |
+
clearInterval(timerInterval);
|
| 181 |
+
|
| 182 |
+
if (!response.ok) {
|
| 183 |
+
const errorData = await response.json();
|
| 184 |
+
throw new Error(errorData.detail || 'Processing failed');
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
const result = await response.json();
|
| 188 |
+
displayResults(result);
|
| 189 |
+
|
| 190 |
+
} catch (error) {
|
| 191 |
+
clearInterval(timerInterval);
|
| 192 |
+
console.error('Processing error:', error);
|
| 193 |
+
showError(error.message || 'An error occurred during processing');
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
function updateProgress(percent, status) {
|
| 198 |
+
elements.progressFill.style.width = `${percent}%`;
|
| 199 |
+
if (status) {
|
| 200 |
+
elements.processingStatus.textContent = status;
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
// =====================
|
| 205 |
+
// Results Display
|
| 206 |
+
// =====================
|
| 207 |
+
|
| 208 |
+
function displayResults(result) {
|
| 209 |
+
// Update metadata
|
| 210 |
+
elements.speakerCount.textContent = `${result.num_speakers} speaker${result.num_speakers !== 1 ? 's' : ''}`;
|
| 211 |
+
elements.durationInfo.textContent = formatDuration(result.duration);
|
| 212 |
+
elements.processingTime.textContent = `${result.processing_time}s`;
|
| 213 |
+
|
| 214 |
+
// Set download links
|
| 215 |
+
elements.downloadTxt.href = result.download_txt;
|
| 216 |
+
elements.downloadSrt.href = result.download_srt;
|
| 217 |
+
|
| 218 |
+
// Render transcript segments
|
| 219 |
+
renderTranscript(result.segments);
|
| 220 |
+
|
| 221 |
+
// Show results section
|
| 222 |
+
showSection('results');
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
function renderTranscript(segments) {
|
| 226 |
+
elements.transcriptContainer.innerHTML = '';
|
| 227 |
+
|
| 228 |
+
const speakerColors = {};
|
| 229 |
+
let colorIndex = 0;
|
| 230 |
+
|
| 231 |
+
segments.forEach((segment) => {
|
| 232 |
+
// Assign color to speaker
|
| 233 |
+
if (!(segment.speaker in speakerColors)) {
|
| 234 |
+
colorIndex++;
|
| 235 |
+
speakerColors[segment.speaker] = `speaker-${Math.min(colorIndex, 5)}`;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
const segmentEl = document.createElement('div');
|
| 239 |
+
segmentEl.className = `segment ${speakerColors[segment.speaker]}`;
|
| 240 |
+
|
| 241 |
+
segmentEl.innerHTML = `
|
| 242 |
+
<div class="segment-header">
|
| 243 |
+
<span class="segment-speaker">${escapeHtml(segment.speaker)}</span>
|
| 244 |
+
<span class="segment-time">${formatTime(segment.start)} - ${formatTime(segment.end)}</span>
|
| 245 |
+
</div>
|
| 246 |
+
<p class="segment-text">${escapeHtml(segment.text)}</p>
|
| 247 |
+
`;
|
| 248 |
+
|
| 249 |
+
elements.transcriptContainer.appendChild(segmentEl);
|
| 250 |
+
});
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
function formatTime(seconds) {
|
| 254 |
+
const h = Math.floor(seconds / 3600);
|
| 255 |
+
const m = Math.floor((seconds % 3600) / 60);
|
| 256 |
+
const s = Math.floor(seconds % 60);
|
| 257 |
+
|
| 258 |
+
if (h > 0) {
|
| 259 |
+
return `${h}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
|
| 260 |
+
}
|
| 261 |
+
return `${m}:${s.toString().padStart(2, '0')}`;
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
function formatDuration(seconds) {
|
| 265 |
+
const m = Math.floor(seconds / 60);
|
| 266 |
+
const s = Math.floor(seconds % 60);
|
| 267 |
+
return `${m}:${s.toString().padStart(2, '0')}`;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
function escapeHtml(text) {
|
| 271 |
+
const div = document.createElement('div');
|
| 272 |
+
div.textContent = text;
|
| 273 |
+
return div.innerHTML;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
// =====================
|
| 277 |
+
// UI State Management
|
| 278 |
+
// =====================
|
| 279 |
+
|
| 280 |
+
function showSection(section) {
|
| 281 |
+
elements.uploadSection.classList.add('hidden');
|
| 282 |
+
elements.processingSection.classList.add('hidden');
|
| 283 |
+
elements.resultsSection.classList.add('hidden');
|
| 284 |
+
elements.errorSection.classList.add('hidden');
|
| 285 |
+
|
| 286 |
+
switch (section) {
|
| 287 |
+
case 'upload':
|
| 288 |
+
elements.uploadSection.classList.remove('hidden');
|
| 289 |
+
break;
|
| 290 |
+
case 'processing':
|
| 291 |
+
elements.processingSection.classList.remove('hidden');
|
| 292 |
+
break;
|
| 293 |
+
case 'results':
|
| 294 |
+
elements.resultsSection.classList.remove('hidden');
|
| 295 |
+
break;
|
| 296 |
+
case 'error':
|
| 297 |
+
elements.errorSection.classList.remove('hidden');
|
| 298 |
+
break;
|
| 299 |
+
}
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
function showError(message) {
|
| 303 |
+
elements.errorMessage.textContent = message;
|
| 304 |
+
showSection('error');
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
function resetToUpload() {
|
| 308 |
+
clearFileSelection();
|
| 309 |
+
showSection('upload');
|
| 310 |
+
updateProgress(0, 'Uploading file...');
|
| 311 |
+
}
|
| 312 |
+
});
|
app/templates/index.html
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="vi">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<meta name="description" content="PrecisionVoice - Speech-to-Text and Speaker Diarization powered by AI">
|
| 8 |
+
<title>PrecisionVoice | AI Speech Transcription</title>
|
| 9 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 10 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 11 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
| 12 |
+
<link rel="stylesheet" href="/static/css/style.css">
|
| 13 |
+
</head>
|
| 14 |
+
|
| 15 |
+
<body>
|
| 16 |
+
<div class="app-container">
|
| 17 |
+
<!-- Header -->
|
| 18 |
+
<header class="header">
|
| 19 |
+
<div class="logo">
|
| 20 |
+
<div class="logo-icon">
|
| 21 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 22 |
+
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
|
| 23 |
+
<path d="M19 10v2a7 7 0 0 1-14 0v-2" />
|
| 24 |
+
<line x1="12" y1="19" x2="12" y2="23" />
|
| 25 |
+
<line x1="8" y1="23" x2="16" y2="23" />
|
| 26 |
+
</svg>
|
| 27 |
+
</div>
|
| 28 |
+
<h1>PrecisionVoice</h1>
|
| 29 |
+
</div>
|
| 30 |
+
<p class="tagline">AI-Powered Speech Transcription with Speaker Detection</p>
|
| 31 |
+
</header>
|
| 32 |
+
|
| 33 |
+
<!-- Main Content -->
|
| 34 |
+
<main class="main-content">
|
| 35 |
+
<!-- Upload Section -->
|
| 36 |
+
<section id="upload-section" class="card upload-card">
|
| 37 |
+
<div class="card-header">
|
| 38 |
+
<h2>Upload Audio</h2>
|
| 39 |
+
<span class="badge">Supported: {{ allowed_formats }}</span>
|
| 40 |
+
</div>
|
| 41 |
+
|
| 42 |
+
<div class="upload-zone" id="drop-zone">
|
| 43 |
+
<div class="upload-icon">
|
| 44 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 45 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 46 |
+
<polyline points="17 8 12 3 7 8" />
|
| 47 |
+
<line x1="12" y1="3" x2="12" y2="15" />
|
| 48 |
+
</svg>
|
| 49 |
+
</div>
|
| 50 |
+
<p class="upload-text">Drag & drop audio file here</p>
|
| 51 |
+
<p class="upload-subtext">or click to browse</p>
|
| 52 |
+
<input type="file" id="file-input" accept=".mp3,.wav,.m4a,.ogg,.flac,.webm" hidden>
|
| 53 |
+
</div>
|
| 54 |
+
|
| 55 |
+
<div id="file-info" class="file-info hidden">
|
| 56 |
+
<div class="file-details">
|
| 57 |
+
<span class="file-name" id="file-name">audio.mp3</span>
|
| 58 |
+
<span class="file-size" id="file-size">0 MB</span>
|
| 59 |
+
</div>
|
| 60 |
+
<button class="btn btn-clear" id="clear-btn" title="Remove file">
|
| 61 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 62 |
+
<line x1="18" y1="6" x2="6" y2="18" />
|
| 63 |
+
<line x1="6" y1="6" x2="18" y2="18" />
|
| 64 |
+
</svg>
|
| 65 |
+
</button>
|
| 66 |
+
</div>
|
| 67 |
+
|
| 68 |
+
<button class="btn btn-primary" id="transcribe-btn" disabled>
|
| 69 |
+
<span class="btn-text">Transcribe</span>
|
| 70 |
+
<span class="btn-icon">
|
| 71 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 72 |
+
<polygon points="5 3 19 12 5 21 5 3" />
|
| 73 |
+
</svg>
|
| 74 |
+
</span>
|
| 75 |
+
</button>
|
| 76 |
+
</section>
|
| 77 |
+
|
| 78 |
+
<!-- Processing Section -->
|
| 79 |
+
<section id="processing-section" class="card processing-card hidden">
|
| 80 |
+
<div class="processing-content">
|
| 81 |
+
<div class="spinner"></div>
|
| 82 |
+
<h3>Processing Audio</h3>
|
| 83 |
+
<p id="processing-status">Uploading file...</p>
|
| 84 |
+
<div class="progress-bar">
|
| 85 |
+
<div class="progress-fill" id="progress-fill"></div>
|
| 86 |
+
</div>
|
| 87 |
+
<div class="timer-display" id="processing-timer">00:00</div>
|
| 88 |
+
<p class="processing-hint">This may take a few minutes depending on audio length</p>
|
| 89 |
+
</div>
|
| 90 |
+
</section>
|
| 91 |
+
|
| 92 |
+
<!-- Results Section -->
|
| 93 |
+
<section id="results-section" class="card results-card hidden">
|
| 94 |
+
<div class="card-header">
|
| 95 |
+
<h2>Transcription Results</h2>
|
| 96 |
+
<div class="result-meta">
|
| 97 |
+
<span id="speaker-count" class="badge">0 speakers</span>
|
| 98 |
+
<span id="duration-info" class="badge">0:00</span>
|
| 99 |
+
<span id="processing-time" class="badge">0.0s</span>
|
| 100 |
+
</div>
|
| 101 |
+
</div>
|
| 102 |
+
|
| 103 |
+
<div class="download-buttons">
|
| 104 |
+
<a href="#" id="download-txt" class="btn btn-outline" download>
|
| 105 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 106 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 107 |
+
<polyline points="7 10 12 15 17 10" />
|
| 108 |
+
<line x1="12" y1="15" x2="12" y2="3" />
|
| 109 |
+
</svg>
|
| 110 |
+
Download TXT
|
| 111 |
+
</a>
|
| 112 |
+
<a href="#" id="download-srt" class="btn btn-outline" download>
|
| 113 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 114 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 115 |
+
<polyline points="7 10 12 15 17 10" />
|
| 116 |
+
<line x1="12" y1="15" x2="12" y2="3" />
|
| 117 |
+
</svg>
|
| 118 |
+
Download SRT
|
| 119 |
+
</a>
|
| 120 |
+
</div>
|
| 121 |
+
|
| 122 |
+
<div class="transcript-container" id="transcript-container">
|
| 123 |
+
<!-- Transcript segments will be rendered here -->
|
| 124 |
+
</div>
|
| 125 |
+
|
| 126 |
+
<button class="btn btn-secondary" id="new-upload-btn">
|
| 127 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 128 |
+
<polyline points="1 4 1 10 7 10" />
|
| 129 |
+
<path d="M3.51 15a9 9 0 1 0 2.13-9.36L1 10" />
|
| 130 |
+
</svg>
|
| 131 |
+
New Transcription
|
| 132 |
+
</button>
|
| 133 |
+
</section>
|
| 134 |
+
|
| 135 |
+
<!-- Error Section -->
|
| 136 |
+
<section id="error-section" class="card error-card hidden">
|
| 137 |
+
<div class="error-content">
|
| 138 |
+
<div class="error-icon">
|
| 139 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 140 |
+
<circle cx="12" cy="12" r="10" />
|
| 141 |
+
<line x1="15" y1="9" x2="9" y2="15" />
|
| 142 |
+
<line x1="9" y1="9" x2="15" y2="15" />
|
| 143 |
+
</svg>
|
| 144 |
+
</div>
|
| 145 |
+
<h3>Error</h3>
|
| 146 |
+
<p id="error-message">An error occurred during processing.</p>
|
| 147 |
+
<button class="btn btn-secondary" id="retry-btn">Try Again</button>
|
| 148 |
+
</div>
|
| 149 |
+
</section>
|
| 150 |
+
</main>
|
| 151 |
+
|
| 152 |
+
<!-- Footer -->
|
| 153 |
+
<footer class="footer">
|
| 154 |
+
<p>Powered by <strong>faster-whisper</strong> & <strong>pyannote.audio</strong></p>
|
| 155 |
+
<p class="footer-note">Max file size: {{ max_upload_mb }}MB</p>
|
| 156 |
+
</footer>
|
| 157 |
+
</div>
|
| 158 |
+
|
| 159 |
+
<script src="/static/js/app.js"></script>
|
| 160 |
+
</body>
|
| 161 |
+
|
| 162 |
+
</html>
|
data/processed/.gitkeep
ADDED
|
File without changes
|
data/uploads/.gitkeep
ADDED
|
File without changes
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
app:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: Dockerfile
|
| 6 |
+
args:
|
| 7 |
+
- PORT=${PORT:-7860}
|
| 8 |
+
container_name: precisionvoice
|
| 9 |
+
ports:
|
| 10 |
+
- "${PORT:-7860}:${PORT:-7860}"
|
| 11 |
+
volumes:
|
| 12 |
+
# Persist uploaded/processed files
|
| 13 |
+
- ./data:/app/data
|
| 14 |
+
# Cache models to avoid re-downloading
|
| 15 |
+
- model_cache_hf:/root/.cache/huggingface
|
| 16 |
+
- model_cache_torch:/root/.cache/torch
|
| 17 |
+
- model_cache_mdx:/root/.audio-separator-models
|
| 18 |
+
environment:
|
| 19 |
+
# HuggingFace token (required for pyannote.audio)
|
| 20 |
+
- HF_TOKEN=${HF_TOKEN:-}
|
| 21 |
+
# Model settings
|
| 22 |
+
- WHISPER_MODEL=${WHISPER_MODEL:-kiendt/PhoWhisper-large-ct2}
|
| 23 |
+
- DIARIZATION_MODEL=${DIARIZATION_MODEL:-pyannote/speaker-diarization-3.1}
|
| 24 |
+
# Device (auto, cuda, cpu)
|
| 25 |
+
- DEVICE=${DEVICE:-auto}
|
| 26 |
+
# Denoising (Speech Enhancement)
|
| 27 |
+
- ENABLE_DENOISER=${ENABLE_DENOISER:-True}
|
| 28 |
+
- DENOISER_MODEL=${DENOISER_MODEL:-dns64}
|
| 29 |
+
# MDX-Net Vocal Separation
|
| 30 |
+
- ENABLE_VOCAL_SEPARATION=${ENABLE_VOCAL_SEPARATION:-True}
|
| 31 |
+
- MDX_MODEL=${MDX_MODEL:-UVR-MDX-NET-Voc_FT}
|
| 32 |
+
# Upload settings
|
| 33 |
+
- MAX_UPLOAD_SIZE_MB=${MAX_UPLOAD_SIZE_MB:-100}
|
| 34 |
+
# Optimization settings
|
| 35 |
+
- ENABLE_LOUDNORM=${ENABLE_LOUDNORM:-True}
|
| 36 |
+
- ENABLE_NOISE_REDUCTION=${ENABLE_NOISE_REDUCTION:-True}
|
| 37 |
+
# VAD settings
|
| 38 |
+
- VAD_THRESHOLD=${VAD_THRESHOLD:-0.5}
|
| 39 |
+
- VAD_MIN_SPEECH_DURATION_MS=${VAD_MIN_SPEECH_DURATION_MS:-250}
|
| 40 |
+
- VAD_MIN_SILENCE_DURATION_MS=${VAD_MIN_SILENCE_DURATION_MS:-500}
|
| 41 |
+
# Clustering settings
|
| 42 |
+
- MERGE_THRESHOLD_S=${MERGE_THRESHOLD_S:-0.5}
|
| 43 |
+
- MIN_SEGMENT_DURATION_S=${MIN_SEGMENT_DURATION_S:-0.3}
|
| 44 |
+
restart: unless-stopped
|
| 45 |
+
# GPU support (uncomment for NVIDIA GPU)
|
| 46 |
+
# deploy:
|
| 47 |
+
# resources:
|
| 48 |
+
# reservations:
|
| 49 |
+
# devices:
|
| 50 |
+
# - driver: nvidia
|
| 51 |
+
# count: all
|
| 52 |
+
# capabilities: [gpu]
|
| 53 |
+
|
| 54 |
+
volumes:
|
| 55 |
+
model_cache_hf:
|
| 56 |
+
name: precisionvoice_hf_cache
|
| 57 |
+
model_cache_torch:
|
| 58 |
+
name: precisionvoice_torch_cache
|
| 59 |
+
model_cache_mdx:
|
| 60 |
+
name: precisionvoice_mdx_cache
|
docker/.gitkeep
ADDED
|
File without changes
|
precision_voice_colab.ipynb
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# PrecisionVoice - Google Colab Runner\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"This notebook allows you to run the [PrecisionVoice](https://github.com/thichuong/PrecisionVoice) application directly in Google Colab.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"### Instructions\n",
|
| 12 |
+
"1. **Runtime Change**: Go to `Runtime` -> `Change runtime type` and make sure **T4 GPU** (or better) is selected.\n",
|
| 13 |
+
"2. **Run All**: You can select `Runtime` -> `Run all` or run each cell step-by-step.\n",
|
| 14 |
+
"3. **Public URL**: Look for the `ngrok` public URL in the final cell output to access the web interface."
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": null,
|
| 20 |
+
"id": "5efa55f1",
|
| 21 |
+
"metadata": {},
|
| 22 |
+
"outputs": [
|
| 23 |
+
{
|
| 24 |
+
"name": "stdout",
|
| 25 |
+
"output_type": "stream",
|
| 26 |
+
"text": [
|
| 27 |
+
"GPU Detected: Tesla T4\n"
|
| 28 |
+
]
|
| 29 |
+
}
|
| 30 |
+
],
|
| 31 |
+
"source": [
|
| 32 |
+
"# @title 1. Check GPU Availability\n",
|
| 33 |
+
"import torch\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"if torch.cuda.is_available():\n",
|
| 36 |
+
" print(f\"GPU Detected: {torch.cuda.get_device_name(0)}\")\n",
|
| 37 |
+
"else:\n",
|
| 38 |
+
" print(\"WARNING: No GPU detected. This application requires a GPU to run efficiently.\")\n",
|
| 39 |
+
" print(\"Please go to Runtime -> Change runtime type -> Hardware accelerator -> T4 GPU\")"
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"cell_type": "code",
|
| 44 |
+
"execution_count": null,
|
| 45 |
+
"id": "b068e8ac",
|
| 46 |
+
"metadata": {},
|
| 47 |
+
"outputs": [
|
| 48 |
+
{
|
| 49 |
+
"name": "stdout",
|
| 50 |
+
"output_type": "stream",
|
| 51 |
+
"text": [
|
| 52 |
+
"Cloning into 'PrecisionVoice'...\n",
|
| 53 |
+
"remote: Enumerating objects: 94, done.\u001b[K\n",
|
| 54 |
+
"remote: Counting objects: 100% (94/94), done.\u001b[K\n",
|
| 55 |
+
"remote: Compressing objects: 100% (51/51), done.\u001b[K\n",
|
| 56 |
+
"remote: Total 94 (delta 34), reused 88 (delta 28), pack-reused 0 (from 0)\u001b[K\n",
|
| 57 |
+
"Receiving objects: 100% (94/94), 35.72 KiB | 5.10 MiB/s, done.\n",
|
| 58 |
+
"Resolving deltas: 100% (34/34), done.\n",
|
| 59 |
+
"/content/PrecisionVoice/PrecisionVoice/PrecisionVoice\n",
|
| 60 |
+
"Repository cloned successfully.\n"
|
| 61 |
+
]
|
| 62 |
+
}
|
| 63 |
+
],
|
| 64 |
+
"source": [
|
| 65 |
+
"# @title 2. Clone Repository\n",
|
| 66 |
+
"import os\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"# Clean up previous run if exists\n",
|
| 69 |
+
"if os.path.exists(\"PrecisionVoice\"):\n",
|
| 70 |
+
" %cd /content\n",
|
| 71 |
+
" !rm -rf PrecisionVoice\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"!git clone https://github.com/thichuong/PrecisionVoice.git\n",
|
| 74 |
+
"%cd PrecisionVoice\n",
|
| 75 |
+
"print(\"Repository cloned successfully.\")"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"cell_type": "code",
|
| 80 |
+
"execution_count": null,
|
| 81 |
+
"id": "42afe30f",
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"outputs": [
|
| 84 |
+
{
|
| 85 |
+
"name": "stdout",
|
| 86 |
+
"output_type": "stream",
|
| 87 |
+
"text": [
|
| 88 |
+
"Installing system dependencies... (This may take a moment)\n",
|
| 89 |
+
"✅ System dependencies (ffmpeg, libsndfile1) installed successfully.\n"
|
| 90 |
+
]
|
| 91 |
+
}
|
| 92 |
+
],
|
| 93 |
+
"source": [
|
| 94 |
+
"# @title 3. Install System Dependencies\n",
|
| 95 |
+
"import subprocess\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"# Installing dependencies defined in Dockerfile (ffmpeg, libsndfile)\n",
|
| 98 |
+
"print(\"Installing system dependencies... (This may take a moment)\")\n",
|
| 99 |
+
"\n",
|
| 100 |
+
"# Update and install (suppressing harmless R-repo warnings common in Colab)\n",
|
| 101 |
+
"!apt-get update -y > /dev/null 2>&1\n",
|
| 102 |
+
"!apt-get install -y ffmpeg libsndfile1 > /dev/null 2>&1\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"# Verify installation\n",
|
| 105 |
+
"try:\n",
|
| 106 |
+
" subprocess.run([\"ffmpeg\", \"-version\"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)\n",
|
| 107 |
+
" print(\"✅ System dependencies (ffmpeg, libsndfile1) installed successfully.\")\n",
|
| 108 |
+
"except Exception as e:\n",
|
| 109 |
+
" print(\"❌ Warning: Potential installation issue. If the app fails, try running '!apt-get install -y ffmpeg' manually.\")"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"execution_count": null,
|
| 115 |
+
"id": "4ec3974f",
|
| 116 |
+
"metadata": {},
|
| 117 |
+
"outputs": [],
|
| 118 |
+
"source": [
|
| 119 |
+
"# @title 4. Install Python Dependencies\n",
|
| 120 |
+
"# Force upgrade torch, torchvision, torchaudio to ensure compatibility\n",
|
| 121 |
+
"!pip install -U torch torchvision torchaudio\n",
|
| 122 |
+
"\n",
|
| 123 |
+
"!pip install -r requirements.txt\n",
|
| 124 |
+
"# Install pyngrok to expose the local server to the internet\n",
|
| 125 |
+
"!pip install pyngrok\n",
|
| 126 |
+
"print(\"Python dependencies installed.\")"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "code",
|
| 131 |
+
"execution_count": null,
|
| 132 |
+
"id": "1d5b721b",
|
| 133 |
+
"metadata": {},
|
| 134 |
+
"outputs": [
|
| 135 |
+
{
|
| 136 |
+
"name": "stdout",
|
| 137 |
+
"output_type": "stream",
|
| 138 |
+
"text": [
|
| 139 |
+
".env file created with default settings.\n"
|
| 140 |
+
]
|
| 141 |
+
}
|
| 142 |
+
],
|
| 143 |
+
"source": [
|
| 144 |
+
"# @title 5. Setup Environment (.env)\n",
|
| 145 |
+
"# Creating a default .env file. You can modify this cell to add your specific keys.\n",
|
| 146 |
+
"\n",
|
| 147 |
+
"env_content = \"\"\"\n",
|
| 148 |
+
"PORT=7860\n",
|
| 149 |
+
"LOG_LEVEL=INFO\n",
|
| 150 |
+
"\n",
|
| 151 |
+
"# Audio Processing\n",
|
| 152 |
+
"NOISE_REDUCTION_LEVEL=5.0\n",
|
| 153 |
+
"VAD_THRESHOLD=0.5\n",
|
| 154 |
+
"VAD_MIN_SPEECH_DURATION_MS=250\n",
|
| 155 |
+
"VAD_MIN_SILENCE_DURATION_MS=500\n",
|
| 156 |
+
"MERGE_THRESHOLD_S=1.5\n",
|
| 157 |
+
"\"\"\"\n",
|
| 158 |
+
"\n",
|
| 159 |
+
"with open(\".env\", \"w\") as f:\n",
|
| 160 |
+
" f.write(env_content)\n",
|
| 161 |
+
"\n",
|
| 162 |
+
"print(\".env file created with default settings.\")"
|
| 163 |
+
]
|
| 164 |
+
},
|
| 165 |
+
{
|
| 166 |
+
"cell_type": "code",
|
| 167 |
+
"execution_count": null,
|
| 168 |
+
"id": "9afa4d11",
|
| 169 |
+
"metadata": {},
|
| 170 |
+
"outputs": [
|
| 171 |
+
{
|
| 172 |
+
"name": "stdout",
|
| 173 |
+
"output_type": "stream",
|
| 174 |
+
"text": [
|
| 175 |
+
"Cleaning up previous sessions...\n"
|
| 176 |
+
]
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"data": {
|
| 180 |
+
"application/javascript": "(async (port, path, width, height, cache, element) => {\n if (!google.colab.kernel.accessAllowed && !cache) {\n return;\n }\n element.appendChild(document.createTextNode(''));\n const url = await google.colab.kernel.proxyPort(port, {cache});\n const iframe = document.createElement('iframe');\n iframe.src = new URL(path, url).toString();\n iframe.height = height;\n iframe.width = width;\n iframe.style.border = 0;\n iframe.allow = [\n 'accelerometer',\n 'autoplay',\n 'camera',\n 'clipboard-read',\n 'clipboard-write',\n 'gyroscope',\n 'magnetometer',\n 'microphone',\n 'serial',\n 'usb',\n 'xr-spatial-tracking',\n ].join('; ');\n element.appendChild(iframe);\n })(8000, \"/\", \"100%\", 900, false, window.element)",
|
| 181 |
+
"text/plain": [
|
| 182 |
+
"<IPython.core.display.Javascript object>"
|
| 183 |
+
]
|
| 184 |
+
},
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"output_type": "display_data"
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"name": "stdout",
|
| 190 |
+
"output_type": "stream",
|
| 191 |
+
"text": [
|
| 192 |
+
"\u001b[31mWarning: This function may stop working due to changes in browser security.\n",
|
| 193 |
+
"Try `serve_kernel_port_as_iframe` instead. \u001b[0m\n"
|
| 194 |
+
]
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"data": {
|
| 198 |
+
"application/javascript": "(async (port, path, text, element) => {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n element.appendChild(document.createTextNode(''));\n const url = await google.colab.kernel.proxyPort(port);\n const anchor = document.createElement('a');\n anchor.href = new URL(path, url).toString();\n anchor.target = '_blank';\n anchor.setAttribute('data-href', url + path);\n anchor.textContent = text;\n element.appendChild(anchor);\n })(8000, \"/\", \"https://localhost:8000/\", window.element)",
|
| 199 |
+
"text/plain": [
|
| 200 |
+
"<IPython.core.display.Javascript object>"
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
+
"metadata": {},
|
| 204 |
+
"output_type": "display_data"
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"name": "stdout",
|
| 208 |
+
"output_type": "stream",
|
| 209 |
+
"text": [
|
| 210 |
+
"Starting server on port 8000...\n",
|
| 211 |
+
"\u001b[32mINFO\u001b[0m: Started server process [\u001b[36m25608\u001b[0m]\n",
|
| 212 |
+
"\u001b[32mINFO\u001b[0m: Waiting for application startup.\n",
|
| 213 |
+
"2026-01-04 03:10:59,288 - app.main - INFO - Starting PrecisionVoice application...\n",
|
| 214 |
+
"2026-01-04 03:10:59,292 - app.main - INFO - Device: cuda\n",
|
| 215 |
+
"2026-01-04 03:10:59,292 - app.main - INFO - Whisper model: kiendt/PhoWhisper-large-ct2\n",
|
| 216 |
+
"2026-01-04 03:10:59,293 - app.main - INFO - Diarization model: pyannote/speaker-diarization-3.1\n",
|
| 217 |
+
"2026-01-04 03:10:59,293 - app.main - INFO - Preloading Whisper model...\n",
|
| 218 |
+
"2026-01-04 03:10:59,293 - app.services.transcription - INFO - Loading Whisper model: kiendt/PhoWhisper-large-ct2\n",
|
| 219 |
+
"2026-01-04 03:10:59,293 - app.services.transcription - INFO - Device: cuda, Compute type: float16\n",
|
| 220 |
+
"\n",
|
| 221 |
+
"🚀 Ngrok Public URL: https://tandy-pileous-biologically.ngrok-free.dev\n",
|
| 222 |
+
"\n",
|
| 223 |
+
"2026-01-04 03:11:02,736 - app.services.transcription - INFO - Whisper model loaded successfully\n",
|
| 224 |
+
"2026-01-04 03:11:02,737 - app.main - WARNING - HF_TOKEN not set, diarization will not be available\n",
|
| 225 |
+
"2026-01-04 03:11:02,737 - app.main - INFO - Application startup complete\n",
|
| 226 |
+
"\u001b[32mINFO\u001b[0m: Application startup complete.\n",
|
| 227 |
+
"\u001b[32mINFO\u001b[0m: Uvicorn running on \u001b[1mhttp://0.0.0.0:8000\u001b[0m (Press CTRL+C to quit)\n",
|
| 228 |
+
"\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET / HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
|
| 229 |
+
"\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /static/css/style.css HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
|
| 230 |
+
"\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /static/js/app.js HTTP/1.1\u001b[0m\" \u001b[32m200 OK\u001b[0m\n",
|
| 231 |
+
"\u001b[32mINFO\u001b[0m: 2a09:bac1:7aa0:10::17:37e:0 - \"\u001b[1mGET /favicon.ico HTTP/1.1\u001b[0m\" \u001b[31m404 Not Found\u001b[0m\n",
|
| 232 |
+
"2026-01-04 03:11:17,130 - app.services.audio_processor - INFO - Saved upload: /content/PrecisionVoice/PrecisionVoice/data/uploads/4bf9c6ad.wav\n",
|
| 233 |
+
"2026-01-04 03:11:17,131 - app.services.audio_processor - INFO - Applying loudnorm normalization...\n",
|
| 234 |
+
"2026-01-04 03:11:17,131 - app.services.audio_processor - INFO - Applying advanced noise reduction (anlmdn, level=5.0)...\n",
|
| 235 |
+
"\u001b[32mINFO\u001b[0m: Shutting down\n",
|
| 236 |
+
"\u001b[32mINFO\u001b[0m: Finished server process [\u001b[36m25608\u001b[0m]\n",
|
| 237 |
+
"\u001b[31mERROR\u001b[0m: Traceback (most recent call last):\n",
|
| 238 |
+
" File \"/usr/lib/python3.12/asyncio/runners.py\", line 195, in run\n",
|
| 239 |
+
" return runner.run(main)\n",
|
| 240 |
+
" ^^^^^^^^^^^^^^^^\n",
|
| 241 |
+
" File \"/usr/lib/python3.12/asyncio/runners.py\", line 118, in run\n",
|
| 242 |
+
" return self._loop.run_until_complete(task)\n",
|
| 243 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 244 |
+
" File \"uvloop/loop.pyx\", line 1512, in uvloop.loop.Loop.run_until_complete\n",
|
| 245 |
+
" File \"uvloop/loop.pyx\", line 1505, in uvloop.loop.Loop.run_until_complete\n",
|
| 246 |
+
" File \"uvloop/loop.pyx\", line 1379, in uvloop.loop.Loop.run_forever\n",
|
| 247 |
+
" File \"uvloop/loop.pyx\", line 557, in uvloop.loop.Loop._run\n",
|
| 248 |
+
" File \"uvloop/loop.pyx\", line 476, in uvloop.loop.Loop._on_idle\n",
|
| 249 |
+
" File \"uvloop/cbhandles.pyx\", line 83, in uvloop.loop.Handle._run\n",
|
| 250 |
+
" File \"uvloop/cbhandles.pyx\", line 63, in uvloop.loop.Handle._run\n",
|
| 251 |
+
" File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 70, in serve\n",
|
| 252 |
+
" with self.capture_signals():\n",
|
| 253 |
+
" ^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 254 |
+
" File \"/usr/lib/python3.12/contextlib.py\", line 144, in __exit__\n",
|
| 255 |
+
" next(self.gen)\n",
|
| 256 |
+
" File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 331, in capture_signals\n",
|
| 257 |
+
" signal.raise_signal(captured_signal)\n",
|
| 258 |
+
" File \"/usr/lib/python3.12/asyncio/runners.py\", line 157, in _on_sigint\n",
|
| 259 |
+
" raise KeyboardInterrupt()\n",
|
| 260 |
+
"KeyboardInterrupt\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"During handling of the above exception, another exception occurred:\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"Traceback (most recent call last):\n",
|
| 265 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 701, in lifespan\n",
|
| 266 |
+
" await receive()\n",
|
| 267 |
+
" File \"/usr/local/lib/python3.12/dist-packages/uvicorn/lifespan/on.py\", line 137, in receive\n",
|
| 268 |
+
" return await self.receive_queue.get()\n",
|
| 269 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 270 |
+
" File \"/usr/lib/python3.12/asyncio/queues.py\", line 158, in get\n",
|
| 271 |
+
" await getter\n",
|
| 272 |
+
"asyncio.exceptions.CancelledError\n",
|
| 273 |
+
"\n",
|
| 274 |
+
"\u001b[31mERROR\u001b[0m: Exception in ASGI application\n",
|
| 275 |
+
"Traceback (most recent call last):\n",
|
| 276 |
+
" File \"/usr/lib/python3.12/asyncio/runners.py\", line 195, in run\n",
|
| 277 |
+
" return runner.run(main)\n",
|
| 278 |
+
" ^^^^^^^^^^^^^^^^\n",
|
| 279 |
+
" File \"/usr/lib/python3.12/asyncio/runners.py\", line 118, in run\n",
|
| 280 |
+
" return self._loop.run_until_complete(task)\n",
|
| 281 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 282 |
+
" File \"uvloop/loop.pyx\", line 1512, in uvloop.loop.Loop.run_until_complete\n",
|
| 283 |
+
" File \"uvloop/loop.pyx\", line 1505, in uvloop.loop.Loop.run_until_complete\n",
|
| 284 |
+
" File \"uvloop/loop.pyx\", line 1379, in uvloop.loop.Loop.run_forever\n",
|
| 285 |
+
" File \"uvloop/loop.pyx\", line 557, in uvloop.loop.Loop._run\n",
|
| 286 |
+
" File \"uvloop/loop.pyx\", line 476, in uvloop.loop.Loop._on_idle\n",
|
| 287 |
+
" File \"uvloop/cbhandles.pyx\", line 83, in uvloop.loop.Handle._run\n",
|
| 288 |
+
" File \"uvloop/cbhandles.pyx\", line 63, in uvloop.loop.Handle._run\n",
|
| 289 |
+
" File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 70, in serve\n",
|
| 290 |
+
" with self.capture_signals():\n",
|
| 291 |
+
" ^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 292 |
+
" File \"/usr/lib/python3.12/contextlib.py\", line 144, in __exit__\n",
|
| 293 |
+
" next(self.gen)\n",
|
| 294 |
+
" File \"/usr/local/lib/python3.12/dist-packages/uvicorn/server.py\", line 331, in capture_signals\n",
|
| 295 |
+
" signal.raise_signal(captured_signal)\n",
|
| 296 |
+
" File \"/usr/lib/python3.12/asyncio/runners.py\", line 157, in _on_sigint\n",
|
| 297 |
+
" raise KeyboardInterrupt()\n",
|
| 298 |
+
"KeyboardInterrupt\n",
|
| 299 |
+
"\n",
|
| 300 |
+
"During handling of the above exception, another exception occurred:\n",
|
| 301 |
+
"\n",
|
| 302 |
+
"Traceback (most recent call last):\n",
|
| 303 |
+
" File \"/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/httptools_impl.py\", line 409, in run_asgi\n",
|
| 304 |
+
" result = await app( # type: ignore[func-returns-value]\n",
|
| 305 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 306 |
+
" File \"/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py\", line 60, in __call__\n",
|
| 307 |
+
" return await self.app(scope, receive, send)\n",
|
| 308 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 309 |
+
" File \"/usr/local/lib/python3.12/dist-packages/fastapi/applications.py\", line 1139, in __call__\n",
|
| 310 |
+
" await super().__call__(scope, receive, send)\n",
|
| 311 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/applications.py\", line 107, in __call__\n",
|
| 312 |
+
" await self.middleware_stack(scope, receive, send)\n",
|
| 313 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py\", line 164, in __call__\n",
|
| 314 |
+
" await self.app(scope, receive, _send)\n",
|
| 315 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/cors.py\", line 93, in __call__\n",
|
| 316 |
+
" await self.simple_response(scope, receive, send, request_headers=headers)\n",
|
| 317 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/cors.py\", line 144, in simple_response\n",
|
| 318 |
+
" await self.app(scope, receive, send)\n",
|
| 319 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/middleware/exceptions.py\", line 63, in __call__\n",
|
| 320 |
+
" await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)\n",
|
| 321 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/_exception_handler.py\", line 42, in wrapped_app\n",
|
| 322 |
+
" await app(scope, receive, sender)\n",
|
| 323 |
+
" File \"/usr/local/lib/python3.12/dist-packages/fastapi/middleware/asyncexitstack.py\", line 18, in __call__\n",
|
| 324 |
+
" await self.app(scope, receive, send)\n",
|
| 325 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 716, in __call__\n",
|
| 326 |
+
" await self.middleware_stack(scope, receive, send)\n",
|
| 327 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 736, in app\n",
|
| 328 |
+
" await route.handle(scope, receive, send)\n",
|
| 329 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/routing.py\", line 290, in handle\n",
|
| 330 |
+
" await self.app(scope, receive, send)\n",
|
| 331 |
+
" File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 119, in app\n",
|
| 332 |
+
" await wrap_app_handling_exceptions(app, request)(scope, receive, send)\n",
|
| 333 |
+
" File \"/usr/local/lib/python3.12/dist-packages/starlette/_exception_handler.py\", line 42, in wrapped_app\n",
|
| 334 |
+
" await app(scope, receive, sender)\n",
|
| 335 |
+
" File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 105, in app\n",
|
| 336 |
+
" response = await f(request)\n",
|
| 337 |
+
" ^^^^^^^^^^^^^^^^\n",
|
| 338 |
+
" File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 385, in app\n",
|
| 339 |
+
" raw_response = await run_endpoint_function(\n",
|
| 340 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 341 |
+
" File \"/usr/local/lib/python3.12/dist-packages/fastapi/routing.py\", line 284, in run_endpoint_function\n",
|
| 342 |
+
" return await dependant.call(**values)\n",
|
| 343 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 344 |
+
" File \"/content/PrecisionVoice/PrecisionVoice/app/api/routes.py\", line 62, in transcribe_audio\n",
|
| 345 |
+
" wav_path, duration = await AudioProcessor.process_upload(\n",
|
| 346 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 347 |
+
" File \"/content/PrecisionVoice/PrecisionVoice/app/services/audio_processor.py\", line 205, in process_upload\n",
|
| 348 |
+
" wav_path = await cls.convert_to_wav(original_path)\n",
|
| 349 |
+
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
|
| 350 |
+
" File \"/content/PrecisionVoice/PrecisionVoice/app/services/audio_processor.py\", line 104, in convert_to_wav\n",
|
| 351 |
+
" await loop.run_in_executor(None, lambda: cls._run_ffmpeg_conversion(input_path, output_path))\n",
|
| 352 |
+
"asyncio.exceptions.CancelledError\n"
|
| 353 |
+
]
|
| 354 |
+
}
|
| 355 |
+
],
|
| 356 |
+
"source": [
|
| 357 |
+
"# @title 6. Run Application\n",
|
| 358 |
+
"import threading\n",
|
| 359 |
+
"import time\n",
|
| 360 |
+
"import os\n",
|
| 361 |
+
"from google.colab.output import serve_kernel_port_as_iframe, serve_kernel_port_as_window\n",
|
| 362 |
+
"from pyngrok import ngrok\n",
|
| 363 |
+
"\n",
|
| 364 |
+
"# FORCE KILL any existing ngrok processes to free up the auth token session\n",
|
| 365 |
+
"print(\"Cleaning up previous sessions...\")\n",
|
| 366 |
+
"!killall ngrok 2>/dev/null \n",
|
| 367 |
+
"ngrok.kill()\n",
|
| 368 |
+
"\n",
|
| 369 |
+
"# Set your authtoken (Ensure this matches the one in your Ngrok dashboard)\n",
|
| 370 |
+
"ngrok.set_auth_token(\"NGROK_TOKEN\")\n",
|
| 371 |
+
"\n",
|
| 372 |
+
"port = 8000\n",
|
| 373 |
+
"\n",
|
| 374 |
+
"def start_ngrok():\n",
|
| 375 |
+
" # Wait a bit for the server to start\n",
|
| 376 |
+
" time.sleep(5)\n",
|
| 377 |
+
" try:\n",
|
| 378 |
+
" # Connect to the port\n",
|
| 379 |
+
" public_url = ngrok.connect(port).public_url\n",
|
| 380 |
+
" print(f\"\\n🚀 Ngrok Public URL: {public_url}\\n\")\n",
|
| 381 |
+
" except Exception as e:\n",
|
| 382 |
+
" print(f\"Ngrok error: {e}\")\n",
|
| 383 |
+
"\n",
|
| 384 |
+
"# Start ngrok in a background thread\n",
|
| 385 |
+
"threading.Thread(target=start_ngrok, daemon=True).start()\n",
|
| 386 |
+
"\n",
|
| 387 |
+
"# Serve the application directly in the notebook cell\n",
|
| 388 |
+
"serve_kernel_port_as_iframe(port, height=900)\n",
|
| 389 |
+
"\n",
|
| 390 |
+
"# Also provide a link to open in a new tab via proxy\n",
|
| 391 |
+
"serve_kernel_port_as_window(port, path=\"/\")\n",
|
| 392 |
+
"\n",
|
| 393 |
+
"# Run the Uvicorn server\n",
|
| 394 |
+
"print(f\"Starting server on port {port}...\")\n",
|
| 395 |
+
"!uvicorn app.main:app --host 0.0.0.0 --port {port}"
|
| 396 |
+
]
|
| 397 |
+
}
|
| 398 |
+
],
|
| 399 |
+
"metadata": {
|
| 400 |
+
"accelerator": "GPU",
|
| 401 |
+
"colab": {
|
| 402 |
+
"gpuType": "T4",
|
| 403 |
+
"provenance": []
|
| 404 |
+
},
|
| 405 |
+
"kernelspec": {
|
| 406 |
+
"display_name": "Python 3 (ipykernel)",
|
| 407 |
+
"language": "python",
|
| 408 |
+
"name": "python3"
|
| 409 |
+
}
|
| 410 |
+
},
|
| 411 |
+
"nbformat": 4,
|
| 412 |
+
"nbformat_minor": 5
|
| 413 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core framework
|
| 2 |
+
fastapi>=0.109.0
|
| 3 |
+
uvicorn[standard]>=0.27.0
|
| 4 |
+
python-multipart>=0.0.6
|
| 5 |
+
jinja2>=3.1.2
|
| 6 |
+
aiofiles>=23.2.1
|
| 7 |
+
|
| 8 |
+
# AI/ML - Speech-to-Text
|
| 9 |
+
faster-whisper>=1.0.0
|
| 10 |
+
ctranslate2>=4.0.0
|
| 11 |
+
|
| 12 |
+
# AI/ML - Speaker Diarization
|
| 13 |
+
pyannote.audio>=3.1.0
|
| 14 |
+
torch>=2.1.0
|
| 15 |
+
torchaudio>=2.1.0
|
| 16 |
+
|
| 17 |
+
# AI/ML - Vocal Separation
|
| 18 |
+
audio-separator[cpu]>=0.17.0
|
| 19 |
+
denoiser>=0.1.4
|
| 20 |
+
|
| 21 |
+
# Audio processing
|
| 22 |
+
ffmpeg-python>=0.2.0
|
| 23 |
+
pydub>=0.25.1
|
| 24 |
+
|
| 25 |
+
# Configuration
|
| 26 |
+
pydantic-settings>=2.1.0
|
| 27 |
+
python-dotenv>=1.0.0
|
| 28 |
+
|
| 29 |
+
# Utilities
|
| 30 |
+
aiohttp>=3.9.0
|
| 31 |
+
numpy>=1.24.0
|
scripts/verify_model_config.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from app.core.config import get_settings
|
| 3 |
+
from app.services.transcription import TranscriptionService
|
| 4 |
+
|
| 5 |
+
def verify_stt_model():
|
| 6 |
+
settings = get_settings()
|
| 7 |
+
print(f"Current Whisper Model: {settings.whisper_model}")
|
| 8 |
+
print(f"Device: {settings.resolved_device}")
|
| 9 |
+
print(f"Compute Type: {settings.resolved_compute_type}")
|
| 10 |
+
|
| 11 |
+
expected_model = "kiendt/PhoWhisper-large-ct2"
|
| 12 |
+
if settings.whisper_model == expected_model:
|
| 13 |
+
print("✅ SUCCESS: Model configuration updated correctly.")
|
| 14 |
+
else:
|
| 15 |
+
print(f"❌ FAILURE: Expected {expected_model}, got {settings.whisper_model}")
|
| 16 |
+
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
verify_stt_model()
|