Spaces:
Running
Running
PoC deployment
Browse files- Dockerfile +68 -0
- README.md +99 -7
- app/__init__.py +1 -0
- app/api/__init__.py +1 -0
- app/api/routes.py +209 -0
- app/core/__init__.py +1 -0
- app/core/config.py +122 -0
- app/main.py +131 -0
- app/schemas/__init__.py +1 -0
- app/schemas/models.py +158 -0
- app/services/__init__.py +5 -0
- app/services/alignment.py +294 -0
- app/services/audio_processor.py +247 -0
- app/services/denoiser.py +142 -0
- app/services/diarization.py +223 -0
- app/services/emo.py +169 -0
- app/services/processor.py +623 -0
- app/services/silero_vad_service.py +72 -0
- app/services/transcription.py +283 -0
- app/services/vocal_separator.py +118 -0
- app/static/css/style.css +673 -0
- app/static/js/app.js +338 -0
- app/templates/index.html +162 -0
- data/processed/.gitkeep +0 -0
- data/uploads/.gitkeep +0 -0
- docker-compose.yml +60 -0
- docker/.gitkeep +0 -0
- precision_voice_eval_ASR.ipynb +0 -0
- precision_voice_simple.ipynb +672 -0
- requirements.txt +48 -0
- scripts/verify_model_config.py +18 -0
Dockerfile
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ================================
|
| 2 |
+
# PrecisionVoice Dockerfile
|
| 3 |
+
# Optimized for performance and size
|
| 4 |
+
# ================================
|
| 5 |
+
|
| 6 |
+
# Stage 1: Builder
|
| 7 |
+
FROM python:3.10-slim-bullseye AS builder
|
| 8 |
+
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install build dependencies
|
| 12 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 13 |
+
build-essential \
|
| 14 |
+
git \
|
| 15 |
+
ffmpeg \
|
| 16 |
+
libsndfile1-dev \
|
| 17 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
# Copy requirements and install dependencies
|
| 20 |
+
# Using --user to keep packages in /root/.local
|
| 21 |
+
COPY requirements.txt .
|
| 22 |
+
RUN pip install --no-cache-dir --user -r requirements.txt
|
| 23 |
+
|
| 24 |
+
# ================================
|
| 25 |
+
# Stage 2: Runtime
|
| 26 |
+
# ================================
|
| 27 |
+
FROM python:3.10-slim-bullseye
|
| 28 |
+
|
| 29 |
+
WORKDIR /app
|
| 30 |
+
|
| 31 |
+
# Install runtime dependencies
|
| 32 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 33 |
+
ffmpeg \
|
| 34 |
+
libsndfile1 \
|
| 35 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 36 |
+
&& apt-get clean
|
| 37 |
+
|
| 38 |
+
# Copy Python packages from builder
|
| 39 |
+
COPY --from=builder /root/.local /root/.local
|
| 40 |
+
|
| 41 |
+
# Ensure scripts in .local are available
|
| 42 |
+
ENV PATH=/root/.local/bin:$PATH
|
| 43 |
+
ENV PYTHONUNBUFFERED=1
|
| 44 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 45 |
+
|
| 46 |
+
# Model cache directories
|
| 47 |
+
ENV HF_HOME=/root/.cache/huggingface
|
| 48 |
+
ENV TORCH_HOME=/root/.cache/torch
|
| 49 |
+
ENV TRANSFORMERS_CACHE=/root/.cache/huggingface
|
| 50 |
+
|
| 51 |
+
# Copy application code
|
| 52 |
+
COPY app/ ./app/
|
| 53 |
+
COPY data/ ./data/
|
| 54 |
+
|
| 55 |
+
# Create necessary directories
|
| 56 |
+
RUN mkdir -p /app/data/uploads /app/data/processed
|
| 57 |
+
|
| 58 |
+
# Port configuration
|
| 59 |
+
ARG PORT=7860
|
| 60 |
+
ENV PORT=${PORT}
|
| 61 |
+
EXPOSE ${PORT}
|
| 62 |
+
|
| 63 |
+
# Health check
|
| 64 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 65 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:${PORT}/api/health')" || exit 1
|
| 66 |
+
|
| 67 |
+
# Run the application
|
| 68 |
+
CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]
|
README.md
CHANGED
|
@@ -1,12 +1,104 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PrecisionVoice
|
| 3 |
+
emoji: 🎙️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_file: app/main.py
|
|
|
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 12 |
+
|
| 13 |
+
# PrecisionVoice - STT & Speaker Diarization
|
| 14 |
+
|
| 15 |
+
A production-ready Speech-to-Text and Speaker Diarization web application using FastAPI, faster-whisper, and pyannote.audio.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- 🎙️ Speech-to-Text using `erax-ai/EraX-WoW-Turbo-V1.1-CT2` (8x faster, 8 Vietnamese dialects)
|
| 20 |
+
- 👥 Speaker Diarization using `pyannote/speaker-diarization-3.1`
|
| 21 |
+
- 🧼 Speech Enhancement using `SpeechBrain SepFormer DNS4` (noise + reverb removal)
|
| 22 |
+
- 🔇 Voice Activity Detection using `Silero VAD v5` (prevents hallucination)
|
| 23 |
+
- 🎤 Vocal Isolation using `MDX-Net` (UVR-MDX-NET-Voc_FT)
|
| 24 |
+
- 🔄 Automatic speaker-transcript alignment
|
| 25 |
+
- 📥 Download results in TXT or SRT format
|
| 26 |
+
- 🐳 Docker-ready with persistent model caching and GPU support
|
| 27 |
+
- 🐳 Docker-ready with persistent model caching and GPU support
|
| 28 |
+
|
| 29 |
+
## Quick Start
|
| 30 |
+
|
| 31 |
+
### Prerequisites
|
| 32 |
+
|
| 33 |
+
1. Docker and Docker Compose
|
| 34 |
+
2. (Optional) NVIDIA GPU with CUDA support
|
| 35 |
+
3. HuggingFace account with access to pyannote models
|
| 36 |
+
|
| 37 |
+
### Setup
|
| 38 |
+
|
| 39 |
+
1. Clone and configure:
|
| 40 |
+
```bash
|
| 41 |
+
cp .env.example .env
|
| 42 |
+
# Edit .env and add your HuggingFace token
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
2. Build and run:
|
| 46 |
+
```bash
|
| 47 |
+
docker compose up --build
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
3. Open http://localhost:8000
|
| 51 |
+
|
| 52 |
+
## Audio Processing Pipeline
|
| 53 |
+
|
| 54 |
+
The system uses a state-of-the-art multi-stage pipeline to ensure maximum accuracy:
|
| 55 |
+
|
| 56 |
+
1. **Speech Enhancement**: Background noise and reverb are removed using `SpeechBrain SepFormer` (DNS4 Challenge winner).
|
| 57 |
+
2. **Vocal Isolation**: Vocals are separated from background music using `MDX-Net`.
|
| 58 |
+
3. **VAD Filtering**: Silence is removed using `Silero VAD v5` to prevent ASR hallucination.
|
| 59 |
+
4. **Refinement**: Highpass filtering and EBU R128 loudness normalization.
|
| 60 |
+
5. **Transcription**: High-precision Vietnamese transcription using `PhoWhisper`.
|
| 61 |
+
6. **Diarization**: Segmenting audio by speaker using `Pyannote 3.1`.
|
| 62 |
+
7. **Alignment**: Merging transcripts with speaker segments + timestamp reconstruction.
|
| 63 |
+
|
| 64 |
+
## Configuration
|
| 65 |
+
|
| 66 |
+
| Variable | Default | Description |
|
| 67 |
+
|----------|---------|-------------|
|
| 68 |
+
| `HF_TOKEN` | - | Required for Pyannote models |
|
| 69 |
+
| `ENABLE_SPEECH_ENHANCEMENT` | `True` | Toggle SpeechBrain speech enhancement |
|
| 70 |
+
| `ENHANCEMENT_MODEL` | `speechbrain/sepformer-dns4-16k-enhancement` | Model for speech enhancement |
|
| 71 |
+
| `ENABLE_SILERO_VAD` | `True` | Toggle Silero VAD for hallucination prevention |
|
| 72 |
+
| `ENABLE_VOCAL_SEPARATION` | `True` | Toggle MDX-Net vocal isolation |
|
| 73 |
+
| `MDX_MODEL` | `UVR-MDX-NET-Voc_FT` | Model for vocal separation |
|
| 74 |
+
| `DEVICE` | `auto` | `cuda`, `cpu`, or `auto` |
|
| 75 |
+
|
| 76 |
+
## Development
|
| 77 |
+
|
| 78 |
+
### Local Setup (without Docker)
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
python -m venv venv
|
| 82 |
+
source venv/bin/activate
|
| 83 |
+
pip install -r requirements.txt
|
| 84 |
+
uvicorn app.main:app --reload
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
### API Endpoints
|
| 88 |
+
|
| 89 |
+
| Endpoint | Method | Description |
|
| 90 |
+
|----------|--------|-------------|
|
| 91 |
+
| `/` | GET | Web UI |
|
| 92 |
+
| `/api/transcribe` | POST | Upload and transcribe audio |
|
| 93 |
+
| `/api/download/{filename}` | GET | Download result files |
|
| 94 |
+
|
| 95 |
+
## Supported Audio Formats
|
| 96 |
+
|
| 97 |
+
- MP3
|
| 98 |
+
- WAV
|
| 99 |
+
- M4A
|
| 100 |
+
- OGG
|
| 101 |
+
|
| 102 |
+
## License
|
| 103 |
+
|
| 104 |
+
MIT
|
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# App package
|
app/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# API package
|
app/api/routes.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API routes for the transcription service.
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
import time
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import csv
|
| 8 |
+
|
| 9 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, BackgroundTasks, Form
|
| 10 |
+
from fastapi.responses import FileResponse
|
| 11 |
+
|
| 12 |
+
from app.core.config import get_settings
|
| 13 |
+
from app.schemas.models import TranscriptionResponse, HealthResponse
|
| 14 |
+
from app.services.audio_processor import AudioProcessor, AudioProcessingError
|
| 15 |
+
from app.services.transcription import TranscriptionService, AVAILABLE_MODELS
|
| 16 |
+
from app.services.diarization import DiarizationService
|
| 17 |
+
from app.services.processor import Processor
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
settings = get_settings()
|
| 21 |
+
|
| 22 |
+
router = APIRouter()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@router.get("/api/health", response_model=HealthResponse)
|
| 26 |
+
async def health_check():
|
| 27 |
+
"""Health check endpoint."""
|
| 28 |
+
return HealthResponse(
|
| 29 |
+
status="healthy",
|
| 30 |
+
models_loaded=TranscriptionService.is_loaded() and DiarizationService.is_loaded(),
|
| 31 |
+
device=settings.resolved_device
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@router.get("/api/models")
|
| 36 |
+
async def get_models():
|
| 37 |
+
"""Get available Whisper models."""
|
| 38 |
+
return {
|
| 39 |
+
"models": list(AVAILABLE_MODELS.keys()),
|
| 40 |
+
"default": settings.default_whisper_model
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@router.post("/api/transcribe", response_model=TranscriptionResponse)
|
| 45 |
+
async def transcribe_audio(
|
| 46 |
+
background_tasks: BackgroundTasks,
|
| 47 |
+
file: UploadFile = File(..., description="Audio file to transcribe"),
|
| 48 |
+
model: str = Form(default="PhoWhisper Large", description="Whisper model to use"),
|
| 49 |
+
language: str = Form(default="vi", description="Language code")
|
| 50 |
+
|
| 51 |
+
):
|
| 52 |
+
"""
|
| 53 |
+
Upload and transcribe an audio file.
|
| 54 |
+
|
| 55 |
+
Uses diarize-first workflow:
|
| 56 |
+
1. Diarization to identify speakers
|
| 57 |
+
2. Transcribe each speaker segment
|
| 58 |
+
3. Return combined result
|
| 59 |
+
4. Predict emotion segments
|
| 60 |
+
"""
|
| 61 |
+
upload_path = None
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
# Read file content
|
| 65 |
+
file_content = await file.read()
|
| 66 |
+
|
| 67 |
+
# Validate
|
| 68 |
+
try:
|
| 69 |
+
AudioProcessor.validate_file(file.filename or "audio.wav", len(file_content))
|
| 70 |
+
except AudioProcessingError as e:
|
| 71 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 72 |
+
|
| 73 |
+
# Save upload
|
| 74 |
+
upload_path = await AudioProcessor.save_upload(file_content, file.filename or "audio.wav")
|
| 75 |
+
|
| 76 |
+
# Process with new workflow
|
| 77 |
+
logger.info(f"Processing audio with model={model}, language={language}")
|
| 78 |
+
result = await Processor.process_audio(
|
| 79 |
+
audio_path=upload_path,
|
| 80 |
+
language=language,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Name output files
|
| 84 |
+
base_name = Path(file.filename or "audio").stem
|
| 85 |
+
|
| 86 |
+
txt_filename = f"{base_name}_output.txt"
|
| 87 |
+
csv_filename = f"{base_name}_output.csv"
|
| 88 |
+
|
| 89 |
+
txt_path = settings.processed_dir / txt_filename
|
| 90 |
+
csv_path = settings.processed_dir / csv_filename
|
| 91 |
+
|
| 92 |
+
# Write TXT
|
| 93 |
+
txt_path.write_text(result.txt_content, encoding="utf-8")
|
| 94 |
+
|
| 95 |
+
# Write CSV (UTF-8)
|
| 96 |
+
roles = result.roles or {}
|
| 97 |
+
with csv_path.open("w", newline="", encoding="utf-8-sig") as f:
|
| 98 |
+
writer = csv.DictWriter(
|
| 99 |
+
f,
|
| 100 |
+
fieldnames=["start", "end", "speaker", "text"],
|
| 101 |
+
)
|
| 102 |
+
writer.writeheader()
|
| 103 |
+
for seg in result.segments:
|
| 104 |
+
writer.writerow({
|
| 105 |
+
"start": round(seg.start, 2),
|
| 106 |
+
"end": round(seg.end, 2),
|
| 107 |
+
"speaker": roles.get(seg.speaker, seg.speaker),
|
| 108 |
+
"text": seg.text,
|
| 109 |
+
})
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# Schedule cleanup
|
| 113 |
+
background_tasks.add_task(cleanup_files, upload_path)
|
| 114 |
+
|
| 115 |
+
# Build response
|
| 116 |
+
return TranscriptionResponse(
|
| 117 |
+
success=True,
|
| 118 |
+
segments=[
|
| 119 |
+
{
|
| 120 |
+
"start": seg.start,
|
| 121 |
+
"end": seg.end,
|
| 122 |
+
"speaker": seg.speaker,
|
| 123 |
+
"role": seg.role,
|
| 124 |
+
"text": seg.text,
|
| 125 |
+
"emotion": seg.emotion
|
| 126 |
+
}
|
| 127 |
+
for seg in result.segments
|
| 128 |
+
],
|
| 129 |
+
speaker_count=result.speaker_count,
|
| 130 |
+
speakers=result.speakers,
|
| 131 |
+
duration=result.duration,
|
| 132 |
+
processing_time=result.processing_time,
|
| 133 |
+
roles=result.roles,
|
| 134 |
+
|
| 135 |
+
emotion_timeline=[
|
| 136 |
+
{
|
| 137 |
+
"time": p.time,
|
| 138 |
+
"emotion": p.emotion
|
| 139 |
+
}
|
| 140 |
+
for p in (result.emotion_timeline or [])
|
| 141 |
+
],
|
| 142 |
+
|
| 143 |
+
emotion_changes=[
|
| 144 |
+
{
|
| 145 |
+
"time": c.time,
|
| 146 |
+
"from": c.emotion_from,
|
| 147 |
+
"to": c.emotion_to,
|
| 148 |
+
"icon_from": c.icon_from,
|
| 149 |
+
"icon_to": c.icon_to
|
| 150 |
+
|
| 151 |
+
}
|
| 152 |
+
for c in (result.emotion_changes or [])
|
| 153 |
+
],
|
| 154 |
+
download_txt=f"/api/download/{txt_filename}",
|
| 155 |
+
download_csv=f"/api/download/{csv_filename}",
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
except HTTPException:
|
| 159 |
+
raise
|
| 160 |
+
except Exception as e:
|
| 161 |
+
logger.exception("Processing failed")
|
| 162 |
+
if upload_path and upload_path.exists():
|
| 163 |
+
background_tasks.add_task(cleanup_files, upload_path)
|
| 164 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
@router.get("/api/download/{filename}")
|
| 168 |
+
async def download_file(filename: str):
|
| 169 |
+
"""
|
| 170 |
+
Download a generated transcript file.
|
| 171 |
+
Supports: .txt, .srt files
|
| 172 |
+
"""
|
| 173 |
+
# Security: only allow specific extensions and no path traversal
|
| 174 |
+
if not filename.endswith(('.txt', '.csv')) or '/' in filename or '..' in filename:
|
| 175 |
+
raise HTTPException(status_code=400, detail="Invalid filename")
|
| 176 |
+
|
| 177 |
+
filepath = settings.processed_dir / filename
|
| 178 |
+
|
| 179 |
+
if not filepath.exists():
|
| 180 |
+
raise HTTPException(status_code=404, detail="File not found")
|
| 181 |
+
|
| 182 |
+
# Determine media type
|
| 183 |
+
if filename.endswith(".txt"):
|
| 184 |
+
media_type = "text/plain; charset=utf-8"
|
| 185 |
+
elif filename.endswith(".csv"):
|
| 186 |
+
media_type = "text/csv; charset=utf-8"
|
| 187 |
+
elif filename.endswith(".srt"):
|
| 188 |
+
media_type = "application/x-subrip"
|
| 189 |
+
else:
|
| 190 |
+
media_type = "application/octet-stream"
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
return FileResponse(
|
| 194 |
+
path=filepath,
|
| 195 |
+
filename=filename,
|
| 196 |
+
media_type=media_type,
|
| 197 |
+
headers={
|
| 198 |
+
"Content-Disposition": f'attachment; filename="{filename}"'},
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
async def cleanup_files(*paths: Path):
|
| 203 |
+
"""Background task to cleanup temporary files."""
|
| 204 |
+
import asyncio
|
| 205 |
+
|
| 206 |
+
# Wait a bit before cleanup
|
| 207 |
+
await asyncio.sleep(5)
|
| 208 |
+
|
| 209 |
+
await AudioProcessor.cleanup_files(*paths)
|
app/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Core package
|
app/core/config.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Application configuration using Pydantic Settings.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import Literal, Dict
|
| 8 |
+
|
| 9 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Settings(BaseSettings):
|
| 13 |
+
"""Application settings loaded from environment variables."""
|
| 14 |
+
|
| 15 |
+
model_config = SettingsConfigDict(
|
| 16 |
+
env_file=".env",
|
| 17 |
+
env_file_encoding="utf-8",
|
| 18 |
+
extra="ignore"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# HuggingFace
|
| 22 |
+
hf_token: str = ""
|
| 23 |
+
enable_noise_reduction: bool = True
|
| 24 |
+
|
| 25 |
+
# Denoising (Speech Enhancement)
|
| 26 |
+
enable_denoiser: bool = True
|
| 27 |
+
denoiser_model: str = "dns64"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# MDX-Net Vocal Separation
|
| 31 |
+
enable_vocal_separation: bool = True
|
| 32 |
+
mdx_model: str = "Kim_Vocal_2.onnx" # High quality vocal isolation
|
| 33 |
+
|
| 34 |
+
available_whisper_models: Dict[str, str] = {
|
| 35 |
+
"EraX-WoW-Turbo": "erax-ai/EraX-WoW-Turbo-V1.1-CT2",
|
| 36 |
+
"PhoWhisper Large": "kiendt/PhoWhisper-large-ct2",
|
| 37 |
+
"PhoWhisper Lora Finetuned": "vyluong/pho-whisper-vi-ct2"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
# S2T model
|
| 41 |
+
default_whisper_model: str = "vyluong/pho-whisper-vi-ct2"
|
| 42 |
+
|
| 43 |
+
# voice emotion detection model
|
| 44 |
+
default_dual_emotion_model: str = "vyluong/emo_dual_classi"
|
| 45 |
+
|
| 46 |
+
# sentiment model based text
|
| 47 |
+
# default_bert_sentiment_model: str = ""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# Diarization model
|
| 51 |
+
diarization_model: str = "pyannote/speaker-diarization-community-1"
|
| 52 |
+
|
| 53 |
+
# Device settings
|
| 54 |
+
device: Literal["cuda", "cpu", "auto"] = "auto"
|
| 55 |
+
compute_type: str = "float16" # float16 for GPU, int8 for CPU
|
| 56 |
+
|
| 57 |
+
# Upload settings
|
| 58 |
+
max_upload_size_mb: int = 100
|
| 59 |
+
allowed_extensions: list[str] = ["mp3", "wav", "m4a", "ogg", "flac", "webm"]
|
| 60 |
+
|
| 61 |
+
# Audio processing settings
|
| 62 |
+
sample_rate: int = 16000
|
| 63 |
+
channels: int = 1 # Mono
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
enable_loudnorm: bool = True
|
| 67 |
+
|
| 68 |
+
# VAD parameters
|
| 69 |
+
vad_threshold: float = 0.55
|
| 70 |
+
vad_min_speech_duration_ms: int = 200
|
| 71 |
+
vad_min_silence_duration_ms: int = 450
|
| 72 |
+
vad_speech_pad_ms: int = 250
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Post-processing
|
| 76 |
+
merge_threshold_s: float = 0.35 # Merge segments from same speaker if gap < this
|
| 77 |
+
min_segment_duration_s: float = 0.85 # Remove segments shorter than this
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Server settings
|
| 81 |
+
host: str = "0.0.0.0"
|
| 82 |
+
port: int = 7860
|
| 83 |
+
|
| 84 |
+
# Paths
|
| 85 |
+
base_dir: Path = Path(__file__).parent.parent.parent
|
| 86 |
+
data_dir: Path = base_dir / "data"
|
| 87 |
+
upload_dir: Path = data_dir / "uploads"
|
| 88 |
+
processed_dir: Path = data_dir / "processed"
|
| 89 |
+
|
| 90 |
+
def __init__(self, **kwargs):
|
| 91 |
+
super().__init__(**kwargs)
|
| 92 |
+
# Ensure directories exist
|
| 93 |
+
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
| 94 |
+
self.processed_dir.mkdir(parents=True, exist_ok=True)
|
| 95 |
+
|
| 96 |
+
@property
|
| 97 |
+
def max_upload_size_bytes(self) -> int:
|
| 98 |
+
return self.max_upload_size_mb * 1024 * 1024
|
| 99 |
+
|
| 100 |
+
@property
|
| 101 |
+
def resolved_device(self) -> str:
|
| 102 |
+
"""Resolve 'auto' to actual device."""
|
| 103 |
+
if self.device == "auto":
|
| 104 |
+
try:
|
| 105 |
+
import torch
|
| 106 |
+
return "cuda" if torch.cuda.is_available() else "cpu"
|
| 107 |
+
except ImportError:
|
| 108 |
+
return "cpu"
|
| 109 |
+
return self.device
|
| 110 |
+
|
| 111 |
+
@property
|
| 112 |
+
def resolved_compute_type(self) -> str:
|
| 113 |
+
"""Get appropriate compute type for device."""
|
| 114 |
+
if self.resolved_device == "cuda":
|
| 115 |
+
return "float16"
|
| 116 |
+
return "int8"
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@lru_cache
|
| 120 |
+
def get_settings() -> Settings:
|
| 121 |
+
"""Get cached settings instance."""
|
| 122 |
+
return Settings()
|
app/main.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PrecisionVoice - Speech-to-Text & Speaker Diarization Application
|
| 3 |
+
|
| 4 |
+
Main FastAPI application entry point.
|
| 5 |
+
"""
|
| 6 |
+
import logging
|
| 7 |
+
from contextlib import asynccontextmanager
|
| 8 |
+
|
| 9 |
+
from fastapi import FastAPI, Request
|
| 10 |
+
from fastapi.staticfiles import StaticFiles
|
| 11 |
+
from fastapi.templating import Jinja2Templates
|
| 12 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 13 |
+
from fastapi.responses import HTMLResponse
|
| 14 |
+
|
| 15 |
+
from app.core.config import get_settings
|
| 16 |
+
from app.api.routes import router
|
| 17 |
+
from app.services.transcription import TranscriptionService
|
| 18 |
+
from app.services.diarization import DiarizationService
|
| 19 |
+
from app.services.emo import EmotionService
|
| 20 |
+
|
| 21 |
+
# Configure logging
|
| 22 |
+
logging.basicConfig(
|
| 23 |
+
level=logging.INFO,
|
| 24 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 25 |
+
)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
settings = get_settings()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@asynccontextmanager
|
| 32 |
+
async def lifespan(app: FastAPI):
|
| 33 |
+
"""
|
| 34 |
+
Application lifespan handler.
|
| 35 |
+
Preloads models on startup for faster first request.
|
| 36 |
+
"""
|
| 37 |
+
logger.info("Starting PrecisionVoice application...")
|
| 38 |
+
logger.info(f"Device: {settings.resolved_device}")
|
| 39 |
+
logger.info(f"Default Whisper model: {settings.default_whisper_model}")
|
| 40 |
+
logger.info(f"Diarization model: {settings.diarization_model}")
|
| 41 |
+
logger.info(f"Emotion voice model: {settings.default_dual_emotion_model}")
|
| 42 |
+
|
| 43 |
+
# Preload default Whisper model
|
| 44 |
+
try:
|
| 45 |
+
logger.info("Preloading Whisper model...")
|
| 46 |
+
TranscriptionService.preload_model()
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Failed to preload Whisper model: {e}")
|
| 49 |
+
|
| 50 |
+
# Preload diarization pipeline
|
| 51 |
+
try:
|
| 52 |
+
if settings.hf_token:
|
| 53 |
+
logger.info("Preloading diarization pipeline...")
|
| 54 |
+
DiarizationService.preload_pipeline()
|
| 55 |
+
else:
|
| 56 |
+
logger.warning("HF_TOKEN not set, diarization will not be available")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.warning(f"Diarization preload failed: {e}")
|
| 59 |
+
|
| 60 |
+
logger.info("Application startup complete")
|
| 61 |
+
|
| 62 |
+
yield
|
| 63 |
+
|
| 64 |
+
logger.info("Shutting down PrecisionVoice application...")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# preload emo model
|
| 68 |
+
try:
|
| 69 |
+
logger.info("Preloading emotion model...")
|
| 70 |
+
EmotionService.preload_model()
|
| 71 |
+
logger.info("Emotion model loaded")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.warning(f"Emotion model preload failed: {e}")
|
| 74 |
+
logger.info("Application startup complete")
|
| 75 |
+
|
| 76 |
+
yield logger.info("Shutting down PrecisionVoice application...")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# Create FastAPI app
|
| 81 |
+
app = FastAPI(
|
| 82 |
+
title="PrecisionVoice",
|
| 83 |
+
description="QA Voice MultipleModel API",
|
| 84 |
+
version="2.0.0",
|
| 85 |
+
lifespan=lifespan
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# CORS middleware
|
| 89 |
+
app.add_middleware(
|
| 90 |
+
CORSMiddleware,
|
| 91 |
+
allow_origins=["*"],
|
| 92 |
+
allow_credentials=True,
|
| 93 |
+
allow_methods=["*"],
|
| 94 |
+
allow_headers=["*"],
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Mount static files
|
| 98 |
+
app.mount(
|
| 99 |
+
"/static",
|
| 100 |
+
StaticFiles(directory="app/static"),
|
| 101 |
+
name="static"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
# Templates
|
| 105 |
+
templates = Jinja2Templates(directory="app/templates")
|
| 106 |
+
|
| 107 |
+
# Include API routes
|
| 108 |
+
app.include_router(router)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@app.get("/", response_class=HTMLResponse)
|
| 112 |
+
async def index(request: Request):
|
| 113 |
+
"""Serve the main web interface."""
|
| 114 |
+
return templates.TemplateResponse(
|
| 115 |
+
"index.html",
|
| 116 |
+
{
|
| 117 |
+
"request": request,
|
| 118 |
+
"max_upload_mb": settings.max_upload_size_mb,
|
| 119 |
+
"allowed_formats": ", ".join(settings.allowed_extensions)
|
| 120 |
+
}
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
import uvicorn
|
| 126 |
+
uvicorn.run(
|
| 127 |
+
"app.main:app",
|
| 128 |
+
host=settings.host,
|
| 129 |
+
port=settings.port,
|
| 130 |
+
reload=True
|
| 131 |
+
)
|
app/schemas/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Schemas package
|
app/schemas/models.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic models for API requests and responses.
|
| 3 |
+
"""
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from enum import Enum
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ProcessingStatus(str, Enum):
|
| 10 |
+
"""Status of the transcription process."""
|
| 11 |
+
PENDING = "pending"
|
| 12 |
+
PROCESSING = "processing"
|
| 13 |
+
COMPLETED = "completed"
|
| 14 |
+
FAILED = "failed"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TranscriptSegment(BaseModel):
|
| 18 |
+
start: float
|
| 19 |
+
end: float
|
| 20 |
+
|
| 21 |
+
speaker: Optional[str] = Field(
|
| 22 |
+
default=None,
|
| 23 |
+
description="Internal speaker id (debug only)"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
role: str = Field(
|
| 27 |
+
...,
|
| 28 |
+
description="Conversation role (NV = agent, KH = customer)"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
text: str = Field(
|
| 32 |
+
...,
|
| 33 |
+
description="Transcribed text"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
emotion: Optional[str] = Field(
|
| 37 |
+
default=None,
|
| 38 |
+
description="Predicted emotion label"
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
emotion_scores: Optional[List[float]] = Field(
|
| 42 |
+
default=None,
|
| 43 |
+
description="Emotion probability scores"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def start_formatted(self) -> str:
|
| 48 |
+
"""Format start time as HH:MM:SS."""
|
| 49 |
+
return self._format_time(self.start)
|
| 50 |
+
|
| 51 |
+
@property
|
| 52 |
+
def end_formatted(self) -> str:
|
| 53 |
+
"""Format end time as HH:MM:SS."""
|
| 54 |
+
return self._format_time(self.end)
|
| 55 |
+
|
| 56 |
+
@staticmethod
|
| 57 |
+
def _format_time(seconds: float) -> str:
|
| 58 |
+
"""Convert seconds to HH:MM:SS format."""
|
| 59 |
+
hours = int(seconds // 3600)
|
| 60 |
+
minutes = int((seconds % 3600) // 60)
|
| 61 |
+
secs = int(seconds % 60)
|
| 62 |
+
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class EmotionPoint(BaseModel):
|
| 66 |
+
|
| 67 |
+
time: float = Field(..., description="Time in seconds")
|
| 68 |
+
emotion: str = Field(..., description="Emotion label")
|
| 69 |
+
icon: Optional[str] = Field(
|
| 70 |
+
default=None,
|
| 71 |
+
description="Emotion icon (emoji)"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class EmotionChange(BaseModel):
|
| 76 |
+
|
| 77 |
+
time: float = Field(..., description="Time of emotion change")
|
| 78 |
+
emotion_from: str = Field(..., description="Previous emotion")
|
| 79 |
+
emotion_to: str = Field(..., description="New emotion")
|
| 80 |
+
icon_from: Optional[str] = Field(
|
| 81 |
+
default=None,
|
| 82 |
+
description="Previous emotion icon"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
icon_to: Optional[str] = Field(
|
| 86 |
+
default=None,
|
| 87 |
+
description="New emotion icon"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
class TranscriptionRequest(BaseModel):
|
| 91 |
+
"""Request model for transcription settings."""
|
| 92 |
+
|
| 93 |
+
language: str = Field(
|
| 94 |
+
default="vi",
|
| 95 |
+
description="Language code for transcription"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
num_speakers: Optional[int] = Field(
|
| 99 |
+
default=None,
|
| 100 |
+
description="Expected number of speakers (None for auto-detect)"
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
output_format: str = Field(
|
| 104 |
+
default="json",
|
| 105 |
+
description="Output format: json, txt, csv"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class TranscriptionResponse(BaseModel):
|
| 110 |
+
"""Response containing the transcription results."""
|
| 111 |
+
success: bool = Field(..., description="Whether transcription succeeded")
|
| 112 |
+
message: str = Field(default="", description="Status message")
|
| 113 |
+
segments: list[TranscriptSegment] = Field(
|
| 114 |
+
default_factory=list,
|
| 115 |
+
description="Transcript segments with speaker and role")
|
| 116 |
+
|
| 117 |
+
duration: float = Field(default=0.0, description="Audio duration in seconds")
|
| 118 |
+
speaker_count: int = Field(default=0, description="Number of detected speakers")
|
| 119 |
+
processing_time: float = Field(default=0.0, description="Processing time in seconds")
|
| 120 |
+
speakers: Optional[list[str]] = None
|
| 121 |
+
|
| 122 |
+
roles: Optional[dict[str, str]] = Field(
|
| 123 |
+
default=None,
|
| 124 |
+
description="Internal mapping speaker_id → role (debug / audit only)"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Emotion Analysis
|
| 128 |
+
emotion_timeline: Optional[List[EmotionPoint]] = Field(
|
| 129 |
+
default=None,
|
| 130 |
+
description="Emotion timeline across conversation"
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
emotion_changes: Optional[List[EmotionChange]] = Field(
|
| 134 |
+
default=None,
|
| 135 |
+
description="Detected emotion change events"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
customer_emotion_score: Optional[float] = Field(
|
| 139 |
+
default=None,
|
| 140 |
+
description="Overall customer emotion score"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
download_txt: Optional[str] = Field(default=None, description="Download URL for TXT file")
|
| 144 |
+
download_csv: Optional[str] = Field(default=None, description="Download URL for CSV file")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class ErrorResponse(BaseModel):
|
| 148 |
+
"""Error response model."""
|
| 149 |
+
success: bool = False
|
| 150 |
+
error: str = Field(..., description="Error message")
|
| 151 |
+
detail: Optional[str] = Field(default=None, description="Detailed error information")
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class HealthResponse(BaseModel):
|
| 155 |
+
"""Health check response."""
|
| 156 |
+
status: str = "healthy"
|
| 157 |
+
models_loaded: bool = False
|
| 158 |
+
device: str = "cpu"
|
app/services/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Services package."""
|
| 2 |
+
from app.services.transcription import TranscriptionService
|
| 3 |
+
from app.services.diarization import DiarizationService
|
| 4 |
+
from app.services.processor import Processor
|
| 5 |
+
from app.services.audio_processor import AudioProcessor
|
app/services/alignment.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Precision alignment service - Word-center-based speaker assignment.
|
| 3 |
+
Merges word-level transcription with speaker diarization using precise timestamps.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Tuple, Optional
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
from app.core.config import get_settings
|
| 11 |
+
from app.services.transcription import WordTimestamp
|
| 12 |
+
from app.services.diarization import SpeakerSegment
|
| 13 |
+
from app.schemas.models import TranscriptSegment
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
settings = get_settings()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class WordWithSpeaker:
|
| 23 |
+
"""A word with assigned speaker."""
|
| 24 |
+
word: str
|
| 25 |
+
start: float
|
| 26 |
+
end: float
|
| 27 |
+
speaker: str
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class AlignmentService:
|
| 31 |
+
"""
|
| 32 |
+
Precision alignment service.
|
| 33 |
+
Uses word-center-based algorithm for accurate speaker-to-text mapping.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
PAUSE_THRESHOLD = 0.45
|
| 37 |
+
CENTER_TOL = 0.15 # s (150 ms)
|
| 38 |
+
OVERLAP_TH = 0.12 # > x% segments
|
| 39 |
+
DIA_MERGE_GAP = 0.25
|
| 40 |
+
MAX_SEGMENT_DURATION = 7.5
|
| 41 |
+
|
| 42 |
+
@staticmethod
|
| 43 |
+
def get_word_center(word: WordTimestamp) -> float:
|
| 44 |
+
"""Calculate the center time of a word."""
|
| 45 |
+
return (word.start + word.end) / 2
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@staticmethod
|
| 49 |
+
def overlap_ratio(w_start, w_end, s_start, s_end):
|
| 50 |
+
overlap = max(0.0, min(w_end, s_end) - max(w_start, s_start))
|
| 51 |
+
dur = max(1e-6, w_end - w_start)
|
| 52 |
+
return overlap / dur
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Diarization merge
|
| 56 |
+
@classmethod
|
| 57 |
+
def merge_dia_segments(cls, segments: List[SpeakerSegment]) -> List[SpeakerSegment]:
|
| 58 |
+
if not segments:
|
| 59 |
+
return []
|
| 60 |
+
|
| 61 |
+
segments = sorted(segments, key=lambda s: s.start)
|
| 62 |
+
merged = [segments[0]]
|
| 63 |
+
|
| 64 |
+
for s in segments[1:]:
|
| 65 |
+
p = merged[-1]
|
| 66 |
+
if s.speaker == p.speaker and (s.start - p.end) <= cls.DIA_MERGE_GAP:
|
| 67 |
+
p.end = s.end
|
| 68 |
+
else:
|
| 69 |
+
merged.append(s)
|
| 70 |
+
|
| 71 |
+
return merged
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@classmethod
|
| 75 |
+
def find_speaker_center(
|
| 76 |
+
cls,
|
| 77 |
+
time: float,
|
| 78 |
+
speaker_segments: List[SpeakerSegment],
|
| 79 |
+
) -> Optional[str]:
|
| 80 |
+
|
| 81 |
+
for seg in speaker_segments:
|
| 82 |
+
if seg.start - cls.CENTER_TOL <= time <= seg.end + cls.CENTER_TOL:
|
| 83 |
+
return seg.speaker
|
| 84 |
+
return None
|
| 85 |
+
|
| 86 |
+
@staticmethod
|
| 87 |
+
def find_closest_speaker(time: float, speaker_segments: List[SpeakerSegment]) -> str:
|
| 88 |
+
if not speaker_segments:
|
| 89 |
+
return "Unknown"
|
| 90 |
+
|
| 91 |
+
min_dist = float("inf")
|
| 92 |
+
closest = "Unknown"
|
| 93 |
+
|
| 94 |
+
for seg in speaker_segments:
|
| 95 |
+
d = min(abs(time - seg.start), abs(time - seg.end))
|
| 96 |
+
if d < min_dist:
|
| 97 |
+
min_dist = d
|
| 98 |
+
closest = seg.speaker
|
| 99 |
+
|
| 100 |
+
return closest
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@classmethod
|
| 104 |
+
def assign_speakers_to_words(
|
| 105 |
+
cls,
|
| 106 |
+
words: List[WordTimestamp],
|
| 107 |
+
speaker_segments: List[SpeakerSegment],
|
| 108 |
+
) -> List[WordWithSpeaker]:
|
| 109 |
+
|
| 110 |
+
words = [w for w in words if w.word and w.word.strip()]
|
| 111 |
+
|
| 112 |
+
if not speaker_segments:
|
| 113 |
+
logger.warning("No diarization, fallback single speaker")
|
| 114 |
+
return [
|
| 115 |
+
WordWithSpeaker(w.word, w.start, w.end, "Speaker 1")
|
| 116 |
+
for w in words
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
speaker_segments = cls.merge_dia_segments(speaker_segments)
|
| 120 |
+
|
| 121 |
+
results = []
|
| 122 |
+
|
| 123 |
+
for word in words:
|
| 124 |
+
center = cls.get_word_center(word)
|
| 125 |
+
|
| 126 |
+
# 1. CENTER
|
| 127 |
+
speaker = cls.find_speaker_center(center, speaker_segments)
|
| 128 |
+
|
| 129 |
+
if speaker is None:
|
| 130 |
+
# 2. OVERLAP
|
| 131 |
+
best_ratio = 0
|
| 132 |
+
best_spk = None
|
| 133 |
+
|
| 134 |
+
for seg in speaker_segments:
|
| 135 |
+
r = cls.overlap_ratio(word.start, word.end, seg.start, seg.end)
|
| 136 |
+
if r > best_ratio:
|
| 137 |
+
best_ratio = r
|
| 138 |
+
best_spk = seg.speaker
|
| 139 |
+
|
| 140 |
+
if best_ratio >= cls.OVERLAP_TH:
|
| 141 |
+
speaker = best_spk
|
| 142 |
+
else:
|
| 143 |
+
# 3. CLOSEST
|
| 144 |
+
speaker = cls.find_closest_speaker(center, speaker_segments)
|
| 145 |
+
|
| 146 |
+
results.append(
|
| 147 |
+
WordWithSpeaker(word.word, word.start, word.end, speaker)
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
return results
|
| 151 |
+
|
| 152 |
+
@classmethod
|
| 153 |
+
def reconstruct_segments(
|
| 154 |
+
cls,
|
| 155 |
+
words_with_speakers: List[WordWithSpeaker]
|
| 156 |
+
) -> List[TranscriptSegment]:
|
| 157 |
+
"""
|
| 158 |
+
Step 3d: Reconstruct sentence segments from words.
|
| 159 |
+
|
| 160 |
+
Groups consecutive words of the same speaker into segments.
|
| 161 |
+
Creates new segment when:
|
| 162 |
+
- Speaker changes
|
| 163 |
+
- Pause > PAUSE_THRESHOLD between words
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
words_with_speakers: List of words with speaker assignments
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
List of TranscriptSegment with complete sentences
|
| 170 |
+
"""
|
| 171 |
+
if not words_with_speakers:
|
| 172 |
+
return []
|
| 173 |
+
|
| 174 |
+
segments = []
|
| 175 |
+
|
| 176 |
+
# Start first segment
|
| 177 |
+
current_speaker = words_with_speakers[0].speaker
|
| 178 |
+
current_start = words_with_speakers[0].start
|
| 179 |
+
current_end = words_with_speakers[0].end
|
| 180 |
+
current_words = [words_with_speakers[0].word]
|
| 181 |
+
|
| 182 |
+
for i in range(1, len(words_with_speakers)):
|
| 183 |
+
word = words_with_speakers[i]
|
| 184 |
+
prev_word = words_with_speakers[i - 1]
|
| 185 |
+
|
| 186 |
+
# Calculate pause between words
|
| 187 |
+
pause = word.start - prev_word.end
|
| 188 |
+
|
| 189 |
+
# Check if we need to start a new segment
|
| 190 |
+
speaker_changed = word.speaker != current_speaker
|
| 191 |
+
significant_pause = pause > cls.PAUSE_THRESHOLD
|
| 192 |
+
|
| 193 |
+
segment_duration = current_end - current_start
|
| 194 |
+
too_long = segment_duration > cls.MAX_SEGMENT_DURATION and pause > 0.15
|
| 195 |
+
|
| 196 |
+
if speaker_changed or significant_pause or too_long:
|
| 197 |
+
# Save current segment
|
| 198 |
+
segments.append(TranscriptSegment(
|
| 199 |
+
start=current_start,
|
| 200 |
+
end=current_end,
|
| 201 |
+
speaker=current_speaker,
|
| 202 |
+
role="UNKNOWN",
|
| 203 |
+
text=" ".join(current_words)
|
| 204 |
+
))
|
| 205 |
+
|
| 206 |
+
# Start new segment
|
| 207 |
+
current_speaker = word.speaker
|
| 208 |
+
current_start = word.start
|
| 209 |
+
current_end = word.end
|
| 210 |
+
current_words = [word.word]
|
| 211 |
+
else:
|
| 212 |
+
# Continue current segment
|
| 213 |
+
current_end = word.end
|
| 214 |
+
current_words.append(word.word)
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
if current_words:
|
| 218 |
+
segments.append(TranscriptSegment(
|
| 219 |
+
start=current_start,
|
| 220 |
+
end=current_end,
|
| 221 |
+
speaker=current_speaker,
|
| 222 |
+
role="UNKNOWN",
|
| 223 |
+
text=" ".join(current_words)
|
| 224 |
+
))
|
| 225 |
+
|
| 226 |
+
logger.debug(f"Reconstructed {len(segments)} segments from {len(words_with_speakers)} words")
|
| 227 |
+
return segments
|
| 228 |
+
|
| 229 |
+
@classmethod
|
| 230 |
+
def resize_and_merge_segments(
|
| 231 |
+
cls,
|
| 232 |
+
segments: List[TranscriptSegment]
|
| 233 |
+
) -> List[TranscriptSegment]:
|
| 234 |
+
"""
|
| 235 |
+
Merge consecutive segments of the same speaker if the gap is small.
|
| 236 |
+
Also filters out extremely short segments.
|
| 237 |
+
"""
|
| 238 |
+
if not segments:
|
| 239 |
+
return []
|
| 240 |
+
|
| 241 |
+
# Filter 1: Remove extremely short blips (noise)
|
| 242 |
+
segments = [s for s in segments if (s.end - s.start) >= settings.min_segment_duration_s]
|
| 243 |
+
|
| 244 |
+
if not segments:
|
| 245 |
+
return []
|
| 246 |
+
|
| 247 |
+
merged = []
|
| 248 |
+
curr = segments[0]
|
| 249 |
+
|
| 250 |
+
for i in range(1, len(segments)):
|
| 251 |
+
next_seg = segments[i]
|
| 252 |
+
|
| 253 |
+
# If same speaker and gap is small, merge
|
| 254 |
+
gap = next_seg.start - curr.end
|
| 255 |
+
if next_seg.speaker == curr.speaker and gap < settings.merge_threshold_s:
|
| 256 |
+
curr.end = next_seg.end
|
| 257 |
+
curr.text += " " + next_seg.text
|
| 258 |
+
else:
|
| 259 |
+
merged.append(curr)
|
| 260 |
+
curr = next_seg
|
| 261 |
+
|
| 262 |
+
merged.append(curr)
|
| 263 |
+
|
| 264 |
+
logger.debug(f"Merged segments: {len(segments)} -> {len(merged)}")
|
| 265 |
+
return merged
|
| 266 |
+
|
| 267 |
+
@classmethod
|
| 268 |
+
def align_precision(
|
| 269 |
+
cls,
|
| 270 |
+
words: List[WordTimestamp],
|
| 271 |
+
speaker_segments: List[SpeakerSegment]
|
| 272 |
+
) -> List[TranscriptSegment]:
|
| 273 |
+
"""
|
| 274 |
+
Full precision alignment pipeline.
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
words: Word-level timestamps from transcription
|
| 278 |
+
speaker_segments: Speaker segments from diarization
|
| 279 |
+
|
| 280 |
+
Returns:
|
| 281 |
+
List of TranscriptSegment with proper speaker assignments
|
| 282 |
+
"""
|
| 283 |
+
# Step 3c: Assign speakers to words
|
| 284 |
+
words_with_speakers = cls.assign_speakers_to_words(words, speaker_segments)
|
| 285 |
+
|
| 286 |
+
# Step 3d: Reconstruct segments
|
| 287 |
+
segments = cls.reconstruct_segments(words_with_speakers)
|
| 288 |
+
|
| 289 |
+
# Step 3e: Clustering/Merging (Optimization)
|
| 290 |
+
segments = cls.resize_and_merge_segments(segments)
|
| 291 |
+
|
| 292 |
+
return segments
|
| 293 |
+
|
| 294 |
+
|
app/services/audio_processor.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio processing utilities.
|
| 3 |
+
Simple validation and file handling.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
import uuid
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Optional, Tuple
|
| 9 |
+
|
| 10 |
+
from app.core.config import get_settings
|
| 11 |
+
import ffmpeg
|
| 12 |
+
import asyncio
|
| 13 |
+
|
| 14 |
+
from app.services.vocal_separator import VocalSeparator
|
| 15 |
+
from app.services.denoiser import DenoiserService
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
settings = get_settings()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class AudioProcessingError(Exception):
|
| 22 |
+
"""Custom exception for audio processing errors."""
|
| 23 |
+
pass
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class AudioProcessor:
|
| 27 |
+
ALLOWED_EXTENSIONS = settings.allowed_extensions
|
| 28 |
+
TARGET_SAMPLE_RATE = settings.sample_rate
|
| 29 |
+
TARGET_CHANNELS = settings.channels
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
def validate_file(cls, filename: str, file_size: int) -> None:
|
| 33 |
+
"""
|
| 34 |
+
Validate uploaded file.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
filename: Original filename
|
| 38 |
+
file_size: File size in bytes
|
| 39 |
+
|
| 40 |
+
Raises:
|
| 41 |
+
AudioProcessingError: If validation fails
|
| 42 |
+
"""
|
| 43 |
+
# Check extension
|
| 44 |
+
ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
| 45 |
+
if ext not in settings.allowed_extensions:
|
| 46 |
+
raise AudioProcessingError(
|
| 47 |
+
f"File type '.{ext}' not supported. "
|
| 48 |
+
f"Allowed: {', '.join(settings.allowed_extensions)}"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Check size
|
| 52 |
+
if file_size > settings.max_upload_size_bytes:
|
| 53 |
+
raise AudioProcessingError(
|
| 54 |
+
f"File too large ({file_size / 1024 / 1024:.1f}MB). "
|
| 55 |
+
f"Maximum size: {settings.max_upload_size_mb}MB"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
@classmethod
|
| 59 |
+
async def save_upload(cls, file_content: bytes, original_filename: str) -> Path:
|
| 60 |
+
"""
|
| 61 |
+
Save uploaded file to disk.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
file_content: Raw file bytes
|
| 65 |
+
original_filename: Original filename for extension
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Path to saved file
|
| 69 |
+
"""
|
| 70 |
+
import aiofiles
|
| 71 |
+
|
| 72 |
+
# Generate unique filename
|
| 73 |
+
ext = original_filename.rsplit('.', 1)[-1].lower() if '.' in original_filename else 'wav'
|
| 74 |
+
unique_filename = f"{uuid.uuid4()}.{ext}"
|
| 75 |
+
file_path = settings.upload_dir / unique_filename
|
| 76 |
+
|
| 77 |
+
# Save file
|
| 78 |
+
async with aiofiles.open(file_path, 'wb') as f:
|
| 79 |
+
await f.write(file_content)
|
| 80 |
+
|
| 81 |
+
logger.info(f"Saved upload: {file_path} ({len(file_content) / 1024:.1f}KB)")
|
| 82 |
+
return file_path
|
| 83 |
+
|
| 84 |
+
@classmethod
|
| 85 |
+
async def convert_to_wav(cls, input_path: Path) -> Path:
|
| 86 |
+
"""
|
| 87 |
+
Convert audio to 16kHz mono WAV using FFmpeg.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
input_path: Path to input audio file
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Path to converted WAV file
|
| 94 |
+
"""
|
| 95 |
+
output_filename = f"{input_path.stem}_processed.wav"
|
| 96 |
+
output_path = settings.processed_dir / output_filename
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
# Run ffmpeg conversion in executor to not block
|
| 100 |
+
loop = asyncio.get_event_loop()
|
| 101 |
+
await loop.run_in_executor(None, lambda: cls._run_ffmpeg_conversion(input_path, output_path))
|
| 102 |
+
|
| 103 |
+
logger.info(f"Converted to WAV: {output_path}")
|
| 104 |
+
return output_path
|
| 105 |
+
|
| 106 |
+
except ffmpeg.Error as e:
|
| 107 |
+
error_msg = e.stderr.decode() if e.stderr else str(e)
|
| 108 |
+
logger.error(f"FFmpeg error: {error_msg}")
|
| 109 |
+
raise AudioProcessingError(f"Audio conversion failed: {error_msg}")
|
| 110 |
+
|
| 111 |
+
@staticmethod
|
| 112 |
+
def _run_ffmpeg_conversion(input_path: Path, output_path: Path) -> None:
|
| 113 |
+
"""Run the actual FFmpeg conversion (blocking)."""
|
| 114 |
+
stream = ffmpeg.input(str(input_path))
|
| 115 |
+
|
| 116 |
+
# Apply normalization if enabled (loudnorm is best for speech consistency)
|
| 117 |
+
if settings.enable_loudnorm:
|
| 118 |
+
logger.debug("Applying loudnorm normalization...")
|
| 119 |
+
stream = stream.filter('loudnorm', I=-20, TP=-2, LRA=7)
|
| 120 |
+
|
| 121 |
+
# Apply noise reduction if enabled (Note: basic filters are kept as minor cleanup)
|
| 122 |
+
if settings.enable_noise_reduction:
|
| 123 |
+
logger.debug("Applying subtle highpass filter...")
|
| 124 |
+
stream = (
|
| 125 |
+
stream
|
| 126 |
+
.filter('highpass', f=60)
|
| 127 |
+
.filter('lowpass', f=7500)
|
| 128 |
+
.filter(
|
| 129 |
+
# Silence trimming
|
| 130 |
+
'silenceremove',
|
| 131 |
+
stop_periods=-1,
|
| 132 |
+
stop_duration=0.4,
|
| 133 |
+
stop_threshold='-45dB'
|
| 134 |
+
)
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
(
|
| 138 |
+
stream.output(
|
| 139 |
+
str(output_path),
|
| 140 |
+
acodec='pcm_s16le',
|
| 141 |
+
ar=16000,
|
| 142 |
+
ac=1
|
| 143 |
+
)
|
| 144 |
+
.overwrite_output()
|
| 145 |
+
.run(quiet=True, capture_stderr=True)
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
@classmethod
|
| 149 |
+
async def get_audio_duration(cls, filepath: Path) -> float:
|
| 150 |
+
"""
|
| 151 |
+
Get audio file duration in seconds.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
filepath: Path to audio file
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
Duration in seconds
|
| 158 |
+
"""
|
| 159 |
+
try:
|
| 160 |
+
loop = asyncio.get_event_loop()
|
| 161 |
+
probe = await loop.run_in_executor(
|
| 162 |
+
None,
|
| 163 |
+
lambda: ffmpeg.probe(str(filepath))
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
duration = float(probe['format'].get('duration', 0))
|
| 167 |
+
return duration
|
| 168 |
+
|
| 169 |
+
except ffmpeg.Error as e:
|
| 170 |
+
logger.warning(f"Could not probe audio duration: {e}")
|
| 171 |
+
return 0.0
|
| 172 |
+
@classmethod
|
| 173 |
+
async def cleanup_files(cls, *paths: Path) -> None:
|
| 174 |
+
"""Remove temporary files."""
|
| 175 |
+
import asyncio
|
| 176 |
+
|
| 177 |
+
for path in paths:
|
| 178 |
+
try:
|
| 179 |
+
if path and path.exists():
|
| 180 |
+
path.unlink()
|
| 181 |
+
logger.debug(f"Cleaned up: {path}")
|
| 182 |
+
except Exception as e:
|
| 183 |
+
logger.warning(f"Failed to cleanup {path}: {e}")
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
@classmethod
|
| 188 |
+
async def process_upload(cls, file_content: bytes, filename: str) -> Tuple[Path, float]:
|
| 189 |
+
"""
|
| 190 |
+
Full upload processing pipeline: validate, save, convert.
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
file_content: Uploaded file bytes
|
| 194 |
+
filename: Original filename
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
Tuple of (processed WAV path, duration in seconds)
|
| 198 |
+
"""
|
| 199 |
+
# Validate
|
| 200 |
+
cls.validate_file(filename, len(file_content))
|
| 201 |
+
|
| 202 |
+
# Save original
|
| 203 |
+
original_path = await cls.save_upload(file_content, filename)
|
| 204 |
+
vocals_path = None
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
# Step 1: Denoising (Speech Enhancement)
|
| 208 |
+
if settings.enable_denoiser:
|
| 209 |
+
denoised_path = await DenoiserService.enhance_audio(original_path)
|
| 210 |
+
source_for_separation = denoised_path
|
| 211 |
+
else:
|
| 212 |
+
source_for_separation = original_path
|
| 213 |
+
denoised_path = None
|
| 214 |
+
|
| 215 |
+
# Step 2: Vocal separation using MDX-Net
|
| 216 |
+
if settings.enable_vocal_separation:
|
| 217 |
+
vocals_path = await VocalSeparator.separate_vocals(source_for_separation)
|
| 218 |
+
source_for_conversion = vocals_path
|
| 219 |
+
else:
|
| 220 |
+
source_for_conversion = source_for_separation
|
| 221 |
+
vocals_path = None
|
| 222 |
+
|
| 223 |
+
# Step 3: Convert to 16kHz mono WAV (includes normalization)
|
| 224 |
+
wav_path = await cls.convert_to_wav(source_for_conversion)
|
| 225 |
+
|
| 226 |
+
# Get duration
|
| 227 |
+
duration = await cls.get_audio_duration(wav_path)
|
| 228 |
+
|
| 229 |
+
# Cleanup intermediate files
|
| 230 |
+
to_cleanup = [original_path]
|
| 231 |
+
if denoised_path and denoised_path != original_path:
|
| 232 |
+
to_cleanup.append(denoised_path)
|
| 233 |
+
if vocals_path and vocals_path not in [original_path, denoised_path]:
|
| 234 |
+
to_cleanup.append(vocals_path)
|
| 235 |
+
|
| 236 |
+
await cls.cleanup_files(*to_cleanup)
|
| 237 |
+
|
| 238 |
+
return wav_path, duration
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
# Cleanup on error
|
| 242 |
+
await cls.cleanup_files(original_path)
|
| 243 |
+
if 'denoised_path' in locals() and denoised_path and denoised_path != original_path:
|
| 244 |
+
await cls.cleanup_files(denoised_path)
|
| 245 |
+
if 'vocals_path' in locals() and vocals_path and vocals_path not in [original_path, denoised_path]:
|
| 246 |
+
await cls.cleanup_files(vocals_path)
|
| 247 |
+
raise
|
app/services/denoiser.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speech Enhancement Service using Facebook's Denoiser.
|
| 3 |
+
Removes background noise and enhances speech quality.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import asyncio
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
import torchaudio
|
| 12 |
+
|
| 13 |
+
from app.core.config import get_settings
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
settings = get_settings()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DenoiserError(Exception):
|
| 20 |
+
"""Custom exception for denoiser errors."""
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class DenoiserService:
|
| 25 |
+
"""
|
| 26 |
+
Service for enhancing speech using Facebook's Denoiser models.
|
| 27 |
+
Supports dns48, dns64, master64, etc.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
_model = None
|
| 31 |
+
_model_name: str = None
|
| 32 |
+
|
| 33 |
+
@classmethod
|
| 34 |
+
def _get_model(cls):
|
| 35 |
+
"""Lazy load the Denoiser model."""
|
| 36 |
+
if cls._model is None or cls._model_name != settings.denoiser_model:
|
| 37 |
+
from denoiser.pretrained import dns48, dns64, master64
|
| 38 |
+
|
| 39 |
+
model_map = {
|
| 40 |
+
"dns48": dns48,
|
| 41 |
+
"dns64": dns64,
|
| 42 |
+
"master64": master64
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
model_func = model_map.get(settings.denoiser_model, dns64)
|
| 46 |
+
logger.debug(f"Loading Denoiser model: {settings.denoiser_model}")
|
| 47 |
+
|
| 48 |
+
model = model_func()
|
| 49 |
+
device = settings.resolved_device
|
| 50 |
+
model.to(device)
|
| 51 |
+
model.eval()
|
| 52 |
+
|
| 53 |
+
cls._model = model
|
| 54 |
+
cls._model_name = settings.denoiser_model
|
| 55 |
+
logger.debug(f"Denoiser model loaded on {device}")
|
| 56 |
+
|
| 57 |
+
return cls._model
|
| 58 |
+
|
| 59 |
+
@classmethod
|
| 60 |
+
async def enhance_audio(cls, input_path: Path) -> Path:
|
| 61 |
+
"""
|
| 62 |
+
Enhance audio by removing noise.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
input_path: Path to input audio file
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Path to enhanced WAV file
|
| 69 |
+
"""
|
| 70 |
+
if not settings.enable_denoiser:
|
| 71 |
+
logger.debug("Denoiser disabled, skipping...")
|
| 72 |
+
return input_path
|
| 73 |
+
|
| 74 |
+
logger.debug(f"Starting speech enhancement for: {input_path.name}")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
# Run enhancement in executor to not block
|
| 78 |
+
loop = asyncio.get_event_loop()
|
| 79 |
+
enhanced_path = await loop.run_in_executor(
|
| 80 |
+
None,
|
| 81 |
+
lambda: cls._run_enhancement(input_path)
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
logger.info(f"Speech enhancement complete: {enhanced_path.name}")
|
| 85 |
+
return enhanced_path
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"Speech enhancement failed: {e}")
|
| 89 |
+
# Fallback to original on failure rather than failing the whole pipeline
|
| 90 |
+
logger.warning("Falling back to original audio.")
|
| 91 |
+
return input_path
|
| 92 |
+
|
| 93 |
+
@classmethod
|
| 94 |
+
def _run_enhancement(cls, input_path: Path) -> Path:
|
| 95 |
+
"""Run the actual denoiser enhancement (blocking)."""
|
| 96 |
+
from denoiser.enhance import enhance
|
| 97 |
+
|
| 98 |
+
model = cls._get_model()
|
| 99 |
+
device = settings.resolved_device
|
| 100 |
+
|
| 101 |
+
# Load audio
|
| 102 |
+
wav, sr = torchaudio.load(str(input_path))
|
| 103 |
+
wav = wav.to(device)
|
| 104 |
+
|
| 105 |
+
# Ensure correct sample rate for the model
|
| 106 |
+
if sr != model.sample_rate:
|
| 107 |
+
resampler = torchaudio.transforms.Resample(sr, model.sample_rate).to(device)
|
| 108 |
+
wav = resampler(wav)
|
| 109 |
+
sr = model.sample_rate
|
| 110 |
+
|
| 111 |
+
# Enhance
|
| 112 |
+
# wav shape: [channels, time]
|
| 113 |
+
from types import SimpleNamespace
|
| 114 |
+
|
| 115 |
+
args = SimpleNamespace(
|
| 116 |
+
streaming=False,
|
| 117 |
+
dry=0.0,
|
| 118 |
+
sample_rate=sr
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
with torch.no_grad():
|
| 122 |
+
# denoiser.enhance.enhance(args, model, wav)
|
| 123 |
+
if wav.dim() == 1:
|
| 124 |
+
wav = wav.unsqueeze(0).unsqueeze(0)
|
| 125 |
+
elif wav.dim() == 2:
|
| 126 |
+
wav = wav.unsqueeze(0)
|
| 127 |
+
|
| 128 |
+
enhanced = enhance(args, model, wav)
|
| 129 |
+
# remove batch dim
|
| 130 |
+
enhanced = enhanced.squeeze(0)
|
| 131 |
+
|
| 132 |
+
# Save enhanced audio
|
| 133 |
+
output_filename = f"{input_path.stem}_denoised.wav"
|
| 134 |
+
output_path = settings.processed_dir / output_filename
|
| 135 |
+
|
| 136 |
+
torchaudio.save(
|
| 137 |
+
str(output_path),
|
| 138 |
+
enhanced.cpu(),
|
| 139 |
+
sr
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
return output_path
|
app/services/diarization.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# The `DiarizationService` class provides a production-grade speaker diarization service for call
|
| 2 |
+
# centers, including role inference based on speaking duration and asynchronous diarization
|
| 3 |
+
# capabilities.
|
| 4 |
+
"""
|
| 5 |
+
Speaker diarization service using pyannote.audio.
|
| 6 |
+
QA / Production optimized diarization for call center.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import List, Optional, Dict
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
|
| 14 |
+
import torch
|
| 15 |
+
from app.core.config import get_settings
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
settings = get_settings()
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# =========================
|
| 22 |
+
# Data model
|
| 23 |
+
# =========================
|
| 24 |
+
@dataclass
|
| 25 |
+
class SpeakerSegment:
|
| 26 |
+
start: float
|
| 27 |
+
end: float
|
| 28 |
+
speaker: str
|
| 29 |
+
|
| 30 |
+
@property
|
| 31 |
+
def duration(self) -> float:
|
| 32 |
+
return self.end - self.start
|
| 33 |
+
|
| 34 |
+
@dataclass
|
| 35 |
+
class DiarizationResult:
|
| 36 |
+
segments: List[SpeakerSegment]
|
| 37 |
+
speaker_count: int
|
| 38 |
+
speakers: List[str]
|
| 39 |
+
roles: Dict[str, str]
|
| 40 |
+
|
| 41 |
+
# =========================
|
| 42 |
+
# Diarization Service
|
| 43 |
+
# =========================
|
| 44 |
+
class DiarizationService:
|
| 45 |
+
"""
|
| 46 |
+
Production-grade speaker diarization service.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
_instance: Optional["DiarizationService"] = None
|
| 50 |
+
_pipeline = None
|
| 51 |
+
|
| 52 |
+
def __new__(cls):
|
| 53 |
+
if cls._instance is None:
|
| 54 |
+
cls._instance = super().__new__(cls)
|
| 55 |
+
return cls._instance
|
| 56 |
+
|
| 57 |
+
# -------------------------
|
| 58 |
+
# Pipeline loading
|
| 59 |
+
# -------------------------
|
| 60 |
+
@classmethod
|
| 61 |
+
def get_pipeline(cls):
|
| 62 |
+
if cls._pipeline is None:
|
| 63 |
+
from pyannote.audio import Pipeline
|
| 64 |
+
|
| 65 |
+
if not settings.hf_token:
|
| 66 |
+
raise ValueError("HF_TOKEN is required for diarization")
|
| 67 |
+
|
| 68 |
+
logger.info(
|
| 69 |
+
f"Loading diarization model: {settings.diarization_model}"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
pipeline = Pipeline.from_pretrained(
|
| 73 |
+
settings.diarization_model,
|
| 74 |
+
token=settings.hf_token
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
pipeline.instantiate({
|
| 78 |
+
"clustering": {
|
| 79 |
+
"threshold": 0.65
|
| 80 |
+
},
|
| 81 |
+
"segmentation": {
|
| 82 |
+
"min_duration_off": 0.4 # avoid fragment explosion
|
| 83 |
+
}
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
device = torch.device(settings.resolved_device)
|
| 87 |
+
if device.type == "cuda":
|
| 88 |
+
pipeline = pipeline.to(device)
|
| 89 |
+
logger.info("Diarization pipeline moved to GPU")
|
| 90 |
+
|
| 91 |
+
cls._pipeline = pipeline
|
| 92 |
+
|
| 93 |
+
return cls._pipeline
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# -------------------------
|
| 97 |
+
# Role inference (CALL CENTER)
|
| 98 |
+
# -------------------------
|
| 99 |
+
@staticmethod
|
| 100 |
+
def infer_roles(segments: List[SpeakerSegment]) -> Dict[str, str]:
|
| 101 |
+
"""
|
| 102 |
+
Infer Agent / Customer roles based on total speaking duration.
|
| 103 |
+
Agent usually speaks the most.
|
| 104 |
+
"""
|
| 105 |
+
duration_map: Dict[str, float] = {}
|
| 106 |
+
|
| 107 |
+
for seg in segments:
|
| 108 |
+
duration_map[seg.speaker] = (
|
| 109 |
+
duration_map.get(seg.speaker, 0.0) + seg.duration
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
if not duration_map:
|
| 113 |
+
return {}
|
| 114 |
+
|
| 115 |
+
# Agent = speaker with max duration
|
| 116 |
+
agent = max(duration_map, key=duration_map.get)
|
| 117 |
+
|
| 118 |
+
roles = {}
|
| 119 |
+
for speaker in duration_map:
|
| 120 |
+
roles[speaker] = "NV" if speaker == agent else "KH"
|
| 121 |
+
|
| 122 |
+
return roles
|
| 123 |
+
# -------------------------
|
| 124 |
+
# Main diarization
|
| 125 |
+
# -------------------------
|
| 126 |
+
@classmethod
|
| 127 |
+
def diarize(
|
| 128 |
+
cls,
|
| 129 |
+
audio_path: Path,
|
| 130 |
+
num_speakers: Optional[int] = None,
|
| 131 |
+
min_speakers: int = 1,
|
| 132 |
+
max_speakers: int = 10
|
| 133 |
+
) -> DiarizationResult:
|
| 134 |
+
|
| 135 |
+
pipeline = cls.get_pipeline()
|
| 136 |
+
logger.debug(f"Diarizing file: {audio_path}")
|
| 137 |
+
|
| 138 |
+
params = {}
|
| 139 |
+
if num_speakers is not None:
|
| 140 |
+
params["num_speakers"] = num_speakers
|
| 141 |
+
else:
|
| 142 |
+
params["min_speakers"] = min_speakers
|
| 143 |
+
params["max_speakers"] = max_speakers
|
| 144 |
+
|
| 145 |
+
diarization = pipeline(str(audio_path), **params)
|
| 146 |
+
|
| 147 |
+
annotation = (
|
| 148 |
+
diarization.speaker_diarization
|
| 149 |
+
if hasattr(diarization, "speaker_diarization")
|
| 150 |
+
else diarization
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# step 1: diarize
|
| 154 |
+
raw_segments: List[SpeakerSegment] = []
|
| 155 |
+
speaker_map = {}
|
| 156 |
+
speaker_idx = 1
|
| 157 |
+
|
| 158 |
+
for turn, _, speaker in annotation.itertracks(yield_label=True):
|
| 159 |
+
if speaker not in speaker_map:
|
| 160 |
+
speaker_map[speaker] = f"Speaker {speaker_idx}"
|
| 161 |
+
speaker_idx += 1
|
| 162 |
+
|
| 163 |
+
raw_segments.append(
|
| 164 |
+
SpeakerSegment(
|
| 165 |
+
start=float(turn.start),
|
| 166 |
+
end=float(turn.end),
|
| 167 |
+
speaker=speaker_map[speaker]
|
| 168 |
+
)
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
raw_segments.sort(key=lambda s: s.start)
|
| 172 |
+
unique_speakers = []
|
| 173 |
+
for seg in raw_segments:
|
| 174 |
+
if seg.speaker not in unique_speakers:
|
| 175 |
+
unique_speakers.append(seg.speaker)
|
| 176 |
+
|
| 177 |
+
roles = cls.infer_roles(raw_segments)
|
| 178 |
+
|
| 179 |
+
logger.info(
|
| 180 |
+
f"Diarization done | "
|
| 181 |
+
f"Segments: {len(raw_segments)} | "
|
| 182 |
+
f"Speakers: {len(unique_speakers)} | "
|
| 183 |
+
f"Roles: {roles}"
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
return DiarizationResult(
|
| 187 |
+
segments=raw_segments,
|
| 188 |
+
speaker_count=len(unique_speakers),
|
| 189 |
+
speakers=unique_speakers,
|
| 190 |
+
roles=roles
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# -------------------------
|
| 194 |
+
# Async
|
| 195 |
+
# -------------------------
|
| 196 |
+
@classmethod
|
| 197 |
+
async def diarize_async(
|
| 198 |
+
cls,
|
| 199 |
+
audio_path: Path,
|
| 200 |
+
num_speakers: Optional[int] = None,
|
| 201 |
+
min_speakers: int = 1,
|
| 202 |
+
max_speakers: int = 10
|
| 203 |
+
) -> DiarizationResult:
|
| 204 |
+
import asyncio
|
| 205 |
+
loop = asyncio.get_event_loop()
|
| 206 |
+
return await loop.run_in_executor(
|
| 207 |
+
None,
|
| 208 |
+
lambda: cls.diarize(
|
| 209 |
+
audio_path,
|
| 210 |
+
num_speakers,
|
| 211 |
+
min_speakers,
|
| 212 |
+
max_speakers
|
| 213 |
+
)
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
@classmethod
|
| 217 |
+
def preload_pipeline(cls) -> None:
|
| 218 |
+
try:
|
| 219 |
+
cls.get_pipeline()
|
| 220 |
+
except Exception as e:
|
| 221 |
+
logger.warning(
|
| 222 |
+
f"Failed to preload diarization pipeline: {e}"
|
| 223 |
+
)
|
app/services/emo.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
import librosa
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
from torch.nn import functional as F
|
| 10 |
+
from huggingface_hub import hf_hub_download
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
# HuggingFace repo
|
| 15 |
+
AVAILABLE_MODELS = {
|
| 16 |
+
"dual_emotion": "vyluong/emo_dual_classi"
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
emotion_labels = ['Angry', 'Anxiety', 'Happy', 'Sad', 'Neutral']
|
| 20 |
+
|
| 21 |
+
EMOTION_META = {
|
| 22 |
+
"Angry": {"emoji": "😡", "color": "#ff4d4f"},
|
| 23 |
+
"Anxiety": {"emoji": "😰", "color": "#faad14"},
|
| 24 |
+
"Happy": {"emoji": "😊", "color": "#52c41a"},
|
| 25 |
+
"Sad": {"emoji": "😢", "color": "#1890ff"},
|
| 26 |
+
"Neutral": {"emoji": "😐", "color": "#d9d9d9"},
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class EmotionService:
|
| 32 |
+
|
| 33 |
+
_models = {}
|
| 34 |
+
|
| 35 |
+
emotion_labels = emotion_labels
|
| 36 |
+
meta = EMOTION_META
|
| 37 |
+
|
| 38 |
+
@classmethod
|
| 39 |
+
def load_dual_model(cls, repo_id, device):
|
| 40 |
+
|
| 41 |
+
logger.info(f"Downloading model from HF: {repo_id}")
|
| 42 |
+
|
| 43 |
+
model_file = hf_hub_download(
|
| 44 |
+
repo_id=repo_id,
|
| 45 |
+
filename="pytorch_model.bin"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
model_code = hf_hub_download(
|
| 49 |
+
repo_id=repo_id,
|
| 50 |
+
filename="model.py"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# add model folder to python path
|
| 54 |
+
sys.path.append(os.path.dirname(model_code))
|
| 55 |
+
|
| 56 |
+
from model import Dual
|
| 57 |
+
|
| 58 |
+
model = Dual()
|
| 59 |
+
|
| 60 |
+
state_dict = torch.load(model_file, map_location=device)
|
| 61 |
+
|
| 62 |
+
model.load_state_dict(state_dict)
|
| 63 |
+
|
| 64 |
+
model.to(device)
|
| 65 |
+
model.eval()
|
| 66 |
+
|
| 67 |
+
logger.info("Emotion model loaded successfully")
|
| 68 |
+
|
| 69 |
+
return model
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@classmethod
|
| 74 |
+
def get_model(cls, model_name="dual_emotion"):
|
| 75 |
+
|
| 76 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 77 |
+
|
| 78 |
+
if model_name in cls._models:
|
| 79 |
+
return cls._models[model_name]
|
| 80 |
+
|
| 81 |
+
repo_id = AVAILABLE_MODELS[model_name]
|
| 82 |
+
|
| 83 |
+
model = cls.load_dual_model(repo_id, device)
|
| 84 |
+
|
| 85 |
+
cls._models[model_name] = model
|
| 86 |
+
|
| 87 |
+
return model
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@classmethod
|
| 91 |
+
def preload_model(cls):
|
| 92 |
+
|
| 93 |
+
logger.info("Preloading emotion model...")
|
| 94 |
+
|
| 95 |
+
cls.get_model()
|
| 96 |
+
|
| 97 |
+
logger.info("Emotion model ready")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# extract mfcc from segments
|
| 101 |
+
@staticmethod
|
| 102 |
+
def extract_mfcc_segment(
|
| 103 |
+
audio: np.ndarray,
|
| 104 |
+
sr: int,
|
| 105 |
+
start: float,
|
| 106 |
+
end: float,
|
| 107 |
+
duration: float = 5.0,
|
| 108 |
+
n_mfcc: int = 128,
|
| 109 |
+
n_fft: int = 2048,
|
| 110 |
+
hop_length: int = 512
|
| 111 |
+
):
|
| 112 |
+
|
| 113 |
+
start_sample = int(start * sr)
|
| 114 |
+
end_sample = int(end * sr)
|
| 115 |
+
|
| 116 |
+
segment = audio[start_sample:end_sample]
|
| 117 |
+
|
| 118 |
+
if len(segment) == 0:
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
target_len = int(sr * duration)
|
| 122 |
+
|
| 123 |
+
if len(segment) < target_len:
|
| 124 |
+
segment = np.pad(segment,(0,target_len-len(segment)),mode="symmetric")
|
| 125 |
+
else:
|
| 126 |
+
segment = segment[:target_len]
|
| 127 |
+
|
| 128 |
+
mfcc = librosa.feature.mfcc(
|
| 129 |
+
y=segment,
|
| 130 |
+
sr=sr,
|
| 131 |
+
n_mfcc=n_mfcc,
|
| 132 |
+
n_fft=n_fft,
|
| 133 |
+
hop_length=hop_length
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
return mfcc
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@classmethod
|
| 140 |
+
def predict_from_mfcc(cls, mfcc):
|
| 141 |
+
|
| 142 |
+
model = cls.get_model()
|
| 143 |
+
|
| 144 |
+
tensor = torch.from_numpy(mfcc).unsqueeze(0).unsqueeze(0).float()
|
| 145 |
+
|
| 146 |
+
device = next(model.parameters()).device
|
| 147 |
+
tensor = tensor.to(device)
|
| 148 |
+
|
| 149 |
+
with torch.no_grad():
|
| 150 |
+
|
| 151 |
+
output = model(tensor)
|
| 152 |
+
|
| 153 |
+
probs = F.softmax(output.squeeze(), dim=0).cpu().numpy()
|
| 154 |
+
|
| 155 |
+
label = cls.emotion_labels[np.argmax(probs)]
|
| 156 |
+
|
| 157 |
+
return label
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# predict from segments
|
| 161 |
+
@classmethod
|
| 162 |
+
def predict_segment(cls, audio, sr, start, end):
|
| 163 |
+
|
| 164 |
+
mfcc = cls.extract_mfcc_segment(audio, sr, start, end)
|
| 165 |
+
|
| 166 |
+
if mfcc is None:
|
| 167 |
+
return "Neutral"
|
| 168 |
+
|
| 169 |
+
return cls.predict_from_mfcc(mfcc)
|
app/services/processor.py
ADDED
|
@@ -0,0 +1,623 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import logging
|
| 3 |
+
import subprocess
|
| 4 |
+
import time
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import List, Dict, Optional, Tuple
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from collections import defaultdict, Counter
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import librosa
|
| 12 |
+
import torch
|
| 13 |
+
|
| 14 |
+
from app.core.config import get_settings
|
| 15 |
+
from app.services.transcription import TranscriptionService
|
| 16 |
+
from app.services.alignment import AlignmentService
|
| 17 |
+
from app.services.transcription import WordTimestamp
|
| 18 |
+
from app.services.emo import EmotionService
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
from app.services.diarization import DiarizationService, SpeakerSegment, DiarizationResult
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
settings = get_settings()
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class TranscriptSegment:
|
| 28 |
+
"""A transcribed segment with speaker info."""
|
| 29 |
+
start: float
|
| 30 |
+
end: float
|
| 31 |
+
speaker: str
|
| 32 |
+
role: Optional[str]
|
| 33 |
+
text: str
|
| 34 |
+
emotion: Optional[str] = None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class EmotionPoint:
|
| 40 |
+
time: float
|
| 41 |
+
emotion: str
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class EmotionChange:
|
| 46 |
+
time: float
|
| 47 |
+
emotion_from: str
|
| 48 |
+
emotion_to: str
|
| 49 |
+
icon_from: Optional[str] = None
|
| 50 |
+
icon_to: Optional[str] = None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class ProcessingResult:
|
| 55 |
+
"""Result of audio processing."""
|
| 56 |
+
segments: List[TranscriptSegment]
|
| 57 |
+
speaker_count: int
|
| 58 |
+
duration: float
|
| 59 |
+
processing_time: float
|
| 60 |
+
speakers: List[str]
|
| 61 |
+
roles: Dict[str, str]
|
| 62 |
+
|
| 63 |
+
txt_content: str = ""
|
| 64 |
+
csv_content: str = ""
|
| 65 |
+
|
| 66 |
+
emotion_timeline: List[EmotionPoint] = None
|
| 67 |
+
emotion_changes: List[EmotionChange] = None
|
| 68 |
+
|
| 69 |
+
def pad_and_refine_tensor(
|
| 70 |
+
waveform: torch.Tensor,
|
| 71 |
+
sr: int,
|
| 72 |
+
start_s: float,
|
| 73 |
+
end_s: float,
|
| 74 |
+
pad_ms: int = 250,
|
| 75 |
+
) -> Tuple[float, float]:
|
| 76 |
+
|
| 77 |
+
total_len = waveform.shape[1]
|
| 78 |
+
s = max(int((start_s - pad_ms / 1000) * sr), 0)
|
| 79 |
+
e = min(int((end_s + pad_ms / 1000) * sr), total_len)
|
| 80 |
+
|
| 81 |
+
if e <= s:
|
| 82 |
+
return start_s, end_s
|
| 83 |
+
|
| 84 |
+
return s / sr, e / sr
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def normalize_asr_result(result: dict):
|
| 88 |
+
|
| 89 |
+
words = []
|
| 90 |
+
|
| 91 |
+
for w in result.get("words", []):
|
| 92 |
+
|
| 93 |
+
word = w.get("word", "").strip()
|
| 94 |
+
if not word:
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
words.append(
|
| 98 |
+
{
|
| 99 |
+
"word": word,
|
| 100 |
+
"start": float(w["start"]),
|
| 101 |
+
"end": float(w["end"]),
|
| 102 |
+
"speaker": w.get("speaker"),
|
| 103 |
+
}
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
text = result.get("text", "").strip()
|
| 107 |
+
return text, words
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def guess_speaker_by_overlap(start, end, diar_segments):
|
| 111 |
+
|
| 112 |
+
best_spk = None
|
| 113 |
+
best_overlap = 0.0
|
| 114 |
+
|
| 115 |
+
for seg in diar_segments:
|
| 116 |
+
|
| 117 |
+
overlap = max(0.0, min(end, seg.end) - max(start, seg.start))
|
| 118 |
+
|
| 119 |
+
if overlap > best_overlap:
|
| 120 |
+
best_overlap = overlap
|
| 121 |
+
best_spk = seg.speaker
|
| 122 |
+
|
| 123 |
+
return best_spk or diar_segments[0].speaker
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def convert_audio_to_wav(audio_path: Path) -> Path:
|
| 128 |
+
"""Convert any audio to WAV 16kHz Mono using ffmpeg."""
|
| 129 |
+
output_path = audio_path.parent / f"{audio_path.stem}_processed.wav"
|
| 130 |
+
if output_path.exists():
|
| 131 |
+
output_path.unlink()
|
| 132 |
+
command = ["ffmpeg", "-i", str(audio_path), "-ar", "16000", "-ac", "1", "-y", str(output_path)]
|
| 133 |
+
try:
|
| 134 |
+
subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 135 |
+
logger.info(f"Converted audio to WAV: {output_path}")
|
| 136 |
+
return output_path
|
| 137 |
+
except subprocess.CalledProcessError as e:
|
| 138 |
+
logger.error(f"FFmpeg conversion failed: {e}")
|
| 139 |
+
return audio_path
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def format_timestamp(seconds: float) -> str:
|
| 143 |
+
m = int(seconds // 60)
|
| 144 |
+
s = seconds % 60
|
| 145 |
+
return f"{m:02d}:{s:06.3f}"
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def extract_mfcc_segment(
|
| 149 |
+
audio: np.ndarray,
|
| 150 |
+
sr: int,
|
| 151 |
+
start: float,
|
| 152 |
+
end: float,
|
| 153 |
+
duration=5,
|
| 154 |
+
):
|
| 155 |
+
|
| 156 |
+
start_sample = int(start * sr)
|
| 157 |
+
end_sample = int(end * sr)
|
| 158 |
+
|
| 159 |
+
segment = audio[start_sample:end_sample]
|
| 160 |
+
|
| 161 |
+
if len(segment) == 0:
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
target_len = int(sr * duration)
|
| 165 |
+
|
| 166 |
+
if len(segment) < target_len:
|
| 167 |
+
segment = np.pad(segment,(0,target_len-len(segment)),mode="symmetric")
|
| 168 |
+
else:
|
| 169 |
+
segment = segment[:target_len]
|
| 170 |
+
|
| 171 |
+
mfcc = librosa.feature.mfcc(
|
| 172 |
+
y=segment,
|
| 173 |
+
sr=sr,
|
| 174 |
+
n_mfcc=128,
|
| 175 |
+
n_fft=2048,
|
| 176 |
+
hop_length=512
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
return mfcc
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def merge_consecutive_segments(
|
| 183 |
+
segments: List[SpeakerSegment],
|
| 184 |
+
max_gap: float = 0.8,
|
| 185 |
+
min_duration: float = 0.15,
|
| 186 |
+
) -> List[SpeakerSegment]:
|
| 187 |
+
"""Merge consecutive segments from same speaker."""
|
| 188 |
+
if not segments:
|
| 189 |
+
return []
|
| 190 |
+
|
| 191 |
+
merged = []
|
| 192 |
+
current = SpeakerSegment(
|
| 193 |
+
start=segments[0].start,
|
| 194 |
+
end=segments[0].end,
|
| 195 |
+
speaker=segments[0].speaker
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
for seg in segments[1:]:
|
| 199 |
+
seg_dur = seg.end - seg.start
|
| 200 |
+
if (seg.speaker == current.speaker and (seg.start - current.end) <= max_gap
|
| 201 |
+
or seg_dur < min_duration):
|
| 202 |
+
# Merge: extend current segment
|
| 203 |
+
current.end = seg.end
|
| 204 |
+
else:
|
| 205 |
+
# New speaker or gap too large
|
| 206 |
+
merged.append(current)
|
| 207 |
+
current = SpeakerSegment(
|
| 208 |
+
start=seg.start,
|
| 209 |
+
end=seg.end,
|
| 210 |
+
speaker=seg.speaker
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
merged.append(current)
|
| 214 |
+
return merged
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def overlap_prefix(a: str, b: str, n: int = 12) -> bool:
|
| 218 |
+
if not a or not b:
|
| 219 |
+
return False
|
| 220 |
+
|
| 221 |
+
a = a.strip().lower()
|
| 222 |
+
b = b.strip().lower()
|
| 223 |
+
|
| 224 |
+
return a[:n] in b or b[:n] in a
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
class Processor:
|
| 228 |
+
@classmethod
|
| 229 |
+
async def process_audio(
|
| 230 |
+
cls,
|
| 231 |
+
audio_path: Path,
|
| 232 |
+
model_name: str = "PhoWhisper Lora Finetuned",
|
| 233 |
+
language="vi",
|
| 234 |
+
merge_segments: bool = True,
|
| 235 |
+
|
| 236 |
+
) -> ProcessingResult:
|
| 237 |
+
|
| 238 |
+
import asyncio
|
| 239 |
+
|
| 240 |
+
t0= time.time()
|
| 241 |
+
EmotionService.preload_model()
|
| 242 |
+
|
| 243 |
+
# 1: Convert to WAV
|
| 244 |
+
logger.info("Step 1: Converting audio to WAV 16kHz...")
|
| 245 |
+
wav_path = await asyncio.get_event_loop().run_in_executor(None, convert_audio_to_wav, audio_path)
|
| 246 |
+
|
| 247 |
+
# 2: Load audio
|
| 248 |
+
y, sr = librosa.load(wav_path, sr=16000, mono=True)
|
| 249 |
+
waveform = torch.from_numpy(y).unsqueeze(0)
|
| 250 |
+
if y.size == 0:
|
| 251 |
+
raise ValueError("Empty audio")
|
| 252 |
+
duration = len(y) / sr
|
| 253 |
+
|
| 254 |
+
# 3: Diarization
|
| 255 |
+
logger.info("Step 3: Running diarization...")
|
| 256 |
+
|
| 257 |
+
diarization: DiarizationResult = await DiarizationService.diarize_async(wav_path)
|
| 258 |
+
|
| 259 |
+
diarization_segments = diarization.segments or []
|
| 260 |
+
speakers = diarization.speakers or []
|
| 261 |
+
roles = diarization.roles or {}
|
| 262 |
+
|
| 263 |
+
if not diarization_segments:
|
| 264 |
+
diarization_segments = [SpeakerSegment(0.0, duration, "SPEAKER_0")]
|
| 265 |
+
speakers = ["SPEAKER_0"]
|
| 266 |
+
roles = {"SPEAKER_0": "KH"}
|
| 267 |
+
|
| 268 |
+
diarization_segments.sort(key=lambda x: x.start)
|
| 269 |
+
|
| 270 |
+
diarization_segments = [
|
| 271 |
+
SpeakerSegment(
|
| 272 |
+
*pad_and_refine_tensor(waveform, sr, s.start, s.end),
|
| 273 |
+
speaker=s.speaker,
|
| 274 |
+
)
|
| 275 |
+
for s in diarization_segments
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
diarization_segments.sort(key=lambda x: x.start)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
if merge_segments and diarization_segments:
|
| 282 |
+
logger.info("Step 4: Merging consecutive segments...")
|
| 283 |
+
diarization_segments = merge_consecutive_segments(diarization_segments)
|
| 284 |
+
|
| 285 |
+
# 4. Normalize speakers
|
| 286 |
+
raw_speakers = sorted({seg.speaker for seg in diarization_segments})
|
| 287 |
+
|
| 288 |
+
speaker_map = {
|
| 289 |
+
spk: f"Speaker {i+1}"
|
| 290 |
+
for i, spk in enumerate(raw_speakers)
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
speakers = list(speaker_map.values())
|
| 294 |
+
|
| 295 |
+
# 5. NORMALIZE ROLES
|
| 296 |
+
speaker_duration = defaultdict(float)
|
| 297 |
+
for seg in diarization_segments:
|
| 298 |
+
speaker_duration[seg.speaker] += seg.end - seg.start
|
| 299 |
+
|
| 300 |
+
logger.info(f"speaker_duration(raw) = {speaker_duration}")
|
| 301 |
+
|
| 302 |
+
if speaker_duration:
|
| 303 |
+
agent_raw = max(speaker_duration, key=speaker_duration.get)
|
| 304 |
+
|
| 305 |
+
roles = {
|
| 306 |
+
speaker_map[spk]: ("NV" if spk == agent_raw else "KH")
|
| 307 |
+
for spk in speaker_duration
|
| 308 |
+
}
|
| 309 |
+
else:
|
| 310 |
+
roles = {}
|
| 311 |
+
|
| 312 |
+
# Default fallback
|
| 313 |
+
for label in speakers:
|
| 314 |
+
roles.setdefault(label, "KH")
|
| 315 |
+
|
| 316 |
+
logger.info(f"roles(mapped) = {roles}")
|
| 317 |
+
|
| 318 |
+
# 7: Transcribe segments after diarization
|
| 319 |
+
logger.info("Step 7: Running ASR with external VAD batch...")
|
| 320 |
+
|
| 321 |
+
asr_result = await TranscriptionService.transcribe_with_words_async(
|
| 322 |
+
audio_array=y,
|
| 323 |
+
model_name=model_name,
|
| 324 |
+
language=language,
|
| 325 |
+
vad_options=True
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
text, raw_words = normalize_asr_result(asr_result)
|
| 329 |
+
|
| 330 |
+
processed_segments: List[TranscriptSegment] = []
|
| 331 |
+
|
| 332 |
+
if not raw_words:
|
| 333 |
+
processed_segments = [
|
| 334 |
+
TranscriptSegment(
|
| 335 |
+
start=0.0,
|
| 336 |
+
end=duration,
|
| 337 |
+
speaker=speakers[0],
|
| 338 |
+
role=roles[speakers[0]],
|
| 339 |
+
text="(No speech detected)"
|
| 340 |
+
)
|
| 341 |
+
]
|
| 342 |
+
|
| 343 |
+
else:
|
| 344 |
+
|
| 345 |
+
# ===== CONVERT TO WordTimestamp =====
|
| 346 |
+
word_objs: List[WordTimestamp] = []
|
| 347 |
+
|
| 348 |
+
for w in raw_words:
|
| 349 |
+
|
| 350 |
+
spk = w.get("speaker")
|
| 351 |
+
|
| 352 |
+
if spk is None:
|
| 353 |
+
spk = guess_speaker_by_overlap(
|
| 354 |
+
w["start"], w["end"], diarization_segments
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
word_objs.append(
|
| 358 |
+
WordTimestamp(
|
| 359 |
+
word=w["word"],
|
| 360 |
+
start=w["start"],
|
| 361 |
+
end=w["end"],
|
| 362 |
+
speaker=spk,
|
| 363 |
+
)
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
word_objs.sort(key=lambda x: x.start)
|
| 367 |
+
|
| 368 |
+
# ===== ALIGNMENT =====
|
| 369 |
+
aligned_segments = AlignmentService.align_precision(
|
| 370 |
+
word_objs,
|
| 371 |
+
diarization_segments
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
processed_segments = []
|
| 375 |
+
|
| 376 |
+
if not aligned_segments:
|
| 377 |
+
|
| 378 |
+
vote = [w.speaker for w in word_objs if w.speaker]
|
| 379 |
+
|
| 380 |
+
if vote:
|
| 381 |
+
raw_spk = Counter(vote).most_common(1)[0][0]
|
| 382 |
+
else:
|
| 383 |
+
raw_spk = diarization_segments[0].speaker
|
| 384 |
+
|
| 385 |
+
label = speaker_map.get(raw_spk, "Speaker 1")
|
| 386 |
+
|
| 387 |
+
processed_segments.append(
|
| 388 |
+
TranscriptSegment(0, duration, label, roles[label], text)
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
else:
|
| 392 |
+
|
| 393 |
+
for seg in aligned_segments:
|
| 394 |
+
|
| 395 |
+
raw_spk = seg.speaker
|
| 396 |
+
label = speaker_map.get(raw_spk, "Speaker 1")
|
| 397 |
+
role = roles.get(label, "KH")
|
| 398 |
+
|
| 399 |
+
processed_segments.append(
|
| 400 |
+
TranscriptSegment(
|
| 401 |
+
start=seg.start,
|
| 402 |
+
end=seg.end,
|
| 403 |
+
speaker=label,
|
| 404 |
+
role=role,
|
| 405 |
+
text=seg.text,
|
| 406 |
+
)
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
processed_segments = cls._merge_adjacent_segments(
|
| 411 |
+
processed_segments
|
| 412 |
+
)
|
| 413 |
+
processed_segments.sort(key=lambda x: x.start)
|
| 414 |
+
|
| 415 |
+
# 8 : Predict emotion segments
|
| 416 |
+
logger.info("Step 8: Predicting emo per segment ")
|
| 417 |
+
processed_segments = cls._predict_emotion_segments(
|
| 418 |
+
processed_segments,
|
| 419 |
+
y,
|
| 420 |
+
sr
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
# build emotion timeline
|
| 424 |
+
emotion_timeline = cls.build_emotion_timeline(processed_segments)
|
| 425 |
+
|
| 426 |
+
# detect emotion change
|
| 427 |
+
emotion_changes = cls.detect_emotion_changes(emotion_timeline)
|
| 428 |
+
|
| 429 |
+
processing_time = time.time() - t0
|
| 430 |
+
|
| 431 |
+
txt_content = cls._generate_txt(
|
| 432 |
+
processed_segments,
|
| 433 |
+
len(speakers),
|
| 434 |
+
processing_time,
|
| 435 |
+
duration,
|
| 436 |
+
roles
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
csv_content = cls._generate_csv(processed_segments)
|
| 440 |
+
|
| 441 |
+
return ProcessingResult(
|
| 442 |
+
segments=processed_segments,
|
| 443 |
+
speaker_count=len(speakers),
|
| 444 |
+
duration=duration,
|
| 445 |
+
processing_time=processing_time,
|
| 446 |
+
speakers=speakers,
|
| 447 |
+
roles=roles,
|
| 448 |
+
txt_content=txt_content,
|
| 449 |
+
csv_content=csv_content,
|
| 450 |
+
|
| 451 |
+
emotion_timeline=emotion_timeline,
|
| 452 |
+
emotion_changes=emotion_changes
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
@staticmethod
|
| 457 |
+
def _merge_adjacent_segments(
|
| 458 |
+
segments: List[TranscriptSegment],
|
| 459 |
+
max_gap_s: float = 0.8,
|
| 460 |
+
max_segment_duration: float = 9.0
|
| 461 |
+
) -> List[TranscriptSegment]:
|
| 462 |
+
"""
|
| 463 |
+
Merge adjacent segments if:
|
| 464 |
+
- same speaker
|
| 465 |
+
- gap <= max_gap_s
|
| 466 |
+
"""
|
| 467 |
+
if not segments:
|
| 468 |
+
return segments
|
| 469 |
+
|
| 470 |
+
segments = sorted(segments, key=lambda s: s.start)
|
| 471 |
+
merged = [segments[0]]
|
| 472 |
+
|
| 473 |
+
for seg in segments[1:]:
|
| 474 |
+
prev = merged[-1]
|
| 475 |
+
|
| 476 |
+
gap = seg.start - prev.end
|
| 477 |
+
combined_duration = seg.end - prev.start
|
| 478 |
+
|
| 479 |
+
if (
|
| 480 |
+
seg.speaker == prev.speaker and seg.role == prev.role
|
| 481 |
+
and gap <= max_gap_s
|
| 482 |
+
and combined_duration <= max_segment_duration
|
| 483 |
+
and not overlap_prefix(seg.text, prev.text)
|
| 484 |
+
):
|
| 485 |
+
# MERGE
|
| 486 |
+
prev.text = f"{prev.text} {seg.text}".strip()
|
| 487 |
+
prev.end = max(prev.end, seg.end)
|
| 488 |
+
else:
|
| 489 |
+
merged.append(seg)
|
| 490 |
+
|
| 491 |
+
return merged
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
@staticmethod
|
| 495 |
+
def _predict_emotion_segments(
|
| 496 |
+
segments: List[TranscriptSegment],
|
| 497 |
+
audio: np.ndarray,
|
| 498 |
+
sr: int
|
| 499 |
+
):
|
| 500 |
+
|
| 501 |
+
for seg in segments:
|
| 502 |
+
|
| 503 |
+
# chỉ predict emotion cho KH
|
| 504 |
+
if seg.role != "KH":
|
| 505 |
+
seg.emotion = None
|
| 506 |
+
continue
|
| 507 |
+
|
| 508 |
+
seg.emotion = EmotionService.predict_segment(
|
| 509 |
+
audio,
|
| 510 |
+
sr,
|
| 511 |
+
seg.start,
|
| 512 |
+
seg.end
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
return segments
|
| 516 |
+
|
| 517 |
+
@staticmethod
|
| 518 |
+
def build_emotion_timeline(segments):
|
| 519 |
+
|
| 520 |
+
timeline = []
|
| 521 |
+
|
| 522 |
+
for seg in segments:
|
| 523 |
+
|
| 524 |
+
if seg.role != "KH":
|
| 525 |
+
continue
|
| 526 |
+
|
| 527 |
+
if not seg.emotion:
|
| 528 |
+
continue
|
| 529 |
+
|
| 530 |
+
icon = EmotionService.meta.get(seg.emotion, {}).get("emoji", "🙂")
|
| 531 |
+
|
| 532 |
+
timeline.append(
|
| 533 |
+
EmotionPoint(
|
| 534 |
+
time=seg.start,
|
| 535 |
+
emotion=seg.emotion,
|
| 536 |
+
icon=icon
|
| 537 |
+
)
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
return timeline
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
@staticmethod
|
| 544 |
+
def detect_emotion_changes(timeline):
|
| 545 |
+
|
| 546 |
+
changes = []
|
| 547 |
+
prev = None
|
| 548 |
+
|
| 549 |
+
for point in timeline:
|
| 550 |
+
|
| 551 |
+
if prev is not None and prev.emotion != point.emotion:
|
| 552 |
+
|
| 553 |
+
icon_from = EmotionService.meta.get(prev.emotion, {}).get("emoji", "🙂")
|
| 554 |
+
icon_to = EmotionService.meta.get(point.emotion, {}).get("emoji", "🙂")
|
| 555 |
+
|
| 556 |
+
changes.append(
|
| 557 |
+
EmotionChange(
|
| 558 |
+
time=point.time,
|
| 559 |
+
emotion_from=prev.emotion,
|
| 560 |
+
emotion_to=point.emotion,
|
| 561 |
+
icon_from=icon_from,
|
| 562 |
+
icon_to=icon_to
|
| 563 |
+
)
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
+
prev = point
|
| 567 |
+
|
| 568 |
+
return changes
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
@classmethod
|
| 572 |
+
def _generate_txt(
|
| 573 |
+
cls,
|
| 574 |
+
segments: List[TranscriptSegment],
|
| 575 |
+
speaker_count: int,
|
| 576 |
+
processing_time: float,
|
| 577 |
+
duration: float,
|
| 578 |
+
roles: Dict[str, str],
|
| 579 |
+
) -> str:
|
| 580 |
+
|
| 581 |
+
segments = sorted(segments, key=lambda s: s.start)
|
| 582 |
+
speakers = []
|
| 583 |
+
for seg in segments:
|
| 584 |
+
if seg.speaker and seg.speaker not in speakers:
|
| 585 |
+
speakers.append(seg.speaker)
|
| 586 |
+
|
| 587 |
+
lines = [
|
| 588 |
+
"# Transcription Result",
|
| 589 |
+
f"# Duration: {format_timestamp(duration)}",
|
| 590 |
+
f"# Speakers: {speaker_count}",
|
| 591 |
+
f"# Roles: {roles}",
|
| 592 |
+
f"# Processing time: {processing_time:.1f}s",
|
| 593 |
+
"",
|
| 594 |
+
]
|
| 595 |
+
icon_pool = ["🔵", "🟢", "🟡", "🟠", "🔴", "🟣"]
|
| 596 |
+
speaker_icons = {
|
| 597 |
+
spk: icon_pool[i % len(icon_pool)]
|
| 598 |
+
for i, spk in enumerate(speakers)
|
| 599 |
+
}
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
for seg in segments:
|
| 603 |
+
ts = f"[{format_timestamp(seg.start)} → {format_timestamp(seg.end)}]"
|
| 604 |
+
role = seg.role or "UNKNOWN"
|
| 605 |
+
|
| 606 |
+
speaker_icon = speaker_icons.get(seg.speaker, "⚪")
|
| 607 |
+
lines.append(
|
| 608 |
+
f"{ts} {speaker_icon} [{seg.speaker}|{role}] {seg.text}"
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
return "\n".join(lines)
|
| 612 |
+
|
| 613 |
+
@classmethod
|
| 614 |
+
def _generate_csv(cls, segments: List[TranscriptSegment]) -> str:
|
| 615 |
+
import csv
|
| 616 |
+
from io import StringIO
|
| 617 |
+
|
| 618 |
+
output = StringIO()
|
| 619 |
+
writer = csv.writer(output)
|
| 620 |
+
writer.writerow(["start", "end", "speaker", "text"])
|
| 621 |
+
for seg in segments:
|
| 622 |
+
writer.writerow([round(seg.start, 3), round(seg.end, 3), seg.speaker, seg.text])
|
| 623 |
+
return output.getvalue()
|
app/services/silero_vad_service.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import librosa
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
from silero_vad import load_silero_vad, get_speech_timestamps
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SileroVADService:
|
| 8 |
+
|
| 9 |
+
_model = None
|
| 10 |
+
|
| 11 |
+
@classmethod
|
| 12 |
+
def load_model(cls):
|
| 13 |
+
|
| 14 |
+
if cls._model is None:
|
| 15 |
+
cls._model = load_silero_vad()
|
| 16 |
+
|
| 17 |
+
return cls._model
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
def get_speech_timestamps(
|
| 21 |
+
cls,
|
| 22 |
+
audio: np.ndarray,
|
| 23 |
+
sr: int
|
| 24 |
+
) -> List[Tuple[float, float]]:
|
| 25 |
+
|
| 26 |
+
model = cls.load_model()
|
| 27 |
+
|
| 28 |
+
audio = audio.astype(np.float32)
|
| 29 |
+
|
| 30 |
+
if np.max(np.abs(audio)) > 0:
|
| 31 |
+
audio = audio / np.max(np.abs(audio))
|
| 32 |
+
|
| 33 |
+
if sr != 16000:
|
| 34 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 35 |
+
sr = 16000
|
| 36 |
+
|
| 37 |
+
speech = get_speech_timestamps(
|
| 38 |
+
audio,
|
| 39 |
+
model,
|
| 40 |
+
sampling_rate=sr
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# convert
|
| 44 |
+
segments = [
|
| 45 |
+
(seg["start"] / sr, seg["end"] / sr)
|
| 46 |
+
for seg in speech
|
| 47 |
+
if seg["end"] > seg["start"]
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
MIN_SPEECH_SEC = 0.25
|
| 51 |
+
segments = [
|
| 52 |
+
(s, e) for s, e in segments
|
| 53 |
+
if (e - s) >= MIN_SPEECH_SEC
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
# merge close
|
| 57 |
+
MERGE_GAP = 0.15
|
| 58 |
+
merged = []
|
| 59 |
+
|
| 60 |
+
for s, e in segments:
|
| 61 |
+
if not merged:
|
| 62 |
+
merged.append([s, e])
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
prev = merged[-1]
|
| 66 |
+
|
| 67 |
+
if s - prev[1] < MERGE_GAP:
|
| 68 |
+
prev[1] = e
|
| 69 |
+
else:
|
| 70 |
+
merged.append([s, e])
|
| 71 |
+
|
| 72 |
+
return [(s, e) for s, e in merged]
|
app/services/transcription.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Transcription service using faster-whisper.
|
| 3 |
+
Supports multiple Vietnamese Whisper models with caching.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Dict, Optional, List
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Tuple
|
| 9 |
+
import re
|
| 10 |
+
import librosa
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
from faster_whisper import WhisperModel
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from app.core.config import get_settings
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
settings = get_settings()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Available Whisper models for Vietnamese
|
| 23 |
+
AVAILABLE_MODELS = {
|
| 24 |
+
"EraX-WoW-Turbo": "erax-ai/EraX-WoW-Turbo-V1.1-CT2",
|
| 25 |
+
"PhoWhisper Large": "kiendt/PhoWhisper-large-ct2",
|
| 26 |
+
"PhoWhisper Lora Finetuned": "vyluong/pho-whisper-vi-ct2"
|
| 27 |
+
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class WordTimestamp:
|
| 32 |
+
"""A single word with precise timestamp."""
|
| 33 |
+
word: str
|
| 34 |
+
start: float
|
| 35 |
+
end: float
|
| 36 |
+
speaker: Optional[str] = None
|
| 37 |
+
|
| 38 |
+
class TranscriptionService:
|
| 39 |
+
"""
|
| 40 |
+
Service for speech-to-text transcription using faster-whisper.
|
| 41 |
+
Supports multiple models with caching.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
_models: Dict[str, WhisperModel] = {}
|
| 45 |
+
|
| 46 |
+
@classmethod
|
| 47 |
+
def get_model(cls, model_name: str = None) -> WhisperModel:
|
| 48 |
+
"""
|
| 49 |
+
Get or load a Whisper model (lazy loading with caching).
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
model_name: Name of the model from AVAILABLE_MODELS
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Loaded WhisperModel instance
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
if model_name is None:
|
| 59 |
+
model_name = settings.default_whisper_model
|
| 60 |
+
|
| 61 |
+
cache_key = f"{model_name}_{settings.resolved_compute_type}"
|
| 62 |
+
|
| 63 |
+
if cache_key in cls._models:
|
| 64 |
+
return cls._models[cache_key]
|
| 65 |
+
|
| 66 |
+
# Get model path
|
| 67 |
+
if model_name in AVAILABLE_MODELS:
|
| 68 |
+
model_path = AVAILABLE_MODELS[model_name]
|
| 69 |
+
else:
|
| 70 |
+
# Fallback to first available model
|
| 71 |
+
model_name = list(AVAILABLE_MODELS.keys())[0]
|
| 72 |
+
model_path = AVAILABLE_MODELS[model_name]
|
| 73 |
+
|
| 74 |
+
logger.info(f"Loading Whisper model: {model_name} ({model_path})")
|
| 75 |
+
logger.debug(f"Device: {settings.resolved_device}, Compute type: {settings.resolved_compute_type}")
|
| 76 |
+
|
| 77 |
+
model = WhisperModel(
|
| 78 |
+
model_path,
|
| 79 |
+
device=settings.resolved_device,
|
| 80 |
+
compute_type=settings.resolved_compute_type,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
cls._models[cache_key] = model
|
| 84 |
+
logger.info(f"Whisper model loaded: {model_name}")
|
| 85 |
+
|
| 86 |
+
return model
|
| 87 |
+
|
| 88 |
+
@classmethod
|
| 89 |
+
def is_loaded(cls, model_name: str = None) -> bool:
|
| 90 |
+
"""Check if a model is loaded."""
|
| 91 |
+
if model_name is None:
|
| 92 |
+
model_name = settings.default_whisper_model
|
| 93 |
+
|
| 94 |
+
cache_key = f"{model_name}_{settings.resolved_compute_type}"
|
| 95 |
+
return cache_key in cls._models
|
| 96 |
+
|
| 97 |
+
@classmethod
|
| 98 |
+
def preload_model(cls, model_name: str = None) -> None:
|
| 99 |
+
"""Preload a model during startup."""
|
| 100 |
+
if model_name is None:
|
| 101 |
+
model_name = settings.default_whisper_model
|
| 102 |
+
try:
|
| 103 |
+
cls.get_model(model_name)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"Failed to preload Whisper model: {e}")
|
| 106 |
+
raise
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@classmethod
|
| 110 |
+
def transcribe_with_words(
|
| 111 |
+
cls,
|
| 112 |
+
audio_array: np.ndarray,
|
| 113 |
+
model_name: str = None,
|
| 114 |
+
language: str = "vi",
|
| 115 |
+
vad_options: Optional[dict | bool] = None,
|
| 116 |
+
beam_size: int = 3,
|
| 117 |
+
temperature: float = 0.0,
|
| 118 |
+
best_of: int = 5,
|
| 119 |
+
patience: float = 1.0,
|
| 120 |
+
length_penalty: float = 1.0,
|
| 121 |
+
no_repeat_ngram_size: int = 3,
|
| 122 |
+
|
| 123 |
+
# Prompting
|
| 124 |
+
initial_prompt: str = "Hội thoại tổng đài. Chỉ ghi lại đúng lời nói trong audio.",
|
| 125 |
+
|
| 126 |
+
prefix_text: Optional[str] = None,
|
| 127 |
+
|
| 128 |
+
# Stability / filtering
|
| 129 |
+
condition_on_previous_text: bool = False,
|
| 130 |
+
no_speech_threshold: float = 0.70,
|
| 131 |
+
log_prob_threshold: float = -1.0,
|
| 132 |
+
compression_ratio_threshold: float = 2.4
|
| 133 |
+
|
| 134 |
+
) -> Dict:
|
| 135 |
+
"""
|
| 136 |
+
Transcribe audio and return word-level timestamps.
|
| 137 |
+
"""
|
| 138 |
+
model = cls.get_model(model_name)
|
| 139 |
+
|
| 140 |
+
if vad_options is None or vad_options is False:
|
| 141 |
+
use_vad = False
|
| 142 |
+
vad_parameters = None
|
| 143 |
+
|
| 144 |
+
elif vad_options is True:
|
| 145 |
+
use_vad = True
|
| 146 |
+
vad_parameters = {
|
| 147 |
+
"threshold": settings.vad_threshold,
|
| 148 |
+
"min_speech_duration_ms": settings.vad_min_speech_duration_ms,
|
| 149 |
+
"min_silence_duration_ms": settings.vad_min_silence_duration_ms,
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
elif isinstance(vad_options, dict):
|
| 153 |
+
use_vad = True
|
| 154 |
+
vad_parameters = vad_options
|
| 155 |
+
|
| 156 |
+
else:
|
| 157 |
+
use_vad = False
|
| 158 |
+
vad_parameters = None
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
prompt = (
|
| 162 |
+
initial_prompt.strip()
|
| 163 |
+
if isinstance(initial_prompt, str) and initial_prompt.strip()
|
| 164 |
+
else None
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
prefix = (
|
| 168 |
+
prefix_text.strip()
|
| 169 |
+
if isinstance(prefix_text, str) and prefix_text.strip()
|
| 170 |
+
else None
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
segments_gen, info = model.transcribe(
|
| 174 |
+
audio_array,
|
| 175 |
+
language=language if language != "auto" else None,
|
| 176 |
+
|
| 177 |
+
# decoding
|
| 178 |
+
beam_size=beam_size,
|
| 179 |
+
temperature=temperature,
|
| 180 |
+
best_of=best_of,
|
| 181 |
+
patience=patience,
|
| 182 |
+
length_penalty=length_penalty,
|
| 183 |
+
no_repeat_ngram_size=no_repeat_ngram_size,
|
| 184 |
+
|
| 185 |
+
# prompting
|
| 186 |
+
prefix=prefix,
|
| 187 |
+
|
| 188 |
+
# QA / Stability
|
| 189 |
+
condition_on_previous_text=condition_on_previous_text,
|
| 190 |
+
no_speech_threshold=no_speech_threshold,
|
| 191 |
+
log_prob_threshold=log_prob_threshold,
|
| 192 |
+
compression_ratio_threshold=compression_ratio_threshold,
|
| 193 |
+
|
| 194 |
+
word_timestamps=True,
|
| 195 |
+
|
| 196 |
+
# VAD
|
| 197 |
+
vad_filter=use_vad,
|
| 198 |
+
vad_parameters=vad_parameters,
|
| 199 |
+
initial_prompt=prompt,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
words = []
|
| 203 |
+
full_text = []
|
| 204 |
+
|
| 205 |
+
for seg in segments_gen:
|
| 206 |
+
if seg.text:
|
| 207 |
+
full_text.append(seg.text.strip())
|
| 208 |
+
|
| 209 |
+
if hasattr(seg, "words") and seg.words:
|
| 210 |
+
for w in seg.words:
|
| 211 |
+
if not w.word.strip():
|
| 212 |
+
continue
|
| 213 |
+
words.append({
|
| 214 |
+
"word": w.word.strip(),
|
| 215 |
+
"start": float(w.start),
|
| 216 |
+
"end": float(w.end),
|
| 217 |
+
})
|
| 218 |
+
|
| 219 |
+
return {
|
| 220 |
+
"text": " ".join(full_text).strip(),
|
| 221 |
+
"words": words,
|
| 222 |
+
"info": info,
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
@classmethod
|
| 227 |
+
async def transcribe_with_words_async(
|
| 228 |
+
cls,
|
| 229 |
+
audio_array: np.ndarray,
|
| 230 |
+
model_name: str = None,
|
| 231 |
+
language: str = "vi",
|
| 232 |
+
vad_options: Optional[dict | bool] = None,
|
| 233 |
+
beam_size: int = 5,
|
| 234 |
+
temperature: float = 0.0,
|
| 235 |
+
best_of: int = 5,
|
| 236 |
+
patience: float = 1.0,
|
| 237 |
+
length_penalty: float = 1.0,
|
| 238 |
+
no_repeat_ngram_size: int = 3,
|
| 239 |
+
initial_prompt: Optional[str] = None,
|
| 240 |
+
prefix_text: Optional[str] = None,
|
| 241 |
+
condition_on_previous_text: bool = False,
|
| 242 |
+
no_speech_threshold: float = 0.70,
|
| 243 |
+
log_prob_threshold: float = -1.0,
|
| 244 |
+
# text repetitive / nonsense
|
| 245 |
+
compression_ratio_threshold: float = 2.4
|
| 246 |
+
) -> Dict:
|
| 247 |
+
"""
|
| 248 |
+
Async wrapper for transcription (runs in thread pool).
|
| 249 |
+
"""
|
| 250 |
+
import asyncio
|
| 251 |
+
|
| 252 |
+
loop = asyncio.get_running_loop()
|
| 253 |
+
return await loop.run_in_executor(
|
| 254 |
+
None,
|
| 255 |
+
lambda: cls.transcribe_with_words(
|
| 256 |
+
audio_array=audio_array,
|
| 257 |
+
model_name=model_name,
|
| 258 |
+
language=language,
|
| 259 |
+
vad_options=vad_options,
|
| 260 |
+
beam_size=beam_size,
|
| 261 |
+
temperature=temperature,
|
| 262 |
+
best_of=best_of,
|
| 263 |
+
patience=patience,
|
| 264 |
+
length_penalty=length_penalty,
|
| 265 |
+
no_repeat_ngram_size=no_repeat_ngram_size,
|
| 266 |
+
initial_prompt=initial_prompt,
|
| 267 |
+
prefix_text=prefix_text,
|
| 268 |
+
condition_on_previous_text=condition_on_previous_text,
|
| 269 |
+
no_speech_threshold=no_speech_threshold,
|
| 270 |
+
log_prob_threshold=log_prob_threshold,
|
| 271 |
+
compression_ratio_threshold=compression_ratio_threshold
|
| 272 |
+
|
| 273 |
+
)
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
@classmethod
|
| 277 |
+
def get_available_models(cls) -> Dict[str, str]:
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
"""Return list of available models."""
|
| 281 |
+
return AVAILABLE_MODELS.copy()
|
| 282 |
+
|
| 283 |
+
|
app/services/vocal_separator.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vocal Separation Service using MDX-Net (via audio-separator).
|
| 3 |
+
Isolates vocals from audio files using state-of-the-art MDX-Net models.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import asyncio
|
| 7 |
+
import logging
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
from app.core.config import get_settings
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
settings = get_settings()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class VocalSeparationError(Exception):
|
| 18 |
+
"""Custom exception for vocal separation errors."""
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class VocalSeparator:
|
| 23 |
+
"""
|
| 24 |
+
Service for separating vocals from audio using MDX-Net.
|
| 25 |
+
Uses the audio-separator library which supports UVR models.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
_separator = None
|
| 29 |
+
_model_name: str = None
|
| 30 |
+
|
| 31 |
+
@classmethod
|
| 32 |
+
def _get_separator(cls):
|
| 33 |
+
"""Lazy load the Audio Separator."""
|
| 34 |
+
if cls._separator is None or cls._model_name != settings.mdx_model:
|
| 35 |
+
from audio_separator.separator import Separator
|
| 36 |
+
|
| 37 |
+
logger.debug(f"Initializing MDX-Net separator with model: {settings.mdx_model}")
|
| 38 |
+
|
| 39 |
+
# Initialize separator
|
| 40 |
+
# Note: audio-separator expects output_dir to exist
|
| 41 |
+
settings.processed_dir.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
separator = Separator(
|
| 44 |
+
output_dir=str(settings.processed_dir),
|
| 45 |
+
output_format="WAV",
|
| 46 |
+
normalization_threshold=0.9
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Load model
|
| 50 |
+
separator.load_model(settings.mdx_model)
|
| 51 |
+
|
| 52 |
+
cls._separator = separator
|
| 53 |
+
cls._model_name = settings.mdx_model
|
| 54 |
+
logger.debug(f"MDX-Net model loaded on {settings.resolved_device}")
|
| 55 |
+
|
| 56 |
+
return cls._separator
|
| 57 |
+
|
| 58 |
+
@classmethod
|
| 59 |
+
async def separate_vocals(cls, input_path: Path) -> Path:
|
| 60 |
+
"""
|
| 61 |
+
Separate vocals from audio file using MDX-Net.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
input_path: Path to input audio file
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Path to separated vocals WAV file
|
| 68 |
+
"""
|
| 69 |
+
if not settings.enable_vocal_separation:
|
| 70 |
+
logger.debug("Vocal separation disabled, skipping...")
|
| 71 |
+
return input_path
|
| 72 |
+
|
| 73 |
+
logger.debug(f"Starting vocal separation for: {input_path.name}")
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
# Run separation in executor to not block
|
| 77 |
+
loop = asyncio.get_event_loop()
|
| 78 |
+
vocals_path = await loop.run_in_executor(
|
| 79 |
+
None,
|
| 80 |
+
lambda: cls._run_separation(input_path)
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
logger.info(f"Vocal separation complete: {vocals_path.name}")
|
| 84 |
+
return vocals_path
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Vocal separation failed: {e}")
|
| 88 |
+
# Fallback to original
|
| 89 |
+
logger.warning("Falling back to original audio.")
|
| 90 |
+
return input_path
|
| 91 |
+
|
| 92 |
+
@classmethod
|
| 93 |
+
def _run_separation(cls, input_path: Path) -> Path:
|
| 94 |
+
"""Run the actual separation (blocking)."""
|
| 95 |
+
separator = cls._get_separator()
|
| 96 |
+
|
| 97 |
+
# separate() returns a list of output filenames
|
| 98 |
+
output_files = separator.separate(str(input_path))
|
| 99 |
+
|
| 100 |
+
# audio-separator usually produces multiple files (Vocals, Instrumental)
|
| 101 |
+
# We need to find the vocals one.
|
| 102 |
+
# It typically names them like {input_stem}_(Vocals)_{model}.wav
|
| 103 |
+
|
| 104 |
+
vocals_file = None
|
| 105 |
+
for file in output_files:
|
| 106 |
+
if "Vocals" in file:
|
| 107 |
+
vocals_file = settings.processed_dir / file
|
| 108 |
+
break
|
| 109 |
+
|
| 110 |
+
if not vocals_file:
|
| 111 |
+
# If we can't find the vocals file specifically, just take the first one or fail
|
| 112 |
+
logger.warning("Could not identify vocals stem in output files.")
|
| 113 |
+
if output_files:
|
| 114 |
+
vocals_file = settings.processed_dir / output_files[0]
|
| 115 |
+
else:
|
| 116 |
+
raise VocalSeparationError("No output files generated by separator.")
|
| 117 |
+
|
| 118 |
+
return vocals_file
|
app/static/css/style.css
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* ================================
|
| 2 |
+
PrecisionVoice - Modern Dark Theme
|
| 3 |
+
================================ */
|
| 4 |
+
|
| 5 |
+
:root {
|
| 6 |
+
/* Color Palette */
|
| 7 |
+
--bg-primary: #0a0a0f;
|
| 8 |
+
--bg-secondary: #12121a;
|
| 9 |
+
--bg-card: rgba(255, 255, 255, 0.03);
|
| 10 |
+
--bg-card-hover: rgba(255, 255, 255, 0.05);
|
| 11 |
+
|
| 12 |
+
--text-primary: #ffffff;
|
| 13 |
+
--text-secondary: #a0a0b0;
|
| 14 |
+
--text-muted: #606070;
|
| 15 |
+
|
| 16 |
+
--accent-primary: #6366f1;
|
| 17 |
+
--accent-secondary: #8b5cf6;
|
| 18 |
+
--accent-gradient: linear-gradient(135deg, #6366f1 0%, #8b5cf6 50%, #a855f7 100%);
|
| 19 |
+
|
| 20 |
+
--success: #10b981;
|
| 21 |
+
--error: #ef4444;
|
| 22 |
+
--warning: #f59e0b;
|
| 23 |
+
|
| 24 |
+
--border-color: rgba(255, 255, 255, 0.08);
|
| 25 |
+
--border-glow: rgba(99, 102, 241, 0.3);
|
| 26 |
+
|
| 27 |
+
/* Spacing */
|
| 28 |
+
--spacing-xs: 0.25rem;
|
| 29 |
+
--spacing-sm: 0.5rem;
|
| 30 |
+
--spacing-md: 1rem;
|
| 31 |
+
--spacing-lg: 1.5rem;
|
| 32 |
+
--spacing-xl: 2rem;
|
| 33 |
+
--spacing-2xl: 3rem;
|
| 34 |
+
|
| 35 |
+
/* Border Radius */
|
| 36 |
+
--radius-sm: 0.375rem;
|
| 37 |
+
--radius-md: 0.75rem;
|
| 38 |
+
--radius-lg: 1rem;
|
| 39 |
+
--radius-xl: 1.5rem;
|
| 40 |
+
|
| 41 |
+
/* Shadows */
|
| 42 |
+
--shadow-sm: 0 2px 8px rgba(0, 0, 0, 0.3);
|
| 43 |
+
--shadow-md: 0 4px 16px rgba(0, 0, 0, 0.4);
|
| 44 |
+
--shadow-lg: 0 8px 32px rgba(0, 0, 0, 0.5);
|
| 45 |
+
--shadow-glow: 0 0 40px rgba(99, 102, 241, 0.15);
|
| 46 |
+
|
| 47 |
+
/* Transitions */
|
| 48 |
+
--transition-fast: 0.15s ease;
|
| 49 |
+
--transition-normal: 0.3s ease;
|
| 50 |
+
--transition-slow: 0.5s ease;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
/* ================================
|
| 54 |
+
Base Styles
|
| 55 |
+
================================ */
|
| 56 |
+
|
| 57 |
+
*,
|
| 58 |
+
*::before,
|
| 59 |
+
*::after {
|
| 60 |
+
box-sizing: border-box;
|
| 61 |
+
margin: 0;
|
| 62 |
+
padding: 0;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
html {
|
| 66 |
+
font-size: 16px;
|
| 67 |
+
scroll-behavior: smooth;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
body {
|
| 71 |
+
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
| 72 |
+
background: var(--bg-primary);
|
| 73 |
+
color: var(--text-primary);
|
| 74 |
+
line-height: 1.6;
|
| 75 |
+
min-height: 100vh;
|
| 76 |
+
-webkit-font-smoothing: antialiased;
|
| 77 |
+
-moz-osx-font-smoothing: grayscale;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/* Animated background gradient */
|
| 81 |
+
body::before {
|
| 82 |
+
content: '';
|
| 83 |
+
position: fixed;
|
| 84 |
+
top: 0;
|
| 85 |
+
left: 0;
|
| 86 |
+
right: 0;
|
| 87 |
+
bottom: 0;
|
| 88 |
+
background:
|
| 89 |
+
radial-gradient(ellipse at 20% 20%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
|
| 90 |
+
radial-gradient(ellipse at 80% 80%, rgba(139, 92, 246, 0.06) 0%, transparent 50%),
|
| 91 |
+
radial-gradient(ellipse at 50% 50%, rgba(168, 85, 247, 0.04) 0%, transparent 70%);
|
| 92 |
+
pointer-events: none;
|
| 93 |
+
z-index: -1;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
/* ================================
|
| 97 |
+
Layout
|
| 98 |
+
================================ */
|
| 99 |
+
|
| 100 |
+
.app-container {
|
| 101 |
+
max-width: 800px;
|
| 102 |
+
margin: 0 auto;
|
| 103 |
+
padding: var(--spacing-lg);
|
| 104 |
+
min-height: 100vh;
|
| 105 |
+
display: flex;
|
| 106 |
+
flex-direction: column;
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
/* ================================
|
| 110 |
+
Header
|
| 111 |
+
================================ */
|
| 112 |
+
|
| 113 |
+
.header {
|
| 114 |
+
text-align: center;
|
| 115 |
+
padding: var(--spacing-2xl) 0;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
.logo {
|
| 119 |
+
display: flex;
|
| 120 |
+
align-items: center;
|
| 121 |
+
justify-content: center;
|
| 122 |
+
gap: var(--spacing-md);
|
| 123 |
+
margin-bottom: var(--spacing-sm);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
.logo-icon {
|
| 127 |
+
width: 48px;
|
| 128 |
+
height: 48px;
|
| 129 |
+
background: var(--accent-gradient);
|
| 130 |
+
border-radius: var(--radius-lg);
|
| 131 |
+
display: flex;
|
| 132 |
+
align-items: center;
|
| 133 |
+
justify-content: center;
|
| 134 |
+
box-shadow: var(--shadow-glow);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
.logo-icon svg {
|
| 138 |
+
width: 28px;
|
| 139 |
+
height: 28px;
|
| 140 |
+
color: white;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.logo h1 {
|
| 144 |
+
font-size: 2rem;
|
| 145 |
+
font-weight: 700;
|
| 146 |
+
background: var(--accent-gradient);
|
| 147 |
+
-webkit-background-clip: text;
|
| 148 |
+
-webkit-text-fill-color: transparent;
|
| 149 |
+
background-clip: text;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.tagline {
|
| 153 |
+
color: var(--text-secondary);
|
| 154 |
+
font-size: 1rem;
|
| 155 |
+
font-weight: 400;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
/* ================================
|
| 159 |
+
Cards
|
| 160 |
+
================================ */
|
| 161 |
+
|
| 162 |
+
.card {
|
| 163 |
+
background: var(--bg-card);
|
| 164 |
+
backdrop-filter: blur(20px);
|
| 165 |
+
border: 1px solid var(--border-color);
|
| 166 |
+
border-radius: var(--radius-xl);
|
| 167 |
+
padding: var(--spacing-xl);
|
| 168 |
+
margin-bottom: var(--spacing-lg);
|
| 169 |
+
transition: var(--transition-normal);
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
.card:hover {
|
| 173 |
+
border-color: var(--border-glow);
|
| 174 |
+
box-shadow: var(--shadow-glow);
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
.card-header {
|
| 178 |
+
display: flex;
|
| 179 |
+
align-items: center;
|
| 180 |
+
justify-content: space-between;
|
| 181 |
+
margin-bottom: var(--spacing-lg);
|
| 182 |
+
flex-wrap: wrap;
|
| 183 |
+
gap: var(--spacing-sm);
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
.card-header h2 {
|
| 187 |
+
font-size: 1.25rem;
|
| 188 |
+
font-weight: 600;
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
/* ================================
|
| 192 |
+
Badge
|
| 193 |
+
================================ */
|
| 194 |
+
|
| 195 |
+
.badge {
|
| 196 |
+
display: inline-block;
|
| 197 |
+
padding: var(--spacing-xs) var(--spacing-sm);
|
| 198 |
+
background: rgba(99, 102, 241, 0.15);
|
| 199 |
+
color: var(--accent-primary);
|
| 200 |
+
border-radius: var(--radius-sm);
|
| 201 |
+
font-size: 0.75rem;
|
| 202 |
+
font-weight: 500;
|
| 203 |
+
text-transform: uppercase;
|
| 204 |
+
letter-spacing: 0.5px;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
/* ================================
|
| 208 |
+
Upload Zone
|
| 209 |
+
================================ */
|
| 210 |
+
|
| 211 |
+
.upload-zone {
|
| 212 |
+
border: 2px dashed var(--border-color);
|
| 213 |
+
border-radius: var(--radius-lg);
|
| 214 |
+
padding: var(--spacing-2xl);
|
| 215 |
+
text-align: center;
|
| 216 |
+
cursor: pointer;
|
| 217 |
+
transition: var(--transition-normal);
|
| 218 |
+
margin-bottom: var(--spacing-lg);
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
.upload-zone:hover,
|
| 222 |
+
.upload-zone.dragover {
|
| 223 |
+
border-color: var(--accent-primary);
|
| 224 |
+
background: rgba(99, 102, 241, 0.05);
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
.upload-zone.dragover {
|
| 228 |
+
transform: scale(1.02);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.upload-icon {
|
| 232 |
+
width: 64px;
|
| 233 |
+
height: 64px;
|
| 234 |
+
margin: 0 auto var(--spacing-md);
|
| 235 |
+
background: var(--accent-gradient);
|
| 236 |
+
border-radius: 50%;
|
| 237 |
+
display: flex;
|
| 238 |
+
align-items: center;
|
| 239 |
+
justify-content: center;
|
| 240 |
+
opacity: 0.8;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
.upload-icon svg {
|
| 244 |
+
width: 32px;
|
| 245 |
+
height: 32px;
|
| 246 |
+
color: white;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.upload-text {
|
| 250 |
+
font-size: 1.125rem;
|
| 251 |
+
font-weight: 500;
|
| 252 |
+
color: var(--text-primary);
|
| 253 |
+
margin-bottom: var(--spacing-xs);
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
.upload-subtext {
|
| 257 |
+
color: var(--text-muted);
|
| 258 |
+
font-size: 0.875rem;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
/* ================================
|
| 262 |
+
File Info
|
| 263 |
+
================================ */
|
| 264 |
+
|
| 265 |
+
.file-info {
|
| 266 |
+
display: flex;
|
| 267 |
+
align-items: center;
|
| 268 |
+
justify-content: space-between;
|
| 269 |
+
padding: var(--spacing-md);
|
| 270 |
+
background: rgba(99, 102, 241, 0.1);
|
| 271 |
+
border-radius: var(--radius-md);
|
| 272 |
+
margin-bottom: var(--spacing-lg);
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
.file-details {
|
| 276 |
+
display: flex;
|
| 277 |
+
flex-direction: column;
|
| 278 |
+
gap: var(--spacing-xs);
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
.file-name {
|
| 282 |
+
font-weight: 500;
|
| 283 |
+
color: var(--text-primary);
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
.file-size {
|
| 287 |
+
font-size: 0.875rem;
|
| 288 |
+
color: var(--text-secondary);
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
/* ================================
|
| 292 |
+
Buttons
|
| 293 |
+
================================ */
|
| 294 |
+
|
| 295 |
+
.btn {
|
| 296 |
+
display: inline-flex;
|
| 297 |
+
align-items: center;
|
| 298 |
+
justify-content: center;
|
| 299 |
+
gap: var(--spacing-sm);
|
| 300 |
+
padding: var(--spacing-md) var(--spacing-xl);
|
| 301 |
+
border: none;
|
| 302 |
+
border-radius: var(--radius-md);
|
| 303 |
+
font-family: inherit;
|
| 304 |
+
font-size: 1rem;
|
| 305 |
+
font-weight: 500;
|
| 306 |
+
cursor: pointer;
|
| 307 |
+
transition: var(--transition-fast);
|
| 308 |
+
text-decoration: none;
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
.btn:disabled {
|
| 312 |
+
opacity: 0.5;
|
| 313 |
+
cursor: not-allowed;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
.btn svg {
|
| 317 |
+
width: 20px;
|
| 318 |
+
height: 20px;
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
.btn-primary {
|
| 322 |
+
width: 100%;
|
| 323 |
+
background: var(--accent-gradient);
|
| 324 |
+
color: white;
|
| 325 |
+
box-shadow: var(--shadow-md);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
.btn-primary:hover:not(:disabled) {
|
| 329 |
+
transform: translateY(-2px);
|
| 330 |
+
box-shadow: var(--shadow-lg), var(--shadow-glow);
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
.btn-primary:active:not(:disabled) {
|
| 334 |
+
transform: translateY(0);
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
.btn-secondary {
|
| 338 |
+
background: var(--bg-card);
|
| 339 |
+
color: var(--text-primary);
|
| 340 |
+
border: 1px solid var(--border-color);
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.btn-secondary:hover:not(:disabled) {
|
| 344 |
+
background: var(--bg-card-hover);
|
| 345 |
+
border-color: var(--accent-primary);
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
.btn-outline {
|
| 349 |
+
background: transparent;
|
| 350 |
+
color: var(--text-primary);
|
| 351 |
+
border: 1px solid var(--border-color);
|
| 352 |
+
padding: var(--spacing-sm) var(--spacing-md);
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
.btn-outline:hover {
|
| 356 |
+
background: var(--bg-card);
|
| 357 |
+
border-color: var(--accent-primary);
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
.btn-clear {
|
| 361 |
+
width: 36px;
|
| 362 |
+
height: 36px;
|
| 363 |
+
padding: 0;
|
| 364 |
+
background: transparent;
|
| 365 |
+
color: var(--text-muted);
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
.btn-clear:hover {
|
| 369 |
+
color: var(--error);
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
/* ================================
|
| 373 |
+
Processing Section
|
| 374 |
+
================================ */
|
| 375 |
+
|
| 376 |
+
.processing-content {
|
| 377 |
+
text-align: center;
|
| 378 |
+
padding: var(--spacing-xl) 0;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
.spinner {
|
| 382 |
+
width: 56px;
|
| 383 |
+
height: 56px;
|
| 384 |
+
margin: 0 auto var(--spacing-lg);
|
| 385 |
+
border: 3px solid var(--border-color);
|
| 386 |
+
border-top-color: var(--accent-primary);
|
| 387 |
+
border-radius: 50%;
|
| 388 |
+
animation: spin 1s linear infinite;
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
@keyframes spin {
|
| 392 |
+
to {
|
| 393 |
+
transform: rotate(360deg);
|
| 394 |
+
}
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
.processing-content h3 {
|
| 398 |
+
font-size: 1.25rem;
|
| 399 |
+
margin-bottom: var(--spacing-sm);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
.processing-content p {
|
| 403 |
+
color: var(--text-secondary);
|
| 404 |
+
margin-bottom: var(--spacing-lg);
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
.progress-bar {
|
| 408 |
+
height: 6px;
|
| 409 |
+
background: var(--bg-secondary);
|
| 410 |
+
border-radius: var(--radius-sm);
|
| 411 |
+
overflow: hidden;
|
| 412 |
+
margin-bottom: var(--spacing-md);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
.progress-fill {
|
| 416 |
+
height: 100%;
|
| 417 |
+
width: 0%;
|
| 418 |
+
background: var(--accent-gradient);
|
| 419 |
+
border-radius: var(--radius-sm);
|
| 420 |
+
transition: width 0.3s ease;
|
| 421 |
+
animation: pulse 2s ease-in-out infinite;
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
@keyframes pulse {
|
| 425 |
+
|
| 426 |
+
0%,
|
| 427 |
+
100% {
|
| 428 |
+
opacity: 1;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
50% {
|
| 432 |
+
opacity: 0.7;
|
| 433 |
+
}
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
.processing-hint {
|
| 437 |
+
font-size: 0.875rem;
|
| 438 |
+
color: var(--text-muted);
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
.timer-display {
|
| 442 |
+
font-size: 2rem;
|
| 443 |
+
font-weight: 700;
|
| 444 |
+
color: var(--accent-primary);
|
| 445 |
+
margin: var(--spacing-md) 0;
|
| 446 |
+
font-family: monospace;
|
| 447 |
+
text-shadow: 0 0 10px rgba(99, 102, 241, 0.3);
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
/* ================================
|
| 451 |
+
Results Section
|
| 452 |
+
================================ */
|
| 453 |
+
|
| 454 |
+
.result-meta {
|
| 455 |
+
display: flex;
|
| 456 |
+
gap: var(--spacing-sm);
|
| 457 |
+
flex-wrap: wrap;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
.download-buttons {
|
| 461 |
+
display: flex;
|
| 462 |
+
gap: var(--spacing-md);
|
| 463 |
+
margin-bottom: var(--spacing-lg);
|
| 464 |
+
flex-wrap: wrap;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.transcript-container {
|
| 468 |
+
max-height: 400px;
|
| 469 |
+
overflow-y: auto;
|
| 470 |
+
padding-right: var(--spacing-sm);
|
| 471 |
+
margin-bottom: var(--spacing-lg);
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
.transcript-container::-webkit-scrollbar {
|
| 475 |
+
width: 6px;
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
.transcript-container::-webkit-scrollbar-track {
|
| 479 |
+
background: var(--bg-secondary);
|
| 480 |
+
border-radius: var(--radius-sm);
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
.transcript-container::-webkit-scrollbar-thumb {
|
| 484 |
+
background: var(--border-color);
|
| 485 |
+
border-radius: var(--radius-sm);
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
.transcript-container::-webkit-scrollbar-thumb:hover {
|
| 489 |
+
background: var(--text-muted);
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
/* Transcript Segment */
|
| 493 |
+
.segment {
|
| 494 |
+
padding: var(--spacing-md);
|
| 495 |
+
border-radius: var(--radius-md);
|
| 496 |
+
margin-bottom: var(--spacing-sm);
|
| 497 |
+
background: var(--bg-secondary);
|
| 498 |
+
border-left: 3px solid var(--accent-primary);
|
| 499 |
+
transition: var(--transition-fast);
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
.segment:hover {
|
| 503 |
+
background: var(--bg-card-hover);
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
.segment-header {
|
| 507 |
+
display: flex;
|
| 508 |
+
align-items: center;
|
| 509 |
+
gap: var(--spacing-md);
|
| 510 |
+
margin-bottom: var(--spacing-xs);
|
| 511 |
+
flex-wrap: wrap;
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
.segment-speaker {
|
| 515 |
+
font-weight: 600;
|
| 516 |
+
color: var(--accent-primary);
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
.segment-time {
|
| 520 |
+
font-size: 0.75rem;
|
| 521 |
+
color: var(--text-muted);
|
| 522 |
+
font-family: monospace;
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
.segment-text {
|
| 526 |
+
color: var(--text-primary);
|
| 527 |
+
line-height: 1.7;
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
/* Speaker Colors */
|
| 531 |
+
.speaker-1 {
|
| 532 |
+
border-left-color: #6366f1;
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
.speaker-1 .segment-speaker {
|
| 536 |
+
color: #6366f1;
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
.speaker-2 {
|
| 540 |
+
border-left-color: #10b981;
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
.speaker-2 .segment-speaker {
|
| 544 |
+
color: #10b981;
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
.speaker-3 {
|
| 548 |
+
border-left-color: #f59e0b;
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
.speaker-3 .segment-speaker {
|
| 552 |
+
color: #f59e0b;
|
| 553 |
+
}
|
| 554 |
+
|
| 555 |
+
.speaker-4 {
|
| 556 |
+
border-left-color: #ec4899;
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
.speaker-4 .segment-speaker {
|
| 560 |
+
color: #ec4899;
|
| 561 |
+
}
|
| 562 |
+
|
| 563 |
+
.speaker-5 {
|
| 564 |
+
border-left-color: #8b5cf6;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
.speaker-5 .segment-speaker {
|
| 568 |
+
color: #8b5cf6;
|
| 569 |
+
}
|
| 570 |
+
|
| 571 |
+
/* ================================
|
| 572 |
+
Error Section
|
| 573 |
+
================================ */
|
| 574 |
+
|
| 575 |
+
.error-content {
|
| 576 |
+
text-align: center;
|
| 577 |
+
padding: var(--spacing-xl) 0;
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
.error-icon {
|
| 581 |
+
width: 64px;
|
| 582 |
+
height: 64px;
|
| 583 |
+
margin: 0 auto var(--spacing-lg);
|
| 584 |
+
background: rgba(239, 68, 68, 0.15);
|
| 585 |
+
border-radius: 50%;
|
| 586 |
+
display: flex;
|
| 587 |
+
align-items: center;
|
| 588 |
+
justify-content: center;
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
.error-icon svg {
|
| 592 |
+
width: 32px;
|
| 593 |
+
height: 32px;
|
| 594 |
+
color: var(--error);
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
.error-content h3 {
|
| 598 |
+
color: var(--error);
|
| 599 |
+
margin-bottom: var(--spacing-sm);
|
| 600 |
+
}
|
| 601 |
+
|
| 602 |
+
.error-content p {
|
| 603 |
+
color: var(--text-secondary);
|
| 604 |
+
margin-bottom: var(--spacing-lg);
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
/* ================================
|
| 608 |
+
Footer
|
| 609 |
+
================================ */
|
| 610 |
+
|
| 611 |
+
.footer {
|
| 612 |
+
margin-top: auto;
|
| 613 |
+
padding: var(--spacing-xl) 0;
|
| 614 |
+
text-align: center;
|
| 615 |
+
color: var(--text-muted);
|
| 616 |
+
font-size: 0.875rem;
|
| 617 |
+
}
|
| 618 |
+
|
| 619 |
+
.footer strong {
|
| 620 |
+
color: var(--text-secondary);
|
| 621 |
+
}
|
| 622 |
+
|
| 623 |
+
.footer-note {
|
| 624 |
+
margin-top: var(--spacing-xs);
|
| 625 |
+
font-size: 0.75rem;
|
| 626 |
+
}
|
| 627 |
+
|
| 628 |
+
/* ================================
|
| 629 |
+
Utility Classes
|
| 630 |
+
================================ */
|
| 631 |
+
|
| 632 |
+
.hidden {
|
| 633 |
+
display: none !important;
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
/* ================================
|
| 637 |
+
Responsive
|
| 638 |
+
================================ */
|
| 639 |
+
|
| 640 |
+
@media (max-width: 640px) {
|
| 641 |
+
:root {
|
| 642 |
+
font-size: 14px;
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
.app-container {
|
| 646 |
+
padding: var(--spacing-md);
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
.card {
|
| 650 |
+
padding: var(--spacing-lg);
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
.upload-zone {
|
| 654 |
+
padding: var(--spacing-xl);
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
.card-header {
|
| 658 |
+
flex-direction: column;
|
| 659 |
+
align-items: flex-start;
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
.result-meta {
|
| 663 |
+
width: 100%;
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
.download-buttons {
|
| 667 |
+
flex-direction: column;
|
| 668 |
+
}
|
| 669 |
+
|
| 670 |
+
.download-buttons .btn {
|
| 671 |
+
width: 100%;
|
| 672 |
+
}
|
| 673 |
+
}
|
app/static/js/app.js
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* PrecisionVoice - Frontend Application Logic
|
| 3 |
+
* Handles file upload, transcription requests, and result display.
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 7 |
+
// DOM Elements
|
| 8 |
+
const elements = {
|
| 9 |
+
// Upload
|
| 10 |
+
dropZone: document.getElementById('drop-zone'),
|
| 11 |
+
fileInput: document.getElementById('file-input'),
|
| 12 |
+
fileInfo: document.getElementById('file-info'),
|
| 13 |
+
fileName: document.getElementById('file-name'),
|
| 14 |
+
fileSize: document.getElementById('file-size'),
|
| 15 |
+
clearBtn: document.getElementById('clear-btn'),
|
| 16 |
+
transcribeBtn: document.getElementById('transcribe-btn'),
|
| 17 |
+
|
| 18 |
+
// Sections
|
| 19 |
+
uploadSection: document.getElementById('upload-section'),
|
| 20 |
+
processingSection: document.getElementById('processing-section'),
|
| 21 |
+
resultsSection: document.getElementById('results-section'),
|
| 22 |
+
errorSection: document.getElementById('error-section'),
|
| 23 |
+
|
| 24 |
+
// Processing
|
| 25 |
+
processingStatus: document.getElementById('processing-status'),
|
| 26 |
+
progressFill: document.getElementById('progress-fill'),
|
| 27 |
+
processingTimer: document.getElementById('processing-timer'),
|
| 28 |
+
|
| 29 |
+
// Results
|
| 30 |
+
speakerCount: document.getElementById('speaker-count'),
|
| 31 |
+
durationInfo: document.getElementById('duration-info'),
|
| 32 |
+
processingTime: document.getElementById('processing-time'),
|
| 33 |
+
transcriptContainer: document.getElementById('transcript-container'),
|
| 34 |
+
downloadTxt: document.getElementById('download-txt'),
|
| 35 |
+
downloadCsv: document.getElementById("download-csv"),
|
| 36 |
+
newUploadBtn: document.getElementById('new-upload-btn'),
|
| 37 |
+
|
| 38 |
+
// Error
|
| 39 |
+
errorMessage: document.getElementById('error-message'),
|
| 40 |
+
retryBtn: document.getElementById('retry-btn')
|
| 41 |
+
};
|
| 42 |
+
|
| 43 |
+
let selectedFile = null;
|
| 44 |
+
|
| 45 |
+
// =====================
|
| 46 |
+
// Event Listeners
|
| 47 |
+
// =====================
|
| 48 |
+
|
| 49 |
+
// Click to upload
|
| 50 |
+
elements.dropZone.addEventListener('click', () => {
|
| 51 |
+
elements.fileInput.click();
|
| 52 |
+
});
|
| 53 |
+
|
| 54 |
+
// File input change
|
| 55 |
+
elements.fileInput.addEventListener('change', (e) => {
|
| 56 |
+
if (e.target.files.length > 0) {
|
| 57 |
+
handleFileSelection(e.target.files[0]);
|
| 58 |
+
}
|
| 59 |
+
});
|
| 60 |
+
|
| 61 |
+
// Drag and drop
|
| 62 |
+
elements.dropZone.addEventListener('dragover', (e) => {
|
| 63 |
+
e.preventDefault();
|
| 64 |
+
elements.dropZone.classList.add('dragover');
|
| 65 |
+
});
|
| 66 |
+
|
| 67 |
+
elements.dropZone.addEventListener('dragleave', () => {
|
| 68 |
+
elements.dropZone.classList.remove('dragover');
|
| 69 |
+
});
|
| 70 |
+
|
| 71 |
+
elements.dropZone.addEventListener('drop', (e) => {
|
| 72 |
+
e.preventDefault();
|
| 73 |
+
elements.dropZone.classList.remove('dragover');
|
| 74 |
+
|
| 75 |
+
if (e.dataTransfer.files.length > 0) {
|
| 76 |
+
handleFileSelection(e.dataTransfer.files[0]);
|
| 77 |
+
}
|
| 78 |
+
});
|
| 79 |
+
|
| 80 |
+
// Clear file
|
| 81 |
+
elements.clearBtn.addEventListener('click', (e) => {
|
| 82 |
+
e.stopPropagation();
|
| 83 |
+
clearFileSelection();
|
| 84 |
+
});
|
| 85 |
+
|
| 86 |
+
// Transcribe button
|
| 87 |
+
elements.transcribeBtn.addEventListener('click', () => {
|
| 88 |
+
if (selectedFile) {
|
| 89 |
+
startTranscription();
|
| 90 |
+
}
|
| 91 |
+
});
|
| 92 |
+
|
| 93 |
+
// New upload button
|
| 94 |
+
elements.newUploadBtn.addEventListener('click', resetToUpload);
|
| 95 |
+
|
| 96 |
+
// Retry button
|
| 97 |
+
elements.retryBtn.addEventListener('click', resetToUpload);
|
| 98 |
+
|
| 99 |
+
// =====================
|
| 100 |
+
// File Handling
|
| 101 |
+
// =====================
|
| 102 |
+
|
| 103 |
+
function handleFileSelection(file) {
|
| 104 |
+
const allowedTypes = ['audio/mpeg', 'audio/wav', 'audio/x-wav', 'audio/mp4', 'audio/x-m4a',
|
| 105 |
+
'audio/ogg', 'audio/flac', 'audio/webm', 'video/webm'];
|
| 106 |
+
const allowedExtensions = ['mp3', 'wav', 'm4a', 'ogg', 'flac', 'webm'];
|
| 107 |
+
|
| 108 |
+
// Check file extension
|
| 109 |
+
const ext = file.name.split('.').pop().toLowerCase();
|
| 110 |
+
if (!allowedExtensions.includes(ext)) {
|
| 111 |
+
showError(`Unsupported file type: .${ext}. Supported: ${allowedExtensions.join(', ')}`);
|
| 112 |
+
return;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Check file size (100MB limit)
|
| 116 |
+
const maxSize = 100 * 1024 * 1024;
|
| 117 |
+
if (file.size > maxSize) {
|
| 118 |
+
showError(`File too large. Maximum size: 100MB`);
|
| 119 |
+
return;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
selectedFile = file;
|
| 123 |
+
|
| 124 |
+
// Update UI
|
| 125 |
+
elements.fileName.textContent = file.name;
|
| 126 |
+
elements.fileSize.textContent = formatFileSize(file.size);
|
| 127 |
+
elements.fileInfo.classList.remove('hidden');
|
| 128 |
+
elements.transcribeBtn.disabled = false;
|
| 129 |
+
|
| 130 |
+
// Hide drop zone text
|
| 131 |
+
elements.dropZone.style.display = 'none';
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
function clearFileSelection() {
|
| 135 |
+
selectedFile = null;
|
| 136 |
+
elements.fileInput.value = '';
|
| 137 |
+
elements.fileInfo.classList.add('hidden');
|
| 138 |
+
elements.transcribeBtn.disabled = true;
|
| 139 |
+
elements.dropZone.style.display = 'block';
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
function formatFileSize(bytes) {
|
| 143 |
+
if (bytes === 0) return '0 Bytes';
|
| 144 |
+
const k = 1024;
|
| 145 |
+
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
|
| 146 |
+
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
| 147 |
+
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// =====================
|
| 151 |
+
// Transcription
|
| 152 |
+
// =====================
|
| 153 |
+
|
| 154 |
+
async function startTranscription() {
|
| 155 |
+
if (!selectedFile) return;
|
| 156 |
+
|
| 157 |
+
// Show processing UI
|
| 158 |
+
showSection('processing');
|
| 159 |
+
updateProgress(100, 'Processing audio... (Check server logs for details)');
|
| 160 |
+
|
| 161 |
+
// Reset and start timer
|
| 162 |
+
let seconds = 0;
|
| 163 |
+
elements.processingTimer.textContent = '00:00';
|
| 164 |
+
const timerInterval = setInterval(() => {
|
| 165 |
+
seconds++;
|
| 166 |
+
const m = Math.floor(seconds / 60);
|
| 167 |
+
const s = seconds % 60;
|
| 168 |
+
elements.processingTimer.textContent = `${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
|
| 169 |
+
}, 1000);
|
| 170 |
+
|
| 171 |
+
try {
|
| 172 |
+
const formData = new FormData();
|
| 173 |
+
formData.append('file', selectedFile);
|
| 174 |
+
|
| 175 |
+
const response = await fetch('/api/transcribe', {
|
| 176 |
+
method: 'POST',
|
| 177 |
+
body: formData
|
| 178 |
+
});
|
| 179 |
+
|
| 180 |
+
clearInterval(timerInterval);
|
| 181 |
+
|
| 182 |
+
if (!response.ok) {
|
| 183 |
+
const errorData = await response.json();
|
| 184 |
+
throw new Error(errorData.detail || 'Processing failed');
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
const result = await response.json();
|
| 188 |
+
displayResults(result);
|
| 189 |
+
|
| 190 |
+
} catch (error) {
|
| 191 |
+
clearInterval(timerInterval);
|
| 192 |
+
console.error('Processing error:', error);
|
| 193 |
+
showError(error.message || 'An error occurred during processing');
|
| 194 |
+
}
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
function updateProgress(percent, status) {
|
| 198 |
+
elements.progressFill.style.width = `${percent}%`;
|
| 199 |
+
if (status) {
|
| 200 |
+
elements.processingStatus.textContent = status;
|
| 201 |
+
}
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
// =====================
|
| 205 |
+
// Results Display
|
| 206 |
+
// =====================
|
| 207 |
+
|
| 208 |
+
function displayResults(result) {
|
| 209 |
+
// ===== Metadata =====
|
| 210 |
+
|
| 211 |
+
// Nếu chỉ dùng ROLE thì không nên hiển thị speaker_count
|
| 212 |
+
if (result.roles) {
|
| 213 |
+
const roleCount = Object.keys(result.roles).length;
|
| 214 |
+
elements.speakerCount.textContent = `${roleCount} role${roleCount !== 1 ? 's' : ''}`;
|
| 215 |
+
} else {
|
| 216 |
+
elements.speakerCount.textContent = '';
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
elements.durationInfo.textContent = formatDuration(result.duration || 0);
|
| 220 |
+
elements.processingTime.textContent = `${result.processing_time || 0}s`;
|
| 221 |
+
|
| 222 |
+
// ===== Download links =====
|
| 223 |
+
if (result.download_txt) {
|
| 224 |
+
elements.downloadTxt.href = result.download_txt;
|
| 225 |
+
elements.downloadTxt.style.display = 'inline-block';
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
if (result.download_csv) {
|
| 229 |
+
elements.downloadCsv.href = result.download_csv;
|
| 230 |
+
elements.downloadCsv.style.display = 'inline-block';
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
// ===== Render transcript =====
|
| 234 |
+
renderTranscript(result.segments || []);
|
| 235 |
+
|
| 236 |
+
// ===== Show results =====
|
| 237 |
+
showSection('results');
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
function renderTranscript(segments) {
|
| 241 |
+
elements.transcriptContainer.innerHTML = '';
|
| 242 |
+
|
| 243 |
+
const roleColors = {};
|
| 244 |
+
let colorIndex = 0;
|
| 245 |
+
|
| 246 |
+
segments.forEach((segment) => {
|
| 247 |
+
const role = segment.role || 'UNKNOWN';
|
| 248 |
+
|
| 249 |
+
if (!(role in roleColors)) {
|
| 250 |
+
colorIndex++;
|
| 251 |
+
roleColors[role] = `speaker-${Math.min(colorIndex, 5)}`;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
const segmentEl = document.createElement('div');
|
| 255 |
+
segmentEl.className = `segment ${roleColors[role]}`;
|
| 256 |
+
|
| 257 |
+
const start = typeof segment.start === 'number' ? segment.start : 0;
|
| 258 |
+
const end = typeof segment.end === 'number' ? segment.end : 0;
|
| 259 |
+
const text = segment.text ? escapeHtml(segment.text) : '';
|
| 260 |
+
|
| 261 |
+
segmentEl.innerHTML = `
|
| 262 |
+
<div class="segment-header">
|
| 263 |
+
<span class="segment-speaker">
|
| 264 |
+
${escapeHtml(role)}
|
| 265 |
+
</span>
|
| 266 |
+
<span class="segment-time">
|
| 267 |
+
${formatTime(start)} - ${formatTime(end)}
|
| 268 |
+
</span>
|
| 269 |
+
</div>
|
| 270 |
+
<p class="segment-text">${text}</p>
|
| 271 |
+
`;
|
| 272 |
+
|
| 273 |
+
elements.transcriptContainer.appendChild(segmentEl);
|
| 274 |
+
});
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
function formatTime(seconds) {
|
| 280 |
+
const h = Math.floor(seconds / 3600);
|
| 281 |
+
const m = Math.floor((seconds % 3600) / 60);
|
| 282 |
+
const s = Math.floor(seconds % 60);
|
| 283 |
+
|
| 284 |
+
if (h > 0) {
|
| 285 |
+
return `${h}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')}`;
|
| 286 |
+
}
|
| 287 |
+
return `${m}:${s.toString().padStart(2, '0')}`;
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
function formatDuration(seconds) {
|
| 291 |
+
const m = Math.floor(seconds / 60);
|
| 292 |
+
const s = Math.floor(seconds % 60);
|
| 293 |
+
return `${m}:${s.toString().padStart(2, '0')}`;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
function escapeHtml(text) {
|
| 297 |
+
const div = document.createElement('div');
|
| 298 |
+
div.textContent = text;
|
| 299 |
+
return div.innerHTML;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
// =====================
|
| 303 |
+
// UI State Management
|
| 304 |
+
// =====================
|
| 305 |
+
|
| 306 |
+
function showSection(section) {
|
| 307 |
+
elements.uploadSection.classList.add('hidden');
|
| 308 |
+
elements.processingSection.classList.add('hidden');
|
| 309 |
+
elements.resultsSection.classList.add('hidden');
|
| 310 |
+
elements.errorSection.classList.add('hidden');
|
| 311 |
+
|
| 312 |
+
switch (section) {
|
| 313 |
+
case 'upload':
|
| 314 |
+
elements.uploadSection.classList.remove('hidden');
|
| 315 |
+
break;
|
| 316 |
+
case 'processing':
|
| 317 |
+
elements.processingSection.classList.remove('hidden');
|
| 318 |
+
break;
|
| 319 |
+
case 'results':
|
| 320 |
+
elements.resultsSection.classList.remove('hidden');
|
| 321 |
+
break;
|
| 322 |
+
case 'error':
|
| 323 |
+
elements.errorSection.classList.remove('hidden');
|
| 324 |
+
break;
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
function showError(message) {
|
| 329 |
+
elements.errorMessage.textContent = message;
|
| 330 |
+
showSection('error');
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
function resetToUpload() {
|
| 334 |
+
clearFileSelection();
|
| 335 |
+
showSection('upload');
|
| 336 |
+
updateProgress(0, 'Uploading file...');
|
| 337 |
+
}
|
| 338 |
+
});
|
app/templates/index.html
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="vi">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<meta name="description" content="PrecisionVoice - Speech-to-Text and Speaker Diarization powered by AI">
|
| 8 |
+
<title>PrecisionVoice | AI Speech Transcription</title>
|
| 9 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 10 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 11 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
| 12 |
+
<link rel="stylesheet" href="/static/css/style.css">
|
| 13 |
+
</head>
|
| 14 |
+
|
| 15 |
+
<body>
|
| 16 |
+
<div class="app-container">
|
| 17 |
+
<!-- Header -->
|
| 18 |
+
<header class="header">
|
| 19 |
+
<div class="logo">
|
| 20 |
+
<div class="logo-icon">
|
| 21 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 22 |
+
<path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z" />
|
| 23 |
+
<path d="M19 10v2a7 7 0 0 1-14 0v-2" />
|
| 24 |
+
<line x1="12" y1="19" x2="12" y2="23" />
|
| 25 |
+
<line x1="8" y1="23" x2="16" y2="23" />
|
| 26 |
+
</svg>
|
| 27 |
+
</div>
|
| 28 |
+
<h1>PrecisionVoice</h1>
|
| 29 |
+
</div>
|
| 30 |
+
<p class="tagline">AI-Powered Speech Transcription with Speaker Detection</p>
|
| 31 |
+
</header>
|
| 32 |
+
|
| 33 |
+
<!-- Main Content -->
|
| 34 |
+
<main class="main-content">
|
| 35 |
+
<!-- Upload Section -->
|
| 36 |
+
<section id="upload-section" class="card upload-card">
|
| 37 |
+
<div class="card-header">
|
| 38 |
+
<h2>Upload Audio</h2>
|
| 39 |
+
<span class="badge">Supported: {{ allowed_formats }}</span>
|
| 40 |
+
</div>
|
| 41 |
+
|
| 42 |
+
<div class="upload-zone" id="drop-zone">
|
| 43 |
+
<div class="upload-icon">
|
| 44 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 45 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 46 |
+
<polyline points="17 8 12 3 7 8" />
|
| 47 |
+
<line x1="12" y1="3" x2="12" y2="15" />
|
| 48 |
+
</svg>
|
| 49 |
+
</div>
|
| 50 |
+
<p class="upload-text">Drag & drop audio file here</p>
|
| 51 |
+
<p class="upload-subtext">or click to browse</p>
|
| 52 |
+
<input type="file" id="file-input" accept=".mp3,.wav,.m4a,.ogg,.flac,.webm" hidden>
|
| 53 |
+
</div>
|
| 54 |
+
|
| 55 |
+
<div id="file-info" class="file-info hidden">
|
| 56 |
+
<div class="file-details">
|
| 57 |
+
<span class="file-name" id="file-name">audio.mp3</span>
|
| 58 |
+
<span class="file-size" id="file-size">0 MB</span>
|
| 59 |
+
</div>
|
| 60 |
+
<button class="btn btn-clear" id="clear-btn" title="Remove file">
|
| 61 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 62 |
+
<line x1="18" y1="6" x2="6" y2="18" />
|
| 63 |
+
<line x1="6" y1="6" x2="18" y2="18" />
|
| 64 |
+
</svg>
|
| 65 |
+
</button>
|
| 66 |
+
</div>
|
| 67 |
+
|
| 68 |
+
<button class="btn btn-primary" id="transcribe-btn" disabled>
|
| 69 |
+
<span class="btn-text">Transcribe</span>
|
| 70 |
+
<span class="btn-icon">
|
| 71 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 72 |
+
<polygon points="5 3 19 12 5 21 5 3" />
|
| 73 |
+
</svg>
|
| 74 |
+
</span>
|
| 75 |
+
</button>
|
| 76 |
+
</section>
|
| 77 |
+
|
| 78 |
+
<!-- Processing Section -->
|
| 79 |
+
<section id="processing-section" class="card processing-card hidden">
|
| 80 |
+
<div class="processing-content">
|
| 81 |
+
<div class="spinner"></div>
|
| 82 |
+
<h3>Processing Audio</h3>
|
| 83 |
+
<p id="processing-status">Uploading file...</p>
|
| 84 |
+
<div class="progress-bar">
|
| 85 |
+
<div class="progress-fill" id="progress-fill"></div>
|
| 86 |
+
</div>
|
| 87 |
+
<div class="timer-display" id="processing-timer">00:00</div>
|
| 88 |
+
<p class="processing-hint">This may take a few minutes depending on audio length</p>
|
| 89 |
+
</div>
|
| 90 |
+
</section>
|
| 91 |
+
|
| 92 |
+
<!-- Results Section -->
|
| 93 |
+
<section id="results-section" class="card results-card hidden">
|
| 94 |
+
<div class="card-header">
|
| 95 |
+
<h2>Transcription Results</h2>
|
| 96 |
+
<div class="result-meta">
|
| 97 |
+
<span id="speaker-count" class="badge">0 speakers</span>
|
| 98 |
+
<span id="duration-info" class="badge">0:00</span>
|
| 99 |
+
<span id="processing-time" class="badge">0.0s</span>
|
| 100 |
+
</div>
|
| 101 |
+
</div>
|
| 102 |
+
|
| 103 |
+
<div class="download-buttons">
|
| 104 |
+
<a href="#" id="download-txt" class="btn btn-outline" download>
|
| 105 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 106 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 107 |
+
<polyline points="7 10 12 15 17 10" />
|
| 108 |
+
<line x1="12" y1="15" x2="12" y2="3" />
|
| 109 |
+
</svg>
|
| 110 |
+
Download TXT
|
| 111 |
+
</a>
|
| 112 |
+
<a href="#" id="download-csv" class="btn btn-outline" download>
|
| 113 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 114 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4" />
|
| 115 |
+
<polyline points="7 10 12 15 17 10" />
|
| 116 |
+
<line x1="12" y1="15" x2="12" y2="3" />
|
| 117 |
+
</svg>
|
| 118 |
+
Download CSV
|
| 119 |
+
</a>
|
| 120 |
+
</div>
|
| 121 |
+
|
| 122 |
+
<div class="transcript-container" id="transcript-container">
|
| 123 |
+
<!-- Transcript segments will be rendered here -->
|
| 124 |
+
</div>
|
| 125 |
+
|
| 126 |
+
<button class="btn btn-secondary" id="new-upload-btn">
|
| 127 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 128 |
+
<polyline points="1 4 1 10 7 10" />
|
| 129 |
+
<path d="M3.51 15a9 9 0 1 0 2.13-9.36L1 10" />
|
| 130 |
+
</svg>
|
| 131 |
+
New Transcription
|
| 132 |
+
</button>
|
| 133 |
+
</section>
|
| 134 |
+
|
| 135 |
+
<!-- Error Section -->
|
| 136 |
+
<section id="error-section" class="card error-card hidden">
|
| 137 |
+
<div class="error-content">
|
| 138 |
+
<div class="error-icon">
|
| 139 |
+
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 140 |
+
<circle cx="12" cy="12" r="10" />
|
| 141 |
+
<line x1="15" y1="9" x2="9" y2="15" />
|
| 142 |
+
<line x1="9" y1="9" x2="15" y2="15" />
|
| 143 |
+
</svg>
|
| 144 |
+
</div>
|
| 145 |
+
<h3>Error</h3>
|
| 146 |
+
<p id="error-message">An error occurred during processing.</p>
|
| 147 |
+
<button class="btn btn-secondary" id="retry-btn">Try Again</button>
|
| 148 |
+
</div>
|
| 149 |
+
</section>
|
| 150 |
+
</main>
|
| 151 |
+
|
| 152 |
+
<!-- Footer -->
|
| 153 |
+
<footer class="footer">
|
| 154 |
+
<p>Powered by <strong>faster-whisper</strong> & <strong>pyannote.audio</strong></p>
|
| 155 |
+
<p class="footer-note">Max file size: {{ max_upload_mb }}MB</p>
|
| 156 |
+
</footer>
|
| 157 |
+
</div>
|
| 158 |
+
|
| 159 |
+
<script src="/static/js/app.js"></script>
|
| 160 |
+
</body>
|
| 161 |
+
|
| 162 |
+
</html>
|
data/processed/.gitkeep
ADDED
|
File without changes
|
data/uploads/.gitkeep
ADDED
|
File without changes
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
app:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: Dockerfile
|
| 6 |
+
args:
|
| 7 |
+
- PORT=${PORT:-7860}
|
| 8 |
+
container_name: precisionvoice
|
| 9 |
+
ports:
|
| 10 |
+
- "${PORT:-7860}:${PORT:-7860}"
|
| 11 |
+
volumes:
|
| 12 |
+
# Persist uploaded/processed files
|
| 13 |
+
- ./data:/app/data
|
| 14 |
+
# Cache models to avoid re-downloading
|
| 15 |
+
- model_cache_hf:/root/.cache/huggingface
|
| 16 |
+
- model_cache_torch:/root/.cache/torch
|
| 17 |
+
- model_cache_mdx:/root/.audio-separator-models
|
| 18 |
+
environment:
|
| 19 |
+
# HuggingFace token (required for pyannote.audio)
|
| 20 |
+
- HF_TOKEN=${HF_TOKEN:-}
|
| 21 |
+
# Model settings
|
| 22 |
+
- WHISPER_MODEL=${WHISPER_MODEL:-erax-ai/EraX-WoW-Turbo-V1.1-CT2}
|
| 23 |
+
- DIARIZATION_MODEL=${DIARIZATION_MODEL:-pyannote/speaker-diarization-3.1}
|
| 24 |
+
# Device (auto, cuda, cpu)
|
| 25 |
+
- DEVICE=${DEVICE:-auto}
|
| 26 |
+
# Speech Enhancement (SpeechBrain SepFormer)
|
| 27 |
+
- ENABLE_SPEECH_ENHANCEMENT=${ENABLE_SPEECH_ENHANCEMENT:-True}
|
| 28 |
+
- ENHANCEMENT_MODEL=${ENHANCEMENT_MODEL:-speechbrain/sepformer-dns4-16k-enhancement}
|
| 29 |
+
# MDX-Net Vocal Separation
|
| 30 |
+
- ENABLE_VOCAL_SEPARATION=${ENABLE_VOCAL_SEPARATION:-True}
|
| 31 |
+
- MDX_MODEL=${MDX_MODEL:-UVR-MDX-NET-Voc_FT}
|
| 32 |
+
# Upload settings
|
| 33 |
+
- MAX_UPLOAD_SIZE_MB=${MAX_UPLOAD_SIZE_MB:-100}
|
| 34 |
+
# Optimization settings
|
| 35 |
+
- ENABLE_LOUDNORM=${ENABLE_LOUDNORM:-True}
|
| 36 |
+
- ENABLE_NOISE_REDUCTION=${ENABLE_NOISE_REDUCTION:-True}
|
| 37 |
+
# VAD settings
|
| 38 |
+
- VAD_THRESHOLD=${VAD_THRESHOLD:-0.5}
|
| 39 |
+
- VAD_MIN_SPEECH_DURATION_MS=${VAD_MIN_SPEECH_DURATION_MS:-250}
|
| 40 |
+
- VAD_MIN_SILENCE_DURATION_MS=${VAD_MIN_SILENCE_DURATION_MS:-500}
|
| 41 |
+
# Clustering settings
|
| 42 |
+
- MERGE_THRESHOLD_S=${MERGE_THRESHOLD_S:-0.5}
|
| 43 |
+
- MIN_SEGMENT_DURATION_S=${MIN_SEGMENT_DURATION_S:-0.3}
|
| 44 |
+
restart: unless-stopped
|
| 45 |
+
# GPU support (uncomment for NVIDIA GPU)
|
| 46 |
+
# deploy:
|
| 47 |
+
# resources:
|
| 48 |
+
# reservations:
|
| 49 |
+
# devices:
|
| 50 |
+
# - driver: nvidia
|
| 51 |
+
# count: all
|
| 52 |
+
# capabilities: [gpu]
|
| 53 |
+
|
| 54 |
+
volumes:
|
| 55 |
+
model_cache_hf:
|
| 56 |
+
name: precisionvoice_hf_cache
|
| 57 |
+
model_cache_torch:
|
| 58 |
+
name: precisionvoice_torch_cache
|
| 59 |
+
model_cache_mdx:
|
| 60 |
+
name: precisionvoice_mdx_cache
|
docker/.gitkeep
ADDED
|
File without changes
|
precision_voice_eval_ASR.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
precision_voice_simple.ipynb
ADDED
|
@@ -0,0 +1,672 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# 🎙️ PrecisionVoice - Vietnamese Speech-to-Text\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Notebook đơn giản để transcribe audio tiếng Việt sử dụng **faster-whisper** và **pyannote** (diarization).\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"### Hướng dẫn\n",
|
| 12 |
+
"1. **Chọn GPU**: `Runtime` → `Change runtime type` → **T4 GPU**\n",
|
| 13 |
+
"2. **Cài đặt Secrets**: Thêm `HF_TOKEN` vào Colab Secrets (Key icon bên trái) để dùng Pyannote.\n",
|
| 14 |
+
"3. **Chạy từng cell** theo thứ tự từ trên xuống\n",
|
| 15 |
+
"4. **Sử dụng Gradio link** ở cell cuối để truy cập UI"
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"cell_type": "code",
|
| 20 |
+
"execution_count": null,
|
| 21 |
+
"metadata": {},
|
| 22 |
+
"outputs": [],
|
| 23 |
+
"source": [
|
| 24 |
+
"# @title 1. 🔍 Kiểm tra GPU\n",
|
| 25 |
+
"import torch\n",
|
| 26 |
+
"\n",
|
| 27 |
+
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 28 |
+
"if device == \"cuda\":\n",
|
| 29 |
+
" gpu_name = torch.cuda.get_device_name(0)\n",
|
| 30 |
+
" gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9\n",
|
| 31 |
+
" print(f\"✅ GPU Detected: {gpu_name}\")\n",
|
| 32 |
+
" print(f\" VRAM: {gpu_mem:.1f} GB\")\n",
|
| 33 |
+
"else:\n",
|
| 34 |
+
" print(\"⚠️ KHÔNG TÌM THẤY GPU!\")\n",
|
| 35 |
+
" print(\"👉 Vào Runtime → Change runtime type → T4 GPU\")"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"execution_count": null,
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [],
|
| 43 |
+
"source": [
|
| 44 |
+
"# @title 2. 📦 Cài đặt Dependencies\n",
|
| 45 |
+
"print(\"Installing dependencies...\")\n",
|
| 46 |
+
"!pip install --upgrade torch torchvision torchaudio \"pyannote.audio>=3.3.1\" faster-whisper gradio librosa nest_asyncio lightning torchmetrics\n",
|
| 47 |
+
"!apt-get install -y -qq ffmpeg > /dev/null 2>&1\n",
|
| 48 |
+
"print(\"✅ Dependencies installed successfully!\")"
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"cell_type": "code",
|
| 53 |
+
"execution_count": null,
|
| 54 |
+
"metadata": {},
|
| 55 |
+
"outputs": [],
|
| 56 |
+
"source": [
|
| 57 |
+
"# @title 3. 🤖 Load Models (Whisper & Pyannote)\n",
|
| 58 |
+
"import torch\n",
|
| 59 |
+
"import time\n",
|
| 60 |
+
"import os\n",
|
| 61 |
+
"import librosa\n",
|
| 62 |
+
"import numpy as np\n",
|
| 63 |
+
"from google.colab import userdata\n",
|
| 64 |
+
"from faster_whisper import WhisperModel\n",
|
| 65 |
+
"from pyannote.audio import Pipeline\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"try:\n",
|
| 68 |
+
" from pyannote.audio.core.task import Specifications, Problem, Resolution\n",
|
| 69 |
+
" torch.serialization.add_safe_globals([Specifications, Problem, Resolution])\n",
|
| 70 |
+
"except Exception as e:\n",
|
| 71 |
+
" print(f\"Could not add custom globals: {e}\")\n",
|
| 72 |
+
"\n",
|
| 73 |
+
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
| 74 |
+
"compute_type = \"float16\" if device == \"cuda\" else \"int8\"\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"# Danh sách các model Whisper hỗ trợ\n",
|
| 77 |
+
"AVAILABLE_MODELS = {\n",
|
| 78 |
+
" \"EraX-WoW-Turbo (Whisper Large V3 Turbo - Tiếng Việt)\": \"erax-ai/EraX-WoW-Turbo-V1.1-CT2\",\n",
|
| 79 |
+
" \"PhoWhisper Large (Tiếng Việt)\": \"kiendt/PhoWhisper-large-ct2\"\n",
|
| 80 |
+
"}\n",
|
| 81 |
+
"\n",
|
| 82 |
+
"# Cache models\n",
|
| 83 |
+
"loaded_whisper_models = {}\n",
|
| 84 |
+
"diarization_pipeline = None\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"# Lấy HF_TOKEN\n",
|
| 87 |
+
"try:\n",
|
| 88 |
+
" hf_token = userdata.get('HF_TOKEN')\n",
|
| 89 |
+
"except:\n",
|
| 90 |
+
" hf_token = os.environ.get('HF_TOKEN')\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"# ==================== LOAD ALL WHISPER MODELS ====================\n",
|
| 93 |
+
"print(\"=\"*50)\n",
|
| 94 |
+
"print(\"🔄 Pre-downloading ALL Whisper Models...\")\n",
|
| 95 |
+
"print(\"=\"*50)\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"total_start = time.time()\n",
|
| 98 |
+
"for model_name, model_path in AVAILABLE_MODELS.items():\n",
|
| 99 |
+
" print(f\"\\n📥 Loading: {model_name}\")\n",
|
| 100 |
+
" start = time.time()\n",
|
| 101 |
+
" try:\n",
|
| 102 |
+
" model = WhisperModel(\n",
|
| 103 |
+
" model_path,\n",
|
| 104 |
+
" device=device,\n",
|
| 105 |
+
" compute_type=compute_type\n",
|
| 106 |
+
" )\n",
|
| 107 |
+
" loaded_whisper_models[f\"{model_name}_{compute_type}\"] = model\n",
|
| 108 |
+
" print(f\" ✅ Loaded in {time.time() - start:.1f}s\")\n",
|
| 109 |
+
" except Exception as e:\n",
|
| 110 |
+
" print(f\" ❌ Failed to load: {e}\")\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"print(f\"\\n✅ All models loaded in {time.time() - total_start:.1f}s\")\n",
|
| 113 |
+
"print(f\" Total models: {len(loaded_whisper_models)}\")\n",
|
| 114 |
+
"print(f\" Device: {device}, Compute: {compute_type}\")\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"# ==================== LOAD PYANNOTE ====================\n",
|
| 117 |
+
"print(\"\\n\" + \"=\"*50)\n",
|
| 118 |
+
"print(\"🔄 Loading Pyannote Diarization...\")\n",
|
| 119 |
+
"print(\"=\"*50)\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"if not hf_token:\n",
|
| 122 |
+
" print(\"⚠️ WARNING: HF_TOKEN not found!\")\n",
|
| 123 |
+
" print(\" Diarization will be disabled.\")\n",
|
| 124 |
+
" print(\" Please set HF_TOKEN in Colab Secrets.\")\n",
|
| 125 |
+
"else:\n",
|
| 126 |
+
" start = time.time()\n",
|
| 127 |
+
" try:\n",
|
| 128 |
+
" diarization_pipeline = Pipeline.from_pretrained(\n",
|
| 129 |
+
" \"pyannote/speaker-diarization-community-1\",\n",
|
| 130 |
+
" token=hf_token\n",
|
| 131 |
+
" )\n",
|
| 132 |
+
" diarization_pipeline.to(torch.device(device))\n",
|
| 133 |
+
" print(f\"✅ Pyannote loaded in {time.time() - start:.1f}s\")\n",
|
| 134 |
+
" except Exception as e:\n",
|
| 135 |
+
" print(f\"❌ Failed to load Pyannote: {e}\")\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"print(\"\\n\" + \"=\"*50)\n",
|
| 138 |
+
"print(\"🎉 All models loaded successfully!\")\n",
|
| 139 |
+
"print(\"=\"*50)\n"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"cell_type": "code",
|
| 144 |
+
"execution_count": null,
|
| 145 |
+
"metadata": {},
|
| 146 |
+
"outputs": [],
|
| 147 |
+
"source": [
|
| 148 |
+
"# @title 4. 🛠️ Utilities & Helpers\n",
|
| 149 |
+
"import gradio as gr\n",
|
| 150 |
+
"import time\n",
|
| 151 |
+
"import nest_asyncio\n",
|
| 152 |
+
"import subprocess\n",
|
| 153 |
+
"import os\n",
|
| 154 |
+
"\n",
|
| 155 |
+
"nest_asyncio.apply()\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"def convert_audio_to_wav(audio_path):\n",
|
| 158 |
+
" \"\"\"Chuẩn hóa audio về định dạng WAV 16kHz Mono.\"\"\"\n",
|
| 159 |
+
" try:\n",
|
| 160 |
+
" # Tạo file tạm\n",
|
| 161 |
+
" output_path = \"temp_processed_audio.wav\"\n",
|
| 162 |
+
" \n",
|
| 163 |
+
" # Xóa file cũ nếu tồn tại\n",
|
| 164 |
+
" if os.path.exists(output_path):\n",
|
| 165 |
+
" os.remove(output_path)\n",
|
| 166 |
+
" \n",
|
| 167 |
+
" # Command line ffmpeg\n",
|
| 168 |
+
" # -i input: file đầu vào\n",
|
| 169 |
+
" # -ar 16000: Sample rate 16k\n",
|
| 170 |
+
" # -ac 1: Mono channel (Pyannote tốt nhất với mono)\n",
|
| 171 |
+
" # -y: Overwrite output\n",
|
| 172 |
+
" command = [\n",
|
| 173 |
+
" \"ffmpeg\", \n",
|
| 174 |
+
" \"-i\", audio_path,\n",
|
| 175 |
+
" \"-ar\", \"16000\",\n",
|
| 176 |
+
" \"-ac\", \"1\",\n",
|
| 177 |
+
" \"-y\",\n",
|
| 178 |
+
" output_path\n",
|
| 179 |
+
" ]\n",
|
| 180 |
+
" \n",
|
| 181 |
+
" subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n",
|
| 182 |
+
" return output_path\n",
|
| 183 |
+
" except Exception as e:\n",
|
| 184 |
+
" print(f\"Error converting audio: {e}\")\n",
|
| 185 |
+
" # Fallback: Trả về file gốc nếu convert lỗi (dù rủi ro)\n",
|
| 186 |
+
" return audio_path\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"def load_whisper_model(model_name, comp_type):\n",
|
| 189 |
+
" \"\"\"Dynamic load Whisper model với cache\"\"\"\n",
|
| 190 |
+
" global loaded_whisper_models\n",
|
| 191 |
+
" cache_key = f\"{model_name}_{comp_type}\"\n",
|
| 192 |
+
" \n",
|
| 193 |
+
" if cache_key in loaded_whisper_models:\n",
|
| 194 |
+
" return loaded_whisper_models[cache_key]\n",
|
| 195 |
+
" \n",
|
| 196 |
+
" model_path = AVAILABLE_MODELS[model_name]\n",
|
| 197 |
+
" print(f\"Loading {model_name}...\")\n",
|
| 198 |
+
" start = time.time()\n",
|
| 199 |
+
" \n",
|
| 200 |
+
" model = WhisperModel(\n",
|
| 201 |
+
" model_path,\n",
|
| 202 |
+
" device=device,\n",
|
| 203 |
+
" compute_type=comp_type\n",
|
| 204 |
+
" )\n",
|
| 205 |
+
" \n",
|
| 206 |
+
" loaded_whisper_models[cache_key] = model\n",
|
| 207 |
+
" print(f\"✅ Loaded in {time.time() - start:.1f}s\")\n",
|
| 208 |
+
" return model\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"def format_timestamp(seconds):\n",
|
| 211 |
+
" \"\"\"Format seconds to MM:SS.ms\"\"\"\n",
|
| 212 |
+
" hours = int(seconds // 3600)\n",
|
| 213 |
+
" minutes = int((seconds % 3600) // 60)\n",
|
| 214 |
+
" secs = seconds % 60\n",
|
| 215 |
+
" if hours > 0:\n",
|
| 216 |
+
" return f\"{hours:02d}:{minutes:02d}:{secs:05.2f}\"\n",
|
| 217 |
+
" return f\"{minutes:02d}:{secs:05.2f}\"\n",
|
| 218 |
+
"\n",
|
| 219 |
+
"def assign_speaker_to_segment(seg_start, seg_end, diarization_result):\n",
|
| 220 |
+
" \"\"\"Gán speaker cho segment dựa trên tỷ lệ overlap >= 30%.\"\"\"\n",
|
| 221 |
+
" if diarization_result is None:\n",
|
| 222 |
+
" return \"SPEAKER_00\"\n",
|
| 223 |
+
" \n",
|
| 224 |
+
" seg_duration = seg_end - seg_start\n",
|
| 225 |
+
" if seg_duration <= 0:\n",
|
| 226 |
+
" return \"SPEAKER_00\"\n",
|
| 227 |
+
" \n",
|
| 228 |
+
" speaker_overlaps = {}\n",
|
| 229 |
+
" \n",
|
| 230 |
+
" for turn, _, speaker in diarization_result.speaker_diarization.itertracks(yield_label=True):\n",
|
| 231 |
+
" overlap_start = max(seg_start, turn.start)\n",
|
| 232 |
+
" overlap_end = min(seg_end, turn.end)\n",
|
| 233 |
+
" overlap = max(0, overlap_end - overlap_start)\n",
|
| 234 |
+
" \n",
|
| 235 |
+
" if overlap > 0:\n",
|
| 236 |
+
" if speaker not in speaker_overlaps:\n",
|
| 237 |
+
" speaker_overlaps[speaker] = 0\n",
|
| 238 |
+
" speaker_overlaps[speaker] += overlap\n",
|
| 239 |
+
" \n",
|
| 240 |
+
" if not speaker_overlaps:\n",
|
| 241 |
+
" return \"SPEAKER_00\"\n",
|
| 242 |
+
" \n",
|
| 243 |
+
" best_speaker = max(speaker_overlaps, key=speaker_overlaps.get)\n",
|
| 244 |
+
" best_overlap = speaker_overlaps[best_speaker]\n",
|
| 245 |
+
" \n",
|
| 246 |
+
" if best_overlap / seg_duration >= 0.3:\n",
|
| 247 |
+
" return best_speaker\n",
|
| 248 |
+
" \n",
|
| 249 |
+
" return \"SPEAKER_00\"\n",
|
| 250 |
+
"\n",
|
| 251 |
+
"def merge_consecutive_segments(segments, max_gap=0.5):\n",
|
| 252 |
+
" \"\"\"Gộp các segment liên tiếp của cùng một speaker.\"\"\"\n",
|
| 253 |
+
" if not segments:\n",
|
| 254 |
+
" return []\n",
|
| 255 |
+
" \n",
|
| 256 |
+
" merged = []\n",
|
| 257 |
+
" current = segments[0].copy()\n",
|
| 258 |
+
" \n",
|
| 259 |
+
" for seg in segments[1:]:\n",
|
| 260 |
+
" if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= max_gap:\n",
|
| 261 |
+
" current['end'] = seg['end']\n",
|
| 262 |
+
" current['text'] += ' ' + seg['text']\n",
|
| 263 |
+
" else:\n",
|
| 264 |
+
" merged.append(current)\n",
|
| 265 |
+
" current = seg.copy()\n",
|
| 266 |
+
" \n",
|
| 267 |
+
" merged.append(current)\n",
|
| 268 |
+
" return merged"
|
| 269 |
+
]
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"cell_type": "code",
|
| 273 |
+
"execution_count": null,
|
| 274 |
+
"metadata": {},
|
| 275 |
+
"outputs": [],
|
| 276 |
+
"source": [
|
| 277 |
+
"# @title 5. ⚙️ Processing Logic\n",
|
| 278 |
+
"def process_audio(audio_path, model_name, language, beam_size, vad_filter, vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold, temperature, best_of, patience, length_penalty, initial_prompt, prefix, condition_on_previous_text, no_speech_threshold, log_prob_threshold, compression_ratio_threshold, comp_type, merge_segs, p=gr.Progress()):\n",
|
| 279 |
+
" \"\"\"\n",
|
| 280 |
+
" Quy trình mới:\n",
|
| 281 |
+
" 0. Chuẩn hóa audio (convert mp3 -> wav 16k).\n",
|
| 282 |
+
" 1. Diarization để tách các đoạn của từng người nói.\n",
|
| 283 |
+
" 2. Cắt audio theo các đoạn này.\n",
|
| 284 |
+
" 3. Transcribe từng đoạn audio.\n",
|
| 285 |
+
" 4. Gộp kết quả.\n",
|
| 286 |
+
" \"\"\"\n",
|
| 287 |
+
" if audio_path is None:\n",
|
| 288 |
+
" msg = \"⚠️ Vui lòng upload hoặc ghi âm audio!\"\n",
|
| 289 |
+
" return msg, msg\n",
|
| 290 |
+
" \n",
|
| 291 |
+
" total_start_time = time.time()\n",
|
| 292 |
+
" \n",
|
| 293 |
+
" # Check Pyannote\n",
|
| 294 |
+
" if diarization_pipeline is None:\n",
|
| 295 |
+
" return \"❌ Lỗi: Chưa load được Pyannote (kiểm tra HF_TOKEN).\", \"❌ Lỗi: Chưa load được Pyannote.\"\n",
|
| 296 |
+
"\n",
|
| 297 |
+
" # 0. Preprocessing Audio (Standardize)\n",
|
| 298 |
+
" p(0.05, desc=\"Đang chuẩn hóa audio (16kHz WAV)...\")\n",
|
| 299 |
+
" try:\n",
|
| 300 |
+
" # Luôn convert về wav 16k mono để tránh lỗi sample rate mismatch của Pyannote\n",
|
| 301 |
+
" clean_audio_path = convert_audio_to_wav(audio_path)\n",
|
| 302 |
+
" except Exception as e:\n",
|
| 303 |
+
" msg = f\"❌ Lỗi convert audio: {e}\"\n",
|
| 304 |
+
" return msg, msg\n",
|
| 305 |
+
" \n",
|
| 306 |
+
" # 1. Load Standardized Audio for slicing later\n",
|
| 307 |
+
" p(0.08, desc=\"Đang đọc file audio...\")\n",
|
| 308 |
+
" try:\n",
|
| 309 |
+
" y, sr = librosa.load(clean_audio_path, sr=16000)\n",
|
| 310 |
+
" # sr should be 16000 now exactly\n",
|
| 311 |
+
" except Exception as e:\n",
|
| 312 |
+
" return f\"❌ Lỗi đọc audio: {e}\", f\"❌ Lỗi đọc audio: {e}\"\n",
|
| 313 |
+
"\n",
|
| 314 |
+
" # 2. DIARIZATION\n",
|
| 315 |
+
" p(0.1, desc=\"Đang phân tách người nói (Diarization)...\")\n",
|
| 316 |
+
" \n",
|
| 317 |
+
" try:\n",
|
| 318 |
+
" # Sử dụng file đã chuẩn hóa\n",
|
| 319 |
+
" diarization = diarization_pipeline(clean_audio_path)\n",
|
| 320 |
+
" except Exception as e:\n",
|
| 321 |
+
" return f\"❌ Lỗi Diarization: {e}\", f\"❌ Lỗi Diarization: {e}\"\n",
|
| 322 |
+
" \n",
|
| 323 |
+
" diarization_segments = []\n",
|
| 324 |
+
" # Dùng cách user đã fix trước đó (nếu model trả về object khác)\n",
|
| 325 |
+
" # Mặc định pipeline community trả về Annotation trực tiếp, nhưng user fix thành diarization.speaker_diarization\n",
|
| 326 |
+
" # Mình sẽ try/except để support cả 2 structure cho an toàn\n",
|
| 327 |
+
" try:\n",
|
| 328 |
+
" # Trường hợp 1: Standard Annotation\n",
|
| 329 |
+
" iterator = diarization.itertracks(yield_label=True)\n",
|
| 330 |
+
" # Test thử xem có chạy ko, nếu không phải Annotation nó sẽ lỗi attribute\n",
|
| 331 |
+
" _ = list(iterator)\n",
|
| 332 |
+
" # Reset iterate\n",
|
| 333 |
+
" iterator = diarization.itertracks(yield_label=True)\n",
|
| 334 |
+
" except:\n",
|
| 335 |
+
" # Trường hợp 2: User report structure (maybe wrapper)\n",
|
| 336 |
+
" try:\n",
|
| 337 |
+
" iterator = diarization.speaker_diarization.itertracks(yield_label=True)\n",
|
| 338 |
+
" except:\n",
|
| 339 |
+
" return \"❌ Lỗi format result Diarization\", \"❌ Lỗi format result Diarization\"\n",
|
| 340 |
+
"\n",
|
| 341 |
+
" for turn, _, speaker in iterator:\n",
|
| 342 |
+
" diarization_segments.append({\n",
|
| 343 |
+
" \"start\": turn.start,\n",
|
| 344 |
+
" \"end\": turn.end,\n",
|
| 345 |
+
" \"speaker\": speaker\n",
|
| 346 |
+
" })\n",
|
| 347 |
+
" \n",
|
| 348 |
+
" # Sort segments by start time\n",
|
| 349 |
+
" diarization_segments.sort(key=lambda x: x['start'])\n",
|
| 350 |
+
" \n",
|
| 351 |
+
" # Merge consecutive segments if requested\n",
|
| 352 |
+
" if merge_segs and diarization_segments:\n",
|
| 353 |
+
" p(0.3, desc=\"Đang gộp segment liên tiếp...\")\n",
|
| 354 |
+
" merged = []\n",
|
| 355 |
+
" current = diarization_segments[0].copy()\n",
|
| 356 |
+
" for seg in diarization_segments[1:]:\n",
|
| 357 |
+
" if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= 0.5:\n",
|
| 358 |
+
" current['end'] = seg['end']\n",
|
| 359 |
+
" else:\n",
|
| 360 |
+
" merged.append(current)\n",
|
| 361 |
+
" current = seg.copy()\n",
|
| 362 |
+
" merged.append(current)\n",
|
| 363 |
+
" diarization_segments = merged\n",
|
| 364 |
+
" \n",
|
| 365 |
+
" # 3. TRANSCRIPTION LOOP\n",
|
| 366 |
+
" p(0.4, desc=\"Đang tải model Whisper...\")\n",
|
| 367 |
+
" model = load_whisper_model(model_name, comp_type)\n",
|
| 368 |
+
" \n",
|
| 369 |
+
" processed_segments = []\n",
|
| 370 |
+
" \n",
|
| 371 |
+
" total_segs = len(diarization_segments)\n",
|
| 372 |
+
" \n",
|
| 373 |
+
" # Prepare VAD options\n",
|
| 374 |
+
" if vad_filter:\n",
|
| 375 |
+
" vad_options = dict(\n",
|
| 376 |
+
" min_silence_duration_ms=vad_min_silence,\n",
|
| 377 |
+
" speech_pad_ms=vad_speech_pad,\n",
|
| 378 |
+
" min_speech_duration_ms=vad_min_speech,\n",
|
| 379 |
+
" threshold=vad_threshold\n",
|
| 380 |
+
" )\n",
|
| 381 |
+
" else:\n",
|
| 382 |
+
" vad_options = False\n",
|
| 383 |
+
" \n",
|
| 384 |
+
" prompt = initial_prompt.strip() if (initial_prompt and initial_prompt.strip()) else None\n",
|
| 385 |
+
" prefix_text = prefix.strip() if (prefix and prefix.strip()) else None\n",
|
| 386 |
+
"\n",
|
| 387 |
+
" print(f\"Processing {total_segs} segments...\")\n",
|
| 388 |
+
" \n",
|
| 389 |
+
" for idx, seg in enumerate(diarization_segments):\n",
|
| 390 |
+
" start_sec = seg['start']\n",
|
| 391 |
+
" end_sec = seg['end']\n",
|
| 392 |
+
" speaker = seg['speaker']\n",
|
| 393 |
+
" \n",
|
| 394 |
+
" # UI Progress\n",
|
| 395 |
+
" progress_val = 0.4 + (0.5 * (idx / total_segs))\n",
|
| 396 |
+
" p(progress_val, desc=f\"Transcribing {idx+1}/{total_segs} ({speaker})...\")\n",
|
| 397 |
+
" \n",
|
| 398 |
+
" # Audio slicing\n",
|
| 399 |
+
" start_sample = int(start_sec * sr)\n",
|
| 400 |
+
" end_sample = int(end_sec * sr)\n",
|
| 401 |
+
" \n",
|
| 402 |
+
" # Avoid empty slice\n",
|
| 403 |
+
" if end_sample <= start_sample:\n",
|
| 404 |
+
" continue\n",
|
| 405 |
+
" \n",
|
| 406 |
+
" y_seg = y[start_sample:end_sample]\n",
|
| 407 |
+
" \n",
|
| 408 |
+
" # Whisper Transcribe for this chunk\n",
|
| 409 |
+
" try:\n",
|
| 410 |
+
" # Note: We pass the numpy array 'y_seg' directly\n",
|
| 411 |
+
" segments_gen, _ = model.transcribe(\n",
|
| 412 |
+
" y_seg, \n",
|
| 413 |
+
" language=language if language != \"auto\" else None,\n",
|
| 414 |
+
" beam_size=beam_size, \n",
|
| 415 |
+
" vad_filter=vad_options,\n",
|
| 416 |
+
" temperature=temperature,\n",
|
| 417 |
+
" best_of=best_of,\n",
|
| 418 |
+
" patience=patience,\n",
|
| 419 |
+
" length_penalty=length_penalty,\n",
|
| 420 |
+
" initial_prompt=prompt,\n",
|
| 421 |
+
" prefix=prefix_text,\n",
|
| 422 |
+
" condition_on_previous_text=condition_on_previous_text,\n",
|
| 423 |
+
" no_speech_threshold=no_speech_threshold,\n",
|
| 424 |
+
" log_prob_threshold=log_prob_threshold,\n",
|
| 425 |
+
" compression_ratio_threshold=compression_ratio_threshold,\n",
|
| 426 |
+
" word_timestamps=False \n",
|
| 427 |
+
" )\n",
|
| 428 |
+
" \n",
|
| 429 |
+
" # Collect text\n",
|
| 430 |
+
" seg_text_parts = []\n",
|
| 431 |
+
" for s in segments_gen:\n",
|
| 432 |
+
" seg_text_parts.append(s.text.strip())\n",
|
| 433 |
+
" \n",
|
| 434 |
+
" final_text = \" \".join(seg_text_parts).strip()\n",
|
| 435 |
+
" \n",
|
| 436 |
+
" if final_text:\n",
|
| 437 |
+
" # Store Result\n",
|
| 438 |
+
" processed_segments.append({\n",
|
| 439 |
+
" \"start\": start_sec,\n",
|
| 440 |
+
" \"end\": end_sec,\n",
|
| 441 |
+
" \"speaker\": speaker,\n",
|
| 442 |
+
" \"text\": final_text\n",
|
| 443 |
+
" })\n",
|
| 444 |
+
" \n",
|
| 445 |
+
" except Exception as e:\n",
|
| 446 |
+
" print(f\"Error transcribing segment {idx}: {e}\")\n",
|
| 447 |
+
" continue\n",
|
| 448 |
+
"\n",
|
| 449 |
+
" total_elapsed = time.time() - total_start_time\n",
|
| 450 |
+
" \n",
|
| 451 |
+
" p(0.95, desc=\"Đang xuất kết quả...\")\n",
|
| 452 |
+
" \n",
|
| 453 |
+
" # ========== OUTPUT GENERATION ==========\n",
|
| 454 |
+
" \n",
|
| 455 |
+
" # Speaker colors\n",
|
| 456 |
+
" speaker_colors = {\n",
|
| 457 |
+
" 'SPEAKER_00': '🔵',\n",
|
| 458 |
+
" 'SPEAKER_01': '🟢', \n",
|
| 459 |
+
" 'SPEAKER_02': '🟡',\n",
|
| 460 |
+
" 'SPEAKER_03': '🟠',\n",
|
| 461 |
+
" 'SPEAKER_04': '🔴',\n",
|
| 462 |
+
" 'SPEAKER_05': '🟣',\n",
|
| 463 |
+
" }\n",
|
| 464 |
+
" \n",
|
| 465 |
+
" # 1. Plain Transcription Output\n",
|
| 466 |
+
" transcribe_lines = []\n",
|
| 467 |
+
" for item in processed_segments:\n",
|
| 468 |
+
" ts = f\"[{format_timestamp(item['start'])} → {format_timestamp(item['end'])}]\"\n",
|
| 469 |
+
" transcribe_lines.append(f\"{ts} {item['text']}\")\n",
|
| 470 |
+
" \n",
|
| 471 |
+
" transcribe_header = f\"\"\"## 📝 Kết quả Transcription\n",
|
| 472 |
+
"\n",
|
| 473 |
+
"| Thông tin | Giá trị |\n",
|
| 474 |
+
"|-----------|----------|\n",
|
| 475 |
+
"| ⏱️ Tổng thời gian xử lý | {total_elapsed:.1f}s |\n",
|
| 476 |
+
"| 📊 Tổng số Segment | {len(processed_segments)} |\n",
|
| 477 |
+
"\n",
|
| 478 |
+
"---\n",
|
| 479 |
+
"\n",
|
| 480 |
+
"\"\"\"\n",
|
| 481 |
+
" transcribe_output = transcribe_header + \"\\n\".join(transcribe_lines)\n",
|
| 482 |
+
" \n",
|
| 483 |
+
" # 2. Diarization + Transcription Output\n",
|
| 484 |
+
" diarize_lines = []\n",
|
| 485 |
+
" unique_speakers = set()\n",
|
| 486 |
+
" \n",
|
| 487 |
+
" for item in processed_segments:\n",
|
| 488 |
+
" unique_speakers.add(item['speaker'])\n",
|
| 489 |
+
" ts = f\"[{format_timestamp(item['start'])} → {format_timestamp(item['end'])}]\"\n",
|
| 490 |
+
" icon = speaker_colors.get(item['speaker'], '⚪')\n",
|
| 491 |
+
" diarize_lines.append(f\"{ts} {icon} **{item['speaker']}**: {item['text']}\")\n",
|
| 492 |
+
" \n",
|
| 493 |
+
" diarize_header = f\"\"\"## 🎭 Kết quả Transcription + Diarization\n",
|
| 494 |
+
"\n",
|
| 495 |
+
"| Thông tin | Giá trị |\n",
|
| 496 |
+
"|-----------|----------|\n",
|
| 497 |
+
"| 👥 Số người nói | {len(unique_speakers)} |\n",
|
| 498 |
+
"| ⏱️ Tổng thời gian xử lý | {total_elapsed:.1f}s |\n",
|
| 499 |
+
"| 📊 Tổng số Segment | {len(processed_segments)} |\n",
|
| 500 |
+
"\n",
|
| 501 |
+
"---\n",
|
| 502 |
+
"\n",
|
| 503 |
+
"\"\"\"\n",
|
| 504 |
+
" diarize_output = diarize_header + \"\\n\".join(diarize_lines)\n",
|
| 505 |
+
" \n",
|
| 506 |
+
" return transcribe_output, diarize_output"
|
| 507 |
+
]
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"cell_type": "code",
|
| 511 |
+
"execution_count": null,
|
| 512 |
+
"metadata": {},
|
| 513 |
+
"outputs": [],
|
| 514 |
+
"source": [
|
| 515 |
+
"# @title 6. 🚀 Gradio UI\n",
|
| 516 |
+
"css = \"\"\"\n",
|
| 517 |
+
".gradio-container { max-width: 1200px !important; }\n",
|
| 518 |
+
".output-markdown { font-family: 'JetBrains Mono', monospace !important; }\n",
|
| 519 |
+
"\"\"\"\n",
|
| 520 |
+
"\n",
|
| 521 |
+
"with gr.Blocks(title=\"PrecisionVoice\", theme=gr.themes.Soft(), css=css) as demo:\n",
|
| 522 |
+
" gr.Markdown(\"\"\"# 🎙️ PrecisionVoice - Vietnamese Speech-to-Text\n",
|
| 523 |
+
" \n",
|
| 524 |
+
"Sử dụng **Whisper** để nhận dạng văn bản và **Pyannote** để phân biệt người nói.\n",
|
| 525 |
+
"\"\"\")\n",
|
| 526 |
+
" \n",
|
| 527 |
+
" with gr.Row():\n",
|
| 528 |
+
" with gr.Column(scale=1):\n",
|
| 529 |
+
" audio_input = gr.Audio(\n",
|
| 530 |
+
" sources=[\"upload\", \"microphone\"], \n",
|
| 531 |
+
" type=\"filepath\", \n",
|
| 532 |
+
" label=\"🔊 Audio Input\"\n",
|
| 533 |
+
" )\n",
|
| 534 |
+
" \n",
|
| 535 |
+
" gr.Markdown(\"### ⚙️ Cài đặt Model\")\n",
|
| 536 |
+
" model_select = gr.Dropdown(\n",
|
| 537 |
+
" choices=list(AVAILABLE_MODELS.keys()),\n",
|
| 538 |
+
" value=list(AVAILABLE_MODELS.keys())[0],\n",
|
| 539 |
+
" label=\"🤖 Whisper Model\"\n",
|
| 540 |
+
" )\n",
|
| 541 |
+
" \n",
|
| 542 |
+
" language = gr.Dropdown(\n",
|
| 543 |
+
" choices=[\"auto\", \"vi\", \"en\", \"zh\", \"ja\", \"ko\"],\n",
|
| 544 |
+
" value=\"vi\",\n",
|
| 545 |
+
" label=\"🌐 Ngôn ngữ\"\n",
|
| 546 |
+
" )\n",
|
| 547 |
+
" \n",
|
| 548 |
+
" comp_type_select = gr.Dropdown(\n",
|
| 549 |
+
" choices=[\"float16\", \"float32\", \"int8\", \"int8_float16\"],\n",
|
| 550 |
+
" value=compute_type,\n",
|
| 551 |
+
" label=\"⚡ Compute Type\"\n",
|
| 552 |
+
" )\n",
|
| 553 |
+
" \n",
|
| 554 |
+
" with gr.Accordion(\"🔧 Tùy chọn nâng cao\", open=False):\n",
|
| 555 |
+
" beam_size = gr.Slider(\n",
|
| 556 |
+
" minimum=1, maximum=10, value=5, step=1,\n",
|
| 557 |
+
" label=\"Beam Size\",\n",
|
| 558 |
+
" info=\"Cao hơn = chính xác hơn nhưng chậm hơn\"\n",
|
| 559 |
+
" )\n",
|
| 560 |
+
" vad_filter = gr.Checkbox(\n",
|
| 561 |
+
" value=True, \n",
|
| 562 |
+
" label=\"VAD Filter\",\n",
|
| 563 |
+
" info=\"Lọc khoảng lặng tự động\"\n",
|
| 564 |
+
" )\n",
|
| 565 |
+
" with gr.Row():\n",
|
| 566 |
+
" vad_min_silence = gr.Number(value=1000, label=\"Min Silence (ms)\", info=\"min_silence_duration_ms\")\n",
|
| 567 |
+
" vad_speech_pad = gr.Number(value=400, label=\"Speech Pad (ms)\", info=\"speech_pad_ms\")\n",
|
| 568 |
+
" with gr.Row():\n",
|
| 569 |
+
" vad_min_speech = gr.Number(value=250, label=\"Min Speech (ms)\", info=\"min_speech_duration_ms\")\n",
|
| 570 |
+
" vad_threshold = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.05, label=\"VAD Threshold\")\n",
|
| 571 |
+
" \n",
|
| 572 |
+
" with gr.Accordion(\"🧠 Tham số Generation (Whisper)\", open=False):\n",
|
| 573 |
+
" with gr.Row():\n",
|
| 574 |
+
" temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label=\"Temperature\")\n",
|
| 575 |
+
" best_of = gr.Number(value=5, label=\"Best Of\")\n",
|
| 576 |
+
" with gr.Row():\n",
|
| 577 |
+
" patience = gr.Number(value=1.0, label=\"Patience\", step=0.1)\n",
|
| 578 |
+
" length_penalty = gr.Number(value=1.0, label=\"Length Penalty\", step=0.1)\n",
|
| 579 |
+
" initial_prompt = gr.Textbox(label=\"Initial Prompt\", placeholder=\"Ngữ cảnh hoặc từ vựng...\")\n",
|
| 580 |
+
" prefix = gr.Textbox(label=\"Prefix\", placeholder=\"Bắt đầu câu với...\")\n",
|
| 581 |
+
" condition_on_previous_text = gr.Checkbox(value=True, label=\"Condition on previous text\")\n",
|
| 582 |
+
" \n",
|
| 583 |
+
" gr.Markdown(\"**Filter Thresholds**\")\n",
|
| 584 |
+
" with gr.Row():\n",
|
| 585 |
+
" no_speech_threshold = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label=\"No Speech Threshold\")\n",
|
| 586 |
+
" log_prob_threshold = gr.Slider(-5.0, 0.0, value=-1.0, step=0.1, label=\"Log Prob Threshold\")\n",
|
| 587 |
+
" compression_ratio_threshold = gr.Number(value=2.4, label=\"Compression Ratio Threshold\")\n",
|
| 588 |
+
" \n",
|
| 589 |
+
" merge_segments = gr.Checkbox(\n",
|
| 590 |
+
" value=True,\n",
|
| 591 |
+
" label=\"Gộp Segment cùng Speaker\",\n",
|
| 592 |
+
" info=\"Gộp các câu liên tiếp của cùng người nói\"\n",
|
| 593 |
+
" )\n",
|
| 594 |
+
" \n",
|
| 595 |
+
" btn_process = gr.Button(\"🚀 Xử lý Audio\", variant=\"primary\", size=\"lg\")\n",
|
| 596 |
+
" \n",
|
| 597 |
+
" with gr.Column(scale=2):\n",
|
| 598 |
+
" with gr.Tabs():\n",
|
| 599 |
+
" with gr.Tab(\"📝 Transcription\"):\n",
|
| 600 |
+
" output_transcribe = gr.Markdown(\n",
|
| 601 |
+
" value=\"*Kết quả transcription sẽ hiển thị ở đây...*\",\n",
|
| 602 |
+
" elem_classes=[\"output-markdown\"]\n",
|
| 603 |
+
" )\n",
|
| 604 |
+
" with gr.Tab(\"🎭 Transcription + Diarization\"):\n",
|
| 605 |
+
" output_diarize = gr.Markdown(\n",
|
| 606 |
+
" value=\"*Kết quả transcription + diarization sẽ hiển thị ở đây...*\",\n",
|
| 607 |
+
" elem_classes=[\"output-markdown\"]\n",
|
| 608 |
+
" )\n",
|
| 609 |
+
" \n",
|
| 610 |
+
" btn_process.click(\n",
|
| 611 |
+
" process_audio,\n",
|
| 612 |
+
" inputs=[\n",
|
| 613 |
+
" audio_input, model_select, language, beam_size, vad_filter, \n",
|
| 614 |
+
" vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold,\n",
|
| 615 |
+
" temperature, best_of, patience, length_penalty, \n",
|
| 616 |
+
" initial_prompt, prefix, condition_on_previous_text,\n",
|
| 617 |
+
" no_speech_threshold, log_prob_threshold, compression_ratio_threshold,\n",
|
| 618 |
+
" comp_type_select, merge_segments\n",
|
| 619 |
+
" ],\n",
|
| 620 |
+
" outputs=[output_transcribe, output_diarize]\n",
|
| 621 |
+
" )\n",
|
| 622 |
+
" \n",
|
| 623 |
+
" gr.Markdown(\"\"\"---\n",
|
| 624 |
+
" \n",
|
| 625 |
+
"### 📖 Hướng dẫn sử dụng\n",
|
| 626 |
+
"\n",
|
| 627 |
+
"1. **Upload audio** hoặc ghi âm trực tiếp\n",
|
| 628 |
+
"2. **Chọn Model**:\n",
|
| 629 |
+
" - `EraX-WoW-Turbo`: Whisper Large V3 Turbo, tối ưu cho tiếng Việt\n",
|
| 630 |
+
" - `PhoWhisper Large`: Model được huấn luyện riêng cho tiếng Việt\n",
|
| 631 |
+
"3. **Setting nâng cao**:\n",
|
| 632 |
+
" - Chỉnh `temperature` nếu muốn model sáng tạo hơn.\n",
|
| 633 |
+
" - Thêm `Initial Prompt` để gợi ý từ vựng chuyên ngành.\n",
|
| 634 |
+
"4. **Nhấn \"🚀 Xử lý Audio\"** để nhận kết quả ở cả 2 tab\n",
|
| 635 |
+
"\"\"\")\n",
|
| 636 |
+
"\n",
|
| 637 |
+
"# Launch\n",
|
| 638 |
+
"import os\n",
|
| 639 |
+
"if \"COLAB_GPU\" in os.environ or \"google.colab\" in str(get_ipython()):\n",
|
| 640 |
+
" demo.queue().launch(share=True, debug=True)\n",
|
| 641 |
+
"else:\n",
|
| 642 |
+
" demo.launch(share=False)"
|
| 643 |
+
]
|
| 644 |
+
}
|
| 645 |
+
],
|
| 646 |
+
"metadata": {
|
| 647 |
+
"accelerator": "GPU",
|
| 648 |
+
"colab": {
|
| 649 |
+
"gpuType": "T4",
|
| 650 |
+
"provenance": []
|
| 651 |
+
},
|
| 652 |
+
"kernelspec": {
|
| 653 |
+
"display_name": "Python 3 (ipykernel)",
|
| 654 |
+
"language": "python",
|
| 655 |
+
"name": "python3"
|
| 656 |
+
},
|
| 657 |
+
"language_info": {
|
| 658 |
+
"codemirror_mode": {
|
| 659 |
+
"name": "ipython",
|
| 660 |
+
"version": 3
|
| 661 |
+
},
|
| 662 |
+
"file_extension": ".py",
|
| 663 |
+
"mimetype": "text/x-python",
|
| 664 |
+
"name": "python",
|
| 665 |
+
"nbconvert_exporter": "python",
|
| 666 |
+
"pygments_lexer": "ipython3",
|
| 667 |
+
"version": "3.12.12"
|
| 668 |
+
}
|
| 669 |
+
},
|
| 670 |
+
"nbformat": 4,
|
| 671 |
+
"nbformat_minor": 4
|
| 672 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core framework
|
| 2 |
+
fastapi>=0.109.0
|
| 3 |
+
uvicorn[standard]>=0.27.0
|
| 4 |
+
python-multipart>=0.0.6
|
| 5 |
+
jinja2>=3.1.2
|
| 6 |
+
aiofiles>=23.2.1
|
| 7 |
+
|
| 8 |
+
# AI/ML - Speech-to-Text
|
| 9 |
+
faster-whisper>=1.0.0
|
| 10 |
+
ctranslate2>=4.0.0
|
| 11 |
+
|
| 12 |
+
# AI/ML - Speaker Diarization (from notebook cell #2)
|
| 13 |
+
pyannote.audio>=3.3.1
|
| 14 |
+
torch>=2.1.0
|
| 15 |
+
torchaudio>=2.1.0
|
| 16 |
+
torchvision
|
| 17 |
+
lightning
|
| 18 |
+
torchmetrics
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Transformers Whisper + LoRA
|
| 22 |
+
transformers>=4.39.0,<5
|
| 23 |
+
accelerate>=0.26.0
|
| 24 |
+
peft>=0.8.0
|
| 25 |
+
huggingface-hub>=0.20.0
|
| 26 |
+
safetensors>=0.4.0
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# AI/ML - Vocal Separation
|
| 30 |
+
audio-separator[cpu]>=0.17.0
|
| 31 |
+
denoiser>=0.1.4
|
| 32 |
+
|
| 33 |
+
# Audio processing
|
| 34 |
+
librosa>=0.10.0
|
| 35 |
+
ffmpeg-python>=0.2.0
|
| 36 |
+
pydub>=0.25.1
|
| 37 |
+
|
| 38 |
+
# Configuration
|
| 39 |
+
pydantic-settings>=2.1.0
|
| 40 |
+
python-dotenv>=1.0.0
|
| 41 |
+
|
| 42 |
+
# Utilities
|
| 43 |
+
numpy>=1.24.0
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
scripts/verify_model_config.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from app.core.config import get_settings
|
| 3 |
+
from app.services.transcription import TranscriptionService
|
| 4 |
+
|
| 5 |
+
def verify_stt_model():
|
| 6 |
+
settings = get_settings()
|
| 7 |
+
print(f"Current Whisper Model: {settings.whisper_model}")
|
| 8 |
+
print(f"Device: {settings.resolved_device}")
|
| 9 |
+
print(f"Compute Type: {settings.resolved_compute_type}")
|
| 10 |
+
|
| 11 |
+
expected_model = "kiendt/PhoWhisper-large-ct2"
|
| 12 |
+
if settings.whisper_model == expected_model:
|
| 13 |
+
print("✅ SUCCESS: Model configuration updated correctly.")
|
| 14 |
+
else:
|
| 15 |
+
print(f"❌ FAILURE: Expected {expected_model}, got {settings.whisper_model}")
|
| 16 |
+
|
| 17 |
+
if __name__ == "__main__":
|
| 18 |
+
verify_stt_model()
|