Spaces:
Sleeping
Sleeping
creator-o1
commited on
Commit
·
d00203b
0
Parent(s):
Initial commit: Complete VoiceForge Enterprise Speech AI Platform
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .env.example +28 -0
- .github/workflows/backend-ci.yml +62 -0
- .github/workflows/ci.yml +36 -0
- .gitignore +178 -0
- .lighthouseci/lhr-1769848038113.html +0 -0
- CHANGELOG.md +128 -0
- CONTRIBUTING.md +279 -0
- README.md +360 -0
- backend/.flake8 +4 -0
- backend/Dockerfile +50 -0
- backend/app/__init__.py +3 -0
- backend/app/api/__init__.py +3 -0
- backend/app/api/routes/__init__.py +31 -0
- backend/app/api/routes/analysis.py +60 -0
- backend/app/api/routes/audio.py +100 -0
- backend/app/api/routes/auth.py +116 -0
- backend/app/api/routes/batch.py +204 -0
- backend/app/api/routes/cloning.py +81 -0
- backend/app/api/routes/health.py +93 -0
- backend/app/api/routes/sign.py +164 -0
- backend/app/api/routes/stt.py +489 -0
- backend/app/api/routes/transcripts.py +200 -0
- backend/app/api/routes/translation.py +261 -0
- backend/app/api/routes/tts.py +245 -0
- backend/app/api/routes/ws.py +153 -0
- backend/app/core/__init__.py +7 -0
- backend/app/core/config.py +108 -0
- backend/app/core/limiter.py +27 -0
- backend/app/core/middleware.py +70 -0
- backend/app/core/security.py +107 -0
- backend/app/core/security_encryption.py +101 -0
- backend/app/core/security_headers.py +37 -0
- backend/app/main.py +257 -0
- backend/app/schemas/__init__.py +39 -0
- backend/app/schemas/stt.py +98 -0
- backend/app/schemas/transcript.py +69 -0
- backend/app/schemas/tts.py +67 -0
- backend/app/services/__init__.py +13 -0
- backend/app/services/audio_service.py +101 -0
- backend/app/services/batch_service.py +348 -0
- backend/app/services/cache_service.py +71 -0
- backend/app/services/clone_service.py +104 -0
- backend/app/services/diarization_service.py +338 -0
- backend/app/services/edge_tts_service.py +357 -0
- backend/app/services/emotion_service.py +132 -0
- backend/app/services/export_service.py +99 -0
- backend/app/services/file_service.py +230 -0
- backend/app/services/meeting_service.py +121 -0
- backend/app/services/nlp_service.py +180 -0
- backend/app/services/sign_avatar_service.py +82 -0
.env.example
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VoiceForge Environment Configuration
|
| 2 |
+
# Copy this file to .env and fill in your values
|
| 3 |
+
|
| 4 |
+
# Database
|
| 5 |
+
DATABASE_URL=postgresql://postgres:postgres@localhost:5432/voiceforge
|
| 6 |
+
|
| 7 |
+
# Redis
|
| 8 |
+
REDIS_URL=redis://localhost:6379/0
|
| 9 |
+
|
| 10 |
+
# Google Cloud
|
| 11 |
+
GOOGLE_APPLICATION_CREDENTIALS=./credentials/google-cloud-key.json
|
| 12 |
+
|
| 13 |
+
# API Settings
|
| 14 |
+
API_HOST=0.0.0.0
|
| 15 |
+
API_PORT=8000
|
| 16 |
+
DEBUG=true
|
| 17 |
+
|
| 18 |
+
# Security
|
| 19 |
+
SECRET_KEY=your-super-secret-key-change-in-production
|
| 20 |
+
ACCESS_TOKEN_EXPIRE_MINUTES=30
|
| 21 |
+
|
| 22 |
+
# File Storage
|
| 23 |
+
UPLOAD_DIR=./uploads
|
| 24 |
+
MAX_AUDIO_DURATION_SECONDS=600
|
| 25 |
+
MAX_UPLOAD_SIZE_MB=50
|
| 26 |
+
|
| 27 |
+
# Supported Languages (comma-separated)
|
| 28 |
+
SUPPORTED_LANGUAGES=en-US,en-GB,es-ES,es-MX,fr-FR,de-DE,ja-JP,ko-KR,cmn-CN,hi-IN
|
.github/workflows/backend-ci.yml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Backend CI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ main ]
|
| 6 |
+
paths:
|
| 7 |
+
- 'backend/**'
|
| 8 |
+
pull_request:
|
| 9 |
+
branches: [ main ]
|
| 10 |
+
paths:
|
| 11 |
+
- 'backend/**'
|
| 12 |
+
|
| 13 |
+
jobs:
|
| 14 |
+
test:
|
| 15 |
+
runs-on: ubuntu-latest
|
| 16 |
+
defaults:
|
| 17 |
+
run:
|
| 18 |
+
working-directory: ./backend
|
| 19 |
+
|
| 20 |
+
services:
|
| 21 |
+
redis:
|
| 22 |
+
image: redis
|
| 23 |
+
ports:
|
| 24 |
+
- 6379:6379
|
| 25 |
+
options: >-
|
| 26 |
+
--health-cmd "redis-cli ping"
|
| 27 |
+
--health-interval 10s
|
| 28 |
+
--health-timeout 5s
|
| 29 |
+
--health-retries 5
|
| 30 |
+
|
| 31 |
+
steps:
|
| 32 |
+
- uses: actions/checkout@v3
|
| 33 |
+
|
| 34 |
+
- name: Set up Python 3.10
|
| 35 |
+
uses: actions/setup-python@v4
|
| 36 |
+
with:
|
| 37 |
+
python-version: "3.10"
|
| 38 |
+
cache: 'pip'
|
| 39 |
+
|
| 40 |
+
- name: Install dependencies
|
| 41 |
+
run: |
|
| 42 |
+
python -m pip install --upgrade pip
|
| 43 |
+
pip install flake8 pytest pytest-asyncio httpx
|
| 44 |
+
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
| 45 |
+
|
| 46 |
+
- name: Lint with flake8
|
| 47 |
+
run: |
|
| 48 |
+
# stop the build if there are Python syntax errors or undefined names
|
| 49 |
+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
| 50 |
+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
|
| 51 |
+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
|
| 52 |
+
|
| 53 |
+
- name: Test with pytest
|
| 54 |
+
env:
|
| 55 |
+
ENCRYPTION_KEY: ${{ secrets.ENCRYPTION_KEY }} # Mock or secret
|
| 56 |
+
REDIS_URL: "redis://localhost:6379/0"
|
| 57 |
+
HF_TOKEN: "mock_token" # Mock for CI
|
| 58 |
+
run: |
|
| 59 |
+
# We Mock heavy dependencies (torch, etc) in tests/conftest.py usually,
|
| 60 |
+
# or we install them. Installing them takes time.
|
| 61 |
+
# For this demo, we assume they are installed or tests mock them.
|
| 62 |
+
pytest
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ "main" ]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [ "main" ]
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
test:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
|
| 13 |
+
steps:
|
| 14 |
+
- uses: actions/checkout@v3
|
| 15 |
+
|
| 16 |
+
- name: Set up Python 3.10
|
| 17 |
+
uses: actions/setup-python@v4
|
| 18 |
+
with:
|
| 19 |
+
python-version: "3.10"
|
| 20 |
+
|
| 21 |
+
- name: Install System Dependencies
|
| 22 |
+
run: |
|
| 23 |
+
sudo apt-get update
|
| 24 |
+
sudo apt-get install -y ffmpeg libsndfile1
|
| 25 |
+
|
| 26 |
+
- name: Install Python Dependencies
|
| 27 |
+
run: |
|
| 28 |
+
python -m pip install --upgrade pip
|
| 29 |
+
pip install pytest pytest-asyncio httpx
|
| 30 |
+
if [ -f backend/requirements.txt ]; then pip install -r backend/requirements.txt; fi
|
| 31 |
+
|
| 32 |
+
- name: Run Tests
|
| 33 |
+
# We skip slow tests or those requiring GPU/Redis if not available
|
| 34 |
+
run: |
|
| 35 |
+
cd backend
|
| 36 |
+
pytest tests/ -v -m "not integration"
|
.gitignore
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
*.manifest
|
| 31 |
+
*.spec
|
| 32 |
+
|
| 33 |
+
# Installer logs
|
| 34 |
+
pip-log.txt
|
| 35 |
+
pip-delete-this-directory.txt
|
| 36 |
+
|
| 37 |
+
# Unit test / coverage reports
|
| 38 |
+
htmlcov/
|
| 39 |
+
.tox/
|
| 40 |
+
.nox/
|
| 41 |
+
.coverage
|
| 42 |
+
.coverage.*
|
| 43 |
+
.cache
|
| 44 |
+
nosetests.xml
|
| 45 |
+
coverage.xml
|
| 46 |
+
*.cover
|
| 47 |
+
*.py,cover
|
| 48 |
+
.hypothesis/
|
| 49 |
+
.pytest_cache/
|
| 50 |
+
cover/
|
| 51 |
+
|
| 52 |
+
# Translations
|
| 53 |
+
*.mo
|
| 54 |
+
*.pot
|
| 55 |
+
|
| 56 |
+
# Django stuff:
|
| 57 |
+
*.log
|
| 58 |
+
local_settings.py
|
| 59 |
+
db.sqlite3
|
| 60 |
+
db.sqlite3-journal
|
| 61 |
+
|
| 62 |
+
# Flask stuff:
|
| 63 |
+
instance/
|
| 64 |
+
.webassets-cache
|
| 65 |
+
|
| 66 |
+
# Scrapy stuff:
|
| 67 |
+
.scrapy
|
| 68 |
+
|
| 69 |
+
# Sphinx documentation
|
| 70 |
+
docs/_build/
|
| 71 |
+
|
| 72 |
+
# PyBuilder
|
| 73 |
+
.pybuilder/
|
| 74 |
+
target/
|
| 75 |
+
|
| 76 |
+
# Jupyter Notebook
|
| 77 |
+
.ipynb_checkpoints
|
| 78 |
+
|
| 79 |
+
# IPython
|
| 80 |
+
profile_default/
|
| 81 |
+
ipython_config.py
|
| 82 |
+
|
| 83 |
+
# pyenv
|
| 84 |
+
# For a library or package, you might want to ignore these files since the Python version is actually
|
| 85 |
+
# determined by the app developer rather than the library.
|
| 86 |
+
# .python-version
|
| 87 |
+
|
| 88 |
+
# pipenv
|
| 89 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 90 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 91 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or even
|
| 92 |
+
# fail to install them.
|
| 93 |
+
# Pipfile.lock
|
| 94 |
+
|
| 95 |
+
# poetry
|
| 96 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 97 |
+
# poetry.lock
|
| 98 |
+
|
| 99 |
+
# pdm
|
| 100 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 101 |
+
# .pdm-python
|
| 102 |
+
# .pdm-build/
|
| 103 |
+
|
| 104 |
+
# PEP 582; used by e.g. github.com/frenzymadness/venvpdm
|
| 105 |
+
__pypackages__/
|
| 106 |
+
|
| 107 |
+
# Celery stuff
|
| 108 |
+
celerybeat-schedule
|
| 109 |
+
celerybeat.pid
|
| 110 |
+
|
| 111 |
+
# SageMath parsed files
|
| 112 |
+
*.sage.py
|
| 113 |
+
|
| 114 |
+
# Environments
|
| 115 |
+
.env
|
| 116 |
+
.venv
|
| 117 |
+
env/
|
| 118 |
+
venv/
|
| 119 |
+
ENV/
|
| 120 |
+
env.bak/
|
| 121 |
+
venv.bak/
|
| 122 |
+
|
| 123 |
+
# Spyder project settings
|
| 124 |
+
.spyderproject
|
| 125 |
+
.spyproject
|
| 126 |
+
|
| 127 |
+
# Rope project settings
|
| 128 |
+
.ropeproject
|
| 129 |
+
|
| 130 |
+
# mkdocs documentation
|
| 131 |
+
/site
|
| 132 |
+
|
| 133 |
+
# mypy
|
| 134 |
+
.mypy_cache/
|
| 135 |
+
.dmypy.json
|
| 136 |
+
dmypy.json
|
| 137 |
+
|
| 138 |
+
# Pyre type checker
|
| 139 |
+
.pyre/
|
| 140 |
+
|
| 141 |
+
# pytype static type analyzer
|
| 142 |
+
.pytype/
|
| 143 |
+
|
| 144 |
+
# Cython debug symbols
|
| 145 |
+
cython_debug/
|
| 146 |
+
|
| 147 |
+
# OS
|
| 148 |
+
.DS_Store
|
| 149 |
+
Thumbs.db
|
| 150 |
+
|
| 151 |
+
# Database
|
| 152 |
+
*.db
|
| 153 |
+
*.sqlite
|
| 154 |
+
|
| 155 |
+
# Local models
|
| 156 |
+
models/
|
| 157 |
+
*.bin
|
| 158 |
+
*.pth
|
| 159 |
+
*.onnx
|
| 160 |
+
|
| 161 |
+
# Credentials
|
| 162 |
+
credentials/
|
| 163 |
+
*.json
|
| 164 |
+
!deploy/monitoring/*.json
|
| 165 |
+
|
| 166 |
+
# Uploads
|
| 167 |
+
uploads/
|
| 168 |
+
|
| 169 |
+
# Diagnostic files
|
| 170 |
+
diagnostic_app.py
|
| 171 |
+
diag_traceback.txt
|
| 172 |
+
diag_log.txt
|
| 173 |
+
live_verify.py
|
| 174 |
+
test_prompt.wav
|
| 175 |
+
test_output.mp3
|
| 176 |
+
debug_app.py
|
| 177 |
+
debug_out.txt
|
| 178 |
+
diag_traceback.txt
|
.lighthouseci/lhr-1769848038113.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
CHANGELOG.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Changelog
|
| 2 |
+
|
| 3 |
+
All notable changes to VoiceForge will be documented in this file.
|
| 4 |
+
|
| 5 |
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
| 6 |
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
| 7 |
+
|
| 8 |
+
## [3.0.0] - 2026-01-31
|
| 9 |
+
|
| 10 |
+
### Major Architecture Updates
|
| 11 |
+
- **Hybrid STT Engine**:
|
| 12 |
+
- Integrated `large-v3-turbo` for 8x faster multilingual transcription.
|
| 13 |
+
- Implemented smart routing between Distil-Whisper (English) and Turbo (Multilingual).
|
| 14 |
+
- **Unified TTS Service**:
|
| 15 |
+
- Added `MeloTTS` integration for local, low-latency speech synthesis.
|
| 16 |
+
- Implemented automatic fallback to EdgeTTS for reliability.
|
| 17 |
+
- **Poetry Migration**:
|
| 18 |
+
- Replaced `requirements.txt` with `pyproject.toml` and `poetry.lock`.
|
| 19 |
+
- Optimized Docker build workflow (multi-stage build ready).
|
| 20 |
+
|
| 21 |
+
### Fixed
|
| 22 |
+
- **Critical Build Fix**: Resolved `numpy`/`torch` version conflicts that caused 30+ min Docker builds.
|
| 23 |
+
|
| 24 |
+
## [2.0.1] - 2026-01-31
|
| 25 |
+
|
| 26 |
+
### Fixed
|
| 27 |
+
- **CRITICAL**: Resolved numpy/torch dependency conflict causing 30+ minute Docker builds
|
| 28 |
+
- Pinned `numpy==1.26.4` (last stable 1.x version)
|
| 29 |
+
- Pinned `torch==2.3.1` and `torchaudio==2.3.1` for compatibility
|
| 30 |
+
- Docker builds now complete in <10 minutes instead of 30+
|
| 31 |
+
- Added version ranges to core dependencies (fastapi, uvicorn, httpx) for stability
|
| 32 |
+
- Added missing `locust` dependency for performance testing
|
| 33 |
+
|
| 34 |
+
### Added
|
| 35 |
+
- `DEPENDENCY_NOTES.md` documenting version constraints and update strategy
|
| 36 |
+
|
| 37 |
+
## [2.0.0] - 2026-01-31
|
| 38 |
+
|
| 39 |
+
### Added
|
| 40 |
+
- **Advanced Test Suite** (Phase 14)
|
| 41 |
+
- 74+ tests across unit, integration, performance, and security categories
|
| 42 |
+
- Master test runner (`tests/run_all_tests.py`) for one-command execution
|
| 43 |
+
- 100% module coverage across all backend services
|
| 44 |
+
- **Quality Automation Tools**
|
| 45 |
+
- `analyze_codebase.py`: Code complexity and maintainability metrics
|
| 46 |
+
- `check_syntax.py`: Python syntax and circular import detection
|
| 47 |
+
- `check_dependencies.py`: Dependency health and security vulnerability scanning
|
| 48 |
+
- `check_pipeline.py`: CI/CD pipeline validation (GitHub Actions, Docker)
|
| 49 |
+
- `coverage_tracker.py`: Module coverage matrix and untested function identification
|
| 50 |
+
- `lighthouse_audit.py`: Frontend performance auditing
|
| 51 |
+
- `project_audit.py`: Overall project coverage reporting
|
| 52 |
+
- **Mobile App Foundation** (Phase 13 - In Progress)
|
| 53 |
+
- Flutter mobile app directory structure
|
| 54 |
+
- Architecture documentation for mobile companion app
|
| 55 |
+
- WebSocket integration design for real-time transcription
|
| 56 |
+
- **Documentation**
|
| 57 |
+
- `docs/TESTING.md`: Comprehensive testing guide
|
| 58 |
+
- Updated `README.md` with testing instructions
|
| 59 |
+
- Mobile app setup guides
|
| 60 |
+
|
| 61 |
+
### Changed
|
| 62 |
+
- Updated `httpx.AsyncClient` usage to use `ASGITransport` for compatibility with modern httpx
|
| 63 |
+
- Improved test fixtures with proper async handling (`pytest-asyncio`)
|
| 64 |
+
- Enhanced `PROJECT_SUMMARY.md` with Phase 14 achievements
|
| 65 |
+
|
| 66 |
+
### Fixed
|
| 67 |
+
- Resolved `httpx` deprecation warnings in integration tests
|
| 68 |
+
- Fixed mock setup in `test_translation_service.py` for `langdetect`
|
| 69 |
+
- Corrected streaming synthesis mock signatures in `test_tts_service.py`
|
| 70 |
+
|
| 71 |
+
## [1.5.0] - 2026-01-17
|
| 72 |
+
|
| 73 |
+
### Added
|
| 74 |
+
- Memory management with dynamic model unloading (1.5GB → 500MB)
|
| 75 |
+
- WebSocket TTS streaming (<500ms TTFB)
|
| 76 |
+
- SSML prosody control for advanced voice customization
|
| 77 |
+
|
| 78 |
+
### Changed
|
| 79 |
+
- Performance improvements across STT and TTS services
|
| 80 |
+
|
| 81 |
+
## [1.4.0] - 2026-01-15
|
| 82 |
+
|
| 83 |
+
### Added
|
| 84 |
+
- Batched inference for 2-4x throughput improvement
|
| 85 |
+
- Audio preprocessing with noise reduction
|
| 86 |
+
- Speaker diarization (pyannote.audio integration)
|
| 87 |
+
- Voice cloning with Coqui XTTS v2
|
| 88 |
+
|
| 89 |
+
## [1.3.0] - 2026-01-10
|
| 90 |
+
|
| 91 |
+
### Added
|
| 92 |
+
- Phase 11: Optimization implementation
|
| 93 |
+
- DNS loopback fix (210x cold start improvement)
|
| 94 |
+
- Int8 quantization + greedy decoding (3x STT speedup)
|
| 95 |
+
- Distil-Whisper hybrid routing (10x cumulative STT speedup)
|
| 96 |
+
- Sentence streaming TTS (8x TTFB speedup)
|
| 97 |
+
- Real-Time Factor: 0.28x (super-realtime performance)
|
| 98 |
+
|
| 99 |
+
### Changed
|
| 100 |
+
- STT latency reduced from 38.5s to 3.7s (10x improvement)
|
| 101 |
+
- TTS TTFB reduced from 8.8s to 1.1s (8x improvement)
|
| 102 |
+
|
| 103 |
+
## [1.2.0] - 2026-01-05
|
| 104 |
+
|
| 105 |
+
### Added
|
| 106 |
+
- Phase 10: Performance research
|
| 107 |
+
- Comprehensive benchmarking suite
|
| 108 |
+
- 11 optimization dimensions identified
|
| 109 |
+
- Priority matrix documentation
|
| 110 |
+
|
| 111 |
+
## [1.0.0] - 2026-01-01
|
| 112 |
+
|
| 113 |
+
### Added
|
| 114 |
+
- Initial release
|
| 115 |
+
- FastAPI backend with REST API
|
| 116 |
+
- Streamlit frontend with glassmorphism UI
|
| 117 |
+
- Local AI integration (Whisper STT + Edge TTS)
|
| 118 |
+
- WebSocket live recording
|
| 119 |
+
- NLP analysis (sentiment, keywords, summary)
|
| 120 |
+
- Docker containerization
|
| 121 |
+
- Basic documentation
|
| 122 |
+
|
| 123 |
+
[2.0.0]: https://github.com/yourusername/voiceforge/compare/v1.5.0...v2.0.0
|
| 124 |
+
[1.5.0]: https://github.com/yourusername/voiceforge/compare/v1.4.0...v1.5.0
|
| 125 |
+
[1.4.0]: https://github.com/yourusername/voiceforge/compare/v1.3.0...v1.4.0
|
| 126 |
+
[1.3.0]: https://github.com/yourusername/voiceforge/compare/v1.2.0...v1.3.0
|
| 127 |
+
[1.2.0]: https://github.com/yourusername/voiceforge/compare/v1.0.0...v1.2.0
|
| 128 |
+
[1.0.0]: https://github.com/yourusername/voiceforge/releases/tag/v1.0.0
|
CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to VoiceForge
|
| 2 |
+
|
| 3 |
+
Thank you for considering contributing to VoiceForge! This document provides guidelines for contributing to the project.
|
| 4 |
+
|
| 5 |
+
## 🚀 Getting Started
|
| 6 |
+
|
| 7 |
+
### Prerequisites
|
| 8 |
+
- Python 3.10+
|
| 9 |
+
- Docker & Docker Compose
|
| 10 |
+
- Git
|
| 11 |
+
|
| 12 |
+
### Development Setup
|
| 13 |
+
|
| 14 |
+
1. **Clone the repository**
|
| 15 |
+
```bash
|
| 16 |
+
git clone https://github.com/yourusername/voiceforge.git
|
| 17 |
+
cd voiceforge
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
2. **Install dependencies**
|
| 21 |
+
```bash
|
| 22 |
+
# Backend
|
| 23 |
+
cd backend
|
| 24 |
+
pip install -r requirements.txt
|
| 25 |
+
|
| 26 |
+
# Frontend
|
| 27 |
+
cd ../frontend
|
| 28 |
+
pip install -r requirements.txt
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
3. **Set up environment variables**
|
| 32 |
+
```bash
|
| 33 |
+
cp backend/.env.example backend/.env
|
| 34 |
+
# Edit .env with your configuration
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
4. **Run the application**
|
| 38 |
+
```bash
|
| 39 |
+
# Using Docker (recommended)
|
| 40 |
+
docker-compose up
|
| 41 |
+
|
| 42 |
+
# OR manually
|
| 43 |
+
# Terminal 1: Backend
|
| 44 |
+
cd backend
|
| 45 |
+
uvicorn app.main:app --reload
|
| 46 |
+
|
| 47 |
+
# Terminal 2: Frontend
|
| 48 |
+
cd frontend
|
| 49 |
+
streamlit run streamlit_app.py
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## 🧪 Testing
|
| 53 |
+
|
| 54 |
+
### Running Tests
|
| 55 |
+
```bash
|
| 56 |
+
cd backend
|
| 57 |
+
|
| 58 |
+
# Run all tests
|
| 59 |
+
python tests/run_all_tests.py
|
| 60 |
+
|
| 61 |
+
# Run specific category
|
| 62 |
+
pytest tests/unit -v
|
| 63 |
+
pytest tests/integration -v
|
| 64 |
+
|
| 65 |
+
# Run with coverage
|
| 66 |
+
pytest --cov=app tests/
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
### Writing Tests
|
| 70 |
+
- **Unit tests**: Test individual functions in `tests/unit/`
|
| 71 |
+
- **Integration tests**: Test API endpoints in `tests/integration/`
|
| 72 |
+
- **Follow existing patterns**: Check similar tests for examples
|
| 73 |
+
|
| 74 |
+
### Quality Checks
|
| 75 |
+
```bash
|
| 76 |
+
# Code quality analysis
|
| 77 |
+
python tests/quality/analyze_codebase.py --path app
|
| 78 |
+
|
| 79 |
+
# Dependency health
|
| 80 |
+
python tests/quality/check_dependencies.py
|
| 81 |
+
|
| 82 |
+
# Syntax check
|
| 83 |
+
python tests/quality/check_syntax.py --path app
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## 📝 Code Style
|
| 87 |
+
|
| 88 |
+
### Python
|
| 89 |
+
- Follow [PEP 8](https://pep8.org/)
|
| 90 |
+
- Use type hints where possible
|
| 91 |
+
- Maximum line length: 100 characters
|
| 92 |
+
- Use descriptive variable names
|
| 93 |
+
|
| 94 |
+
### Example
|
| 95 |
+
```python
|
| 96 |
+
from typing import List, Optional
|
| 97 |
+
|
| 98 |
+
async def transcribe_audio(
|
| 99 |
+
file_path: str,
|
| 100 |
+
language: Optional[str] = None,
|
| 101 |
+
quality_mode: bool = False
|
| 102 |
+
) -> dict:
|
| 103 |
+
"""
|
| 104 |
+
Transcribe audio file to text.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
file_path: Path to audio file
|
| 108 |
+
language: Language code (auto-detect if None)
|
| 109 |
+
quality_mode: Use high-quality mode with beam search
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
dict: Transcription result with segments
|
| 113 |
+
"""
|
| 114 |
+
# Implementation
|
| 115 |
+
pass
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
### Formatting
|
| 119 |
+
We recommend using:
|
| 120 |
+
- `black` for code formatting
|
| 121 |
+
- `isort` for import sorting
|
| 122 |
+
- `mypy` for type checking
|
| 123 |
+
|
| 124 |
+
```bash
|
| 125 |
+
# Format code
|
| 126 |
+
black app/
|
| 127 |
+
isort app/
|
| 128 |
+
|
| 129 |
+
# Type check
|
| 130 |
+
mypy app/
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
## 🌿 Branch Strategy
|
| 134 |
+
|
| 135 |
+
### Branch Naming
|
| 136 |
+
- `feature/description` - New features
|
| 137 |
+
- `fix/description` - Bug fixes
|
| 138 |
+
- `docs/description` - Documentation updates
|
| 139 |
+
- `test/description` - Test additions/improvements
|
| 140 |
+
|
| 141 |
+
### Example
|
| 142 |
+
```bash
|
| 143 |
+
git checkout -b feature/add-voice-cloning
|
| 144 |
+
git checkout -b fix/tts-streaming-bug
|
| 145 |
+
git checkout -b docs/update-api-guide
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
## 📤 Pull Request Process
|
| 149 |
+
|
| 150 |
+
1. **Create a feature branch**
|
| 151 |
+
```bash
|
| 152 |
+
git checkout -b feature/my-new-feature
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
2. **Make your changes**
|
| 156 |
+
- Write clean, well-documented code
|
| 157 |
+
- Add tests for new functionality
|
| 158 |
+
- Update documentation as needed
|
| 159 |
+
|
| 160 |
+
3. **Test your changes**
|
| 161 |
+
```bash
|
| 162 |
+
python tests/run_all_tests.py
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
4. **Commit with clear messages**
|
| 166 |
+
```bash
|
| 167 |
+
git commit -m "feat: add real-time noise cancellation
|
| 168 |
+
|
| 169 |
+
- Implement RNNoise integration
|
| 170 |
+
- Add preprocessing pipeline
|
| 171 |
+
- Add unit tests for audio processing
|
| 172 |
+
- Update API documentation"
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
5. **Push and create PR**
|
| 176 |
+
```bash
|
| 177 |
+
git push origin feature/my-new-feature
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
6. **PR Description Template**
|
| 181 |
+
```markdown
|
| 182 |
+
## Description
|
| 183 |
+
Brief description of changes
|
| 184 |
+
|
| 185 |
+
## Type of Change
|
| 186 |
+
- [ ] Bug fix
|
| 187 |
+
- [ ] New feature
|
| 188 |
+
- [ ] Documentation update
|
| 189 |
+
- [ ] Performance improvement
|
| 190 |
+
|
| 191 |
+
## Testing
|
| 192 |
+
- [ ] Unit tests added/updated
|
| 193 |
+
- [ ] Integration tests added/updated
|
| 194 |
+
- [ ] Manual testing performed
|
| 195 |
+
|
| 196 |
+
## Checklist
|
| 197 |
+
- [ ] Code follows project style guidelines
|
| 198 |
+
- [ ] Tests pass locally
|
| 199 |
+
- [ ] Documentation updated
|
| 200 |
+
- [ ] No new warnings introduced
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
## 🐛 Reporting Bugs
|
| 204 |
+
|
| 205 |
+
### Bug Report Template
|
| 206 |
+
```markdown
|
| 207 |
+
**Describe the bug**
|
| 208 |
+
A clear description of what the bug is.
|
| 209 |
+
|
| 210 |
+
**To Reproduce**
|
| 211 |
+
Steps to reproduce:
|
| 212 |
+
1. Go to '...'
|
| 213 |
+
2. Click on '....'
|
| 214 |
+
3. See error
|
| 215 |
+
|
| 216 |
+
**Expected behavior**
|
| 217 |
+
What you expected to happen.
|
| 218 |
+
|
| 219 |
+
**Environment:**
|
| 220 |
+
- OS: [e.g. Windows 11]
|
| 221 |
+
- Python version: [e.g. 3.10.5]
|
| 222 |
+
- VoiceForge version: [e.g. 2.0.0]
|
| 223 |
+
|
| 224 |
+
**Additional context**
|
| 225 |
+
Add any other context, logs, or screenshots.
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
## 💡 Feature Requests
|
| 229 |
+
|
| 230 |
+
### Feature Request Template
|
| 231 |
+
```markdown
|
| 232 |
+
**Problem Statement**
|
| 233 |
+
Describe the problem this feature would solve.
|
| 234 |
+
|
| 235 |
+
**Proposed Solution**
|
| 236 |
+
Describe your proposed solution.
|
| 237 |
+
|
| 238 |
+
**Alternatives Considered**
|
| 239 |
+
What alternatives have you considered?
|
| 240 |
+
|
| 241 |
+
**Additional Context**
|
| 242 |
+
Any other context, mockups, or examples.
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
## 📚 Documentation
|
| 246 |
+
|
| 247 |
+
### Documentation Standards
|
| 248 |
+
- Use clear, concise language
|
| 249 |
+
- Include code examples
|
| 250 |
+
- Update relevant docs when changing functionality
|
| 251 |
+
- Add inline comments for complex logic
|
| 252 |
+
|
| 253 |
+
### Documentation Locations
|
| 254 |
+
- `README.md` - Project overview
|
| 255 |
+
- `docs/API.md` - API reference
|
| 256 |
+
- `docs/TESTING.md` - Testing guide
|
| 257 |
+
- `docs/ARCHITECTURE.md` - System architecture
|
| 258 |
+
- Inline docstrings - Function/class documentation
|
| 259 |
+
|
| 260 |
+
## 🏆 Recognition
|
| 261 |
+
|
| 262 |
+
Contributors will be:
|
| 263 |
+
- Listed in `CONTRIBUTORS.md`
|
| 264 |
+
- Mentioned in release notes
|
| 265 |
+
- Credited in the README
|
| 266 |
+
|
| 267 |
+
## 📜 License
|
| 268 |
+
|
| 269 |
+
By contributing, you agree that your contributions will be licensed under the MIT License.
|
| 270 |
+
|
| 271 |
+
## ❓ Questions?
|
| 272 |
+
|
| 273 |
+
- Open an issue for questions
|
| 274 |
+
- Join our discussions
|
| 275 |
+
- Email: your@email.com
|
| 276 |
+
|
| 277 |
+
---
|
| 278 |
+
|
| 279 |
+
**Thank you for making VoiceForge better!** 🎉
|
README.md
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🎙️ VoiceForge - Enterprise Speech AI Platform
|
| 2 |
+
|
| 3 |
+

|
| 4 |
+
|
| 5 |
+
[](CHANGELOG.md)
|
| 6 |
+
[](docs/PROJECT_SUMMARY.md)
|
| 7 |
+
[](https://www.python.org/)
|
| 8 |
+
[](https://fastapi.tiangolo.com/)
|
| 9 |
+
[](deploy/k8s/)
|
| 10 |
+
[](deploy/terraform/)
|
| 11 |
+
|
| 12 |
+
**VoiceForge V4.0** is an **enterprise-grade, cloud-native** Speech AI platform with complete infrastructure automation, security hardening, and observability. Features local-first Whisper STT, Edge TTS, voice cloning, sign language recognition, and a Flutter mobile companion app.
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## 🚀 V4.0 - Enterprise Edition
|
| 17 |
+
|
| 18 |
+
### 🆕 What's New
|
| 19 |
+
- ☸️ **Kubernetes Native**: Production-ready K8s manifests + Helm charts
|
| 20 |
+
- 🏗️ **Infrastructure as Code**: Full Terraform setup for AWS (VPC, EKS, Redis)
|
| 21 |
+
- 📊 **Observability Stack**: Grafana dashboards + Prometheus monitoring with alerts
|
| 22 |
+
- 🔒 **Security Hardening**: Rate limiting, data encryption (Fernet), security headers, penetration tests
|
| 23 |
+
- 📱 **Mobile App**: Flutter companion with offline support, localization (en/es), accessibility
|
| 24 |
+
- 🤖 **Sign Language**: Real-time ASL recognition + avatar generation
|
| 25 |
+
- 🚦 **CI/CD**: GitHub Actions for automated testing
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## 📦 Complete Feature Set
|
| 30 |
+
|
| 31 |
+
### 🎧 Speech-to-Text (STT)
|
| 32 |
+
- ✅ Hybrid Local/Cloud (Whisper + Google Cloud)
|
| 33 |
+
- ✅ Real-time WebSocket streaming
|
| 34 |
+
- ✅ Speaker diarization (pyannote)
|
| 35 |
+
- ✅ Word-level timestamps
|
| 36 |
+
- ✅ 50+ languages
|
| 37 |
+
|
| 38 |
+
### 🗣️ Text-to-Speech (TTS)
|
| 39 |
+
- ✅ 300+ neural voices (Edge TTS)
|
| 40 |
+
- ✅ Voice cloning (Coqui XTTS v2)
|
| 41 |
+
- ✅ Speed/pitch customization
|
| 42 |
+
- ✅ Streaming playback
|
| 43 |
+
|
| 44 |
+
### 🤖 AI Features
|
| 45 |
+
- ✅ Emotion & sentiment analysis
|
| 46 |
+
- ✅ Meeting minutes generation
|
| 47 |
+
- ✅ Keyword extraction & summarization
|
| 48 |
+
- ✅ Audio translation (100+ languages)
|
| 49 |
+
- ✅ Sign language recognition + generation
|
| 50 |
+
|
| 51 |
+
### 🎨 Audio Studio
|
| 52 |
+
- ✅ Trim, merge, convert audio
|
| 53 |
+
- ✅ Batch processing
|
| 54 |
+
- ✅ Export: PDF, SRT, VTT, TXT
|
| 55 |
+
|
| 56 |
+
### 📱 Mobile App (Flutter)
|
| 57 |
+
- ✅ Cross-platform (Android/iOS)
|
| 58 |
+
- ✅ Offline transcription caching (Hive)
|
| 59 |
+
- ✅ Real-time recording & synthesis
|
| 60 |
+
- ✅ i18n (English/Spanish)
|
| 61 |
+
- ✅ High contrast accessibility mode
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## 🏗️ Enterprise Infrastructure
|
| 66 |
+
|
| 67 |
+
### ☸️ Kubernetes Deployment
|
| 68 |
+
```bash
|
| 69 |
+
# Deploy to cluster
|
| 70 |
+
kubectl apply -f deploy/k8s/namespace.yaml
|
| 71 |
+
kubectl apply -f deploy/k8s/backend.yaml
|
| 72 |
+
kubectl apply -f deploy/k8s/ingress.yaml
|
| 73 |
+
|
| 74 |
+
# Or use Helm
|
| 75 |
+
helm install voiceforge deploy/helm/voiceforge -f values.yaml
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### 🔧 Terraform Provisioning
|
| 79 |
+
```bash
|
| 80 |
+
cd deploy/terraform
|
| 81 |
+
terraform init
|
| 82 |
+
terraform plan
|
| 83 |
+
terraform apply # Creates: VPC, EKS, ElastiCache Redis
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
**Provisions:**
|
| 87 |
+
- VPC with public/private subnets + NAT
|
| 88 |
+
- EKS cluster with auto-scaling node groups
|
| 89 |
+
- ElastiCache Redis cluster
|
| 90 |
+
- Security groups + IAM roles
|
| 91 |
+
|
| 92 |
+
### 📊 Monitoring & Alerting
|
| 93 |
+
```bash
|
| 94 |
+
# Import Grafana dashboard
|
| 95 |
+
kubectl apply -f deploy/monitoring/prometheus-rules.yaml
|
| 96 |
+
# Dashboard JSON: deploy/monitoring/grafana-dashboard.json
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
**Metrics tracked:**
|
| 100 |
+
- Request rate, latency (p95/p99)
|
| 101 |
+
- Error rates (5xx)
|
| 102 |
+
- CPU/Memory usage
|
| 103 |
+
- Pod health & restarts
|
| 104 |
+
|
| 105 |
+
**Alerts:**
|
| 106 |
+
- High error rate (>5%)
|
| 107 |
+
- High latency (>2s p95)
|
| 108 |
+
- Resource exhaustion
|
| 109 |
+
|
| 110 |
+
---
|
| 111 |
+
|
| 112 |
+
## 🔒 Security Features
|
| 113 |
+
|
| 114 |
+
| Feature | Implementation | Status |
|
| 115 |
+
|---------|----------------|--------|
|
| 116 |
+
| **Rate Limiting** | slowapi + Redis | ✅ 5/min (auth), 10/min (AI) |
|
| 117 |
+
| **Data Encryption** | Fernet (AES) at-rest | ✅ User PII + transcripts |
|
| 118 |
+
| **Security Headers** | HSTS, CSP, X-Frame-Options | ✅ All responses |
|
| 119 |
+
| **Authentication** | JWT + API keys | ✅ Token refresh |
|
| 120 |
+
| **Penetration Tests** | OWASP Top 10 scanner | ✅ Automated |
|
| 121 |
+
|
| 122 |
+
Run security tests:
|
| 123 |
+
```bash
|
| 124 |
+
python backend/tests/security/security_tests.py --base-url http://localhost:8000
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
## 🚀 Quick Start
|
| 130 |
+
|
| 131 |
+
### 1. Docker Compose (Fastest)
|
| 132 |
+
```bash
|
| 133 |
+
git clone https://github.com/yourusername/voiceforge
|
| 134 |
+
cd voiceforge
|
| 135 |
+
docker-compose up -d
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
### 2. Local Development
|
| 139 |
+
```bash
|
| 140 |
+
# Backend
|
| 141 |
+
cd backend
|
| 142 |
+
pip install -r requirements.txt
|
| 143 |
+
uvicorn app.main:app --reload
|
| 144 |
+
|
| 145 |
+
# Frontend
|
| 146 |
+
cd frontend
|
| 147 |
+
pip install -r requirements.txt
|
| 148 |
+
streamlit run streamlit_app.py
|
| 149 |
+
|
| 150 |
+
# Mobile
|
| 151 |
+
cd mobile
|
| 152 |
+
flutter pub get
|
| 153 |
+
flutter run
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
### 3. Kubernetes
|
| 157 |
+
```bash
|
| 158 |
+
helm install voiceforge ./deploy/helm/voiceforge \
|
| 159 |
+
--set redis.enabled=true \
|
| 160 |
+
--set ingress.hosts[0].host=api.yourdomain.com
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
**Access:**
|
| 164 |
+
- Frontend: http://localhost:8501
|
| 165 |
+
- API Docs: http://localhost:8000/docs
|
| 166 |
+
- Metrics: http://localhost:8000/metrics
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
## 🛠️ Tech Stack
|
| 171 |
+
|
| 172 |
+
### Backend
|
| 173 |
+
- **FastAPI**: Async REST API
|
| 174 |
+
- **SQLAlchemy**: ORM + migrations
|
| 175 |
+
- **Celery**: Background tasks
|
| 176 |
+
- **Redis**: Cache + rate limiting
|
| 177 |
+
- **Prometheus**: Metrics
|
| 178 |
+
|
| 179 |
+
### AI/ML
|
| 180 |
+
- **faster-whisper**: Local STT
|
| 181 |
+
- **edge-tts**: Neural TTS (free)
|
| 182 |
+
- **Coqui TTS**: Voice cloning
|
| 183 |
+
- **MediaPipe**: Sign language recognition
|
| 184 |
+
- **pyannote**: Speaker diarization
|
| 185 |
+
|
| 186 |
+
### Frontend
|
| 187 |
+
- **Streamlit**: Web UI
|
| 188 |
+
- **Flutter**: Mobile app (Riverpod state)
|
| 189 |
+
|
| 190 |
+
### DevOps
|
| 191 |
+
- **Docker**: Multi-stage builds
|
| 192 |
+
- **Kubernetes**: Helm charts + HPA
|
| 193 |
+
- **Terraform**: AWS infrastructure
|
| 194 |
+
- **GitHub Actions**: CI/CD pipeline
|
| 195 |
+
- **Grafana**: Dashboards
|
| 196 |
+
|
| 197 |
+
---
|
| 198 |
+
|
| 199 |
+
## 📁 Project Structure
|
| 200 |
+
|
| 201 |
+
```
|
| 202 |
+
voiceforge/
|
| 203 |
+
├── backend/ # FastAPI microservices
|
| 204 |
+
│ ├── app/
|
| 205 |
+
│ │ ├── api/routes/ # REST endpoints
|
| 206 |
+
│ │ ├── core/ # Config, security, limiter
|
| 207 |
+
│ │ ├── models/ # SQLAlchemy models
|
| 208 |
+
│ │ ├── services/ # Business logic (STT, TTS, NLP, etc.)
|
| 209 |
+
│ │ └── workers/ # Celery tasks
|
| 210 |
+
│ ├── tests/ # Unit, integration, security tests
|
| 211 |
+
│ │ ├── unit/ # Service tests
|
| 212 |
+
│ │ ├── integration/ # API tests
|
| 213 |
+
│ │ ├── quality/ # Code analyzers
|
| 214 |
+
│ │ └── security/ # OWASP scanners
|
| 215 |
+
│ └── requirements.txt
|
| 216 |
+
├── frontend/ # Streamlit web app
|
| 217 |
+
│ ├── pages/ # Multi-page UI
|
| 218 |
+
│ └── components/ # Reusable widgets
|
| 219 |
+
├── mobile/ # Flutter companion app
|
| 220 |
+
│ ├── lib/
|
| 221 |
+
│ │ ├── features/ # Auth, Transcription, Synthesis, Settings
|
| 222 |
+
│ │ ├── core/ # Theme, providers
|
| 223 |
+
│ │ └── l10n/ # Localization (en, es)
|
| 224 |
+
│ └── pubspec.yaml
|
| 225 |
+
├── deploy/ # Infrastructure
|
| 226 |
+
│ ├── k8s/ # Kubernetes manifests
|
| 227 |
+
│ ├── helm/ # Helm charts
|
| 228 |
+
│ ├── terraform/ # AWS IaC (VPC, EKS, Redis)
|
| 229 |
+
│ ├── monitoring/ # Grafana + Prometheus
|
| 230 |
+
│ └── docker/ # Compose files
|
| 231 |
+
├── docs/ # Documentation
|
| 232 |
+
│ ├── ARCHITECTURE.md # System design
|
| 233 |
+
│ ├── DEPLOYMENT_GUIDE.md
|
| 234 |
+
│ ├── WALKTHROUGH.md # Feature tour
|
| 235 |
+
│ └── adr/ # Architecture decisions
|
| 236 |
+
└── .github/workflows/ # CI/CD pipelines
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
---
|
| 240 |
+
|
| 241 |
+
## 🧪 Testing
|
| 242 |
+
|
| 243 |
+
```bash
|
| 244 |
+
# Run all tests (unit, integration, quality, security)
|
| 245 |
+
cd backend
|
| 246 |
+
python tests/run_all_tests.py
|
| 247 |
+
|
| 248 |
+
# Individual test suites
|
| 249 |
+
pytest tests/unit/ # Unit tests
|
| 250 |
+
pytest tests/integration/ # API tests
|
| 251 |
+
python tests/security/security_tests.py # Penetration tests
|
| 252 |
+
|
| 253 |
+
# Mobile tests
|
| 254 |
+
cd mobile
|
| 255 |
+
flutter test
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
**Coverage Goal: >80%**
|
| 259 |
+
|
| 260 |
+
---
|
| 261 |
+
|
| 262 |
+
## 🌍 Supported Languages
|
| 263 |
+
|
| 264 |
+
**STT + TTS**: English, Spanish, French, German, Japanese, Korean, Chinese, Hindi, Arabic, Portuguese, Italian, Russian, Dutch, Turkish, Polish, and 35+ more.
|
| 265 |
+
|
| 266 |
+
**Voice Cloning**: 16 languages including all above.
|
| 267 |
+
|
| 268 |
+
---
|
| 269 |
+
|
| 270 |
+
## 📊 Performance Benchmarks
|
| 271 |
+
|
| 272 |
+
| Operation | Time | Metric |
|
| 273 |
+
|-----------|------|--------|
|
| 274 |
+
| STT (30s audio) | 3.7s | 0.12x RTF |
|
| 275 |
+
| TTS (80 words) | 1.1s | TTFB |
|
| 276 |
+
| Voice Clone | 2.3s | 3s sample |
|
| 277 |
+
| Sign Recognition | 60 FPS | Real-time |
|
| 278 |
+
|
| 279 |
+
**Cost Savings**: 100% (local mode vs cloud APIs)
|
| 280 |
+
|
| 281 |
+
---
|
| 282 |
+
|
| 283 |
+
## 🚢 Deployment Scenarios
|
| 284 |
+
|
| 285 |
+
### Development
|
| 286 |
+
```bash
|
| 287 |
+
docker-compose up
|
| 288 |
+
```
|
| 289 |
+
|
| 290 |
+
### Staging (Cloud VM)
|
| 291 |
+
```bash
|
| 292 |
+
docker-compose -f docker-compose.prod.yml up -d
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
### Production (Kubernetes)
|
| 296 |
+
```bash
|
| 297 |
+
# Option 1: Direct manifests
|
| 298 |
+
kubectl apply -f deploy/k8s/
|
| 299 |
+
|
| 300 |
+
# Option 2: Helm chart
|
| 301 |
+
helm upgrade --install voiceforge deploy/helm/voiceforge \
|
| 302 |
+
--set replicaCount=3 \
|
| 303 |
+
--set autoscaling.enabled=true \
|
| 304 |
+
--set redis.enabled=true
|
| 305 |
+
```
|
| 306 |
+
|
| 307 |
+
### Cloud Provisioning
|
| 308 |
+
```bash
|
| 309 |
+
# AWS with Terraform
|
| 310 |
+
cd deploy/terraform
|
| 311 |
+
terraform apply -var="environment=production"
|
| 312 |
+
|
| 313 |
+
# GCP or Azure: Adapt Terraform modules
|
| 314 |
+
```
|
| 315 |
+
|
| 316 |
+
---
|
| 317 |
+
|
| 318 |
+
## 📚 Documentation
|
| 319 |
+
|
| 320 |
+
- [📖 Architecture](docs/ARCHITECTURE.md)
|
| 321 |
+
- [🚀 Deployment Guide](docs/DEPLOYMENT_GUIDE.md)
|
| 322 |
+
- [🔍 API Reference](http://localhost:8000/docs)
|
| 323 |
+
- [📱 Mobile Guide](mobile/README.md)
|
| 324 |
+
- [🔐 Security Policy](docs/SECURITY.md)
|
| 325 |
+
- [🎓 Interview Prep](docs/INTERVIEW_PREP.md)
|
| 326 |
+
|
| 327 |
+
---
|
| 328 |
+
|
| 329 |
+
## 🤝 Contributing
|
| 330 |
+
|
| 331 |
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
| 332 |
+
|
| 333 |
+
---
|
| 334 |
+
|
| 335 |
+
## 📝 License
|
| 336 |
+
|
| 337 |
+
MIT License - see [LICENSE](LICENSE) for details.
|
| 338 |
+
|
| 339 |
+
---
|
| 340 |
+
|
| 341 |
+
## 💡 Highlights for Portfolio/Interviews
|
| 342 |
+
|
| 343 |
+
This project demonstrates:
|
| 344 |
+
1. **Full-Stack Development**: Backend (FastAPI), Frontend (Streamlit), Mobile (Flutter)
|
| 345 |
+
2. **AI/ML Integration**: Local model deployment, hybrid cloud architecture
|
| 346 |
+
3. **DevOps Excellence**: Docker, K8s, Helm, Terraform, CI/CD
|
| 347 |
+
4. **Security**: Encryption, rate limiting, OWASP testing
|
| 348 |
+
5. **Observability**: Prometheus metrics, Grafana dashboards, alerting
|
| 349 |
+
6. **Scalability**: HPA, async workers, Redis caching
|
| 350 |
+
7. **Accessibility**: i18n, high contrast, screen readers
|
| 351 |
+
|
| 352 |
+
---
|
| 353 |
+
|
| 354 |
+
<div align="center">
|
| 355 |
+
|
| 356 |
+
**Built with ❤️ to showcase enterprise-level AI engineering**
|
| 357 |
+
|
| 358 |
+
[⭐ Star this repo](https://github.com/yourusername/voiceforge) • [📧 Contact](mailto:your@email.com)
|
| 359 |
+
|
| 360 |
+
</div>
|
backend/.flake8
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[flake8]
|
| 2 |
+
max-line-length = 120
|
| 3 |
+
extend-ignore = E203
|
| 4 |
+
exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,venv
|
backend/Dockerfile
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Build Stage
|
| 2 |
+
FROM python:3.10-slim as builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Set environment variables
|
| 7 |
+
ENV PYTHONDONTWRITEBYTECODE 1
|
| 8 |
+
ENV PYTHONUNBUFFERED 1
|
| 9 |
+
|
| 10 |
+
# Install system dependencies required for building python packages
|
| 11 |
+
# ffmpeg is needed for audio processing
|
| 12 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 13 |
+
gcc \
|
| 14 |
+
ffmpeg \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
# Install python dependencies
|
| 18 |
+
COPY requirements.txt .
|
| 19 |
+
RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Final Stage
|
| 23 |
+
FROM python:3.10-slim
|
| 24 |
+
|
| 25 |
+
WORKDIR /app
|
| 26 |
+
|
| 27 |
+
# Install runtime dependencies (ffmpeg)
|
| 28 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 29 |
+
ffmpeg \
|
| 30 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 31 |
+
|
| 32 |
+
# Copy wheels from builder
|
| 33 |
+
COPY --from=builder /app/wheels /wheels
|
| 34 |
+
COPY --from=builder /app/requirements.txt .
|
| 35 |
+
|
| 36 |
+
# Install dependencies from wheels
|
| 37 |
+
RUN pip install --no-cache /wheels/*
|
| 38 |
+
|
| 39 |
+
# Copy application code
|
| 40 |
+
COPY . .
|
| 41 |
+
|
| 42 |
+
# Create a non-root user
|
| 43 |
+
RUN addgroup --system app && adduser --system --group app
|
| 44 |
+
USER app
|
| 45 |
+
|
| 46 |
+
# Expose port
|
| 47 |
+
EXPOSE 8000
|
| 48 |
+
|
| 49 |
+
# Run commands
|
| 50 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
backend/app/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge Backend Package
|
| 3 |
+
"""
|
backend/app/api/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge API Package
|
| 3 |
+
"""
|
backend/app/api/routes/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge API Routes Package
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .stt import router as stt_router
|
| 6 |
+
from .tts import router as tts_router
|
| 7 |
+
from .health import router as health_router
|
| 8 |
+
from .transcripts import router as transcripts_router
|
| 9 |
+
from .ws import router as ws_router
|
| 10 |
+
from .translation import router as translation_router
|
| 11 |
+
from .batch import router as batch_router
|
| 12 |
+
from .analysis import router as analysis_router
|
| 13 |
+
from .audio import router as audio_router
|
| 14 |
+
from .cloning import router as cloning_router
|
| 15 |
+
from .sign import router as sign_router
|
| 16 |
+
from .auth import router as auth_router
|
| 17 |
+
|
| 18 |
+
__all__ = [
|
| 19 |
+
"stt_router",
|
| 20 |
+
"tts_router",
|
| 21 |
+
"health_router",
|
| 22 |
+
"transcripts_router",
|
| 23 |
+
"ws_router",
|
| 24 |
+
"translation_router",
|
| 25 |
+
"batch_router",
|
| 26 |
+
"analysis_router",
|
| 27 |
+
"audio_router",
|
| 28 |
+
"cloning_router",
|
| 29 |
+
"sign_router",
|
| 30 |
+
"auth_router",
|
| 31 |
+
]
|
backend/app/api/routes/analysis.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Analysis API Routes
|
| 3 |
+
Endpoints for Emotion and Sentiment Analysis
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
|
| 7 |
+
from typing import Dict, Any
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import shutil
|
| 11 |
+
import tempfile
|
| 12 |
+
|
| 13 |
+
from app.services.emotion_service import get_emotion_service
|
| 14 |
+
from app.services.nlp_service import get_nlp_service
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
router = APIRouter(prefix="/analysis", tags=["Analysis"])
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@router.post("/emotion/audio")
|
| 21 |
+
async def analyze_audio_emotion(
|
| 22 |
+
file: UploadFile = File(..., description="Audio file to analyze"),
|
| 23 |
+
):
|
| 24 |
+
"""
|
| 25 |
+
Analyze emotions in an audio file using Wav2Vec2.
|
| 26 |
+
Returns dominant emotion and probability distribution.
|
| 27 |
+
"""
|
| 28 |
+
service = get_emotion_service()
|
| 29 |
+
|
| 30 |
+
# Save to temp file
|
| 31 |
+
suffix = os.path.splitext(file.filename)[1] or ".wav"
|
| 32 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 33 |
+
shutil.copyfileobj(file.file, tmp)
|
| 34 |
+
tmp_path = tmp.name
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
result = service.analyze_audio(tmp_path)
|
| 38 |
+
return result
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error(f"Emotion analysis failed: {e}")
|
| 41 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 42 |
+
finally:
|
| 43 |
+
try:
|
| 44 |
+
os.unlink(tmp_path)
|
| 45 |
+
except:
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@router.post("/sentiment/text")
|
| 50 |
+
async def analyze_text_sentiment(
|
| 51 |
+
text: str = Form(..., description="Text to analyze"),
|
| 52 |
+
):
|
| 53 |
+
"""
|
| 54 |
+
Analyze text sentiment (polarity and subjectivity).
|
| 55 |
+
"""
|
| 56 |
+
nlp_service = get_nlp_service()
|
| 57 |
+
try:
|
| 58 |
+
return nlp_service.analyze_sentiment(text)
|
| 59 |
+
except Exception as e:
|
| 60 |
+
raise HTTPException(status_code=500, detail=str(e))
|
backend/app/api/routes/audio.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Editing API Routes
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
|
| 6 |
+
from fastapi.responses import FileResponse
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
import os
|
| 9 |
+
import shutil
|
| 10 |
+
import tempfile
|
| 11 |
+
import uuid
|
| 12 |
+
|
| 13 |
+
from app.services.audio_service import get_audio_service, AudioService
|
| 14 |
+
|
| 15 |
+
router = APIRouter(prefix="/audio", tags=["Audio Studio"])
|
| 16 |
+
|
| 17 |
+
@router.post("/trim")
|
| 18 |
+
async def trim_audio(
|
| 19 |
+
file: UploadFile = File(..., description="Audio file"),
|
| 20 |
+
start_sec: float = Form(..., description="Start time in seconds"),
|
| 21 |
+
end_sec: float = Form(..., description="End time in seconds"),
|
| 22 |
+
service: AudioService = Depends(get_audio_service)
|
| 23 |
+
):
|
| 24 |
+
"""Trim an audio file"""
|
| 25 |
+
suffix = os.path.splitext(file.filename)[1] or ".mp3"
|
| 26 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 27 |
+
shutil.copyfileobj(file.file, tmp)
|
| 28 |
+
tmp_path = tmp.name
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
output_path = tmp_path.replace(suffix, f"_trimmed{suffix}")
|
| 32 |
+
service.trim_audio(tmp_path, int(start_sec * 1000), int(end_sec * 1000), output_path)
|
| 33 |
+
|
| 34 |
+
return FileResponse(
|
| 35 |
+
output_path,
|
| 36 |
+
filename=f"trimmed_{file.filename}",
|
| 37 |
+
background=None # Let FastAPI handle cleanup? No, we need custom cleanup or use background task
|
| 38 |
+
)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 41 |
+
# Note: Temp files might persist. In prod, use a cleanup task.
|
| 42 |
+
|
| 43 |
+
@router.post("/merge")
|
| 44 |
+
async def merge_audio(
|
| 45 |
+
files: List[UploadFile] = File(..., description="Files to merge"),
|
| 46 |
+
format: str = Form("mp3", description="Output format"),
|
| 47 |
+
service: AudioService = Depends(get_audio_service)
|
| 48 |
+
):
|
| 49 |
+
"""Merge multiple audio files"""
|
| 50 |
+
temp_files = []
|
| 51 |
+
try:
|
| 52 |
+
for file in files:
|
| 53 |
+
suffix = os.path.splitext(file.filename)[1] or ".mp3"
|
| 54 |
+
tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
| 55 |
+
content = await file.read()
|
| 56 |
+
tmp.write(content)
|
| 57 |
+
tmp.close()
|
| 58 |
+
temp_files.append(tmp.name)
|
| 59 |
+
|
| 60 |
+
output_filename = f"merged_{uuid.uuid4()}.{format}"
|
| 61 |
+
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
| 62 |
+
|
| 63 |
+
service.merge_audio(temp_files, output_path)
|
| 64 |
+
|
| 65 |
+
return FileResponse(output_path, filename=output_filename)
|
| 66 |
+
|
| 67 |
+
except Exception as e:
|
| 68 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 69 |
+
finally:
|
| 70 |
+
for p in temp_files:
|
| 71 |
+
try:
|
| 72 |
+
os.unlink(p)
|
| 73 |
+
except:
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
@router.post("/convert")
|
| 77 |
+
async def convert_audio(
|
| 78 |
+
file: UploadFile = File(..., description="Audio file"),
|
| 79 |
+
target_format: str = Form(..., description="Target format (mp3, wav, flac, ogg)"),
|
| 80 |
+
service: AudioService = Depends(get_audio_service)
|
| 81 |
+
):
|
| 82 |
+
"""Convert audio format"""
|
| 83 |
+
suffix = os.path.splitext(file.filename)[1] or ".wav"
|
| 84 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 85 |
+
shutil.copyfileobj(file.file, tmp)
|
| 86 |
+
tmp_path = tmp.name
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
output_path = service.convert_format(tmp_path, target_format)
|
| 90 |
+
return FileResponse(
|
| 91 |
+
output_path,
|
| 92 |
+
filename=f"{os.path.splitext(file.filename)[0]}.{target_format}"
|
| 93 |
+
)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 96 |
+
finally:
|
| 97 |
+
try:
|
| 98 |
+
os.unlink(tmp_path)
|
| 99 |
+
except:
|
| 100 |
+
pass
|
backend/app/api/routes/auth.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime, timedelta
|
| 2 |
+
from typing import List
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
import secrets
|
| 5 |
+
from fastapi import APIRouter, Depends, HTTPException, status
|
| 6 |
+
from fastapi.security import OAuth2PasswordRequestForm
|
| 7 |
+
from sqlalchemy.orm import Session
|
| 8 |
+
|
| 9 |
+
from ...core.security import (
|
| 10 |
+
create_access_token,
|
| 11 |
+
get_password_hash,
|
| 12 |
+
verify_password,
|
| 13 |
+
get_current_active_user,
|
| 14 |
+
ACCESS_TOKEN_EXPIRE_MINUTES
|
| 15 |
+
)
|
| 16 |
+
from ...models import get_db, User, ApiKey
|
| 17 |
+
from ...core.limiter import limiter
|
| 18 |
+
from fastapi import APIRouter, Depends, HTTPException, status, Request
|
| 19 |
+
|
| 20 |
+
router = APIRouter(prefix="/auth", tags=["Authentication"])
|
| 21 |
+
|
| 22 |
+
# --- Schemas ---
|
| 23 |
+
class Token(BaseModel):
|
| 24 |
+
access_token: str
|
| 25 |
+
token_type: str
|
| 26 |
+
|
| 27 |
+
class UserCreate(BaseModel):
|
| 28 |
+
email: str
|
| 29 |
+
password: str
|
| 30 |
+
full_name: str = None
|
| 31 |
+
|
| 32 |
+
class UserOut(BaseModel):
|
| 33 |
+
id: int
|
| 34 |
+
email: str
|
| 35 |
+
full_name: str = None
|
| 36 |
+
is_active: bool
|
| 37 |
+
|
| 38 |
+
class Config:
|
| 39 |
+
orm_mode = True
|
| 40 |
+
|
| 41 |
+
class ApiKeyCreate(BaseModel):
|
| 42 |
+
name: str
|
| 43 |
+
|
| 44 |
+
class ApiKeyOut(BaseModel):
|
| 45 |
+
key: str
|
| 46 |
+
name: str
|
| 47 |
+
created_at: datetime
|
| 48 |
+
|
| 49 |
+
class Config:
|
| 50 |
+
orm_mode = True
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# --- Endpoints ---
|
| 54 |
+
|
| 55 |
+
@router.post("/register", response_model=UserOut)
|
| 56 |
+
@limiter.limit("5/minute")
|
| 57 |
+
async def register(request: Request, user_in: UserCreate, db: Session = Depends(get_db)):
|
| 58 |
+
"""Register a new user"""
|
| 59 |
+
existing_user = db.query(User).filter(User.email == user_in.email).first()
|
| 60 |
+
if existing_user:
|
| 61 |
+
raise HTTPException(status_code=400, detail="Email already registered")
|
| 62 |
+
|
| 63 |
+
hashed_password = get_password_hash(user_in.password)
|
| 64 |
+
new_user = User(
|
| 65 |
+
email=user_in.email,
|
| 66 |
+
hashed_password=hashed_password,
|
| 67 |
+
full_name=user_in.full_name
|
| 68 |
+
)
|
| 69 |
+
db.add(new_user)
|
| 70 |
+
db.commit()
|
| 71 |
+
db.refresh(new_user)
|
| 72 |
+
return new_user
|
| 73 |
+
|
| 74 |
+
@router.post("/login", response_model=Token)
|
| 75 |
+
@limiter.limit("5/minute")
|
| 76 |
+
async def login(request: Request, form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
|
| 77 |
+
"""Login to get access token"""
|
| 78 |
+
user = db.query(User).filter(User.email == form_data.username).first()
|
| 79 |
+
if not user or not verify_password(form_data.password, user.hashed_password):
|
| 80 |
+
raise HTTPException(
|
| 81 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 82 |
+
detail="Incorrect email or password",
|
| 83 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 87 |
+
access_token = create_access_token(
|
| 88 |
+
subject=user.id, expires_delta=access_token_expires
|
| 89 |
+
)
|
| 90 |
+
return {"access_token": access_token, "token_type": "bearer"}
|
| 91 |
+
|
| 92 |
+
@router.post("/api-keys", response_model=ApiKeyOut)
|
| 93 |
+
async def create_api_key(
|
| 94 |
+
key_in: ApiKeyCreate,
|
| 95 |
+
current_user: User = Depends(get_current_active_user),
|
| 96 |
+
db: Session = Depends(get_db)
|
| 97 |
+
):
|
| 98 |
+
"""Generate a new API key for the current user"""
|
| 99 |
+
# Generate secure 32-char key
|
| 100 |
+
raw_key = secrets.token_urlsafe(32)
|
| 101 |
+
api_key_str = f"vf_{raw_key}" # Prefix for identification
|
| 102 |
+
|
| 103 |
+
new_key = ApiKey(
|
| 104 |
+
key=api_key_str,
|
| 105 |
+
name=key_in.name,
|
| 106 |
+
user_id=current_user.id
|
| 107 |
+
)
|
| 108 |
+
db.add(new_key)
|
| 109 |
+
db.commit()
|
| 110 |
+
db.refresh(new_key)
|
| 111 |
+
return new_key
|
| 112 |
+
|
| 113 |
+
@router.get("/me", response_model=UserOut)
|
| 114 |
+
async def read_users_me(current_user: User = Depends(get_current_active_user)):
|
| 115 |
+
"""Get current user details"""
|
| 116 |
+
return current_user
|
backend/app/api/routes/batch.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Batch Processing API Routes
|
| 3 |
+
Endpoints for submitting and managing batch transcription jobs
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends, BackgroundTasks
|
| 7 |
+
from fastapi.responses import FileResponse
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
from typing import List, Optional, Dict, Any
|
| 10 |
+
import logging
|
| 11 |
+
import shutil
|
| 12 |
+
import os
|
| 13 |
+
import tempfile
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
from app.services.batch_service import get_batch_service
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
router = APIRouter(prefix="/batch", tags=["batch"])
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Request/Response Models
|
| 23 |
+
class BatchJobResponse(BaseModel):
|
| 24 |
+
"""Batch job response model."""
|
| 25 |
+
job_id: str
|
| 26 |
+
status: str
|
| 27 |
+
progress: float
|
| 28 |
+
created_at: str
|
| 29 |
+
total_files: int
|
| 30 |
+
completed_files: int
|
| 31 |
+
failed_files: int
|
| 32 |
+
has_zip: bool
|
| 33 |
+
files: Optional[Dict[str, Any]] = None
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Endpoints
|
| 37 |
+
@router.post("/transcribe", response_model=BatchJobResponse)
|
| 38 |
+
async def create_batch_job(
|
| 39 |
+
background_tasks: BackgroundTasks,
|
| 40 |
+
files: List[UploadFile] = File(..., description="Audio files to transcribe"),
|
| 41 |
+
language: Optional[str] = Form(None, description="Language code (e.g., 'en', 'hi')"),
|
| 42 |
+
output_format: str = Form("txt", description="Output format (txt, srt)"),
|
| 43 |
+
):
|
| 44 |
+
"""
|
| 45 |
+
Submit a batch of audio files for transcription.
|
| 46 |
+
|
| 47 |
+
1. Uploads multiple files
|
| 48 |
+
2. Creates a batch job
|
| 49 |
+
3. Starts processing in background
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
files: List of audio files
|
| 53 |
+
language: Optional language code
|
| 54 |
+
output_format: Output format (txt or srt)
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Created job details
|
| 58 |
+
"""
|
| 59 |
+
if not files:
|
| 60 |
+
raise HTTPException(status_code=400, detail="No files provided")
|
| 61 |
+
|
| 62 |
+
if len(files) > 50:
|
| 63 |
+
raise HTTPException(status_code=400, detail="Maximum 50 files per batch")
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
service = get_batch_service()
|
| 67 |
+
|
| 68 |
+
# Create temp files for processing
|
| 69 |
+
file_paths = {}
|
| 70 |
+
original_names = []
|
| 71 |
+
|
| 72 |
+
for file in files:
|
| 73 |
+
suffix = Path(file.filename).suffix or ".wav"
|
| 74 |
+
# Create a named temp file that persists until manually deleted
|
| 75 |
+
tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
| 76 |
+
content = await file.read()
|
| 77 |
+
tmp.write(content)
|
| 78 |
+
tmp.close()
|
| 79 |
+
|
| 80 |
+
file_paths[file.filename] = tmp.name
|
| 81 |
+
original_names.append(file.filename)
|
| 82 |
+
|
| 83 |
+
# Create job
|
| 84 |
+
job = service.create_job(
|
| 85 |
+
filenames=original_names,
|
| 86 |
+
options={
|
| 87 |
+
"language": language,
|
| 88 |
+
"output_format": output_format,
|
| 89 |
+
}
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Connect to Celery worker for processing
|
| 93 |
+
from app.workers.tasks import process_audio_file
|
| 94 |
+
|
| 95 |
+
# NOTE: For MVP batch service, we are currently keeping the simplified background_tasks approach
|
| 96 |
+
# because the 'process_audio_file' task defined in tasks.py is for individual files,
|
| 97 |
+
# whereas 'process_job' handles the whole batch logic (zipping etc).
|
| 98 |
+
# To fully migrate, we would need to refactor batch_service to span multiple tasks.
|
| 99 |
+
#
|
| 100 |
+
# For now, let's keep the background_task for the orchestrator, and have the orchestrator
|
| 101 |
+
# call the celery tasks for individual files?
|
| 102 |
+
# Actually, `service.process_job` currently runs synchronously in a background thread.
|
| 103 |
+
# We will leave as is for 3.1 step 1, but we CAN use Celery for the individual transcriptions.
|
| 104 |
+
|
| 105 |
+
# Start processing in background (Orchestrator runs in thread, calls expensive operations)
|
| 106 |
+
background_tasks.add_task(
|
| 107 |
+
service.process_job,
|
| 108 |
+
job_id=job.job_id,
|
| 109 |
+
file_paths=file_paths,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
return job.to_dict()
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
# Cleanup any created temp files on error
|
| 116 |
+
for path in file_paths.values():
|
| 117 |
+
try:
|
| 118 |
+
os.unlink(path)
|
| 119 |
+
except:
|
| 120 |
+
pass
|
| 121 |
+
logger.error(f"Batch job creation failed: {e}")
|
| 122 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@router.get("/jobs", response_model=List[BatchJobResponse])
|
| 126 |
+
async def list_jobs(limit: int = 10):
|
| 127 |
+
"""
|
| 128 |
+
List recent batch jobs.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
limit: Max number of jobs to return
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
List of jobs
|
| 135 |
+
"""
|
| 136 |
+
service = get_batch_service()
|
| 137 |
+
jobs = service.list_jobs(limit)
|
| 138 |
+
return [job.to_dict() for job in jobs]
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
@router.get("/{job_id}", response_model=BatchJobResponse)
|
| 142 |
+
async def get_job_status(job_id: str):
|
| 143 |
+
"""
|
| 144 |
+
Get status of a specific batch job.
|
| 145 |
+
|
| 146 |
+
Args:
|
| 147 |
+
job_id: Job ID
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
Job details and progress
|
| 151 |
+
"""
|
| 152 |
+
service = get_batch_service()
|
| 153 |
+
job = service.get_job(job_id)
|
| 154 |
+
|
| 155 |
+
if not job:
|
| 156 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 157 |
+
|
| 158 |
+
return job.to_dict()
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
@router.get("/{job_id}/download")
|
| 162 |
+
async def download_results(job_id: str):
|
| 163 |
+
"""
|
| 164 |
+
Download batch job results as ZIP.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
job_id: Job ID
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
ZIP file download
|
| 171 |
+
"""
|
| 172 |
+
service = get_batch_service()
|
| 173 |
+
zip_path = service.get_zip_path(job_id)
|
| 174 |
+
|
| 175 |
+
if not zip_path:
|
| 176 |
+
raise HTTPException(status_code=404, detail="Results not available (job may be processing or failed)")
|
| 177 |
+
|
| 178 |
+
return FileResponse(
|
| 179 |
+
path=zip_path,
|
| 180 |
+
filename=f"batch_{job_id}_results.zip",
|
| 181 |
+
media_type="application/zip",
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
@router.delete("/{job_id}")
|
| 186 |
+
async def delete_job(job_id: str):
|
| 187 |
+
"""
|
| 188 |
+
Delete a batch job and cleanup files.
|
| 189 |
+
|
| 190 |
+
Args:
|
| 191 |
+
job_id: Job ID
|
| 192 |
+
"""
|
| 193 |
+
service = get_batch_service()
|
| 194 |
+
|
| 195 |
+
# Try to cancel first if running
|
| 196 |
+
service.cancel_job(job_id)
|
| 197 |
+
|
| 198 |
+
# Delete data
|
| 199 |
+
success = service.delete_job(job_id)
|
| 200 |
+
|
| 201 |
+
if not success:
|
| 202 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
| 203 |
+
|
| 204 |
+
return {"status": "deleted", "job_id": job_id}
|
backend/app/api/routes/cloning.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voice Cloning API Routes
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
|
| 6 |
+
from fastapi.responses import FileResponse
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
import os
|
| 9 |
+
import shutil
|
| 10 |
+
import tempfile
|
| 11 |
+
import uuid
|
| 12 |
+
|
| 13 |
+
from app.services.clone_service import get_clone_service, CloneService
|
| 14 |
+
|
| 15 |
+
router = APIRouter(prefix="/clone", tags=["Voice Cloning"])
|
| 16 |
+
|
| 17 |
+
@router.post("/synthesize")
|
| 18 |
+
async def clone_synthesize(
|
| 19 |
+
text: str = Form(..., description="Text to speak"),
|
| 20 |
+
language: str = Form("en", description="Language code (en, es, fr, de, etc.)"),
|
| 21 |
+
files: List[UploadFile] = File(..., description="Reference audio samples (1-3 files, 3-10s each recommended)"),
|
| 22 |
+
service: CloneService = Depends(get_clone_service)
|
| 23 |
+
):
|
| 24 |
+
"""
|
| 25 |
+
Clone a voice from reference audio samples.
|
| 26 |
+
|
| 27 |
+
Uses Coqui XTTS v2.
|
| 28 |
+
WARNING: Heavy operation. May take 5-20 seconds depending on GPU.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
# Validation
|
| 32 |
+
if not files:
|
| 33 |
+
raise HTTPException(status_code=400, detail="At least one reference audio file is required")
|
| 34 |
+
|
| 35 |
+
temp_files = []
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
# Save reference files
|
| 39 |
+
for file in files:
|
| 40 |
+
suffix = os.path.splitext(file.filename)[1] or ".wav"
|
| 41 |
+
tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
| 42 |
+
content = await file.read()
|
| 43 |
+
tmp.write(content)
|
| 44 |
+
tmp.close()
|
| 45 |
+
temp_files.append(tmp.name)
|
| 46 |
+
|
| 47 |
+
# Generate output path
|
| 48 |
+
output_filename = f"cloned_{uuid.uuid4()}.wav"
|
| 49 |
+
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
| 50 |
+
|
| 51 |
+
# Synthesize
|
| 52 |
+
service.clone_voice(
|
| 53 |
+
text=text,
|
| 54 |
+
speaker_wav_paths=temp_files,
|
| 55 |
+
language=language,
|
| 56 |
+
output_path=output_path
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
return FileResponse(
|
| 60 |
+
output_path,
|
| 61 |
+
filename="cloned_speech.wav",
|
| 62 |
+
media_type="audio/wav"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
except ImportError:
|
| 66 |
+
raise HTTPException(status_code=503, detail="Voice Cloning service not available (TTS library missing)")
|
| 67 |
+
except Exception as e:
|
| 68 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 69 |
+
|
| 70 |
+
finally:
|
| 71 |
+
# Cleanup input files
|
| 72 |
+
for p in temp_files:
|
| 73 |
+
try:
|
| 74 |
+
os.unlink(p)
|
| 75 |
+
except:
|
| 76 |
+
pass
|
| 77 |
+
# Note: Output file cleanup needs management in prod (background task or stream)
|
| 78 |
+
|
| 79 |
+
@router.get("/languages")
|
| 80 |
+
def get_languages(service: CloneService = Depends(get_clone_service)):
|
| 81 |
+
return {"languages": service.get_supported_languages()}
|
backend/app/api/routes/health.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Health Check Router
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from fastapi import APIRouter
|
| 6 |
+
|
| 7 |
+
router = APIRouter(prefix="/health", tags=["Health"])
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@router.get("")
|
| 11 |
+
@router.get("/")
|
| 12 |
+
async def health_check():
|
| 13 |
+
"""Basic health check endpoint"""
|
| 14 |
+
return {
|
| 15 |
+
"status": "healthy",
|
| 16 |
+
"service": "voiceforge-api",
|
| 17 |
+
"version": "1.0.0",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@router.get("/ready")
|
| 22 |
+
async def readiness_check():
|
| 23 |
+
"""Readiness check - verifies all dependencies are available"""
|
| 24 |
+
# TODO: Check database, Redis, Google Cloud connectivity
|
| 25 |
+
return {
|
| 26 |
+
"status": "ready",
|
| 27 |
+
"checks": {
|
| 28 |
+
"database": "ok",
|
| 29 |
+
"redis": "ok",
|
| 30 |
+
"google_cloud": "ok",
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@router.get("/memory")
|
| 36 |
+
async def memory_status():
|
| 37 |
+
"""Get current memory usage and loaded models"""
|
| 38 |
+
from ...services.whisper_stt_service import (
|
| 39 |
+
_whisper_models,
|
| 40 |
+
_model_last_used,
|
| 41 |
+
get_memory_usage_mb
|
| 42 |
+
)
|
| 43 |
+
import time
|
| 44 |
+
|
| 45 |
+
current_time = time.time()
|
| 46 |
+
models_info = {}
|
| 47 |
+
|
| 48 |
+
for name in _whisper_models.keys():
|
| 49 |
+
last_used = _model_last_used.get(name, 0)
|
| 50 |
+
idle_seconds = current_time - last_used if last_used else 0
|
| 51 |
+
models_info[name] = {
|
| 52 |
+
"loaded": True,
|
| 53 |
+
"idle_seconds": round(idle_seconds, 1)
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
"memory_mb": round(get_memory_usage_mb(), 1),
|
| 58 |
+
"loaded_models": list(_whisper_models.keys()),
|
| 59 |
+
"models_detail": models_info
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@router.post("/memory/cleanup")
|
| 64 |
+
async def cleanup_memory():
|
| 65 |
+
"""Unload idle models to free memory"""
|
| 66 |
+
from ...services.whisper_stt_service import cleanup_idle_models, get_memory_usage_mb
|
| 67 |
+
|
| 68 |
+
before = get_memory_usage_mb()
|
| 69 |
+
cleanup_idle_models()
|
| 70 |
+
after = get_memory_usage_mb()
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
"memory_before_mb": round(before, 1),
|
| 74 |
+
"memory_after_mb": round(after, 1),
|
| 75 |
+
"freed_mb": round(before - after, 1)
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@router.post("/memory/unload-all")
|
| 80 |
+
async def unload_all():
|
| 81 |
+
"""Unload ALL models to free maximum memory"""
|
| 82 |
+
from ...services.whisper_stt_service import unload_all_models, get_memory_usage_mb
|
| 83 |
+
|
| 84 |
+
before = get_memory_usage_mb()
|
| 85 |
+
unloaded = unload_all_models()
|
| 86 |
+
after = get_memory_usage_mb()
|
| 87 |
+
|
| 88 |
+
return {
|
| 89 |
+
"unloaded_models": unloaded,
|
| 90 |
+
"memory_before_mb": round(before, 1),
|
| 91 |
+
"memory_after_mb": round(after, 1),
|
| 92 |
+
"freed_mb": round(before - after, 1)
|
| 93 |
+
}
|
backend/app/api/routes/sign.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sign Language API Routes
|
| 3 |
+
Provides WebSocket and REST endpoints for ASL recognition.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, UploadFile, File, HTTPException
|
| 7 |
+
from fastapi.responses import JSONResponse
|
| 8 |
+
import numpy as np
|
| 9 |
+
import base64
|
| 10 |
+
import cv2
|
| 11 |
+
import logging
|
| 12 |
+
from typing import List
|
| 13 |
+
|
| 14 |
+
from ...services.sign_recognition_service import get_sign_service, SignPrediction
|
| 15 |
+
from ...services.sign_avatar_service import get_avatar_service
|
| 16 |
+
from pydantic import BaseModel
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
router = APIRouter(prefix="/sign", tags=["Sign Language"])
|
| 21 |
+
|
| 22 |
+
class TextToSignRequest(BaseModel):
|
| 23 |
+
text: str
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@router.get("/health")
|
| 27 |
+
async def sign_health():
|
| 28 |
+
"""Check if sign recognition service is available"""
|
| 29 |
+
try:
|
| 30 |
+
service = get_sign_service()
|
| 31 |
+
return {"status": "ready", "service": "SignRecognitionService"}
|
| 32 |
+
except Exception as e:
|
| 33 |
+
return {"status": "error", "message": str(e)}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@router.post("/recognize")
|
| 37 |
+
async def recognize_sign(file: UploadFile = File(..., description="Image of hand sign")):
|
| 38 |
+
"""
|
| 39 |
+
Recognize ASL letter from a single image.
|
| 40 |
+
|
| 41 |
+
Upload an image containing a hand sign to get the predicted letter.
|
| 42 |
+
"""
|
| 43 |
+
try:
|
| 44 |
+
# Read image
|
| 45 |
+
contents = await file.read()
|
| 46 |
+
nparr = np.frombuffer(contents, np.uint8)
|
| 47 |
+
image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
| 48 |
+
|
| 49 |
+
if image is None:
|
| 50 |
+
raise HTTPException(status_code=400, detail="Invalid image file")
|
| 51 |
+
|
| 52 |
+
# Get predictions
|
| 53 |
+
service = get_sign_service()
|
| 54 |
+
predictions = service.process_frame(image)
|
| 55 |
+
|
| 56 |
+
if not predictions:
|
| 57 |
+
return JSONResponse({
|
| 58 |
+
"success": True,
|
| 59 |
+
"predictions": [],
|
| 60 |
+
"message": "No hands detected in image"
|
| 61 |
+
})
|
| 62 |
+
|
| 63 |
+
return JSONResponse({
|
| 64 |
+
"success": True,
|
| 65 |
+
"predictions": [
|
| 66 |
+
{
|
| 67 |
+
"letter": p.letter,
|
| 68 |
+
"confidence": p.confidence
|
| 69 |
+
}
|
| 70 |
+
for p in predictions
|
| 71 |
+
]
|
| 72 |
+
})
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Sign recognition error: {e}")
|
| 76 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
@router.websocket("/live")
|
| 80 |
+
async def sign_websocket(websocket: WebSocket):
|
| 81 |
+
"""
|
| 82 |
+
WebSocket endpoint for real-time sign language recognition.
|
| 83 |
+
|
| 84 |
+
Client sends base64-encoded JPEG frames, server responds with predictions.
|
| 85 |
+
|
| 86 |
+
Protocol:
|
| 87 |
+
- Client sends: {"frame": "<base64 jpeg>"}
|
| 88 |
+
- Server sends: {"predictions": [{"letter": "A", "confidence": 0.8}]}
|
| 89 |
+
"""
|
| 90 |
+
await websocket.accept()
|
| 91 |
+
service = get_sign_service()
|
| 92 |
+
|
| 93 |
+
logger.info("Sign language WebSocket connected")
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
while True:
|
| 97 |
+
# Receive frame from client
|
| 98 |
+
data = await websocket.receive_json()
|
| 99 |
+
|
| 100 |
+
if "frame" not in data:
|
| 101 |
+
await websocket.send_json({"error": "Missing 'frame' field"})
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# Decode base64 image
|
| 105 |
+
try:
|
| 106 |
+
frame_data = base64.b64decode(data["frame"])
|
| 107 |
+
nparr = np.frombuffer(frame_data, np.uint8)
|
| 108 |
+
frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
| 109 |
+
|
| 110 |
+
if frame is None:
|
| 111 |
+
await websocket.send_json({"error": "Invalid frame data"})
|
| 112 |
+
continue
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
await websocket.send_json({"error": f"Frame decode error: {e}"})
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
# Process frame
|
| 119 |
+
predictions = service.process_frame(frame)
|
| 120 |
+
|
| 121 |
+
# Send results
|
| 122 |
+
await websocket.send_json({
|
| 123 |
+
"predictions": [
|
| 124 |
+
{
|
| 125 |
+
"letter": p.letter,
|
| 126 |
+
"confidence": round(p.confidence, 2)
|
| 127 |
+
}
|
| 128 |
+
for p in predictions
|
| 129 |
+
]
|
| 130 |
+
})
|
| 131 |
+
|
| 132 |
+
except WebSocketDisconnect:
|
| 133 |
+
logger.info("Sign language WebSocket disconnected")
|
| 134 |
+
except Exception as e:
|
| 135 |
+
logger.error(f"WebSocket error: {e}")
|
| 136 |
+
await websocket.close(code=1011, reason=str(e))
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
@router.get("/alphabet")
|
| 140 |
+
async def get_alphabet():
|
| 141 |
+
"""Get list of supported ASL letters"""
|
| 142 |
+
return {
|
| 143 |
+
"supported_letters": list("ABCDILUVWY5"), # Currently implemented
|
| 144 |
+
"note": "J and Z require motion tracking (coming soon)"
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
@router.post("/animate")
|
| 149 |
+
async def animate_text(request: TextToSignRequest):
|
| 150 |
+
"""
|
| 151 |
+
Convert text to sign language animation sequence (Finger Spelling).
|
| 152 |
+
"""
|
| 153 |
+
try:
|
| 154 |
+
service = get_avatar_service()
|
| 155 |
+
sequence = service.text_to_glosses(request.text)
|
| 156 |
+
|
| 157 |
+
return {
|
| 158 |
+
"success": True,
|
| 159 |
+
"sequence": sequence,
|
| 160 |
+
"count": len(sequence)
|
| 161 |
+
}
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Animation error: {e}")
|
| 164 |
+
raise HTTPException(status_code=500, detail=str(e))
|
backend/app/api/routes/stt.py
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speech-to-Text API Router
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from typing import Optional, List
|
| 8 |
+
|
| 9 |
+
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends, Request
|
| 10 |
+
from fastapi.responses import JSONResponse
|
| 11 |
+
|
| 12 |
+
from ...core.limiter import limiter
|
| 13 |
+
|
| 14 |
+
from ...services.stt_service import get_stt_service, STTService
|
| 15 |
+
from ...services.file_service import get_file_service, FileService
|
| 16 |
+
from ...schemas.stt import (
|
| 17 |
+
TranscriptionResponse,
|
| 18 |
+
TranscriptionRequest,
|
| 19 |
+
LanguageInfo,
|
| 20 |
+
LanguageListResponse,
|
| 21 |
+
)
|
| 22 |
+
from ...core.config import get_settings
|
| 23 |
+
from sqlalchemy.orm import Session
|
| 24 |
+
from ...models import get_db, AudioFile, Transcript
|
| 25 |
+
from ...workers.tasks import process_audio_file
|
| 26 |
+
from celery.result import AsyncResult
|
| 27 |
+
from ...schemas.stt import (
|
| 28 |
+
TranscriptionResponse,
|
| 29 |
+
TranscriptionRequest,
|
| 30 |
+
LanguageInfo,
|
| 31 |
+
LanguageListResponse,
|
| 32 |
+
AsyncTranscriptionResponse,
|
| 33 |
+
TaskStatusResponse,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
router = APIRouter(prefix="/stt", tags=["Speech-to-Text"])
|
| 39 |
+
settings = get_settings()
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@router.get("/languages", response_model=LanguageListResponse)
|
| 43 |
+
async def get_supported_languages(
|
| 44 |
+
stt_service: STTService = Depends(get_stt_service),
|
| 45 |
+
):
|
| 46 |
+
"""
|
| 47 |
+
Get list of supported languages for speech-to-text
|
| 48 |
+
"""
|
| 49 |
+
languages = stt_service.get_supported_languages()
|
| 50 |
+
return LanguageListResponse(
|
| 51 |
+
languages=languages,
|
| 52 |
+
total=len(languages),
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@router.post("/upload", response_model=TranscriptionResponse)
|
| 57 |
+
@limiter.limit("10/minute")
|
| 58 |
+
async def transcribe_upload(
|
| 59 |
+
request: Request,
|
| 60 |
+
file: UploadFile = File(..., description="Audio file to transcribe"),
|
| 61 |
+
language: str = Form(default="en-US", description="Language code"),
|
| 62 |
+
enable_punctuation: bool = Form(default=True, description="Enable automatic punctuation"),
|
| 63 |
+
enable_word_timestamps: bool = Form(default=True, description="Include word-level timestamps"),
|
| 64 |
+
enable_diarization: bool = Form(default=False, description="Enable speaker diarization"),
|
| 65 |
+
speaker_count: Optional[int] = Form(default=None, description="Expected number of speakers"),
|
| 66 |
+
prompt: Optional[str] = Form(None, description="Custom vocabulary/keywords (e.g. 'VoiceForge, PyTorch')"),
|
| 67 |
+
stt_service: STTService = Depends(get_stt_service),
|
| 68 |
+
file_service: FileService = Depends(get_file_service),
|
| 69 |
+
db: Session = Depends(get_db),
|
| 70 |
+
|
| 71 |
+
):
|
| 72 |
+
"""
|
| 73 |
+
Transcribe an uploaded audio file
|
| 74 |
+
|
| 75 |
+
Supports: WAV, MP3, M4A, FLAC, OGG, WebM
|
| 76 |
+
|
| 77 |
+
For files longer than 1 minute, consider using the async endpoint.
|
| 78 |
+
"""
|
| 79 |
+
# Validate file type
|
| 80 |
+
if not file.filename:
|
| 81 |
+
raise HTTPException(status_code=400, detail="No filename provided")
|
| 82 |
+
|
| 83 |
+
ext = file.filename.split(".")[-1].lower()
|
| 84 |
+
if ext not in settings.supported_audio_formats_list:
|
| 85 |
+
raise HTTPException(
|
| 86 |
+
status_code=400,
|
| 87 |
+
detail=f"Unsupported format: {ext}. Supported: {', '.join(settings.supported_audio_formats_list)}"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Validate language
|
| 91 |
+
if language not in settings.supported_languages_list:
|
| 92 |
+
raise HTTPException(
|
| 93 |
+
status_code=400,
|
| 94 |
+
detail=f"Unsupported language: {language}. Supported: {', '.join(settings.supported_languages_list)}"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
# Read file content
|
| 99 |
+
content = await file.read()
|
| 100 |
+
|
| 101 |
+
# Save to storage
|
| 102 |
+
storage_path, metadata = file_service.save_upload(
|
| 103 |
+
file_content=content,
|
| 104 |
+
original_filename=file.filename,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
logger.info(f"Processing transcription for {file.filename} ({len(content)} bytes)")
|
| 108 |
+
|
| 109 |
+
# Perform transcription
|
| 110 |
+
result = stt_service.transcribe_file(
|
| 111 |
+
audio_path=storage_path,
|
| 112 |
+
language=language,
|
| 113 |
+
enable_automatic_punctuation=enable_punctuation,
|
| 114 |
+
enable_word_time_offsets=enable_word_timestamps,
|
| 115 |
+
enable_speaker_diarization=enable_diarization,
|
| 116 |
+
diarization_speaker_count=speaker_count,
|
| 117 |
+
sample_rate=metadata.get("sample_rate"),
|
| 118 |
+
prompt=prompt, # Custom vocabulary
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# Clean up temp file (optional - could keep for history)
|
| 122 |
+
# file_service.delete_file(storage_path)
|
| 123 |
+
|
| 124 |
+
# Save to database
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
# 1. Create AudioFile record
|
| 128 |
+
audio_file = AudioFile(
|
| 129 |
+
storage_path=str(storage_path),
|
| 130 |
+
original_filename=file.filename,
|
| 131 |
+
duration=result.duration,
|
| 132 |
+
format=ext,
|
| 133 |
+
sample_rate=metadata.get("sample_rate"),
|
| 134 |
+
language=language,
|
| 135 |
+
detected_language=result.language,
|
| 136 |
+
status="done"
|
| 137 |
+
)
|
| 138 |
+
db.add(audio_file)
|
| 139 |
+
db.flush() # get ID
|
| 140 |
+
|
| 141 |
+
# 2. Create Transcript record
|
| 142 |
+
transcript = Transcript(
|
| 143 |
+
audio_file_id=audio_file.id,
|
| 144 |
+
raw_text=result.text,
|
| 145 |
+
processed_text=result.text, # initially same
|
| 146 |
+
segments=[s.model_dump() for s in result.segments] if result.segments else [],
|
| 147 |
+
language=result.language,
|
| 148 |
+
created_at=datetime.utcnow(),
|
| 149 |
+
)
|
| 150 |
+
db.add(transcript)
|
| 151 |
+
db.commit()
|
| 152 |
+
db.refresh(transcript)
|
| 153 |
+
|
| 154 |
+
# Return result with ID
|
| 155 |
+
response_data = result.model_dump()
|
| 156 |
+
response_data["id"] = transcript.id
|
| 157 |
+
|
| 158 |
+
# Explicitly validate to catch errors early
|
| 159 |
+
try:
|
| 160 |
+
return TranscriptionResponse(**response_data)
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.error(f"Validation error for response: {e}")
|
| 163 |
+
logger.error(f"Response data: {response_data}")
|
| 164 |
+
raise HTTPException(status_code=500, detail=f"Response validation failed: {str(e)}")
|
| 165 |
+
return response
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Failed to save to DB: {e}")
|
| 169 |
+
# Don't fail the request if DB save fails, just return result
|
| 170 |
+
# But in production we might want to ensure persistence
|
| 171 |
+
return result
|
| 172 |
+
|
| 173 |
+
except FileNotFoundError as e:
|
| 174 |
+
logger.error(f"File error: {e}")
|
| 175 |
+
raise HTTPException(status_code=404, detail=str(e))
|
| 176 |
+
except ValueError as e:
|
| 177 |
+
logger.error(f"Validation error: {e}")
|
| 178 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.exception(f"Transcription failed: {e}")
|
| 181 |
+
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
@router.post("/upload/quality")
|
| 185 |
+
async def transcribe_quality(
|
| 186 |
+
file: UploadFile = File(..., description="Audio file to transcribe"),
|
| 187 |
+
language: str = Form(default="en-US", description="Language code"),
|
| 188 |
+
preprocess: bool = Form(default=False, description="Apply noise reduction (5-15% WER improvement)"),
|
| 189 |
+
prompt: Optional[str] = Form(None, description="Custom vocabulary/keywords"),
|
| 190 |
+
):
|
| 191 |
+
"""
|
| 192 |
+
High-quality transcription mode (optimized for accuracy).
|
| 193 |
+
|
| 194 |
+
Features:
|
| 195 |
+
- beam_size=5 for more accurate decoding (~40% fewer errors)
|
| 196 |
+
- condition_on_previous_text=False to reduce hallucinations
|
| 197 |
+
- Optional audio preprocessing for noisy environments
|
| 198 |
+
|
| 199 |
+
Trade-off: ~2x slower than standard mode
|
| 200 |
+
Best for: Important recordings, noisy audio, reduced error tolerance
|
| 201 |
+
"""
|
| 202 |
+
from app.services.whisper_stt_service import get_whisper_stt_service
|
| 203 |
+
import tempfile
|
| 204 |
+
import os
|
| 205 |
+
|
| 206 |
+
# Validate file
|
| 207 |
+
if not file.filename:
|
| 208 |
+
raise HTTPException(status_code=400, detail="No filename provided")
|
| 209 |
+
|
| 210 |
+
try:
|
| 211 |
+
content = await file.read()
|
| 212 |
+
|
| 213 |
+
# Save to temp file
|
| 214 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 215 |
+
f.write(content)
|
| 216 |
+
temp_path = f.name
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
stt_service = get_whisper_stt_service()
|
| 220 |
+
result = stt_service.transcribe_quality(
|
| 221 |
+
temp_path,
|
| 222 |
+
language=language,
|
| 223 |
+
preprocess=preprocess,
|
| 224 |
+
prompt=prompt,
|
| 225 |
+
)
|
| 226 |
+
return result
|
| 227 |
+
finally:
|
| 228 |
+
try:
|
| 229 |
+
os.unlink(temp_path)
|
| 230 |
+
except:
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
except Exception as e:
|
| 234 |
+
logger.exception(f"Quality transcription failed: {e}")
|
| 235 |
+
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
@router.post("/upload/batch")
|
| 239 |
+
async def transcribe_batch(
|
| 240 |
+
files: List[UploadFile] = File(..., description="Multiple audio files to transcribe"),
|
| 241 |
+
language: str = Form(default="en-US", description="Language code"),
|
| 242 |
+
batch_size: int = Form(default=8, description="Batch size (8 optimal for CPU)"),
|
| 243 |
+
):
|
| 244 |
+
"""
|
| 245 |
+
Batch transcription for high throughput.
|
| 246 |
+
|
| 247 |
+
Uses BatchedInferencePipeline for 2-3x speedup on concurrent files.
|
| 248 |
+
|
| 249 |
+
Best for: Processing multiple files, API with high concurrency
|
| 250 |
+
"""
|
| 251 |
+
from app.services.whisper_stt_service import get_whisper_stt_service
|
| 252 |
+
import tempfile
|
| 253 |
+
import os
|
| 254 |
+
|
| 255 |
+
if not files:
|
| 256 |
+
raise HTTPException(status_code=400, detail="No files provided")
|
| 257 |
+
|
| 258 |
+
results = []
|
| 259 |
+
stt_service = get_whisper_stt_service()
|
| 260 |
+
|
| 261 |
+
for file in files:
|
| 262 |
+
if not file.filename:
|
| 263 |
+
continue
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
+
content = await file.read()
|
| 267 |
+
|
| 268 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 269 |
+
f.write(content)
|
| 270 |
+
temp_path = f.name
|
| 271 |
+
|
| 272 |
+
try:
|
| 273 |
+
result = stt_service.transcribe_batched(
|
| 274 |
+
temp_path,
|
| 275 |
+
language=language,
|
| 276 |
+
batch_size=batch_size,
|
| 277 |
+
)
|
| 278 |
+
result["filename"] = file.filename
|
| 279 |
+
results.append(result)
|
| 280 |
+
finally:
|
| 281 |
+
try:
|
| 282 |
+
os.unlink(temp_path)
|
| 283 |
+
except:
|
| 284 |
+
pass
|
| 285 |
+
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.error(f"Failed to transcribe {file.filename}: {e}")
|
| 288 |
+
results.append({
|
| 289 |
+
"filename": file.filename,
|
| 290 |
+
"error": str(e),
|
| 291 |
+
})
|
| 292 |
+
|
| 293 |
+
return {
|
| 294 |
+
"count": len(results),
|
| 295 |
+
"results": results,
|
| 296 |
+
"mode": "batched",
|
| 297 |
+
"batch_size": batch_size,
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
@router.post("/async-upload", response_model=AsyncTranscriptionResponse)
|
| 302 |
+
async def transcribe_async_upload(
|
| 303 |
+
file: UploadFile = File(..., description="Audio file to transcribe"),
|
| 304 |
+
language: str = Form(default="en-US", description="Language code"),
|
| 305 |
+
file_service: FileService = Depends(get_file_service),
|
| 306 |
+
db: Session = Depends(get_db),
|
| 307 |
+
):
|
| 308 |
+
"""
|
| 309 |
+
Asynchronously transcribe an uploaded audio file (Celery)
|
| 310 |
+
"""
|
| 311 |
+
# Validate file type
|
| 312 |
+
if not file.filename:
|
| 313 |
+
raise HTTPException(status_code=400, detail="No filename provided")
|
| 314 |
+
|
| 315 |
+
ext = file.filename.split(".")[-1].lower()
|
| 316 |
+
if ext not in settings.supported_audio_formats_list:
|
| 317 |
+
raise HTTPException(
|
| 318 |
+
status_code=400,
|
| 319 |
+
detail=f"Unsupported format: {ext}"
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
try:
|
| 323 |
+
content = await file.read()
|
| 324 |
+
storage_path, metadata = file_service.save_upload(
|
| 325 |
+
file_content=content,
|
| 326 |
+
original_filename=file.filename,
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Create AudioFile record with 'queued' status
|
| 330 |
+
audio_file = AudioFile(
|
| 331 |
+
storage_path=str(storage_path),
|
| 332 |
+
original_filename=file.filename,
|
| 333 |
+
duration=0.0, # Will be updated by worker
|
| 334 |
+
format=ext,
|
| 335 |
+
sample_rate=metadata.get("sample_rate"),
|
| 336 |
+
language=language,
|
| 337 |
+
status="queued"
|
| 338 |
+
)
|
| 339 |
+
db.add(audio_file)
|
| 340 |
+
db.commit()
|
| 341 |
+
db.refresh(audio_file)
|
| 342 |
+
|
| 343 |
+
# Trigger Celery Task
|
| 344 |
+
task = process_audio_file.delay(audio_file.id)
|
| 345 |
+
|
| 346 |
+
return AsyncTranscriptionResponse(
|
| 347 |
+
task_id=task.id,
|
| 348 |
+
audio_file_id=audio_file.id,
|
| 349 |
+
status="queued",
|
| 350 |
+
message="File uploaded and queued for processing"
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
except Exception as e:
|
| 354 |
+
logger.exception(f"Async upload failed: {e}")
|
| 355 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
@router.get("/tasks/{task_id}", response_model=TaskStatusResponse)
|
| 359 |
+
async def get_task_status(task_id: str, db: Session = Depends(get_db)):
|
| 360 |
+
"""
|
| 361 |
+
Check status of an async transcription task
|
| 362 |
+
"""
|
| 363 |
+
task_result = AsyncResult(task_id)
|
| 364 |
+
|
| 365 |
+
response = TaskStatusResponse(
|
| 366 |
+
task_id=task_id,
|
| 367 |
+
status=task_result.status.lower(),
|
| 368 |
+
created_at=datetime.utcnow(), # Approximate or fetch from DB tracked tasks
|
| 369 |
+
updated_at=datetime.utcnow()
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
if task_result.successful():
|
| 373 |
+
# If successful, the result of the task function isn't returned directly
|
| 374 |
+
# because process_audio_file returns None (it saves to DB).
|
| 375 |
+
# We need to find the Transcript associated with this task if possible.
|
| 376 |
+
# Ideally, we should store task_id in AudioFile or Transcript to link them.
|
| 377 |
+
# For now, we just report completion.
|
| 378 |
+
response.status = "completed"
|
| 379 |
+
response.progress = 100.0
|
| 380 |
+
elif task_result.failed():
|
| 381 |
+
response.status = "failed"
|
| 382 |
+
response.error = str(task_result.result)
|
| 383 |
+
elif task_result.state == 'PROGRESS':
|
| 384 |
+
response.status = "processing"
|
| 385 |
+
# If we had progress updating in the task, we could read it here
|
| 386 |
+
|
| 387 |
+
return response
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
@router.post("/transcribe-bytes", response_model=TranscriptionResponse)
|
| 391 |
+
async def transcribe_bytes(
|
| 392 |
+
audio_content: bytes,
|
| 393 |
+
language: str = "en-US",
|
| 394 |
+
encoding: str = "LINEAR16",
|
| 395 |
+
sample_rate: int = 16000,
|
| 396 |
+
stt_service: STTService = Depends(get_stt_service),
|
| 397 |
+
):
|
| 398 |
+
"""
|
| 399 |
+
Transcribe raw audio bytes (for streaming/real-time use)
|
| 400 |
+
|
| 401 |
+
This endpoint is primarily for internal use or advanced clients
|
| 402 |
+
that send pre-processed audio data.
|
| 403 |
+
"""
|
| 404 |
+
try:
|
| 405 |
+
result = stt_service.transcribe_bytes(
|
| 406 |
+
audio_content=audio_content,
|
| 407 |
+
language=language,
|
| 408 |
+
encoding=encoding,
|
| 409 |
+
sample_rate=sample_rate,
|
| 410 |
+
)
|
| 411 |
+
return result
|
| 412 |
+
except Exception as e:
|
| 413 |
+
logger.exception(f"Transcription failed: {e}")
|
| 414 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
# TODO: WebSocket endpoint for real-time streaming
|
| 418 |
+
# @router.websocket("/stream")
|
| 419 |
+
# async def stream_transcription(websocket: WebSocket):
|
| 420 |
+
# """Real-time streaming transcription via WebSocket"""
|
| 421 |
+
# pass
|
| 422 |
+
|
| 423 |
+
@router.post("/upload/diarize")
|
| 424 |
+
async def diarize_audio(
|
| 425 |
+
file: UploadFile = File(..., description="Audio file to diarize"),
|
| 426 |
+
num_speakers: Optional[int] = Form(None, description="Exact number of speakers (optional)"),
|
| 427 |
+
min_speakers: Optional[int] = Form(None, description="Minimum number of speakers (optional)"),
|
| 428 |
+
max_speakers: Optional[int] = Form(None, description="Maximum number of speakers (optional)"),
|
| 429 |
+
language: Optional[str] = Form(None, description="Language code (e.g., 'en'). Auto-detected if not provided."),
|
| 430 |
+
preprocess: bool = Form(False, description="Apply noise reduction before processing (improves accuracy for noisy audio)"),
|
| 431 |
+
):
|
| 432 |
+
"""
|
| 433 |
+
Perform Speaker Diarization ("Who said what").
|
| 434 |
+
|
| 435 |
+
Uses faster-whisper for transcription + pyannote.audio for speaker identification.
|
| 436 |
+
|
| 437 |
+
Requires:
|
| 438 |
+
- HF_TOKEN in .env for Pyannote model access
|
| 439 |
+
|
| 440 |
+
Returns:
|
| 441 |
+
- segments: List of segments with timestamps, text, and speaker labels
|
| 442 |
+
- speaker_stats: Speaking time per speaker
|
| 443 |
+
- language: Detected/specified language
|
| 444 |
+
"""
|
| 445 |
+
from app.services.diarization_service import get_diarization_service
|
| 446 |
+
import tempfile
|
| 447 |
+
import os
|
| 448 |
+
|
| 449 |
+
if not file.filename:
|
| 450 |
+
raise HTTPException(status_code=400, detail="No filename provided")
|
| 451 |
+
|
| 452 |
+
try:
|
| 453 |
+
# Save temp file
|
| 454 |
+
content = await file.read()
|
| 455 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 456 |
+
f.write(content)
|
| 457 |
+
temp_path = f.name
|
| 458 |
+
|
| 459 |
+
try:
|
| 460 |
+
service = get_diarization_service()
|
| 461 |
+
result = service.process_audio(
|
| 462 |
+
temp_path,
|
| 463 |
+
num_speakers=num_speakers,
|
| 464 |
+
min_speakers=min_speakers,
|
| 465 |
+
max_speakers=max_speakers,
|
| 466 |
+
language=language,
|
| 467 |
+
preprocess=preprocess,
|
| 468 |
+
)
|
| 469 |
+
return result
|
| 470 |
+
|
| 471 |
+
except ValueError as e:
|
| 472 |
+
# Token missing
|
| 473 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 474 |
+
except ImportError as e:
|
| 475 |
+
# Not installed
|
| 476 |
+
raise HTTPException(status_code=503, detail=str(e))
|
| 477 |
+
except Exception as e:
|
| 478 |
+
logger.exception("Diarization error")
|
| 479 |
+
raise HTTPException(status_code=500, detail=f"Diarization failed: {str(e)}")
|
| 480 |
+
|
| 481 |
+
finally:
|
| 482 |
+
try:
|
| 483 |
+
os.unlink(temp_path)
|
| 484 |
+
except:
|
| 485 |
+
pass
|
| 486 |
+
|
| 487 |
+
except Exception as e:
|
| 488 |
+
logger.error(f"Diarization request failed: {e}")
|
| 489 |
+
raise HTTPException(status_code=500, detail=str(e))
|
backend/app/api/routes/transcripts.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Transcript Management Routes
|
| 3 |
+
CRUD operations and Export
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Optional
|
| 7 |
+
from fastapi import APIRouter, Depends, HTTPException, Response, Query, UploadFile, File, Form
|
| 8 |
+
from sqlalchemy.orm import Session
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
from ...models import get_db, Transcript, AudioFile
|
| 12 |
+
from ...schemas.transcript import TranscriptResponse, TranscriptUpdate
|
| 13 |
+
from ...services.nlp_service import get_nlp_service, NLPService
|
| 14 |
+
from ...services.export_service import ExportService
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
router = APIRouter(prefix="/transcripts", tags=["Transcripts"])
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@router.get("", response_model=List[TranscriptResponse])
|
| 21 |
+
async def list_transcripts(
|
| 22 |
+
skip: int = 0,
|
| 23 |
+
limit: int = 100,
|
| 24 |
+
db: Session = Depends(get_db),
|
| 25 |
+
):
|
| 26 |
+
"""List all transcripts"""
|
| 27 |
+
transcripts = db.query(Transcript).order_by(Transcript.created_at.desc()).offset(skip).limit(limit).all()
|
| 28 |
+
return transcripts
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@router.get("/{transcript_id}", response_model=TranscriptResponse)
|
| 32 |
+
async def get_transcript(
|
| 33 |
+
transcript_id: int,
|
| 34 |
+
db: Session = Depends(get_db),
|
| 35 |
+
):
|
| 36 |
+
"""Get specific transcript details"""
|
| 37 |
+
transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
|
| 38 |
+
if not transcript:
|
| 39 |
+
raise HTTPException(status_code=404, detail="Transcript not found")
|
| 40 |
+
return transcript
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@router.post("/{transcript_id}/analyze")
|
| 44 |
+
async def analyze_transcript(
|
| 45 |
+
transcript_id: int,
|
| 46 |
+
db: Session = Depends(get_db),
|
| 47 |
+
nlp_service: NLPService = Depends(get_nlp_service),
|
| 48 |
+
):
|
| 49 |
+
"""Run NLP analysis on a transcript"""
|
| 50 |
+
transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
|
| 51 |
+
if not transcript:
|
| 52 |
+
raise HTTPException(status_code=404, detail="Transcript not found")
|
| 53 |
+
|
| 54 |
+
if not transcript.processed_text:
|
| 55 |
+
raise HTTPException(status_code=400, detail="Transcript has no text content")
|
| 56 |
+
|
| 57 |
+
# Run analysis
|
| 58 |
+
analysis = nlp_service.process_transcript(transcript.processed_text)
|
| 59 |
+
|
| 60 |
+
# Update DB
|
| 61 |
+
transcript.sentiment = analysis["sentiment"]
|
| 62 |
+
transcript.topics = {"keywords": analysis["keywords"]}
|
| 63 |
+
transcript.summary = analysis["summary"]
|
| 64 |
+
transcript.updated_at = datetime.utcnow()
|
| 65 |
+
|
| 66 |
+
db.commit()
|
| 67 |
+
db.refresh(transcript)
|
| 68 |
+
|
| 69 |
+
return {
|
| 70 |
+
"status": "success",
|
| 71 |
+
"analysis": analysis
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@router.get("/{transcript_id}/export")
|
| 76 |
+
async def export_transcript(
|
| 77 |
+
transcript_id: int,
|
| 78 |
+
format: str = Query(..., regex="^(txt|srt|vtt|pdf)$"),
|
| 79 |
+
db: Session = Depends(get_db),
|
| 80 |
+
):
|
| 81 |
+
"""
|
| 82 |
+
Export transcript to specific format
|
| 83 |
+
"""
|
| 84 |
+
transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
|
| 85 |
+
if not transcript:
|
| 86 |
+
raise HTTPException(status_code=404, detail="Transcript not found")
|
| 87 |
+
|
| 88 |
+
# Convert model to dict for service
|
| 89 |
+
data = {
|
| 90 |
+
"id": transcript.id,
|
| 91 |
+
"text": transcript.processed_text,
|
| 92 |
+
"created_at": str(transcript.created_at),
|
| 93 |
+
"duration": 0,
|
| 94 |
+
"segments": transcript.segments,
|
| 95 |
+
"words": [],
|
| 96 |
+
"sentiment": transcript.sentiment,
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
if format == "txt":
|
| 100 |
+
content = ExportService.to_txt(data)
|
| 101 |
+
media_type = "text/plain"
|
| 102 |
+
elif format == "srt":
|
| 103 |
+
content = ExportService.to_srt(data)
|
| 104 |
+
media_type = "text/plain"
|
| 105 |
+
elif format == "vtt":
|
| 106 |
+
content = ExportService.to_vtt(data)
|
| 107 |
+
media_type = "text/vtt"
|
| 108 |
+
elif format == "pdf":
|
| 109 |
+
content = ExportService.to_pdf(data)
|
| 110 |
+
media_type = "application/pdf"
|
| 111 |
+
else:
|
| 112 |
+
raise HTTPException(status_code=400, detail="Unsupported format")
|
| 113 |
+
|
| 114 |
+
return Response(
|
| 115 |
+
content=content,
|
| 116 |
+
media_type=media_type,
|
| 117 |
+
headers={
|
| 118 |
+
"Content-Disposition": f'attachment; filename="transcript_{transcript_id}.{format}"'
|
| 119 |
+
}
|
| 120 |
+
)
|
| 121 |
+
@router.post("/meeting")
|
| 122 |
+
async def process_meeting(
|
| 123 |
+
file: UploadFile = File(..., description="Audio recording of meeting"),
|
| 124 |
+
num_speakers: Optional[int] = Form(None, description="Number of speakers (hint)"),
|
| 125 |
+
language: Optional[str] = Form(None, description="Language code"),
|
| 126 |
+
db: Session = Depends(get_db),
|
| 127 |
+
):
|
| 128 |
+
"""
|
| 129 |
+
Process a meeting recording:
|
| 130 |
+
1. Diarization (Who spoke when)
|
| 131 |
+
2. Transcription (What was said)
|
| 132 |
+
3. NLP Analysis (Summary, Action Items, Sentiment)
|
| 133 |
+
4. Save to DB
|
| 134 |
+
"""
|
| 135 |
+
import shutil
|
| 136 |
+
import os
|
| 137 |
+
import tempfile
|
| 138 |
+
from ...services.meeting_service import get_meeting_service
|
| 139 |
+
|
| 140 |
+
# Save upload to temp file
|
| 141 |
+
suffix = os.path.splitext(file.filename)[1] or ".wav"
|
| 142 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 143 |
+
shutil.copyfileobj(file.file, tmp)
|
| 144 |
+
tmp_path = tmp.name
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
meeting_service = get_meeting_service()
|
| 148 |
+
|
| 149 |
+
# Run full pipeline
|
| 150 |
+
# This can be slow (minutes) so strictly speaking should be a background task
|
| 151 |
+
# But for this MVP level we'll do it synchronously with a long timeout
|
| 152 |
+
result = meeting_service.process_meeting(
|
| 153 |
+
audio_path=tmp_path,
|
| 154 |
+
num_speakers=num_speakers,
|
| 155 |
+
language=language
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Save to DB
|
| 159 |
+
# Create AudioFile record first
|
| 160 |
+
audio_file = AudioFile(
|
| 161 |
+
filename=file.filename,
|
| 162 |
+
filepath="processed_in_memory", # We delete temp file, so no perm path
|
| 163 |
+
duration=result["metadata"]["duration_seconds"],
|
| 164 |
+
file_size=0,
|
| 165 |
+
format=suffix.replace(".", "")
|
| 166 |
+
)
|
| 167 |
+
db.add(audio_file)
|
| 168 |
+
db.commit()
|
| 169 |
+
db.refresh(audio_file)
|
| 170 |
+
|
| 171 |
+
# Create Transcript record
|
| 172 |
+
transcript = Transcript(
|
| 173 |
+
audio_file_id=audio_file.id,
|
| 174 |
+
raw_text=result["raw_text"],
|
| 175 |
+
processed_text=result["raw_text"],
|
| 176 |
+
segments=result["transcript_segments"],
|
| 177 |
+
sentiment=result["sentiment"],
|
| 178 |
+
topics={"keywords": result["topics"]},
|
| 179 |
+
action_items=result["action_items"],
|
| 180 |
+
attendees=result["metadata"]["attendees"],
|
| 181 |
+
summary=result["summary"],
|
| 182 |
+
language=result["metadata"]["language"],
|
| 183 |
+
confidence=0.95, # Estimated
|
| 184 |
+
duration=result["metadata"]["duration_seconds"],
|
| 185 |
+
created_at=datetime.utcnow()
|
| 186 |
+
)
|
| 187 |
+
db.add(transcript)
|
| 188 |
+
db.commit()
|
| 189 |
+
db.refresh(transcript)
|
| 190 |
+
|
| 191 |
+
return result
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 195 |
+
finally:
|
| 196 |
+
# Cleanup
|
| 197 |
+
try:
|
| 198 |
+
os.unlink(tmp_path)
|
| 199 |
+
except:
|
| 200 |
+
pass
|
backend/app/api/routes/translation.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Translation API Routes
|
| 3 |
+
Endpoints for text and audio translation services
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
from typing import Optional, List
|
| 9 |
+
import logging
|
| 10 |
+
|
| 11 |
+
from app.services.translation_service import get_translation_service
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
router = APIRouter(prefix="/translation", tags=["translation"])
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Request/Response Models
|
| 18 |
+
class TranslateTextRequest(BaseModel):
|
| 19 |
+
"""Request model for text translation."""
|
| 20 |
+
text: str = Field(..., min_length=1, max_length=5000, description="Text to translate")
|
| 21 |
+
source_lang: str = Field(..., description="Source language code (e.g., 'hi', 'en-US')")
|
| 22 |
+
target_lang: str = Field(..., description="Target language code (e.g., 'en', 'es')")
|
| 23 |
+
use_pivot: bool = Field(default=True, description="Use English as pivot for unsupported pairs")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TranslateTextResponse(BaseModel):
|
| 27 |
+
"""Response model for text translation."""
|
| 28 |
+
translated_text: str
|
| 29 |
+
source_lang: str
|
| 30 |
+
target_lang: str
|
| 31 |
+
source_text: str
|
| 32 |
+
processing_time: float
|
| 33 |
+
word_count: int
|
| 34 |
+
pivot_used: Optional[bool] = False
|
| 35 |
+
intermediate_text: Optional[str] = None
|
| 36 |
+
model_used: Optional[str] = None
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class LanguageInfo(BaseModel):
|
| 40 |
+
"""Language information model."""
|
| 41 |
+
code: str
|
| 42 |
+
name: str
|
| 43 |
+
flag: str
|
| 44 |
+
native: str
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class TranslationPair(BaseModel):
|
| 48 |
+
"""Translation pair model."""
|
| 49 |
+
code: str
|
| 50 |
+
source: LanguageInfo
|
| 51 |
+
target: LanguageInfo
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class DetectLanguageResponse(BaseModel):
|
| 55 |
+
"""Response model for language detection."""
|
| 56 |
+
detected_language: str
|
| 57 |
+
confidence: float
|
| 58 |
+
language_info: Optional[dict] = None
|
| 59 |
+
all_probabilities: Optional[List[dict]] = None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Endpoints
|
| 63 |
+
@router.get("/languages", response_model=List[LanguageInfo])
|
| 64 |
+
async def get_supported_languages():
|
| 65 |
+
"""
|
| 66 |
+
Get list of all supported languages.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
List of supported languages with metadata
|
| 70 |
+
"""
|
| 71 |
+
service = get_translation_service()
|
| 72 |
+
return service.get_supported_languages()
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@router.get("/pairs")
|
| 76 |
+
async def get_supported_pairs():
|
| 77 |
+
"""
|
| 78 |
+
Get list of all supported translation pairs.
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
List of supported source->target language pairs
|
| 82 |
+
"""
|
| 83 |
+
service = get_translation_service()
|
| 84 |
+
return {
|
| 85 |
+
"pairs": service.get_supported_pairs(),
|
| 86 |
+
"total": len(service.get_supported_pairs()),
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@router.post("/text", response_model=TranslateTextResponse)
|
| 91 |
+
async def translate_text(request: TranslateTextRequest):
|
| 92 |
+
"""
|
| 93 |
+
Translate text from source to target language.
|
| 94 |
+
|
| 95 |
+
- Uses Helsinki-NLP MarianMT models (~300MB per language pair)
|
| 96 |
+
- Supports pivot translation through English for unsupported pairs
|
| 97 |
+
- First request for a language pair may take longer (model loading)
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
request: Translation request with text and language codes
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
Translated text with metadata
|
| 104 |
+
"""
|
| 105 |
+
service = get_translation_service()
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
if request.use_pivot:
|
| 109 |
+
result = service.translate_with_pivot(
|
| 110 |
+
text=request.text,
|
| 111 |
+
source_lang=request.source_lang,
|
| 112 |
+
target_lang=request.target_lang,
|
| 113 |
+
)
|
| 114 |
+
else:
|
| 115 |
+
result = service.translate_text(
|
| 116 |
+
text=request.text,
|
| 117 |
+
source_lang=request.source_lang,
|
| 118 |
+
target_lang=request.target_lang,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
return TranslateTextResponse(**result)
|
| 122 |
+
|
| 123 |
+
except ValueError as e:
|
| 124 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"Translation error: {e}")
|
| 127 |
+
raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
@router.post("/detect", response_model=DetectLanguageResponse)
|
| 131 |
+
async def detect_language(text: str = Form(..., min_length=10, description="Text to analyze")):
|
| 132 |
+
"""
|
| 133 |
+
Detect the language of input text.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
text: Text to analyze (minimum 10 characters for accuracy)
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Detected language with confidence score
|
| 140 |
+
"""
|
| 141 |
+
service = get_translation_service()
|
| 142 |
+
result = service.detect_language(text)
|
| 143 |
+
|
| 144 |
+
if result.get("error"):
|
| 145 |
+
raise HTTPException(status_code=400, detail=result["error"])
|
| 146 |
+
|
| 147 |
+
return DetectLanguageResponse(**result)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
@router.get("/model-info")
|
| 151 |
+
async def get_model_info():
|
| 152 |
+
"""
|
| 153 |
+
Get information about loaded translation models.
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
Model loading status and supported pairs
|
| 157 |
+
"""
|
| 158 |
+
service = get_translation_service()
|
| 159 |
+
return service.get_model_info()
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
@router.post("/audio")
|
| 163 |
+
async def translate_audio(
|
| 164 |
+
file: UploadFile = File(..., description="Audio file to translate"),
|
| 165 |
+
source_lang: str = Form(..., description="Source language code"),
|
| 166 |
+
target_lang: str = Form(..., description="Target language code"),
|
| 167 |
+
generate_audio: bool = Form(default=True, description="Generate TTS output"),
|
| 168 |
+
):
|
| 169 |
+
"""
|
| 170 |
+
Full audio translation pipeline: STT → Translate → TTS
|
| 171 |
+
|
| 172 |
+
1. Transcribe audio using Whisper
|
| 173 |
+
2. Translate text using MarianMT
|
| 174 |
+
3. Optionally generate speech in target language
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
file: Audio file (WAV, MP3, etc.)
|
| 178 |
+
source_lang: Source language code
|
| 179 |
+
target_lang: Target language code
|
| 180 |
+
generate_audio: Whether to generate TTS output
|
| 181 |
+
|
| 182 |
+
Returns:
|
| 183 |
+
Transcription, translation, and optional audio response
|
| 184 |
+
"""
|
| 185 |
+
import tempfile
|
| 186 |
+
import os
|
| 187 |
+
from app.services.whisper_stt_service import get_whisper_stt_service
|
| 188 |
+
from app.services.edge_tts_service import get_edge_tts_service
|
| 189 |
+
|
| 190 |
+
translation_service = get_translation_service()
|
| 191 |
+
stt_service = get_whisper_stt_service()
|
| 192 |
+
tts_service = get_edge_tts_service()
|
| 193 |
+
|
| 194 |
+
# Save uploaded file
|
| 195 |
+
suffix = os.path.splitext(file.filename)[1] or ".wav"
|
| 196 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 197 |
+
content = await file.read()
|
| 198 |
+
tmp.write(content)
|
| 199 |
+
tmp_path = tmp.name
|
| 200 |
+
|
| 201 |
+
try:
|
| 202 |
+
# Step 1: Transcribe
|
| 203 |
+
transcription = stt_service.transcribe_file(tmp_path, language=source_lang)
|
| 204 |
+
source_text = transcription["text"]
|
| 205 |
+
|
| 206 |
+
if not source_text.strip():
|
| 207 |
+
raise HTTPException(status_code=400, detail="No speech detected in audio")
|
| 208 |
+
|
| 209 |
+
# Step 2: Translate
|
| 210 |
+
translation = translation_service.translate_with_pivot(
|
| 211 |
+
text=source_text,
|
| 212 |
+
source_lang=source_lang,
|
| 213 |
+
target_lang=target_lang,
|
| 214 |
+
)
|
| 215 |
+
translated_text = translation["translated_text"]
|
| 216 |
+
|
| 217 |
+
# Step 3: Generate TTS (optional)
|
| 218 |
+
audio_base64 = None
|
| 219 |
+
if generate_audio:
|
| 220 |
+
# Map language code to voice
|
| 221 |
+
voice_map = {
|
| 222 |
+
"en": "en-US-AriaNeural",
|
| 223 |
+
"hi": "hi-IN-SwaraNeural",
|
| 224 |
+
"es": "es-ES-ElviraNeural",
|
| 225 |
+
"fr": "fr-FR-DeniseNeural",
|
| 226 |
+
"de": "de-DE-KatjaNeural",
|
| 227 |
+
"zh": "zh-CN-XiaoxiaoNeural",
|
| 228 |
+
"ja": "ja-JP-NanamiNeural",
|
| 229 |
+
"ko": "ko-KR-SunHiNeural",
|
| 230 |
+
"ar": "ar-SA-ZariyahNeural",
|
| 231 |
+
"ru": "ru-RU-SvetlanaNeural",
|
| 232 |
+
}
|
| 233 |
+
target_code = target_lang.split("-")[0].lower()
|
| 234 |
+
voice = voice_map.get(target_code, "en-US-AriaNeural")
|
| 235 |
+
|
| 236 |
+
audio_bytes = tts_service.synthesize_sync(translated_text, voice=voice)
|
| 237 |
+
|
| 238 |
+
import base64
|
| 239 |
+
audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
|
| 240 |
+
|
| 241 |
+
return {
|
| 242 |
+
"source_text": source_text,
|
| 243 |
+
"translated_text": translated_text,
|
| 244 |
+
"source_lang": source_lang,
|
| 245 |
+
"target_lang": target_lang,
|
| 246 |
+
"transcription_time": transcription["processing_time"],
|
| 247 |
+
"translation_time": translation["processing_time"],
|
| 248 |
+
"audio_base64": audio_base64,
|
| 249 |
+
"audio_format": "mp3" if audio_base64 else None,
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
except HTTPException:
|
| 253 |
+
raise
|
| 254 |
+
except Exception as e:
|
| 255 |
+
logger.error(f"Audio translation failed: {e}")
|
| 256 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 257 |
+
finally:
|
| 258 |
+
try:
|
| 259 |
+
os.unlink(tmp_path)
|
| 260 |
+
except:
|
| 261 |
+
pass
|
backend/app/api/routes/tts.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text-to-Speech API Router
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import base64
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Optional
|
| 8 |
+
from fastapi import APIRouter, HTTPException, Depends, Response, Request
|
| 9 |
+
from fastapi.responses import StreamingResponse
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
|
| 12 |
+
from ...core.limiter import limiter
|
| 13 |
+
|
| 14 |
+
from ...services.tts_service import get_tts_service, TTSService
|
| 15 |
+
from ...schemas.tts import (
|
| 16 |
+
SynthesisRequest,
|
| 17 |
+
SynthesisResponse,
|
| 18 |
+
VoiceInfo,
|
| 19 |
+
VoiceListResponse,
|
| 20 |
+
VoicePreviewRequest,
|
| 21 |
+
)
|
| 22 |
+
from ...core.config import get_settings
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
router = APIRouter(prefix="/tts", tags=["Text-to-Speech"])
|
| 26 |
+
settings = get_settings()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@router.get("/voices", response_model=VoiceListResponse)
|
| 30 |
+
async def get_voices(
|
| 31 |
+
language: Optional[str] = None,
|
| 32 |
+
tts_service: TTSService = Depends(get_tts_service),
|
| 33 |
+
):
|
| 34 |
+
"""
|
| 35 |
+
Get list of available TTS voices
|
| 36 |
+
|
| 37 |
+
Optionally filter by language code (e.g., "en-US", "es", "fr")
|
| 38 |
+
"""
|
| 39 |
+
return await tts_service.get_voices(language_code=language)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@router.get("/voices/{language}", response_model=VoiceListResponse)
|
| 43 |
+
async def get_voices_by_language(
|
| 44 |
+
language: str,
|
| 45 |
+
tts_service: TTSService = Depends(get_tts_service),
|
| 46 |
+
):
|
| 47 |
+
"""
|
| 48 |
+
Get voices for a specific language
|
| 49 |
+
"""
|
| 50 |
+
if language not in settings.supported_languages_list:
|
| 51 |
+
# Try partial match (e.g., "en" matches "en-US", "en-GB")
|
| 52 |
+
partial_matches = [l for l in settings.supported_languages_list if l.startswith(language)]
|
| 53 |
+
if not partial_matches:
|
| 54 |
+
raise HTTPException(
|
| 55 |
+
status_code=400,
|
| 56 |
+
detail=f"Unsupported language: {language}"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
return await tts_service.get_voices(language_code=language)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@router.post("/synthesize", response_model=SynthesisResponse)
|
| 63 |
+
@limiter.limit("10/minute")
|
| 64 |
+
async def synthesize_speech(
|
| 65 |
+
request: Request,
|
| 66 |
+
request_body: SynthesisRequest,
|
| 67 |
+
tts_service: TTSService = Depends(get_tts_service),
|
| 68 |
+
):
|
| 69 |
+
"""
|
| 70 |
+
Synthesize text to speech
|
| 71 |
+
|
| 72 |
+
Returns base64-encoded audio content along with metadata.
|
| 73 |
+
Decode the audio_content field to get the audio bytes.
|
| 74 |
+
"""
|
| 75 |
+
# Validate text length
|
| 76 |
+
if len(request_body.text) > 5000:
|
| 77 |
+
raise HTTPException(
|
| 78 |
+
status_code=400,
|
| 79 |
+
detail="Text too long. Maximum 5000 characters."
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Validate language
|
| 83 |
+
lang_base = request_body.language.split("-")[0] if "-" in request_body.language else request_body.language
|
| 84 |
+
supported_bases = [l.split("-")[0] for l in settings.supported_languages_list]
|
| 85 |
+
if lang_base not in supported_bases:
|
| 86 |
+
raise HTTPException(
|
| 87 |
+
status_code=400,
|
| 88 |
+
detail=f"Unsupported language: {request_body.language}"
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
result = await tts_service.synthesize(request_body)
|
| 93 |
+
return result
|
| 94 |
+
except ValueError as e:
|
| 95 |
+
logger.error(f"Synthesis validation error: {e}")
|
| 96 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.exception(f"Synthesis failed: {e}")
|
| 99 |
+
raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@router.post("/stream")
|
| 103 |
+
async def stream_speech(
|
| 104 |
+
request: SynthesisRequest,
|
| 105 |
+
tts_service: TTSService = Depends(get_tts_service),
|
| 106 |
+
):
|
| 107 |
+
"""
|
| 108 |
+
Stream text-to-speech audio
|
| 109 |
+
|
| 110 |
+
Returns a chunked audio stream (audio/mpeg) for immediate playback.
|
| 111 |
+
Best for long text to reduce latency (TTFB).
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
return StreamingResponse(
|
| 115 |
+
tts_service.synthesize_stream(request),
|
| 116 |
+
media_type="audio/mpeg"
|
| 117 |
+
)
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.exception(f"Streaming synthesis failed: {e}")
|
| 120 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@router.post("/ssml")
|
| 124 |
+
async def synthesize_ssml(
|
| 125 |
+
text: str,
|
| 126 |
+
voice: str = "en-US-AriaNeural",
|
| 127 |
+
rate: str = "medium",
|
| 128 |
+
pitch: str = "medium",
|
| 129 |
+
emphasis: Optional[str] = None,
|
| 130 |
+
auto_breaks: bool = True,
|
| 131 |
+
tts_service: TTSService = Depends(get_tts_service),
|
| 132 |
+
):
|
| 133 |
+
"""
|
| 134 |
+
Synthesize speech with SSML prosody control
|
| 135 |
+
|
| 136 |
+
Supports advanced speech customization:
|
| 137 |
+
- rate: 'x-slow', 'slow', 'medium', 'fast', 'x-fast'
|
| 138 |
+
- pitch: 'x-low', 'low', 'medium', 'high', 'x-high'
|
| 139 |
+
- emphasis: 'reduced', 'moderate', 'strong'
|
| 140 |
+
- auto_breaks: Add natural pauses at punctuation
|
| 141 |
+
|
| 142 |
+
Returns audio/mpeg stream.
|
| 143 |
+
"""
|
| 144 |
+
try:
|
| 145 |
+
from ...services.edge_tts_service import get_edge_tts_service
|
| 146 |
+
edge_service = get_edge_tts_service()
|
| 147 |
+
|
| 148 |
+
# Build SSML
|
| 149 |
+
ssml = edge_service.build_ssml(
|
| 150 |
+
text=text,
|
| 151 |
+
voice=voice,
|
| 152 |
+
rate=rate,
|
| 153 |
+
pitch=pitch,
|
| 154 |
+
emphasis=emphasis,
|
| 155 |
+
breaks=auto_breaks
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
# Synthesize
|
| 159 |
+
audio_bytes = await edge_service.synthesize_ssml(ssml, voice)
|
| 160 |
+
|
| 161 |
+
return Response(
|
| 162 |
+
content=audio_bytes,
|
| 163 |
+
media_type="audio/mpeg",
|
| 164 |
+
headers={"Content-Disposition": "inline; filename=speech.mp3"}
|
| 165 |
+
)
|
| 166 |
+
except Exception as e:
|
| 167 |
+
logger.exception(f"SSML synthesis failed: {e}")
|
| 168 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@router.post("/synthesize/audio")
|
| 172 |
+
async def synthesize_audio_file(
|
| 173 |
+
request: SynthesisRequest,
|
| 174 |
+
tts_service: TTSService = Depends(get_tts_service),
|
| 175 |
+
):
|
| 176 |
+
"""
|
| 177 |
+
Synthesize text and return audio file directly
|
| 178 |
+
|
| 179 |
+
Returns the audio file as a downloadable stream.
|
| 180 |
+
"""
|
| 181 |
+
try:
|
| 182 |
+
result = await tts_service.synthesize(request)
|
| 183 |
+
|
| 184 |
+
# Decode base64 audio
|
| 185 |
+
audio_bytes = base64.b64decode(result.audio_content)
|
| 186 |
+
|
| 187 |
+
# Determine content type
|
| 188 |
+
content_types = {
|
| 189 |
+
"MP3": "audio/mpeg",
|
| 190 |
+
"LINEAR16": "audio/wav",
|
| 191 |
+
"OGG_OPUS": "audio/ogg",
|
| 192 |
+
}
|
| 193 |
+
content_type = content_types.get(result.encoding, "audio/mpeg")
|
| 194 |
+
|
| 195 |
+
# Return as streaming response
|
| 196 |
+
return StreamingResponse(
|
| 197 |
+
BytesIO(audio_bytes),
|
| 198 |
+
media_type=content_type,
|
| 199 |
+
headers={
|
| 200 |
+
"Content-Disposition": f'attachment; filename="speech.{result.encoding.lower()}"',
|
| 201 |
+
"Content-Length": str(result.audio_size),
|
| 202 |
+
}
|
| 203 |
+
)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.exception(f"Audio synthesis failed: {e}")
|
| 206 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@router.post("/preview")
|
| 210 |
+
async def preview_voice(
|
| 211 |
+
request: VoicePreviewRequest,
|
| 212 |
+
tts_service: TTSService = Depends(get_tts_service),
|
| 213 |
+
):
|
| 214 |
+
"""
|
| 215 |
+
Generate a short preview of a voice
|
| 216 |
+
|
| 217 |
+
Returns a small audio sample for voice selection UI.
|
| 218 |
+
"""
|
| 219 |
+
# Find the voice to get its language
|
| 220 |
+
voices = tts_service.get_voices().voices
|
| 221 |
+
voice_info = next((v for v in voices if v.name == request.voice), None)
|
| 222 |
+
|
| 223 |
+
if not voice_info:
|
| 224 |
+
raise HTTPException(status_code=404, detail=f"Voice not found: {request.voice}")
|
| 225 |
+
|
| 226 |
+
# Create synthesis request with preview text
|
| 227 |
+
synth_request = SynthesisRequest(
|
| 228 |
+
text=request.text or "Hello! This is a preview of my voice.",
|
| 229 |
+
language=voice_info.language_code,
|
| 230 |
+
voice=request.voice,
|
| 231 |
+
audio_encoding="MP3",
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
result = tts_service.synthesize(synth_request)
|
| 236 |
+
|
| 237 |
+
# Return audio directly
|
| 238 |
+
audio_bytes = base64.b64decode(result.audio_content)
|
| 239 |
+
return StreamingResponse(
|
| 240 |
+
BytesIO(audio_bytes),
|
| 241 |
+
media_type="audio/mpeg",
|
| 242 |
+
)
|
| 243 |
+
except Exception as e:
|
| 244 |
+
logger.exception(f"Preview failed: {e}")
|
| 245 |
+
raise HTTPException(status_code=500, detail=str(e))
|
backend/app/api/routes/ws.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
WebSocket Router for Real-Time Transcription
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import logging
|
| 6 |
+
import json
|
| 7 |
+
from typing import Dict
|
| 8 |
+
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
router = APIRouter(prefix="/ws", tags=["WebSocket"])
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ConnectionManager:
|
| 15 |
+
"""Manages active WebSocket connections"""
|
| 16 |
+
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.active_connections: Dict[str, WebSocket] = {}
|
| 19 |
+
|
| 20 |
+
async def connect(self, client_id: str, websocket: WebSocket):
|
| 21 |
+
await websocket.accept()
|
| 22 |
+
self.active_connections[client_id] = websocket
|
| 23 |
+
logger.info(f"Client {client_id} connected")
|
| 24 |
+
|
| 25 |
+
def disconnect(self, client_id: str):
|
| 26 |
+
if client_id in self.active_connections:
|
| 27 |
+
del self.active_connections[client_id]
|
| 28 |
+
logger.info(f"Client {client_id} disconnected")
|
| 29 |
+
|
| 30 |
+
async def send_json(self, client_id: str, data: dict):
|
| 31 |
+
if client_id in self.active_connections:
|
| 32 |
+
await self.active_connections[client_id].send_json(data)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
manager = ConnectionManager()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@router.websocket("/transcription/{client_id}")
|
| 39 |
+
async def websocket_transcription(websocket: WebSocket, client_id: str):
|
| 40 |
+
"""
|
| 41 |
+
Real-time streaming transcription via WebSocket with VAD
|
| 42 |
+
"""
|
| 43 |
+
await manager.connect(client_id, websocket)
|
| 44 |
+
|
| 45 |
+
from app.services.ws_stt_service import StreamManager, transcribe_buffer
|
| 46 |
+
|
| 47 |
+
stream_manager = StreamManager(websocket)
|
| 48 |
+
|
| 49 |
+
async def handle_transcription(audio_bytes: bytes):
|
| 50 |
+
"""Callback for processing speech segments."""
|
| 51 |
+
try:
|
| 52 |
+
# Send processing status
|
| 53 |
+
await manager.send_json(client_id, {"status": "processing"})
|
| 54 |
+
|
| 55 |
+
# Transcribe
|
| 56 |
+
result = await transcribe_buffer(audio_bytes)
|
| 57 |
+
text = result.get("text", "").strip()
|
| 58 |
+
|
| 59 |
+
if text:
|
| 60 |
+
# Send result
|
| 61 |
+
await manager.send_json(client_id, {
|
| 62 |
+
"text": text,
|
| 63 |
+
"is_final": True,
|
| 64 |
+
"status": "complete"
|
| 65 |
+
})
|
| 66 |
+
logger.info(f"Transcribed: {text}")
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.error(f"Transcription callback error: {e}")
|
| 69 |
+
await manager.send_json(client_id, {"error": str(e)})
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
# Start processing loop
|
| 73 |
+
await stream_manager.process_stream(handle_transcription)
|
| 74 |
+
|
| 75 |
+
except WebSocketDisconnect:
|
| 76 |
+
manager.disconnect(client_id)
|
| 77 |
+
except Exception as e:
|
| 78 |
+
logger.error(f"WebSocket error: {e}")
|
| 79 |
+
try:
|
| 80 |
+
await manager.send_json(client_id, {"error": str(e)})
|
| 81 |
+
except:
|
| 82 |
+
pass
|
| 83 |
+
manager.disconnect(client_id)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@router.websocket("/tts/{client_id}")
|
| 87 |
+
async def websocket_tts(websocket: WebSocket, client_id: str):
|
| 88 |
+
"""
|
| 89 |
+
Real-time Text-to-Speech via WebSocket
|
| 90 |
+
|
| 91 |
+
Protocol:
|
| 92 |
+
- Client sends: JSON {"text": "...", "voice": "...", "rate": "...", "pitch": "..."}
|
| 93 |
+
- Server sends: Binary audio chunks (MP3) followed by JSON {"status": "complete"}
|
| 94 |
+
|
| 95 |
+
This achieves <500ms TTFB by streaming as chunks are generated.
|
| 96 |
+
"""
|
| 97 |
+
await manager.connect(client_id, websocket)
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
import edge_tts
|
| 101 |
+
|
| 102 |
+
while True:
|
| 103 |
+
# Receive synthesis request
|
| 104 |
+
data = await websocket.receive_json()
|
| 105 |
+
|
| 106 |
+
text = data.get("text", "")
|
| 107 |
+
voice = data.get("voice", "en-US-AriaNeural")
|
| 108 |
+
rate = data.get("rate", "+0%")
|
| 109 |
+
pitch = data.get("pitch", "+0Hz")
|
| 110 |
+
|
| 111 |
+
if not text:
|
| 112 |
+
await websocket.send_json({"error": "No text provided"})
|
| 113 |
+
continue
|
| 114 |
+
|
| 115 |
+
logger.info(f"WebSocket TTS: Synthesizing '{text[:50]}...' with {voice}")
|
| 116 |
+
|
| 117 |
+
# Stream audio chunks directly
|
| 118 |
+
import time
|
| 119 |
+
start_time = time.time()
|
| 120 |
+
first_chunk_sent = False
|
| 121 |
+
total_bytes = 0
|
| 122 |
+
|
| 123 |
+
communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
|
| 124 |
+
|
| 125 |
+
async for chunk in communicate.stream():
|
| 126 |
+
if chunk["type"] == "audio":
|
| 127 |
+
await websocket.send_bytes(chunk["data"])
|
| 128 |
+
total_bytes += len(chunk["data"])
|
| 129 |
+
|
| 130 |
+
if not first_chunk_sent:
|
| 131 |
+
ttfb = (time.time() - start_time) * 1000
|
| 132 |
+
logger.info(f"WebSocket TTS TTFB: {ttfb:.0f}ms")
|
| 133 |
+
first_chunk_sent = True
|
| 134 |
+
|
| 135 |
+
# Send completion marker
|
| 136 |
+
total_time = time.time() - start_time
|
| 137 |
+
await websocket.send_json({
|
| 138 |
+
"status": "complete",
|
| 139 |
+
"total_bytes": total_bytes,
|
| 140 |
+
"total_time_ms": round(total_time * 1000),
|
| 141 |
+
"ttfb_ms": round(ttfb) if first_chunk_sent else None
|
| 142 |
+
})
|
| 143 |
+
|
| 144 |
+
except WebSocketDisconnect:
|
| 145 |
+
manager.disconnect(client_id)
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"WebSocket TTS error: {e}")
|
| 148 |
+
try:
|
| 149 |
+
await websocket.send_json({"error": str(e)})
|
| 150 |
+
except:
|
| 151 |
+
pass
|
| 152 |
+
manager.disconnect(client_id)
|
| 153 |
+
|
backend/app/core/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge Core Package
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .config import get_settings, Settings, LANGUAGE_METADATA
|
| 6 |
+
|
| 7 |
+
__all__ = ["get_settings", "Settings", "LANGUAGE_METADATA"]
|
backend/app/core/config.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge Configuration
|
| 3 |
+
Pydantic Settings for application configuration
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from typing import List
|
| 8 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 9 |
+
from pydantic import Field
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Settings(BaseSettings):
|
| 13 |
+
"""Application settings loaded from environment variables"""
|
| 14 |
+
|
| 15 |
+
model_config = SettingsConfigDict(
|
| 16 |
+
env_file=".env",
|
| 17 |
+
env_file_encoding="utf-8",
|
| 18 |
+
case_sensitive=False,
|
| 19 |
+
extra="allow", # Allow extra env vars without error
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Application
|
| 23 |
+
app_name: str = "VoiceForge"
|
| 24 |
+
app_version: str = "1.0.0"
|
| 25 |
+
debug: bool = False
|
| 26 |
+
|
| 27 |
+
# API Server
|
| 28 |
+
api_host: str = "0.0.0.0"
|
| 29 |
+
api_port: int = 8000
|
| 30 |
+
|
| 31 |
+
# Database
|
| 32 |
+
database_url: str = Field(
|
| 33 |
+
default="sqlite:///./voiceforge.db",
|
| 34 |
+
description="Database connection URL (SQLite for dev, PostgreSQL for prod)"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Redis
|
| 38 |
+
redis_url: str = Field(
|
| 39 |
+
default="redis://localhost:6379/0",
|
| 40 |
+
description="Redis connection URL for caching and Celery"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Google Cloud
|
| 44 |
+
google_application_credentials: str = Field(
|
| 45 |
+
default="./credentials/google-cloud-key.json",
|
| 46 |
+
description="Path to Google Cloud service account JSON key"
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# AI Services Configuration
|
| 50 |
+
use_local_services: bool = Field(
|
| 51 |
+
default=True,
|
| 52 |
+
description="Use local free services (Whisper + EdgeTTS) instead of Google Cloud"
|
| 53 |
+
)
|
| 54 |
+
whisper_model: str = Field(
|
| 55 |
+
default="small",
|
| 56 |
+
description="Whisper model size (tiny, base, small, medium, large-v3)"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Security
|
| 60 |
+
secret_key: str = Field(
|
| 61 |
+
default="your-super-secret-key-change-in-production",
|
| 62 |
+
description="Secret key for JWT encoding"
|
| 63 |
+
)
|
| 64 |
+
access_token_expire_minutes: int = 30
|
| 65 |
+
algorithm: str = "HS256"
|
| 66 |
+
hf_token: str | None = Field(default=None, description="Hugging Face Token for Diarization")
|
| 67 |
+
|
| 68 |
+
# File Storage
|
| 69 |
+
upload_dir: str = "./uploads"
|
| 70 |
+
max_audio_duration_seconds: int = 600 # 10 minutes
|
| 71 |
+
max_upload_size_mb: int = 50
|
| 72 |
+
|
| 73 |
+
# Supported Languages
|
| 74 |
+
supported_languages: str = "en-US,en-GB,es-ES,es-MX,fr-FR,de-DE,ja-JP,ko-KR,zh-CN,hi-IN"
|
| 75 |
+
|
| 76 |
+
# Audio Formats
|
| 77 |
+
supported_audio_formats: str = "wav,mp3,m4a,flac,ogg,webm"
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def supported_languages_list(self) -> List[str]:
|
| 81 |
+
"""Get supported languages as a list"""
|
| 82 |
+
return [lang.strip() for lang in self.supported_languages.split(",")]
|
| 83 |
+
|
| 84 |
+
@property
|
| 85 |
+
def supported_audio_formats_list(self) -> List[str]:
|
| 86 |
+
"""Get supported audio formats as a list"""
|
| 87 |
+
return [fmt.strip() for fmt in self.supported_audio_formats.split(",")]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# Language metadata for UI display
|
| 91 |
+
LANGUAGE_METADATA = {
|
| 92 |
+
"en-US": {"name": "English (US)", "flag": "🇺🇸", "native": "English"},
|
| 93 |
+
"en-GB": {"name": "English (UK)", "flag": "🇬🇧", "native": "English"},
|
| 94 |
+
"es-ES": {"name": "Spanish (Spain)", "flag": "🇪🇸", "native": "Español"},
|
| 95 |
+
"es-MX": {"name": "Spanish (Mexico)", "flag": "🇲🇽", "native": "Español"},
|
| 96 |
+
"fr-FR": {"name": "French", "flag": "🇫🇷", "native": "Français"},
|
| 97 |
+
"de-DE": {"name": "German", "flag": "🇩🇪", "native": "Deutsch"},
|
| 98 |
+
"ja-JP": {"name": "Japanese", "flag": "🇯🇵", "native": "日本語"},
|
| 99 |
+
"ko-KR": {"name": "Korean", "flag": "🇰🇷", "native": "한국어"},
|
| 100 |
+
"zh-CN": {"name": "Chinese (Mandarin)", "flag": "🇨🇳", "native": "中文"},
|
| 101 |
+
"hi-IN": {"name": "Hindi", "flag": "🇮🇳", "native": "हिन्दी"},
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@lru_cache
|
| 106 |
+
def get_settings() -> Settings:
|
| 107 |
+
"""Get cached settings instance"""
|
| 108 |
+
return Settings()
|
backend/app/core/limiter.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from slowapi import Limiter
|
| 3 |
+
from slowapi.util import get_remote_address
|
| 4 |
+
from slowapi.errors import RateLimitExceeded
|
| 5 |
+
|
| 6 |
+
# Initialize Limiter
|
| 7 |
+
# Use in-memory storage for local dev (Redis for production)
|
| 8 |
+
redis_url = os.getenv("REDIS_URL")
|
| 9 |
+
|
| 10 |
+
# For local testing without Redis, use memory storage
|
| 11 |
+
if redis_url and redis_url.strip():
|
| 12 |
+
try:
|
| 13 |
+
import redis
|
| 14 |
+
r = redis.from_url(redis_url)
|
| 15 |
+
r.ping() # Test connection
|
| 16 |
+
storage_uri = redis_url
|
| 17 |
+
except Exception:
|
| 18 |
+
# Redis not available, fall back to memory
|
| 19 |
+
storage_uri = "memory://"
|
| 20 |
+
else:
|
| 21 |
+
storage_uri = "memory://"
|
| 22 |
+
|
| 23 |
+
limiter = Limiter(
|
| 24 |
+
key_func=get_remote_address,
|
| 25 |
+
storage_uri=storage_uri,
|
| 26 |
+
default_limits=["60/minute"] # Global limit: 60 req/min per IP
|
| 27 |
+
)
|
backend/app/core/middleware.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rate Limiting Middleware
|
| 3 |
+
Uses Redis to track and limit request rates per IP address.
|
| 4 |
+
Pure ASGI implementation to avoid BaseHTTPMiddleware issues.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
import redis
|
| 9 |
+
from starlette.responses import JSONResponse
|
| 10 |
+
from starlette.types import ASGIApp, Scope, Receive, Send
|
| 11 |
+
from ..core.config import get_settings
|
| 12 |
+
|
| 13 |
+
settings = get_settings()
|
| 14 |
+
|
| 15 |
+
class RateLimitMiddleware:
|
| 16 |
+
def __init__(self, app: ASGIApp):
|
| 17 |
+
self.app = app
|
| 18 |
+
# Hardcoded or from settings (bypassing constructor arg issue)
|
| 19 |
+
self.requests_per_minute = 60
|
| 20 |
+
self.window_size = 60 # seconds
|
| 21 |
+
|
| 22 |
+
# Connect to Redis
|
| 23 |
+
try:
|
| 24 |
+
self.redis_client = redis.from_url(settings.redis_url)
|
| 25 |
+
except Exception as e:
|
| 26 |
+
print(f"⚠️ Rate limiter disabled: Could not connect to Redis ({e})")
|
| 27 |
+
self.redis_client = None
|
| 28 |
+
|
| 29 |
+
async def __call__(self, scope: Scope, receive: Receive, send: Send):
|
| 30 |
+
# Skip if not HTTP
|
| 31 |
+
if scope["type"] != "http":
|
| 32 |
+
await self.app(scope, receive, send)
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
# Skip rate limiting for non-API routes or if Redis is down
|
| 36 |
+
path = scope.get("path", "")
|
| 37 |
+
if not path.startswith("/api/") or self.redis_client is None:
|
| 38 |
+
await self.app(scope, receive, send)
|
| 39 |
+
return
|
| 40 |
+
|
| 41 |
+
# Get client IP
|
| 42 |
+
client = scope.get("client")
|
| 43 |
+
client_ip = client[0] if client else "unknown"
|
| 44 |
+
key = f"rate_limit:{client_ip}"
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
# Simple fixed window counter
|
| 48 |
+
current_count = self.redis_client.incr(key)
|
| 49 |
+
|
| 50 |
+
# Set expiry on first request
|
| 51 |
+
if current_count == 1:
|
| 52 |
+
self.redis_client.expire(key, self.window_size)
|
| 53 |
+
|
| 54 |
+
if current_count > self.requests_per_minute:
|
| 55 |
+
response = JSONResponse(
|
| 56 |
+
status_code=429,
|
| 57 |
+
content={
|
| 58 |
+
"detail": "Too many requests",
|
| 59 |
+
"retry_after": self.window_size
|
| 60 |
+
},
|
| 61 |
+
headers={"Retry-After": str(self.window_size)}
|
| 62 |
+
)
|
| 63 |
+
await response(scope, receive, send)
|
| 64 |
+
return
|
| 65 |
+
|
| 66 |
+
except redis.RedisError:
|
| 67 |
+
# Fail open if Redis has issues during request
|
| 68 |
+
pass
|
| 69 |
+
|
| 70 |
+
await self.app(scope, receive, send)
|
backend/app/core/security.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Security Utilities
|
| 3 |
+
Handles password hashing, JWT generation, and API key verification.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
+
from typing import Optional, Union, Any
|
| 8 |
+
from jose import jwt
|
| 9 |
+
from passlib.context import CryptContext
|
| 10 |
+
from fastapi.security import OAuth2PasswordBearer, APIKeyHeader
|
| 11 |
+
from fastapi import Depends, HTTPException, status
|
| 12 |
+
from sqlalchemy.orm import Session
|
| 13 |
+
|
| 14 |
+
from ..core.config import get_settings
|
| 15 |
+
from ..models import get_db, User, ApiKey
|
| 16 |
+
|
| 17 |
+
settings = get_settings()
|
| 18 |
+
|
| 19 |
+
# Password hashing (PBKDF2 is safer/easier on Windows than bcrypt sometimes)
|
| 20 |
+
pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
|
| 21 |
+
|
| 22 |
+
# JWT configuration
|
| 23 |
+
SECRET_KEY = settings.secret_key
|
| 24 |
+
ALGORITHM = settings.algorithm
|
| 25 |
+
ACCESS_TOKEN_EXPIRE_MINUTES = settings.access_token_expire_minutes
|
| 26 |
+
|
| 27 |
+
# OAuth2 scheme
|
| 28 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="api/v1/auth/login")
|
| 29 |
+
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
| 33 |
+
return pwd_context.verify(plain_password, hashed_password)
|
| 34 |
+
|
| 35 |
+
def get_password_hash(password: str) -> str:
|
| 36 |
+
return pwd_context.hash(password)
|
| 37 |
+
|
| 38 |
+
def create_access_token(subject: Union[str, Any], expires_delta: timedelta = None) -> str:
|
| 39 |
+
if expires_delta:
|
| 40 |
+
expire = datetime.utcnow() + expires_delta
|
| 41 |
+
else:
|
| 42 |
+
expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
| 43 |
+
|
| 44 |
+
to_encode = {"exp": expire, "sub": str(subject)}
|
| 45 |
+
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
| 46 |
+
return encoded_jwt
|
| 47 |
+
|
| 48 |
+
async def get_current_user(token: str = Depends(oauth2_scheme), db: Session = Depends(get_db)) -> User:
|
| 49 |
+
"""Validate JWT and return user"""
|
| 50 |
+
credentials_exception = HTTPException(
|
| 51 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 52 |
+
detail="Could not validate credentials",
|
| 53 |
+
headers={"WWW-Authenticate": "Bearer"},
|
| 54 |
+
)
|
| 55 |
+
try:
|
| 56 |
+
payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 57 |
+
user_id: str = payload.get("sub")
|
| 58 |
+
if user_id is None:
|
| 59 |
+
raise credentials_exception
|
| 60 |
+
except Exception:
|
| 61 |
+
raise credentials_exception
|
| 62 |
+
|
| 63 |
+
user = db.query(User).filter(User.id == int(user_id)).first()
|
| 64 |
+
if user is None:
|
| 65 |
+
raise credentials_exception
|
| 66 |
+
return user
|
| 67 |
+
|
| 68 |
+
async def get_current_active_user(current_user: User = Depends(get_current_user)) -> User:
|
| 69 |
+
if not current_user.is_active:
|
| 70 |
+
raise HTTPException(status_code=400, detail="Inactive user")
|
| 71 |
+
return current_user
|
| 72 |
+
|
| 73 |
+
async def verify_api_key(
|
| 74 |
+
api_key: str = Depends(api_key_header),
|
| 75 |
+
db: Session = Depends(get_db)
|
| 76 |
+
) -> Optional[User]:
|
| 77 |
+
"""
|
| 78 |
+
Validate API key from X-API-Key header.
|
| 79 |
+
Returns the associated user if valid, else None (or raises if enforcing).
|
| 80 |
+
"""
|
| 81 |
+
if not api_key:
|
| 82 |
+
return None # Or raise if strict
|
| 83 |
+
|
| 84 |
+
key_record = db.query(ApiKey).filter(ApiKey.key == api_key, ApiKey.is_active == True).first()
|
| 85 |
+
|
| 86 |
+
if key_record:
|
| 87 |
+
# Update usage stats
|
| 88 |
+
key_record.last_used_at = datetime.utcnow()
|
| 89 |
+
db.commit()
|
| 90 |
+
return key_record.user
|
| 91 |
+
|
| 92 |
+
return None # Invalid key
|
| 93 |
+
|
| 94 |
+
def get_api_user_or_jwt_user(
|
| 95 |
+
api_key_user: Optional[User] = Depends(verify_api_key),
|
| 96 |
+
jwt_user: Optional[User] = Depends(get_current_user)
|
| 97 |
+
) -> User:
|
| 98 |
+
"""Allow access via either API Key or JWT"""
|
| 99 |
+
if api_key_user:
|
| 100 |
+
return api_key_user
|
| 101 |
+
if jwt_user:
|
| 102 |
+
return jwt_user
|
| 103 |
+
|
| 104 |
+
raise HTTPException(
|
| 105 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 106 |
+
detail="Not authenticated"
|
| 107 |
+
)
|
backend/app/core/security_encryption.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Field-level Encryption for SQLAlchemy Models.
|
| 3 |
+
|
| 4 |
+
Uses Fernet symmetric encryption from the `cryptography` library.
|
| 5 |
+
The ENCRYPTION_KEY should be a 32-byte base64-encoded key.
|
| 6 |
+
Generate one with: from cryptography.fernet import Fernet; print(Fernet.generate_key())
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import base64
|
| 11 |
+
import logging
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
from cryptography.fernet import Fernet, InvalidToken
|
| 15 |
+
from sqlalchemy import TypeDecorator, String
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
# --- Configuration ---
|
| 20 |
+
# IMPORTANT: Store this securely! In production, use secrets manager or env vars.
|
| 21 |
+
# Default key is for development ONLY - regenerate for production!
|
| 22 |
+
_DEFAULT_DEV_KEY = "VOICEFORGE_DEV_KEY_REPLACE_ME_NOW=" # Placeholder - NOT a valid key
|
| 23 |
+
|
| 24 |
+
def _get_encryption_key() -> bytes:
|
| 25 |
+
"""Get the encryption key from environment or generate a dev default."""
|
| 26 |
+
key_str = os.getenv("ENCRYPTION_KEY")
|
| 27 |
+
|
| 28 |
+
if key_str:
|
| 29 |
+
return key_str.encode()
|
| 30 |
+
|
| 31 |
+
# Generate a consistent dev key (NOT SECURE - dev only)
|
| 32 |
+
logger.warning("⚠️ ENCRYPTION_KEY not set! Using insecure dev key. DO NOT USE IN PRODUCTION.")
|
| 33 |
+
# Create a valid Fernet key from a predictable seed for dev
|
| 34 |
+
return Fernet.generate_key() # This generates a random key each run - bad for dev persistence
|
| 35 |
+
# For dev consistency, use a fixed key (still insecure):
|
| 36 |
+
# return base64.urlsafe_b64encode(b"32_byte_dev_key_for_testing_1234")
|
| 37 |
+
|
| 38 |
+
# Cache the Fernet instance
|
| 39 |
+
_fernet: Optional[Fernet] = None
|
| 40 |
+
|
| 41 |
+
def get_fernet() -> Fernet:
|
| 42 |
+
"""Get or create the Fernet encryption instance."""
|
| 43 |
+
global _fernet
|
| 44 |
+
if _fernet is None:
|
| 45 |
+
key = _get_encryption_key()
|
| 46 |
+
_fernet = Fernet(key)
|
| 47 |
+
return _fernet
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# --- SQLAlchemy TypeDecorator ---
|
| 51 |
+
|
| 52 |
+
class EncryptedString(TypeDecorator):
|
| 53 |
+
"""
|
| 54 |
+
SQLAlchemy type that encrypts/decrypts string values transparently.
|
| 55 |
+
|
| 56 |
+
Usage:
|
| 57 |
+
class User(Base):
|
| 58 |
+
full_name = Column(EncryptedString(255), nullable=True)
|
| 59 |
+
|
| 60 |
+
The encrypted data is stored as a base64-encoded string in the database.
|
| 61 |
+
"""
|
| 62 |
+
impl = String
|
| 63 |
+
cache_ok = True
|
| 64 |
+
|
| 65 |
+
def __init__(self, length: int = 512, *args, **kwargs):
|
| 66 |
+
# Encrypted strings are longer than plaintext, so pad the length
|
| 67 |
+
super().__init__(length * 2, *args, **kwargs)
|
| 68 |
+
|
| 69 |
+
def process_bind_param(self, value, dialect):
|
| 70 |
+
"""Encrypt the value before storing in DB."""
|
| 71 |
+
if value is None:
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
fernet = get_fernet()
|
| 76 |
+
# Encode string to bytes, encrypt, then decode to string for storage
|
| 77 |
+
encrypted = fernet.encrypt(value.encode('utf-8'))
|
| 78 |
+
return encrypted.decode('utf-8')
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Encryption failed: {e}")
|
| 81 |
+
# In case of encryption failure, store plaintext (fail-open for dev)
|
| 82 |
+
# In production, you might want to raise instead
|
| 83 |
+
return value
|
| 84 |
+
|
| 85 |
+
def process_result_value(self, value, dialect):
|
| 86 |
+
"""Decrypt the value when reading from DB."""
|
| 87 |
+
if value is None:
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
fernet = get_fernet()
|
| 92 |
+
# Decode from storage string, decrypt, then decode to string
|
| 93 |
+
decrypted = fernet.decrypt(value.encode('utf-8'))
|
| 94 |
+
return decrypted.decode('utf-8')
|
| 95 |
+
except InvalidToken:
|
| 96 |
+
# Value might be plaintext (legacy data or encryption disabled)
|
| 97 |
+
logger.warning("Decryption failed - returning raw value (possible legacy data)")
|
| 98 |
+
return value
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error(f"Decryption failed: {e}")
|
| 101 |
+
return value
|
backend/app/core/security_headers.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from starlette.middleware.base import BaseHTTPMiddleware
|
| 2 |
+
from starlette.types import ASGIApp, Receive, Scope, Send
|
| 3 |
+
|
| 4 |
+
class SecurityHeadersMiddleware(BaseHTTPMiddleware):
|
| 5 |
+
def __init__(self, app: ASGIApp):
|
| 6 |
+
super().__init__(app)
|
| 7 |
+
|
| 8 |
+
async def dispatch(self, request, call_next):
|
| 9 |
+
response = await call_next(request)
|
| 10 |
+
|
| 11 |
+
# Prevent Clickjacking
|
| 12 |
+
response.headers["X-Frame-Options"] = "DENY"
|
| 13 |
+
|
| 14 |
+
# Prevent MIME type sniffing
|
| 15 |
+
response.headers["X-Content-Type-Options"] = "nosniff"
|
| 16 |
+
|
| 17 |
+
# Enable XSS filtering in browser (legacy but good for depth)
|
| 18 |
+
response.headers["X-XSS-Protection"] = "1; mode=block"
|
| 19 |
+
|
| 20 |
+
# Strict Transport Security (HSTS)
|
| 21 |
+
# Enforce HTTPS. max-age=31536000 is 1 year.
|
| 22 |
+
# includeSubDomains applies to all subdomains.
|
| 23 |
+
# preload allows domain to be included in browser preload lists.
|
| 24 |
+
# NOTE: Only effective if served over HTTPS.
|
| 25 |
+
response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
|
| 26 |
+
|
| 27 |
+
# Content Security Policy (CSP)
|
| 28 |
+
# Very strict default: only allow content from self.
|
| 29 |
+
# This might need adjustment for Swagger UI (CDN assets) or other resources.
|
| 30 |
+
# For now, we allow 'unsafe-inline' and 'unsafe-eval' for Swagger UI compatibility if needed,
|
| 31 |
+
# but primarily 'self'.
|
| 32 |
+
response.headers["Content-Security-Policy"] = "default-src 'self'; img-src 'self' data: https:; style-src 'self' 'unsafe-inline'; script-src 'self' 'unsafe-inline';"
|
| 33 |
+
|
| 34 |
+
# Referrer Policy
|
| 35 |
+
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
| 36 |
+
|
| 37 |
+
return response
|
backend/app/main.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge - FastAPI Main Application
|
| 3 |
+
Production-grade Speech-to-Text & Text-to-Speech API
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
# WARN: PyTorch 2.6+ security workaround for Pyannote
|
| 8 |
+
# Must be before any other torch imports
|
| 9 |
+
import os
|
| 10 |
+
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
|
| 11 |
+
import torch.serialization
|
| 12 |
+
try:
|
| 13 |
+
torch.serialization.add_safe_globals([dict])
|
| 14 |
+
except:
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
from contextlib import asynccontextmanager
|
| 18 |
+
from fastapi import FastAPI, Request
|
| 19 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 20 |
+
from fastapi.responses import JSONResponse
|
| 21 |
+
from fastapi.openapi.utils import get_openapi
|
| 22 |
+
|
| 23 |
+
from prometheus_fastapi_instrumentator import Instrumentator
|
| 24 |
+
from .core.config import get_settings
|
| 25 |
+
from .api.routes import (
|
| 26 |
+
stt_router,
|
| 27 |
+
tts_router,
|
| 28 |
+
health_router,
|
| 29 |
+
transcripts_router,
|
| 30 |
+
ws_router,
|
| 31 |
+
translation_router,
|
| 32 |
+
batch_router,
|
| 33 |
+
analysis_router,
|
| 34 |
+
audio_router,
|
| 35 |
+
cloning_router,
|
| 36 |
+
sign_router,
|
| 37 |
+
auth_router
|
| 38 |
+
)
|
| 39 |
+
from .models import Base, engine
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Configure logging
|
| 44 |
+
logging.basicConfig(
|
| 45 |
+
level=logging.INFO,
|
| 46 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 47 |
+
)
|
| 48 |
+
logger = logging.getLogger(__name__)
|
| 49 |
+
|
| 50 |
+
settings = get_settings()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@asynccontextmanager
|
| 54 |
+
async def lifespan(app: FastAPI):
|
| 55 |
+
"""
|
| 56 |
+
Application lifespan handler
|
| 57 |
+
Runs on startup and shutdown
|
| 58 |
+
"""
|
| 59 |
+
# Startup
|
| 60 |
+
logger.info(f"Starting {settings.app_name} v{settings.app_version}")
|
| 61 |
+
|
| 62 |
+
# Create database tables
|
| 63 |
+
logger.info("Creating database tables...")
|
| 64 |
+
Base.metadata.create_all(bind=engine)
|
| 65 |
+
|
| 66 |
+
# Pre-warm Whisper models for faster first request
|
| 67 |
+
logger.info("Pre-warming AI models...")
|
| 68 |
+
try:
|
| 69 |
+
from .services.whisper_stt_service import get_whisper_model
|
| 70 |
+
# Pre-load English Distil model (most common)
|
| 71 |
+
get_whisper_model("distil-small.en")
|
| 72 |
+
logger.info("✅ Distil-Whisper model loaded")
|
| 73 |
+
# Pre-load multilingual model
|
| 74 |
+
get_whisper_model("small")
|
| 75 |
+
logger.info("✅ Whisper-small model loaded")
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.warning(f"Model pre-warming failed: {e}")
|
| 78 |
+
|
| 79 |
+
# Pre-cache TTS voice list
|
| 80 |
+
try:
|
| 81 |
+
from .services.tts_service import get_tts_service
|
| 82 |
+
tts_service = get_tts_service()
|
| 83 |
+
await tts_service.get_voices()
|
| 84 |
+
logger.info("✅ TTS voice list cached")
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.warning(f"Voice list caching failed: {e}")
|
| 87 |
+
|
| 88 |
+
logger.info("🚀 Startup complete - All models warmed up!")
|
| 89 |
+
|
| 90 |
+
yield
|
| 91 |
+
|
| 92 |
+
# Shutdown
|
| 93 |
+
logger.info("Shutting down...")
|
| 94 |
+
# TODO: Close database connections
|
| 95 |
+
# TODO: Close Redis connections
|
| 96 |
+
logger.info("Shutdown complete")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# Create FastAPI application
|
| 100 |
+
app = FastAPI(
|
| 101 |
+
title=settings.app_name,
|
| 102 |
+
description="""
|
| 103 |
+
## VoiceForge API
|
| 104 |
+
|
| 105 |
+
Production-grade Speech-to-Text and Text-to-Speech API.
|
| 106 |
+
|
| 107 |
+
### Features
|
| 108 |
+
|
| 109 |
+
- 🎤 **Speech-to-Text**: Transcribe audio files with word-level timestamps
|
| 110 |
+
- 🔊 **Text-to-Speech**: Synthesize speech with 300+ neural voices
|
| 111 |
+
- 🌍 **Multi-language**: Support for 10+ languages
|
| 112 |
+
- 🧠 **AI Analysis**: Sentiment, keywords, and summarization
|
| 113 |
+
- 🌐 **Translation**: Translate text/audio between 20+ languages
|
| 114 |
+
- ⚡ **Free & Fast**: Local Whisper + Edge TTS - no API costs
|
| 115 |
+
""",
|
| 116 |
+
version=settings.app_version,
|
| 117 |
+
docs_url="/docs",
|
| 118 |
+
redoc_url="/redoc",
|
| 119 |
+
lifespan=lifespan,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
from slowapi import _rate_limit_exceeded_handler
|
| 124 |
+
from slowapi.errors import RateLimitExceeded
|
| 125 |
+
from slowapi.middleware import SlowAPIMiddleware
|
| 126 |
+
from .core.limiter import limiter
|
| 127 |
+
from .core.security_headers import SecurityHeadersMiddleware
|
| 128 |
+
|
| 129 |
+
# Add Rate Limiting (default: 60 requests/min per IP)
|
| 130 |
+
app.state.limiter = limiter
|
| 131 |
+
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 132 |
+
app.add_middleware(SlowAPIMiddleware)
|
| 133 |
+
|
| 134 |
+
# Security Headers (Must be before CORS to ensure headers are present even on errors/CORS blocks)
|
| 135 |
+
app.add_middleware(SecurityHeadersMiddleware)
|
| 136 |
+
|
| 137 |
+
# CORS middleware
|
| 138 |
+
app.add_middleware(
|
| 139 |
+
CORSMiddleware,
|
| 140 |
+
allow_origins=["*"], # Configure for production
|
| 141 |
+
allow_credentials=True,
|
| 142 |
+
allow_methods=["*"],
|
| 143 |
+
allow_headers=["*"],
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Prometheus Metrics
|
| 147 |
+
Instrumentator().instrument(app).expose(app)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# Include routers
|
| 151 |
+
app.include_router(health_router)
|
| 152 |
+
app.include_router(auth_router, prefix="/api/v1")
|
| 153 |
+
app.include_router(stt_router, prefix="/api/v1")
|
| 154 |
+
app.include_router(tts_router, prefix="/api/v1")
|
| 155 |
+
app.include_router(transcripts_router, prefix="/api/v1")
|
| 156 |
+
app.include_router(ws_router, prefix="/api/v1")
|
| 157 |
+
app.include_router(translation_router, prefix="/api/v1")
|
| 158 |
+
app.include_router(batch_router, prefix="/api/v1")
|
| 159 |
+
app.include_router(analysis_router, prefix="/api/v1")
|
| 160 |
+
app.include_router(audio_router, prefix="/api/v1")
|
| 161 |
+
app.include_router(cloning_router, prefix="/api/v1")
|
| 162 |
+
app.include_router(sign_router, prefix="/api/v1")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# Exception handlers
|
| 169 |
+
@app.exception_handler(Exception)
|
| 170 |
+
async def global_exception_handler(request: Request, exc: Exception):
|
| 171 |
+
"""Global exception handler for unhandled errors"""
|
| 172 |
+
logger.exception(f"Unhandled error: {exc}")
|
| 173 |
+
return JSONResponse(
|
| 174 |
+
status_code=500,
|
| 175 |
+
content={
|
| 176 |
+
"error": "internal_server_error",
|
| 177 |
+
"message": "An unexpected error occurred",
|
| 178 |
+
"detail": str(exc) if settings.debug else None,
|
| 179 |
+
},
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
@app.exception_handler(ValueError)
|
| 184 |
+
async def value_error_handler(request: Request, exc: ValueError):
|
| 185 |
+
"""Handler for validation errors"""
|
| 186 |
+
return JSONResponse(
|
| 187 |
+
status_code=400,
|
| 188 |
+
content={
|
| 189 |
+
"error": "validation_error",
|
| 190 |
+
"message": str(exc),
|
| 191 |
+
},
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# Root endpoint
|
| 196 |
+
@app.get("/", tags=["Root"])
|
| 197 |
+
async def root():
|
| 198 |
+
"""API root - returns basic info"""
|
| 199 |
+
return {
|
| 200 |
+
"name": settings.app_name,
|
| 201 |
+
"version": settings.app_version,
|
| 202 |
+
"status": "running",
|
| 203 |
+
"docs": "/docs",
|
| 204 |
+
"health": "/health",
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# Custom OpenAPI schema
|
| 209 |
+
def custom_openapi():
|
| 210 |
+
"""Generate custom OpenAPI schema with enhanced documentation"""
|
| 211 |
+
if app.openapi_schema:
|
| 212 |
+
return app.openapi_schema
|
| 213 |
+
|
| 214 |
+
openapi_schema = get_openapi(
|
| 215 |
+
title=settings.app_name,
|
| 216 |
+
version=settings.app_version,
|
| 217 |
+
description=app.description,
|
| 218 |
+
routes=app.routes,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# Add custom logo
|
| 222 |
+
openapi_schema["info"]["x-logo"] = {
|
| 223 |
+
"url": "https://example.com/logo.png"
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
# Add tags with descriptions
|
| 227 |
+
openapi_schema["tags"] = [
|
| 228 |
+
{
|
| 229 |
+
"name": "Health",
|
| 230 |
+
"description": "Health check endpoints for monitoring",
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"name": "Speech-to-Text",
|
| 234 |
+
"description": "Convert audio to text with timestamps and speaker detection",
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"name": "Text-to-Speech",
|
| 238 |
+
"description": "Convert text to natural-sounding speech",
|
| 239 |
+
},
|
| 240 |
+
]
|
| 241 |
+
|
| 242 |
+
app.openapi_schema = openapi_schema
|
| 243 |
+
return app.openapi_schema
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
app.openapi = custom_openapi
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
if __name__ == "__main__":
|
| 250 |
+
import uvicorn
|
| 251 |
+
|
| 252 |
+
uvicorn.run(
|
| 253 |
+
"app.main:app",
|
| 254 |
+
host=settings.api_host,
|
| 255 |
+
port=settings.api_port,
|
| 256 |
+
reload=settings.debug,
|
| 257 |
+
)
|
backend/app/schemas/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge Schemas Package
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .stt import (
|
| 6 |
+
TranscriptionRequest,
|
| 7 |
+
TranscriptionResponse,
|
| 8 |
+
TranscriptionSegment,
|
| 9 |
+
TranscriptionWord,
|
| 10 |
+
LanguageInfo,
|
| 11 |
+
)
|
| 12 |
+
from .tts import (
|
| 13 |
+
SynthesisRequest,
|
| 14 |
+
SynthesisResponse,
|
| 15 |
+
VoiceInfo,
|
| 16 |
+
VoiceListResponse,
|
| 17 |
+
)
|
| 18 |
+
from .transcript import (
|
| 19 |
+
TranscriptCreate,
|
| 20 |
+
TranscriptUpdate,
|
| 21 |
+
TranscriptResponse,
|
| 22 |
+
TranscriptListResponse,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
__all__ = [
|
| 26 |
+
"TranscriptionRequest",
|
| 27 |
+
"TranscriptionResponse",
|
| 28 |
+
"TranscriptionSegment",
|
| 29 |
+
"TranscriptionWord",
|
| 30 |
+
"LanguageInfo",
|
| 31 |
+
"SynthesisRequest",
|
| 32 |
+
"SynthesisResponse",
|
| 33 |
+
"VoiceInfo",
|
| 34 |
+
"VoiceListResponse",
|
| 35 |
+
"TranscriptCreate",
|
| 36 |
+
"TranscriptUpdate",
|
| 37 |
+
"TranscriptResponse",
|
| 38 |
+
"TranscriptListResponse",
|
| 39 |
+
]
|
backend/app/schemas/stt.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speech-to-Text Schemas
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from typing import List, Optional, Dict, Any
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TranscriptionWord(BaseModel):
|
| 11 |
+
"""Individual word with timing information"""
|
| 12 |
+
word: str
|
| 13 |
+
start_time: float = Field(..., description="Start time in seconds")
|
| 14 |
+
end_time: float = Field(..., description="End time in seconds")
|
| 15 |
+
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TranscriptionSegment(BaseModel):
|
| 19 |
+
"""Transcript segment with speaker and timing"""
|
| 20 |
+
text: str
|
| 21 |
+
start_time: float = Field(..., description="Start time in seconds")
|
| 22 |
+
end_time: float = Field(..., description="End time in seconds")
|
| 23 |
+
speaker: Optional[str] = Field(None, description="Speaker label (e.g., SPEAKER_1)")
|
| 24 |
+
confidence: float = Field(..., ge=0.0, le=1.0)
|
| 25 |
+
words: Optional[List[TranscriptionWord]] = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class TranscriptionRequest(BaseModel):
|
| 29 |
+
"""Request parameters for transcription"""
|
| 30 |
+
language: str = Field(default="en-US", description="Language code (e.g., en-US)")
|
| 31 |
+
enable_automatic_punctuation: bool = True
|
| 32 |
+
enable_word_time_offsets: bool = True
|
| 33 |
+
enable_speaker_diarization: bool = False
|
| 34 |
+
diarization_speaker_count: Optional[int] = Field(None, ge=2, le=10)
|
| 35 |
+
model: str = Field(default="default", description="STT model to use")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class TranscriptionResponse(BaseModel):
|
| 39 |
+
"""Response from transcription"""
|
| 40 |
+
id: Optional[int] = None
|
| 41 |
+
audio_file_id: Optional[int] = None
|
| 42 |
+
text: str = Field(..., description="Full transcription text")
|
| 43 |
+
segments: List[TranscriptionSegment] = Field(default_factory=list)
|
| 44 |
+
words: Optional[List[TranscriptionWord]] = None
|
| 45 |
+
language: str
|
| 46 |
+
detected_language: Optional[str] = None
|
| 47 |
+
confidence: float = Field(..., ge=0.0, le=1.0)
|
| 48 |
+
duration: float = Field(..., description="Audio duration in seconds")
|
| 49 |
+
word_count: int
|
| 50 |
+
processing_time: float = Field(..., description="Processing time in seconds")
|
| 51 |
+
|
| 52 |
+
model_config = {
|
| 53 |
+
"from_attributes": True
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class StreamingTranscriptionResponse(BaseModel):
|
| 58 |
+
"""Response for streaming transcription updates"""
|
| 59 |
+
is_final: bool = False
|
| 60 |
+
text: str
|
| 61 |
+
confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 62 |
+
stability: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class LanguageInfo(BaseModel):
|
| 66 |
+
"""Language information for UI display"""
|
| 67 |
+
code: str = Field(..., description="Language code (e.g., en-US)")
|
| 68 |
+
name: str = Field(..., description="Display name (e.g., English (US))")
|
| 69 |
+
native_name: str = Field(..., description="Native name (e.g., English)")
|
| 70 |
+
flag: str = Field(..., description="Flag emoji")
|
| 71 |
+
stt_supported: bool = True
|
| 72 |
+
tts_supported: bool = True
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class LanguageListResponse(BaseModel):
|
| 76 |
+
"""Response with list of supported languages"""
|
| 77 |
+
languages: List[LanguageInfo]
|
| 78 |
+
total: int
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class TaskStatusResponse(BaseModel):
|
| 83 |
+
"""Status of an async transcription task"""
|
| 84 |
+
task_id: str
|
| 85 |
+
status: str = Field(..., description="pending, processing, completed, failed")
|
| 86 |
+
progress: float = Field(default=0.0, ge=0.0, le=100.0, description="Progress percentage")
|
| 87 |
+
result: Optional[TranscriptionResponse] = None
|
| 88 |
+
error: Optional[str] = None
|
| 89 |
+
created_at: datetime
|
| 90 |
+
updated_at: datetime
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class AsyncTranscriptionResponse(BaseModel):
|
| 94 |
+
"""Response for async transcription submission"""
|
| 95 |
+
task_id: str
|
| 96 |
+
audio_file_id: int
|
| 97 |
+
status: str = "queued"
|
| 98 |
+
message: str = "File uploaded and queued for processing"
|
backend/app/schemas/transcript.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Transcript Schemas
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from typing import List, Optional, Dict, Any
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
from .stt import TranscriptionSegment, TranscriptionWord
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TranscriptCreate(BaseModel):
|
| 13 |
+
"""Schema for creating a transcript"""
|
| 14 |
+
raw_text: str
|
| 15 |
+
processed_text: Optional[str] = None
|
| 16 |
+
segments: Optional[List[Dict[str, Any]]] = None
|
| 17 |
+
words: Optional[List[Dict[str, Any]]] = None
|
| 18 |
+
language: str = "en-US"
|
| 19 |
+
confidence: Optional[float] = None
|
| 20 |
+
duration: Optional[float] = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TranscriptUpdate(BaseModel):
|
| 24 |
+
"""Schema for updating a transcript"""
|
| 25 |
+
processed_text: Optional[str] = None
|
| 26 |
+
language: Optional[str] = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class TranscriptResponse(BaseModel):
|
| 30 |
+
"""Schema for transcript response"""
|
| 31 |
+
id: int
|
| 32 |
+
audio_file_id: Optional[int] = None
|
| 33 |
+
user_id: Optional[int] = None
|
| 34 |
+
raw_text: Optional[str] = None
|
| 35 |
+
processed_text: Optional[str] = None
|
| 36 |
+
segments: Optional[List[Dict[str, Any]]] = None
|
| 37 |
+
words: Optional[List[Dict[str, Any]]] = None
|
| 38 |
+
language: Optional[str] = None
|
| 39 |
+
translation_language: Optional[str] = None
|
| 40 |
+
translated_text: Optional[str] = None
|
| 41 |
+
sentiment: Optional[Dict[str, Any]] = None
|
| 42 |
+
topics: Optional[List[str]] = None
|
| 43 |
+
keywords: Optional[List[Dict[str, Any]]] = None
|
| 44 |
+
summary: Optional[str] = None
|
| 45 |
+
confidence: Optional[float] = None
|
| 46 |
+
duration: Optional[float] = None
|
| 47 |
+
word_count: Optional[int] = None
|
| 48 |
+
created_at: datetime
|
| 49 |
+
updated_at: Optional[datetime] = None
|
| 50 |
+
|
| 51 |
+
model_config = {
|
| 52 |
+
"from_attributes": True
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class TranscriptListResponse(BaseModel):
|
| 57 |
+
"""Schema for paginated transcript list"""
|
| 58 |
+
transcripts: List[TranscriptResponse]
|
| 59 |
+
total: int
|
| 60 |
+
page: int
|
| 61 |
+
page_size: int
|
| 62 |
+
has_more: bool
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class ExportRequest(BaseModel):
|
| 66 |
+
"""Schema for transcript export request"""
|
| 67 |
+
format: str = Field(..., pattern="^(txt|srt|vtt|pdf|json)$")
|
| 68 |
+
include_timestamps: bool = True
|
| 69 |
+
include_speakers: bool = True
|
backend/app/schemas/tts.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text-to-Speech Schemas
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
from pydantic import BaseModel, Field
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SynthesisRequest(BaseModel):
|
| 10 |
+
"""Request for text-to-speech synthesis"""
|
| 11 |
+
text: str = Field(..., min_length=1, max_length=5000, description="Text to synthesize")
|
| 12 |
+
language: str = Field(default="en-US", description="Language code")
|
| 13 |
+
voice: Optional[str] = Field(None, description="Voice name (e.g., en-US-Wavenet-D)")
|
| 14 |
+
|
| 15 |
+
# Audio configuration
|
| 16 |
+
audio_encoding: str = Field(default="MP3", description="Output format: MP3, LINEAR16, OGG_OPUS")
|
| 17 |
+
sample_rate: int = Field(default=24000, description="Sample rate in Hz")
|
| 18 |
+
|
| 19 |
+
# Voice tuning
|
| 20 |
+
speaking_rate: float = Field(default=1.0, ge=0.25, le=4.0, description="Speaking rate")
|
| 21 |
+
pitch: float = Field(default=0.0, ge=-20.0, le=20.0, description="Voice pitch in semitones")
|
| 22 |
+
volume_gain_db: float = Field(default=0.0, ge=-96.0, le=16.0, description="Volume gain in dB")
|
| 23 |
+
|
| 24 |
+
# SSML support
|
| 25 |
+
use_ssml: bool = Field(default=False, description="Treat text as SSML")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class SynthesisResponse(BaseModel):
|
| 29 |
+
"""Response from text-to-speech synthesis"""
|
| 30 |
+
audio_content: str = Field(..., description="Base64 encoded audio")
|
| 31 |
+
audio_size: int = Field(..., description="Audio size in bytes")
|
| 32 |
+
duration_estimate: float = Field(..., description="Estimated duration in seconds")
|
| 33 |
+
voice_used: str
|
| 34 |
+
language: str
|
| 35 |
+
encoding: str
|
| 36 |
+
sample_rate: int
|
| 37 |
+
processing_time: float = Field(..., description="Processing time in seconds")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class VoiceInfo(BaseModel):
|
| 41 |
+
"""Information about a TTS voice"""
|
| 42 |
+
name: str = Field(..., description="Voice name (e.g., en-US-Wavenet-D)")
|
| 43 |
+
language_code: str = Field(..., description="Language code")
|
| 44 |
+
language_name: str = Field(..., description="Language display name")
|
| 45 |
+
ssml_gender: str = Field(..., description="MALE, FEMALE, or NEUTRAL")
|
| 46 |
+
natural_sample_rate: int = Field(..., description="Native sample rate in Hz")
|
| 47 |
+
voice_type: str = Field(..., description="Standard, WaveNet, or Neural2")
|
| 48 |
+
|
| 49 |
+
# Display helpers
|
| 50 |
+
display_name: Optional[str] = None
|
| 51 |
+
flag: Optional[str] = None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class VoiceListResponse(BaseModel):
|
| 55 |
+
"""Response with list of available voices"""
|
| 56 |
+
voices: List[VoiceInfo]
|
| 57 |
+
total: int
|
| 58 |
+
language_filter: Optional[str] = None
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class VoicePreviewRequest(BaseModel):
|
| 62 |
+
"""Request for voice preview"""
|
| 63 |
+
voice: str = Field(..., description="Voice name to preview")
|
| 64 |
+
text: Optional[str] = Field(
|
| 65 |
+
default="Hello! This is a preview of my voice.",
|
| 66 |
+
max_length=200
|
| 67 |
+
)
|
backend/app/services/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VoiceForge Services Package
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from .stt_service import STTService
|
| 6 |
+
from .tts_service import TTSService
|
| 7 |
+
from .file_service import FileService
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"STTService",
|
| 11 |
+
"TTSService",
|
| 12 |
+
"FileService",
|
| 13 |
+
]
|
backend/app/services/audio_service.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Editing Service
|
| 3 |
+
Handles audio manipulation: Trimming, Merging, and Conversion using Pydub/FFmpeg
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
from typing import List, Optional
|
| 9 |
+
from pydub import AudioSegment
|
| 10 |
+
import tempfile
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
class AudioService:
|
| 15 |
+
"""
|
| 16 |
+
Service for audio manipulation tasks.
|
| 17 |
+
Requires ffmpeg to be installed/available in path.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
def load_audio(self, file_path: str) -> AudioSegment:
|
| 24 |
+
"""Load audio file into Pydub AudioSegment"""
|
| 25 |
+
try:
|
| 26 |
+
return AudioSegment.from_file(file_path)
|
| 27 |
+
except Exception as e:
|
| 28 |
+
logger.error(f"Failed to load audio {file_path}: {e}")
|
| 29 |
+
raise ValueError(f"Could not load audio file: {str(e)}")
|
| 30 |
+
|
| 31 |
+
def trim_audio(self, input_path: str, start_ms: int, end_ms: int, output_path: Optional[str] = None) -> str:
|
| 32 |
+
"""
|
| 33 |
+
Trim audio from start_ms to end_ms.
|
| 34 |
+
"""
|
| 35 |
+
if start_ms < 0 or end_ms <= start_ms:
|
| 36 |
+
raise ValueError("Invalid start/end timestamps")
|
| 37 |
+
|
| 38 |
+
audio = self.load_audio(input_path)
|
| 39 |
+
|
| 40 |
+
# Check duration
|
| 41 |
+
if start_ms >= len(audio):
|
| 42 |
+
raise ValueError("Start time exceeds audio duration")
|
| 43 |
+
|
| 44 |
+
# Slice
|
| 45 |
+
trimmed = audio[start_ms:end_ms]
|
| 46 |
+
|
| 47 |
+
if not output_path:
|
| 48 |
+
base, ext = os.path.splitext(input_path)
|
| 49 |
+
output_path = f"{base}_trimmed{ext}"
|
| 50 |
+
|
| 51 |
+
trimmed.export(output_path, format=os.path.splitext(output_path)[1][1:])
|
| 52 |
+
logger.info(f"Trimmed audio saved to {output_path}")
|
| 53 |
+
return output_path
|
| 54 |
+
|
| 55 |
+
def merge_audio(self, file_paths: List[str], output_path: str, crossfade_ms: int = 0) -> str:
|
| 56 |
+
"""
|
| 57 |
+
Merge multiple audio files into one.
|
| 58 |
+
"""
|
| 59 |
+
if not file_paths:
|
| 60 |
+
raise ValueError("No files to merge")
|
| 61 |
+
|
| 62 |
+
combined = AudioSegment.empty()
|
| 63 |
+
|
| 64 |
+
for path in file_paths:
|
| 65 |
+
segment = self.load_audio(path)
|
| 66 |
+
if crossfade_ms > 0 and len(combined) > 0:
|
| 67 |
+
combined = combined.append(segment, crossfade=crossfade_ms)
|
| 68 |
+
else:
|
| 69 |
+
combined += segment
|
| 70 |
+
|
| 71 |
+
# Create dir if needed
|
| 72 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 73 |
+
|
| 74 |
+
# Export
|
| 75 |
+
fmt = os.path.splitext(output_path)[1][1:] or "mp3"
|
| 76 |
+
combined.export(output_path, format=fmt)
|
| 77 |
+
logger.info(f"Merged {len(file_paths)} files to {output_path}")
|
| 78 |
+
return output_path
|
| 79 |
+
|
| 80 |
+
def convert_format(self, input_path: str, target_format: str) -> str:
|
| 81 |
+
"""
|
| 82 |
+
Convert audio format (e.g. wav -> mp3)
|
| 83 |
+
"""
|
| 84 |
+
audio = self.load_audio(input_path)
|
| 85 |
+
|
| 86 |
+
base = os.path.splitext(input_path)[0]
|
| 87 |
+
output_path = f"{base}.{target_format}"
|
| 88 |
+
|
| 89 |
+
audio.export(output_path, format=target_format)
|
| 90 |
+
logger.info(f"Converted to {target_format}: {output_path}")
|
| 91 |
+
return output_path
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# Singleton
|
| 95 |
+
_audio_service = None
|
| 96 |
+
|
| 97 |
+
def get_audio_service() -> AudioService:
|
| 98 |
+
global _audio_service
|
| 99 |
+
if _audio_service is None:
|
| 100 |
+
_audio_service = AudioService()
|
| 101 |
+
return _audio_service
|
backend/app/services/batch_service.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Batch Processing Service
|
| 3 |
+
Handles multi-file transcription with job tracking and parallel processing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import tempfile
|
| 10 |
+
import uuid
|
| 11 |
+
import zipfile
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, List, Optional, Any
|
| 15 |
+
from dataclasses import dataclass, field
|
| 16 |
+
from enum import Enum
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class JobStatus(str, Enum):
|
| 22 |
+
"""Batch job status enum."""
|
| 23 |
+
PENDING = "pending"
|
| 24 |
+
PROCESSING = "processing"
|
| 25 |
+
COMPLETED = "completed"
|
| 26 |
+
FAILED = "failed"
|
| 27 |
+
CANCELLED = "cancelled"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class FileStatus(str, Enum):
|
| 31 |
+
"""Individual file status."""
|
| 32 |
+
QUEUED = "queued"
|
| 33 |
+
PROCESSING = "processing"
|
| 34 |
+
COMPLETED = "completed"
|
| 35 |
+
FAILED = "failed"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class FileResult:
|
| 40 |
+
"""Result for a single file in batch."""
|
| 41 |
+
filename: str
|
| 42 |
+
status: FileStatus = FileStatus.QUEUED
|
| 43 |
+
progress: float = 0.0
|
| 44 |
+
transcript: Optional[str] = None
|
| 45 |
+
language: Optional[str] = None
|
| 46 |
+
duration: Optional[float] = None
|
| 47 |
+
word_count: Optional[int] = None
|
| 48 |
+
processing_time: Optional[float] = None
|
| 49 |
+
error: Optional[str] = None
|
| 50 |
+
output_path: Optional[str] = None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class BatchJob:
|
| 55 |
+
"""Batch processing job."""
|
| 56 |
+
job_id: str
|
| 57 |
+
status: JobStatus = JobStatus.PENDING
|
| 58 |
+
created_at: datetime = field(default_factory=datetime.now)
|
| 59 |
+
started_at: Optional[datetime] = None
|
| 60 |
+
completed_at: Optional[datetime] = None
|
| 61 |
+
files: Dict[str, FileResult] = field(default_factory=dict)
|
| 62 |
+
total_files: int = 0
|
| 63 |
+
completed_files: int = 0
|
| 64 |
+
failed_files: int = 0
|
| 65 |
+
options: Dict[str, Any] = field(default_factory=dict)
|
| 66 |
+
output_zip_path: Optional[str] = None
|
| 67 |
+
|
| 68 |
+
@property
|
| 69 |
+
def progress(self) -> float:
|
| 70 |
+
"""Overall job progress percentage."""
|
| 71 |
+
if self.total_files == 0:
|
| 72 |
+
return 0.0
|
| 73 |
+
return (self.completed_files + self.failed_files) / self.total_files * 100
|
| 74 |
+
|
| 75 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 76 |
+
"""Convert to dictionary for API response."""
|
| 77 |
+
return {
|
| 78 |
+
"job_id": self.job_id,
|
| 79 |
+
"status": self.status.value,
|
| 80 |
+
"progress": round(self.progress, 1),
|
| 81 |
+
"created_at": self.created_at.isoformat(),
|
| 82 |
+
"started_at": self.started_at.isoformat() if self.started_at else None,
|
| 83 |
+
"completed_at": self.completed_at.isoformat() if self.completed_at else None,
|
| 84 |
+
"total_files": self.total_files,
|
| 85 |
+
"completed_files": self.completed_files,
|
| 86 |
+
"failed_files": self.failed_files,
|
| 87 |
+
"files": {
|
| 88 |
+
name: {
|
| 89 |
+
"filename": f.filename,
|
| 90 |
+
"status": f.status.value,
|
| 91 |
+
"progress": f.progress,
|
| 92 |
+
"transcript": f.transcript[:500] + "..." if f.transcript and len(f.transcript) > 500 else f.transcript,
|
| 93 |
+
"language": f.language,
|
| 94 |
+
"duration": f.duration,
|
| 95 |
+
"word_count": f.word_count,
|
| 96 |
+
"processing_time": f.processing_time,
|
| 97 |
+
"error": f.error,
|
| 98 |
+
}
|
| 99 |
+
for name, f in self.files.items()
|
| 100 |
+
},
|
| 101 |
+
"options": self.options,
|
| 102 |
+
"has_zip": self.output_zip_path is not None,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# In-memory job store (use Redis in production)
|
| 107 |
+
_batch_jobs: Dict[str, BatchJob] = {}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class BatchProcessingService:
|
| 111 |
+
"""
|
| 112 |
+
Service for batch audio transcription.
|
| 113 |
+
Processes multiple files with progress tracking.
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
def __init__(self, output_dir: Optional[str] = None):
|
| 117 |
+
"""Initialize batch service."""
|
| 118 |
+
self.output_dir = output_dir or tempfile.gettempdir()
|
| 119 |
+
self._processing_lock = asyncio.Lock()
|
| 120 |
+
|
| 121 |
+
def create_job(
|
| 122 |
+
self,
|
| 123 |
+
filenames: List[str],
|
| 124 |
+
options: Optional[Dict[str, Any]] = None,
|
| 125 |
+
) -> BatchJob:
|
| 126 |
+
"""
|
| 127 |
+
Create a new batch job.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
filenames: List of filenames to process
|
| 131 |
+
options: Processing options (language, output_format, etc.)
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
Created BatchJob
|
| 135 |
+
"""
|
| 136 |
+
job_id = str(uuid.uuid4())[:8]
|
| 137 |
+
|
| 138 |
+
files = {
|
| 139 |
+
name: FileResult(filename=name)
|
| 140 |
+
for name in filenames
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
job = BatchJob(
|
| 144 |
+
job_id=job_id,
|
| 145 |
+
files=files,
|
| 146 |
+
total_files=len(filenames),
|
| 147 |
+
options=options or {},
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
_batch_jobs[job_id] = job
|
| 151 |
+
logger.info(f"Created batch job {job_id} with {len(filenames)} files")
|
| 152 |
+
|
| 153 |
+
return job
|
| 154 |
+
|
| 155 |
+
def get_job(self, job_id: str) -> Optional[BatchJob]:
|
| 156 |
+
"""Get job by ID."""
|
| 157 |
+
return _batch_jobs.get(job_id)
|
| 158 |
+
|
| 159 |
+
def list_jobs(self, limit: int = 20) -> List[BatchJob]:
|
| 160 |
+
"""List recent jobs."""
|
| 161 |
+
jobs = list(_batch_jobs.values())
|
| 162 |
+
jobs.sort(key=lambda j: j.created_at, reverse=True)
|
| 163 |
+
return jobs[:limit]
|
| 164 |
+
|
| 165 |
+
async def process_job(
|
| 166 |
+
self,
|
| 167 |
+
job_id: str,
|
| 168 |
+
file_paths: Dict[str, str],
|
| 169 |
+
) -> BatchJob:
|
| 170 |
+
"""
|
| 171 |
+
Process all files in a batch job.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
job_id: Job ID
|
| 175 |
+
file_paths: Mapping of filename -> temp file path
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
Completed BatchJob
|
| 179 |
+
"""
|
| 180 |
+
job = self.get_job(job_id)
|
| 181 |
+
if not job:
|
| 182 |
+
raise ValueError(f"Job not found: {job_id}")
|
| 183 |
+
|
| 184 |
+
job.status = JobStatus.PROCESSING
|
| 185 |
+
job.started_at = datetime.now()
|
| 186 |
+
|
| 187 |
+
# STT Service is used inside the worker now
|
| 188 |
+
# from app.services.whisper_stt_service import get_whisper_stt_service
|
| 189 |
+
# stt_service = get_whisper_stt_service()
|
| 190 |
+
|
| 191 |
+
# Get options
|
| 192 |
+
language = job.options.get("language")
|
| 193 |
+
output_format = job.options.get("output_format", "txt")
|
| 194 |
+
|
| 195 |
+
# Process each file
|
| 196 |
+
output_files: List[str] = []
|
| 197 |
+
|
| 198 |
+
for filename, file_path in file_paths.items():
|
| 199 |
+
file_result = job.files.get(filename)
|
| 200 |
+
if not file_result:
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
file_result.status = FileStatus.PROCESSING
|
| 204 |
+
file_result.progress = 0.0
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
import time
|
| 208 |
+
start_time = time.time()
|
| 209 |
+
|
| 210 |
+
# Transcribe via Celery Worker
|
| 211 |
+
from app.workers.tasks import transcribe_file_path
|
| 212 |
+
|
| 213 |
+
# Dispatch task
|
| 214 |
+
task = transcribe_file_path.delay(
|
| 215 |
+
file_path=file_path,
|
| 216 |
+
language=language,
|
| 217 |
+
output_format=output_format
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# Wait for result (since this service runs in background thread)
|
| 221 |
+
# In a full async arch we would return job_id and poll,
|
| 222 |
+
# but here we keep the batch logic simple while scaling the compute.
|
| 223 |
+
task_result = task.get(timeout=600) # 10 min timeout per file
|
| 224 |
+
|
| 225 |
+
processing_time = time.time() - start_time
|
| 226 |
+
|
| 227 |
+
# Update file result
|
| 228 |
+
file_result.transcript = task_result.get("text", "")
|
| 229 |
+
file_result.language = task_result.get("language", "unknown")
|
| 230 |
+
file_result.duration = task_result.get("duration")
|
| 231 |
+
file_result.word_count = len(file_result.transcript.split())
|
| 232 |
+
file_result.processing_time = round(processing_time, 2)
|
| 233 |
+
file_result.status = FileStatus.COMPLETED
|
| 234 |
+
file_result.progress = 100.0
|
| 235 |
+
|
| 236 |
+
# Helper for SRT writing since we have raw segments dicts now
|
| 237 |
+
result = {"segments": task_result.get("segments", []), "text": file_result.transcript}
|
| 238 |
+
|
| 239 |
+
# Save output file
|
| 240 |
+
output_filename = Path(filename).stem + f".{output_format}"
|
| 241 |
+
output_path = os.path.join(self.output_dir, job_id, output_filename)
|
| 242 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
| 243 |
+
|
| 244 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 245 |
+
if output_format == "srt":
|
| 246 |
+
# Write SRT format
|
| 247 |
+
segments = result.get("segments", [])
|
| 248 |
+
for i, seg in enumerate(segments, 1):
|
| 249 |
+
start = self._format_srt_time(seg.get("start", 0))
|
| 250 |
+
end = self._format_srt_time(seg.get("end", 0))
|
| 251 |
+
text = seg.get("text", "").strip()
|
| 252 |
+
f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
|
| 253 |
+
else:
|
| 254 |
+
f.write(file_result.transcript)
|
| 255 |
+
|
| 256 |
+
file_result.output_path = output_path
|
| 257 |
+
output_files.append(output_path)
|
| 258 |
+
|
| 259 |
+
job.completed_files += 1
|
| 260 |
+
logger.info(f"[{job_id}] Completed {filename} ({job.completed_files}/{job.total_files})")
|
| 261 |
+
|
| 262 |
+
except Exception as e:
|
| 263 |
+
file_result.status = FileStatus.FAILED
|
| 264 |
+
file_result.error = str(e)
|
| 265 |
+
file_result.progress = 0.0
|
| 266 |
+
job.failed_files += 1
|
| 267 |
+
logger.error(f"[{job_id}] Failed {filename}: {e}")
|
| 268 |
+
|
| 269 |
+
finally:
|
| 270 |
+
# Clean up temp file
|
| 271 |
+
try:
|
| 272 |
+
if os.path.exists(file_path):
|
| 273 |
+
os.unlink(file_path)
|
| 274 |
+
except:
|
| 275 |
+
pass
|
| 276 |
+
|
| 277 |
+
# Create ZIP of all outputs
|
| 278 |
+
if output_files:
|
| 279 |
+
zip_path = os.path.join(self.output_dir, f"{job_id}_results.zip")
|
| 280 |
+
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 281 |
+
for file_path in output_files:
|
| 282 |
+
zf.write(file_path, os.path.basename(file_path))
|
| 283 |
+
|
| 284 |
+
job.output_zip_path = zip_path
|
| 285 |
+
logger.info(f"[{job_id}] Created ZIP: {zip_path}")
|
| 286 |
+
|
| 287 |
+
# Update job status
|
| 288 |
+
job.status = JobStatus.COMPLETED if job.failed_files == 0 else JobStatus.FAILED
|
| 289 |
+
job.completed_at = datetime.now()
|
| 290 |
+
|
| 291 |
+
return job
|
| 292 |
+
|
| 293 |
+
def _format_srt_time(self, seconds: float) -> str:
|
| 294 |
+
"""Format seconds to SRT time format (HH:MM:SS,mmm)."""
|
| 295 |
+
hours = int(seconds // 3600)
|
| 296 |
+
minutes = int((seconds % 3600) // 60)
|
| 297 |
+
secs = int(seconds % 60)
|
| 298 |
+
millis = int((seconds % 1) * 1000)
|
| 299 |
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
| 300 |
+
|
| 301 |
+
def cancel_job(self, job_id: str) -> bool:
|
| 302 |
+
"""Cancel a pending/processing job."""
|
| 303 |
+
job = self.get_job(job_id)
|
| 304 |
+
if job and job.status in [JobStatus.PENDING, JobStatus.PROCESSING]:
|
| 305 |
+
job.status = JobStatus.CANCELLED
|
| 306 |
+
return True
|
| 307 |
+
return False
|
| 308 |
+
|
| 309 |
+
def delete_job(self, job_id: str) -> bool:
|
| 310 |
+
"""Delete a job and its output files."""
|
| 311 |
+
job = _batch_jobs.pop(job_id, None)
|
| 312 |
+
if job:
|
| 313 |
+
# Clean up files
|
| 314 |
+
if job.output_zip_path and os.path.exists(job.output_zip_path):
|
| 315 |
+
try:
|
| 316 |
+
os.unlink(job.output_zip_path)
|
| 317 |
+
except:
|
| 318 |
+
pass
|
| 319 |
+
|
| 320 |
+
job_dir = os.path.join(self.output_dir, job_id)
|
| 321 |
+
if os.path.exists(job_dir):
|
| 322 |
+
try:
|
| 323 |
+
import shutil
|
| 324 |
+
shutil.rmtree(job_dir)
|
| 325 |
+
except:
|
| 326 |
+
pass
|
| 327 |
+
|
| 328 |
+
return True
|
| 329 |
+
return False
|
| 330 |
+
|
| 331 |
+
def get_zip_path(self, job_id: str) -> Optional[str]:
|
| 332 |
+
"""Get path to job's output ZIP file."""
|
| 333 |
+
job = self.get_job(job_id)
|
| 334 |
+
if job and job.output_zip_path and os.path.exists(job.output_zip_path):
|
| 335 |
+
return job.output_zip_path
|
| 336 |
+
return None
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# Singleton instance
|
| 340 |
+
_batch_service: Optional[BatchProcessingService] = None
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def get_batch_service() -> BatchProcessingService:
|
| 344 |
+
"""Get or create BatchProcessingService singleton."""
|
| 345 |
+
global _batch_service
|
| 346 |
+
if _batch_service is None:
|
| 347 |
+
_batch_service = BatchProcessingService()
|
| 348 |
+
return _batch_service
|
backend/app/services/cache_service.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import redis
|
| 2 |
+
import json
|
| 3 |
+
import hashlib
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Optional, Any
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
|
| 8 |
+
from ..core.config import get_settings
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
class CacheService:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
settings = get_settings()
|
| 15 |
+
self.default_ttl = 3600 # 1 hour
|
| 16 |
+
self.redis = None
|
| 17 |
+
self.disk_cache = None
|
| 18 |
+
|
| 19 |
+
# Try Redis first
|
| 20 |
+
try:
|
| 21 |
+
self.redis = redis.from_url(settings.redis_url, decode_responses=False)
|
| 22 |
+
self.redis.ping()
|
| 23 |
+
logger.info("✅ Redis Cache connected")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.warning(f"⚠️ Redis unavailable, falling back to DiskCache: {e}")
|
| 26 |
+
self.redis = None
|
| 27 |
+
|
| 28 |
+
# Fallback to DiskCache
|
| 29 |
+
try:
|
| 30 |
+
import diskcache
|
| 31 |
+
cache_dir = "./cache_data"
|
| 32 |
+
self.disk_cache = diskcache.Cache(cache_dir)
|
| 33 |
+
logger.info(f"💾 DiskCache initialized at {cache_dir}")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
logger.error(f"❌ DiskCache init failed: {e}")
|
| 36 |
+
|
| 37 |
+
def get(self, key: str) -> Optional[bytes]:
|
| 38 |
+
"""Get raw bytes from cache"""
|
| 39 |
+
try:
|
| 40 |
+
if self.redis:
|
| 41 |
+
return self.redis.get(key)
|
| 42 |
+
elif self.disk_cache:
|
| 43 |
+
return self.disk_cache.get(key)
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Cache get failed: {e}")
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
def set(self, key: str, value: bytes, ttl: int = None):
|
| 49 |
+
"""Set raw bytes in cache"""
|
| 50 |
+
try:
|
| 51 |
+
ttl_val = ttl or self.default_ttl
|
| 52 |
+
|
| 53 |
+
if self.redis:
|
| 54 |
+
self.redis.setex(key, ttl_val, value)
|
| 55 |
+
elif self.disk_cache:
|
| 56 |
+
self.disk_cache.set(key, value, expire=ttl_val)
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.error(f"Cache set failed: {e}")
|
| 59 |
+
|
| 60 |
+
def generate_key(self, prefix: str, **kwargs) -> str:
|
| 61 |
+
"""Generate a stable cache key from arguments"""
|
| 62 |
+
# Convert all values to string for stability
|
| 63 |
+
safe_kwargs = {k: str(v) for k, v in kwargs.items()}
|
| 64 |
+
sorted_kwargs = dict(sorted(safe_kwargs.items()))
|
| 65 |
+
key_str = json.dumps(sorted_kwargs, sort_keys=True)
|
| 66 |
+
hash_str = hashlib.md5(key_str.encode()).hexdigest()
|
| 67 |
+
return f"{prefix}:{hash_str}"
|
| 68 |
+
|
| 69 |
+
@lru_cache()
|
| 70 |
+
def get_cache_service() -> CacheService:
|
| 71 |
+
return CacheService()
|
backend/app/services/clone_service.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Voice Cloning Service (Coqui XTTS)
|
| 3 |
+
High-quality multi-lingual text-to-speech with voice cloning capabilities.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
import torch
|
| 9 |
+
import gc
|
| 10 |
+
from typing import List, Optional, Dict, Any
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import tempfile
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
class CloneService:
|
| 17 |
+
"""
|
| 18 |
+
Service for Voice Cloning using Coqui XTTS v2.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 23 |
+
self.tts = None
|
| 24 |
+
self.model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
| 25 |
+
self.loaded = False
|
| 26 |
+
|
| 27 |
+
def load_model(self):
|
| 28 |
+
"""Lazy load the heavy XTTS model"""
|
| 29 |
+
if self.loaded:
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
try:
|
| 33 |
+
logger.info(f"Loading XTTS model ({self.device})... This may take a while.")
|
| 34 |
+
from TTS.api import TTS
|
| 35 |
+
|
| 36 |
+
# Load model
|
| 37 |
+
self.tts = TTS(self.model_name).to(self.device)
|
| 38 |
+
self.loaded = True
|
| 39 |
+
logger.info("✅ XTTS Model loaded successfully")
|
| 40 |
+
|
| 41 |
+
except ImportError as e:
|
| 42 |
+
logger.error("TTS library not installed. Please install 'TTS'.")
|
| 43 |
+
raise ImportError("Voice Cloning requires 'TTS' library.")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Failed to load XTTS model: {e}")
|
| 46 |
+
raise e
|
| 47 |
+
|
| 48 |
+
def unload_model(self):
|
| 49 |
+
"""Unload model to free VRAM"""
|
| 50 |
+
if self.tts:
|
| 51 |
+
del self.tts
|
| 52 |
+
self.tts = None
|
| 53 |
+
self.loaded = False
|
| 54 |
+
gc.collect()
|
| 55 |
+
torch.cuda.empty_cache()
|
| 56 |
+
logger.info("🗑️ XTTS Model unloaded")
|
| 57 |
+
|
| 58 |
+
def clone_voice(
|
| 59 |
+
self,
|
| 60 |
+
text: str,
|
| 61 |
+
speaker_wav_paths: List[str],
|
| 62 |
+
language: str = "en",
|
| 63 |
+
output_path: Optional[str] = None
|
| 64 |
+
) -> str:
|
| 65 |
+
"""
|
| 66 |
+
Synthesize speech in the style of the reference audio.
|
| 67 |
+
"""
|
| 68 |
+
if not self.loaded:
|
| 69 |
+
self.load_model()
|
| 70 |
+
|
| 71 |
+
if not output_path:
|
| 72 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
| 73 |
+
output_path = f.name
|
| 74 |
+
|
| 75 |
+
try:
|
| 76 |
+
# XTTS synthesis
|
| 77 |
+
# Note: speaker_wav can be a list of files for better cloning
|
| 78 |
+
self.tts.tts_to_file(
|
| 79 |
+
text=text,
|
| 80 |
+
speaker_wav=speaker_wav_paths,
|
| 81 |
+
language=language,
|
| 82 |
+
file_path=output_path,
|
| 83 |
+
split_sentences=True
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
logger.info(f"Cloned speech generated: {output_path}")
|
| 87 |
+
return output_path
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(f"Cloning failed: {e}")
|
| 91 |
+
raise e
|
| 92 |
+
|
| 93 |
+
def get_supported_languages(self) -> List[str]:
|
| 94 |
+
# XTTS v2 supported languages
|
| 95 |
+
return ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"]
|
| 96 |
+
|
| 97 |
+
# Singleton
|
| 98 |
+
_clone_service = None
|
| 99 |
+
|
| 100 |
+
def get_clone_service():
|
| 101 |
+
global _clone_service
|
| 102 |
+
if _clone_service is None:
|
| 103 |
+
_clone_service = CloneService()
|
| 104 |
+
return _clone_service
|
backend/app/services/diarization_service.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speaker Diarization Service - Clean Implementation
|
| 3 |
+
Uses faster-whisper + pyannote.audio directly (no whisperx)
|
| 4 |
+
|
| 5 |
+
This avoids the KeyError bugs in whisperx alignment while providing
|
| 6 |
+
the same functionality.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import gc
|
| 11 |
+
import logging
|
| 12 |
+
import torch
|
| 13 |
+
from typing import Optional, Dict, Any, List
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
+
from app.core.config import get_settings
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# Load environment variables from .env file
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
# Workaround for PyTorch 2.6+ weights_only security restriction
|
| 24 |
+
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class DiarizationService:
|
| 28 |
+
"""
|
| 29 |
+
Speaker Diarization Service using faster-whisper + pyannote.audio.
|
| 30 |
+
|
| 31 |
+
This implementation avoids whisperx entirely to prevent alignment bugs.
|
| 32 |
+
|
| 33 |
+
Flow:
|
| 34 |
+
1. Transcribe with faster-whisper (word-level timestamps)
|
| 35 |
+
2. Diarize with pyannote.audio (speaker segments)
|
| 36 |
+
3. Merge speakers with transcript segments
|
| 37 |
+
|
| 38 |
+
Requires:
|
| 39 |
+
- faster-whisper (already installed)
|
| 40 |
+
- pyannote.audio
|
| 41 |
+
- Valid Hugging Face Token (HF_TOKEN) in .env
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
def __init__(self):
|
| 45 |
+
self.settings = get_settings()
|
| 46 |
+
|
| 47 |
+
# Auto-detect GPU (prefer CUDA for speed)
|
| 48 |
+
if torch.cuda.is_available():
|
| 49 |
+
self.device = "cuda"
|
| 50 |
+
self.compute_type = "float16"
|
| 51 |
+
logger.info(f"🚀 Diarization using GPU: {torch.cuda.get_device_name(0)}")
|
| 52 |
+
else:
|
| 53 |
+
self.device = "cpu"
|
| 54 |
+
self.compute_type = "int8"
|
| 55 |
+
logger.info("⚠️ Diarization using CPU (slower)")
|
| 56 |
+
|
| 57 |
+
# Load HF token
|
| 58 |
+
self.hf_token = os.getenv("HF_TOKEN")
|
| 59 |
+
if not self.hf_token:
|
| 60 |
+
logger.warning("⚠️ HF_TOKEN not found. Speaker diarization will fail.")
|
| 61 |
+
|
| 62 |
+
# FFmpeg Setup for Windows
|
| 63 |
+
self._setup_ffmpeg()
|
| 64 |
+
|
| 65 |
+
def _setup_ffmpeg(self):
|
| 66 |
+
"""Auto-configure FFmpeg from imageio-ffmpeg if not in PATH"""
|
| 67 |
+
try:
|
| 68 |
+
import imageio_ffmpeg
|
| 69 |
+
import shutil
|
| 70 |
+
|
| 71 |
+
ffmpeg_src = imageio_ffmpeg.get_ffmpeg_exe()
|
| 72 |
+
backend_dir = os.getcwd()
|
| 73 |
+
ffmpeg_dest = os.path.join(backend_dir, "ffmpeg.exe")
|
| 74 |
+
|
| 75 |
+
if not os.path.exists(ffmpeg_dest):
|
| 76 |
+
shutil.copy(ffmpeg_src, ffmpeg_dest)
|
| 77 |
+
logger.info(f"🔧 Configured FFmpeg: {ffmpeg_dest}")
|
| 78 |
+
|
| 79 |
+
if backend_dir not in os.environ.get("PATH", ""):
|
| 80 |
+
os.environ["PATH"] = backend_dir + os.pathsep + os.environ.get("PATH", "")
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.warning(f"⚠️ Could not auto-configure FFmpeg: {e}")
|
| 84 |
+
|
| 85 |
+
def check_requirements(self):
|
| 86 |
+
"""Validate requirements before processing"""
|
| 87 |
+
if not self.hf_token:
|
| 88 |
+
raise ValueError(
|
| 89 |
+
"HF_TOKEN is missing. Add HF_TOKEN=your_token to .env file. "
|
| 90 |
+
"Get one at: https://huggingface.co/settings/tokens"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def _get_diarization_pipeline(self):
|
| 94 |
+
"""Load pyannote diarization pipeline with PyTorch 2.6+ fix"""
|
| 95 |
+
from pyannote.audio import Pipeline
|
| 96 |
+
|
| 97 |
+
# Monkey-patch torch.load for PyTorch 2.6+ compatibility
|
| 98 |
+
original_load = torch.load
|
| 99 |
+
def safe_load(*args, **kwargs):
|
| 100 |
+
kwargs.pop('weights_only', None)
|
| 101 |
+
return original_load(*args, **kwargs, weights_only=False)
|
| 102 |
+
|
| 103 |
+
torch.load = safe_load
|
| 104 |
+
try:
|
| 105 |
+
pipeline = Pipeline.from_pretrained(
|
| 106 |
+
"pyannote/speaker-diarization-3.1",
|
| 107 |
+
use_auth_token=self.hf_token
|
| 108 |
+
)
|
| 109 |
+
if self.device == "cuda":
|
| 110 |
+
pipeline.to(torch.device("cuda"))
|
| 111 |
+
return pipeline
|
| 112 |
+
finally:
|
| 113 |
+
torch.load = original_load
|
| 114 |
+
|
| 115 |
+
def _transcribe_with_timestamps(self, audio_path: str, language: Optional[str] = None) -> Dict:
|
| 116 |
+
"""Transcribe audio using faster-whisper with word timestamps"""
|
| 117 |
+
from faster_whisper import WhisperModel
|
| 118 |
+
|
| 119 |
+
# CTranslate2 (faster-whisper) doesn't support float16 on all GPUs
|
| 120 |
+
# Use int8 for whisper, but pyannote still benefits from CUDA
|
| 121 |
+
whisper_compute = "int8" if self.device == "cuda" else "int8"
|
| 122 |
+
model = WhisperModel(
|
| 123 |
+
"small",
|
| 124 |
+
device=self.device,
|
| 125 |
+
compute_type=whisper_compute
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
segments_raw, info = model.transcribe(
|
| 129 |
+
audio_path,
|
| 130 |
+
language=language,
|
| 131 |
+
word_timestamps=True,
|
| 132 |
+
vad_filter=True
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
segments = []
|
| 136 |
+
for segment in segments_raw:
|
| 137 |
+
segments.append({
|
| 138 |
+
"start": segment.start,
|
| 139 |
+
"end": segment.end,
|
| 140 |
+
"text": segment.text.strip(),
|
| 141 |
+
"words": [
|
| 142 |
+
{"start": w.start, "end": w.end, "word": w.word}
|
| 143 |
+
for w in (segment.words or [])
|
| 144 |
+
]
|
| 145 |
+
})
|
| 146 |
+
|
| 147 |
+
# Cleanup
|
| 148 |
+
del model
|
| 149 |
+
gc.collect()
|
| 150 |
+
|
| 151 |
+
return {
|
| 152 |
+
"segments": segments,
|
| 153 |
+
"language": info.language
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
def _preprocess_audio(self, audio_path: str) -> str:
|
| 157 |
+
"""
|
| 158 |
+
Apply noise reduction to audio file.
|
| 159 |
+
Returns path to cleaned audio file.
|
| 160 |
+
"""
|
| 161 |
+
try:
|
| 162 |
+
import noisereduce as nr
|
| 163 |
+
import librosa
|
| 164 |
+
import soundfile as sf
|
| 165 |
+
import tempfile
|
| 166 |
+
|
| 167 |
+
logger.info("🔧 Preprocessing audio (noise reduction)...")
|
| 168 |
+
|
| 169 |
+
# Load audio
|
| 170 |
+
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
|
| 171 |
+
|
| 172 |
+
# Apply spectral gating noise reduction
|
| 173 |
+
reduced_noise = nr.reduce_noise(
|
| 174 |
+
y=audio,
|
| 175 |
+
sr=sr,
|
| 176 |
+
stationary=True,
|
| 177 |
+
prop_decrease=0.75
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# Save to temp file
|
| 181 |
+
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 182 |
+
sf.write(temp_file.name, reduced_noise, sr)
|
| 183 |
+
|
| 184 |
+
logger.info(f" → Noise reduction complete, saved to {temp_file.name}")
|
| 185 |
+
return temp_file.name
|
| 186 |
+
|
| 187 |
+
except ImportError as e:
|
| 188 |
+
logger.warning(f"⚠️ Audio preprocessing unavailable (install noisereduce, librosa, soundfile): {e}")
|
| 189 |
+
return audio_path
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.warning(f"⚠️ Audio preprocessing failed: {e}")
|
| 192 |
+
return audio_path
|
| 193 |
+
|
| 194 |
+
def _merge_speakers(self, transcript: Dict, diarization) -> List[Dict]:
|
| 195 |
+
"""
|
| 196 |
+
Merge speaker labels from diarization with transcript segments.
|
| 197 |
+
|
| 198 |
+
Uses midpoint matching with nearest-speaker fallback to minimize UNKNOWN labels.
|
| 199 |
+
"""
|
| 200 |
+
segments = transcript["segments"]
|
| 201 |
+
result = []
|
| 202 |
+
|
| 203 |
+
# Build list of speaker turns for efficient lookup
|
| 204 |
+
speaker_turns = [
|
| 205 |
+
(turn.start, turn.end, spk)
|
| 206 |
+
for turn, _, spk in diarization.itertracks(yield_label=True)
|
| 207 |
+
]
|
| 208 |
+
|
| 209 |
+
for seg in segments:
|
| 210 |
+
mid_time = (seg["start"] + seg["end"]) / 2
|
| 211 |
+
speaker = None
|
| 212 |
+
|
| 213 |
+
# Step 1: Try exact midpoint match
|
| 214 |
+
for start, end, spk in speaker_turns:
|
| 215 |
+
if start <= mid_time <= end:
|
| 216 |
+
speaker = spk
|
| 217 |
+
break
|
| 218 |
+
|
| 219 |
+
# Step 2: If no match, find nearest speaker (fallback)
|
| 220 |
+
if speaker is None and speaker_turns:
|
| 221 |
+
min_distance = float('inf')
|
| 222 |
+
for start, end, spk in speaker_turns:
|
| 223 |
+
# Distance to nearest edge of speaker segment
|
| 224 |
+
if mid_time < start:
|
| 225 |
+
dist = start - mid_time
|
| 226 |
+
elif mid_time > end:
|
| 227 |
+
dist = mid_time - end
|
| 228 |
+
else:
|
| 229 |
+
dist = 0 # Should have been caught above
|
| 230 |
+
|
| 231 |
+
if dist < min_distance:
|
| 232 |
+
min_distance = dist
|
| 233 |
+
speaker = spk
|
| 234 |
+
|
| 235 |
+
# Final fallback (shouldn't happen)
|
| 236 |
+
if speaker is None:
|
| 237 |
+
speaker = "UNKNOWN"
|
| 238 |
+
|
| 239 |
+
result.append({
|
| 240 |
+
"start": seg["start"],
|
| 241 |
+
"end": seg["end"],
|
| 242 |
+
"text": seg["text"],
|
| 243 |
+
"speaker": speaker
|
| 244 |
+
})
|
| 245 |
+
|
| 246 |
+
return result
|
| 247 |
+
|
| 248 |
+
def process_audio(
|
| 249 |
+
self,
|
| 250 |
+
audio_path: str,
|
| 251 |
+
num_speakers: Optional[int] = None,
|
| 252 |
+
min_speakers: Optional[int] = None,
|
| 253 |
+
max_speakers: Optional[int] = None,
|
| 254 |
+
language: Optional[str] = None,
|
| 255 |
+
preprocess: bool = False,
|
| 256 |
+
) -> Dict[str, Any]:
|
| 257 |
+
"""
|
| 258 |
+
Full diarization pipeline: [Preprocess] → Transcribe → Diarize → Merge
|
| 259 |
+
|
| 260 |
+
Args:
|
| 261 |
+
audio_path: Path to audio file
|
| 262 |
+
num_speakers: Exact number of speakers (optional)
|
| 263 |
+
min_speakers: Minimum speakers (optional)
|
| 264 |
+
max_speakers: Maximum speakers (optional)
|
| 265 |
+
language: Force language code (optional, auto-detected if None)
|
| 266 |
+
preprocess: Apply noise reduction before processing (default: False)
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
Dict with segments, speaker_stats, language, status
|
| 270 |
+
"""
|
| 271 |
+
self.check_requirements()
|
| 272 |
+
|
| 273 |
+
logger.info(f"🎤 Starting diarization on {self.device}...")
|
| 274 |
+
|
| 275 |
+
# Optional preprocessing for noise reduction
|
| 276 |
+
processed_path = audio_path
|
| 277 |
+
if preprocess:
|
| 278 |
+
processed_path = self._preprocess_audio(audio_path)
|
| 279 |
+
|
| 280 |
+
try:
|
| 281 |
+
# Step 1: Transcribe with faster-whisper
|
| 282 |
+
logger.info("Step 1/3: Transcribing audio...")
|
| 283 |
+
transcript = self._transcribe_with_timestamps(processed_path, language)
|
| 284 |
+
detected_lang = transcript["language"]
|
| 285 |
+
logger.info(f" → Language: {detected_lang}, Segments: {len(transcript['segments'])}")
|
| 286 |
+
|
| 287 |
+
# Step 2: Diarize with pyannote
|
| 288 |
+
logger.info("Step 2/3: Identifying speakers...")
|
| 289 |
+
pipeline = self._get_diarization_pipeline()
|
| 290 |
+
|
| 291 |
+
diarization = pipeline(
|
| 292 |
+
processed_path,
|
| 293 |
+
num_speakers=num_speakers,
|
| 294 |
+
min_speakers=min_speakers,
|
| 295 |
+
max_speakers=max_speakers
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
# Cleanup pipeline
|
| 299 |
+
del pipeline
|
| 300 |
+
gc.collect()
|
| 301 |
+
|
| 302 |
+
# Step 3: Merge results
|
| 303 |
+
logger.info("Step 3/3: Merging speakers with transcript...")
|
| 304 |
+
segments = self._merge_speakers(transcript, diarization)
|
| 305 |
+
|
| 306 |
+
# Calculate speaker stats
|
| 307 |
+
speaker_stats = {}
|
| 308 |
+
for seg in segments:
|
| 309 |
+
spk = seg["speaker"]
|
| 310 |
+
dur = seg["end"] - seg["start"]
|
| 311 |
+
speaker_stats[spk] = speaker_stats.get(spk, 0) + dur
|
| 312 |
+
|
| 313 |
+
logger.info(f"✅ Diarization complete: {len(segments)} segments, {len(speaker_stats)} speakers")
|
| 314 |
+
|
| 315 |
+
return {
|
| 316 |
+
"segments": segments,
|
| 317 |
+
"speaker_stats": speaker_stats,
|
| 318 |
+
"language": detected_lang,
|
| 319 |
+
"status": "success"
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
logger.exception("Diarization failed")
|
| 324 |
+
raise e
|
| 325 |
+
finally:
|
| 326 |
+
gc.collect()
|
| 327 |
+
if self.device == "cuda":
|
| 328 |
+
torch.cuda.empty_cache()
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
# Singleton
|
| 332 |
+
_diarization_service = None
|
| 333 |
+
|
| 334 |
+
def get_diarization_service():
|
| 335 |
+
global _diarization_service
|
| 336 |
+
if not _diarization_service:
|
| 337 |
+
_diarization_service = DiarizationService()
|
| 338 |
+
return _diarization_service
|
backend/app/services/edge_tts_service.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Edge-TTS Text-to-Speech Service
|
| 3 |
+
Free, high-quality neural TTS using Microsoft Edge's speech synthesis
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import io
|
| 8 |
+
import logging
|
| 9 |
+
import edge_tts
|
| 10 |
+
from typing import Optional, List, Dict, Any
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# Available voice samples by language
|
| 16 |
+
VOICE_CATALOG = {
|
| 17 |
+
"en-US": [
|
| 18 |
+
{"name": "en-US-AriaNeural", "gender": "Female", "style": "professional"},
|
| 19 |
+
{"name": "en-US-GuyNeural", "gender": "Male", "style": "casual"},
|
| 20 |
+
{"name": "en-US-JennyNeural", "gender": "Female", "style": "friendly"},
|
| 21 |
+
{"name": "en-US-ChristopherNeural", "gender": "Male", "style": "newscast"},
|
| 22 |
+
],
|
| 23 |
+
"en-GB": [
|
| 24 |
+
{"name": "en-GB-SoniaNeural", "gender": "Female", "style": "professional"},
|
| 25 |
+
{"name": "en-GB-RyanNeural", "gender": "Male", "style": "casual"},
|
| 26 |
+
],
|
| 27 |
+
"en-IN": [
|
| 28 |
+
{"name": "en-IN-NeerjaNeural", "gender": "Female", "style": "professional"},
|
| 29 |
+
{"name": "en-IN-PrabhatNeural", "gender": "Male", "style": "casual"},
|
| 30 |
+
],
|
| 31 |
+
"hi-IN": [
|
| 32 |
+
{"name": "hi-IN-SwaraNeural", "gender": "Female", "style": "professional"},
|
| 33 |
+
{"name": "hi-IN-MadhurNeural", "gender": "Male", "style": "casual"},
|
| 34 |
+
],
|
| 35 |
+
"es-ES": [
|
| 36 |
+
{"name": "es-ES-ElviraNeural", "gender": "Female", "style": "professional"},
|
| 37 |
+
{"name": "es-ES-AlvaroNeural", "gender": "Male", "style": "casual"},
|
| 38 |
+
],
|
| 39 |
+
"es-MX": [
|
| 40 |
+
{"name": "es-MX-DaliaNeural", "gender": "Female", "style": "professional"},
|
| 41 |
+
{"name": "es-MX-JorgeNeural", "gender": "Male", "style": "casual"},
|
| 42 |
+
],
|
| 43 |
+
"fr-FR": [
|
| 44 |
+
{"name": "fr-FR-DeniseNeural", "gender": "Female", "style": "professional"},
|
| 45 |
+
{"name": "fr-FR-HenriNeural", "gender": "Male", "style": "casual"},
|
| 46 |
+
],
|
| 47 |
+
"de-DE": [
|
| 48 |
+
{"name": "de-DE-KatjaNeural", "gender": "Female", "style": "professional"},
|
| 49 |
+
{"name": "de-DE-ConradNeural", "gender": "Male", "style": "casual"},
|
| 50 |
+
],
|
| 51 |
+
"ja-JP": [
|
| 52 |
+
{"name": "ja-JP-NanamiNeural", "gender": "Female", "style": "professional"},
|
| 53 |
+
{"name": "ja-JP-KeitaNeural", "gender": "Male", "style": "casual"},
|
| 54 |
+
],
|
| 55 |
+
"ko-KR": [
|
| 56 |
+
{"name": "ko-KR-SunHiNeural", "gender": "Female", "style": "professional"},
|
| 57 |
+
{"name": "ko-KR-InJoonNeural", "gender": "Male", "style": "casual"},
|
| 58 |
+
],
|
| 59 |
+
"zh-CN": [
|
| 60 |
+
{"name": "zh-CN-XiaoxiaoNeural", "gender": "Female", "style": "professional"},
|
| 61 |
+
{"name": "zh-CN-YunxiNeural", "gender": "Male", "style": "casual"},
|
| 62 |
+
],
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class EdgeTTSService:
|
| 67 |
+
"""
|
| 68 |
+
Text-to-Speech service using Microsoft Edge TTS (free, neural voices)
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
def __init__(self):
|
| 72 |
+
"""Initialize the Edge TTS service"""
|
| 73 |
+
self._all_voices = None
|
| 74 |
+
|
| 75 |
+
# Class-level cache
|
| 76 |
+
_voices_cache = None
|
| 77 |
+
|
| 78 |
+
async def get_voices(self, language: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 79 |
+
"""
|
| 80 |
+
Get available voices
|
| 81 |
+
"""
|
| 82 |
+
# Check cache
|
| 83 |
+
if EdgeTTSService._voices_cache is None:
|
| 84 |
+
try:
|
| 85 |
+
voices = await edge_tts.list_voices()
|
| 86 |
+
|
| 87 |
+
# Transform to our format
|
| 88 |
+
formatted_voices = []
|
| 89 |
+
for v in voices:
|
| 90 |
+
formatted_voices.append({
|
| 91 |
+
"name": v["ShortName"],
|
| 92 |
+
"display_name": v["ShortName"].replace("-", " ").split("Neural")[0].strip(),
|
| 93 |
+
"language_code": v["Locale"],
|
| 94 |
+
"gender": v["Gender"],
|
| 95 |
+
"voice_type": "Neural",
|
| 96 |
+
})
|
| 97 |
+
|
| 98 |
+
EdgeTTSService._voices_cache = formatted_voices
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error(f"Failed to fetch voices from Edge TTS: {e}. Falling back to catalog.")
|
| 101 |
+
# Fallback to catalog
|
| 102 |
+
voices = []
|
| 103 |
+
for lang, lang_voices in VOICE_CATALOG.items():
|
| 104 |
+
for v in lang_voices:
|
| 105 |
+
voices.append({
|
| 106 |
+
"name": v["name"],
|
| 107 |
+
"display_name": v["name"].replace("-", " ").replace("Neural", "").strip(),
|
| 108 |
+
"language_code": lang,
|
| 109 |
+
"gender": v["gender"],
|
| 110 |
+
"voice_type": "Neural",
|
| 111 |
+
})
|
| 112 |
+
EdgeTTSService._voices_cache = voices
|
| 113 |
+
|
| 114 |
+
voices = EdgeTTSService._voices_cache
|
| 115 |
+
|
| 116 |
+
# Filter by language if specified
|
| 117 |
+
if language:
|
| 118 |
+
voices = [v for v in voices if v["language_code"].startswith(language)]
|
| 119 |
+
|
| 120 |
+
return voices
|
| 121 |
+
|
| 122 |
+
def get_voices_sync(self, language: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 123 |
+
"""Synchronous wrapper for get_voices"""
|
| 124 |
+
# Create a new event loop if necessary for sync wrapper
|
| 125 |
+
try:
|
| 126 |
+
loop = asyncio.get_event_loop()
|
| 127 |
+
except RuntimeError:
|
| 128 |
+
loop = asyncio.new_event_loop()
|
| 129 |
+
asyncio.set_event_loop(loop)
|
| 130 |
+
|
| 131 |
+
if loop.is_running():
|
| 132 |
+
# If loop is running, we can't block it.
|
| 133 |
+
import concurrent.futures
|
| 134 |
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
| 135 |
+
future = asyncio.run_coroutine_threadsafe(self.get_voices(language), loop)
|
| 136 |
+
return future.result()
|
| 137 |
+
|
| 138 |
+
return loop.run_until_complete(self.get_voices(language))
|
| 139 |
+
|
| 140 |
+
def build_ssml(
|
| 141 |
+
self,
|
| 142 |
+
text: str,
|
| 143 |
+
voice: str = "en-US-AriaNeural",
|
| 144 |
+
rate: str = "medium",
|
| 145 |
+
pitch: str = "medium",
|
| 146 |
+
emphasis: str = None,
|
| 147 |
+
breaks: bool = True
|
| 148 |
+
) -> str:
|
| 149 |
+
"""
|
| 150 |
+
Build SSML markup for advanced prosody control.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
text: Plain text to convert
|
| 154 |
+
voice: Voice name
|
| 155 |
+
rate: Speed - 'x-slow', 'slow', 'medium', 'fast', 'x-fast' or percentage
|
| 156 |
+
pitch: Pitch - 'x-low', 'low', 'medium', 'high', 'x-high' or Hz offset
|
| 157 |
+
emphasis: Optional emphasis level - 'reduced', 'moderate', 'strong'
|
| 158 |
+
breaks: Auto-insert breaks at punctuation
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
SSML-formatted string
|
| 162 |
+
"""
|
| 163 |
+
# Normalize rate/pitch values
|
| 164 |
+
rate_value = rate if rate in ['x-slow', 'slow', 'medium', 'fast', 'x-fast'] else rate
|
| 165 |
+
pitch_value = pitch if pitch in ['x-low', 'low', 'medium', 'high', 'x-high'] else pitch
|
| 166 |
+
|
| 167 |
+
# Build SSML
|
| 168 |
+
ssml_parts = ['<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">']
|
| 169 |
+
ssml_parts.append(f'<voice name="{voice}">')
|
| 170 |
+
ssml_parts.append(f'<prosody rate="{rate_value}" pitch="{pitch_value}">')
|
| 171 |
+
|
| 172 |
+
if emphasis:
|
| 173 |
+
ssml_parts.append(f'<emphasis level="{emphasis}">')
|
| 174 |
+
|
| 175 |
+
# Auto-insert breaks for natural speech
|
| 176 |
+
if breaks:
|
| 177 |
+
import re
|
| 178 |
+
# Add short breaks after commas, longer after periods
|
| 179 |
+
processed_text = re.sub(r'([,;:])\s*', r'\1<break time="200ms"/>', text)
|
| 180 |
+
processed_text = re.sub(r'([.!?])\s+', r'\1<break time="500ms"/>', processed_text)
|
| 181 |
+
ssml_parts.append(processed_text)
|
| 182 |
+
else:
|
| 183 |
+
ssml_parts.append(text)
|
| 184 |
+
|
| 185 |
+
if emphasis:
|
| 186 |
+
ssml_parts.append('</emphasis>')
|
| 187 |
+
|
| 188 |
+
ssml_parts.append('</prosody>')
|
| 189 |
+
ssml_parts.append('</voice>')
|
| 190 |
+
ssml_parts.append('</speak>')
|
| 191 |
+
|
| 192 |
+
return ''.join(ssml_parts)
|
| 193 |
+
|
| 194 |
+
async def synthesize_ssml(
|
| 195 |
+
self,
|
| 196 |
+
ssml_text: str,
|
| 197 |
+
voice: str = "en-US-AriaNeural",
|
| 198 |
+
) -> bytes:
|
| 199 |
+
"""
|
| 200 |
+
Synthesize speech from SSML markup.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
ssml_text: SSML-formatted text
|
| 204 |
+
voice: Voice name (for edge-tts communication)
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
Audio bytes (MP3)
|
| 208 |
+
"""
|
| 209 |
+
logger.info(f"Synthesizing SSML with voice: {voice}")
|
| 210 |
+
|
| 211 |
+
# Edge TTS handles SSML natively
|
| 212 |
+
communicate = edge_tts.Communicate(ssml_text, voice)
|
| 213 |
+
|
| 214 |
+
audio_buffer = io.BytesIO()
|
| 215 |
+
async for chunk in communicate.stream():
|
| 216 |
+
if chunk["type"] == "audio":
|
| 217 |
+
audio_buffer.write(chunk["data"])
|
| 218 |
+
|
| 219 |
+
audio_buffer.seek(0)
|
| 220 |
+
return audio_buffer.read()
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
async def synthesize_stream(
|
| 224 |
+
self,
|
| 225 |
+
text: str,
|
| 226 |
+
voice: str = "en-US-AriaNeural",
|
| 227 |
+
rate: str = "+0%",
|
| 228 |
+
pitch: str = "+0Hz",
|
| 229 |
+
):
|
| 230 |
+
"""
|
| 231 |
+
Stream speech synthesis chunks.
|
| 232 |
+
|
| 233 |
+
Optimized to stream sentence-by-sentence to reduce TTFB (Time To First Byte),
|
| 234 |
+
avoiding full-text buffering issues.
|
| 235 |
+
"""
|
| 236 |
+
import re
|
| 237 |
+
|
| 238 |
+
# Split text into sentences to force incremental processing
|
| 239 |
+
# This regex matches sentences ending with . ! ? or end of string
|
| 240 |
+
# It keeps the proper punctuation.
|
| 241 |
+
sentences = re.findall(r'[^.!?]+(?:[.!?]+|$)', text)
|
| 242 |
+
if not sentences:
|
| 243 |
+
sentences = [text]
|
| 244 |
+
|
| 245 |
+
logger.info(f"Streaming {len(sentences)} sentences for low latency...")
|
| 246 |
+
|
| 247 |
+
for sentence in sentences:
|
| 248 |
+
if not sentence.strip():
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
+
communicate = edge_tts.Communicate(sentence, voice, rate=rate, pitch=pitch)
|
| 252 |
+
|
| 253 |
+
async for chunk in communicate.stream():
|
| 254 |
+
if chunk["type"] == "audio":
|
| 255 |
+
yield chunk["data"]
|
| 256 |
+
|
| 257 |
+
async def synthesize(
|
| 258 |
+
self,
|
| 259 |
+
text: str,
|
| 260 |
+
voice: str = "en-US-AriaNeural",
|
| 261 |
+
rate: str = "+0%",
|
| 262 |
+
pitch: str = "+0Hz",
|
| 263 |
+
) -> bytes:
|
| 264 |
+
"""
|
| 265 |
+
Synthesize speech from text
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
text: Text to synthesize
|
| 269 |
+
voice: Voice name (e.g., 'en-US-AriaNeural')
|
| 270 |
+
rate: Speaking rate adjustment (e.g., '+20%', '-10%')
|
| 271 |
+
pitch: Pitch adjustment (e.g., '+5Hz', '-10Hz')
|
| 272 |
+
|
| 273 |
+
Returns:
|
| 274 |
+
Audio content as bytes (MP3 format)
|
| 275 |
+
"""
|
| 276 |
+
# Reuse stream method to avoid duplication
|
| 277 |
+
audio_buffer = io.BytesIO()
|
| 278 |
+
async for chunk in self.synthesize_stream(text, voice, rate, pitch):
|
| 279 |
+
audio_buffer.write(chunk)
|
| 280 |
+
|
| 281 |
+
audio_buffer.seek(0)
|
| 282 |
+
return audio_buffer.read()
|
| 283 |
+
|
| 284 |
+
def synthesize_sync(
|
| 285 |
+
self,
|
| 286 |
+
text: str,
|
| 287 |
+
voice: str = "en-US-AriaNeural",
|
| 288 |
+
rate: str = "+0%",
|
| 289 |
+
pitch: str = "+0Hz",
|
| 290 |
+
) -> bytes:
|
| 291 |
+
"""Synchronous wrapper for synthesize"""
|
| 292 |
+
try:
|
| 293 |
+
loop = asyncio.get_event_loop()
|
| 294 |
+
except RuntimeError:
|
| 295 |
+
loop = asyncio.new_event_loop()
|
| 296 |
+
asyncio.set_event_loop(loop)
|
| 297 |
+
|
| 298 |
+
return loop.run_until_complete(self.synthesize(text, voice, rate, pitch))
|
| 299 |
+
|
| 300 |
+
async def synthesize_to_response(
|
| 301 |
+
self,
|
| 302 |
+
text: str,
|
| 303 |
+
voice: str = "en-US-AriaNeural",
|
| 304 |
+
speaking_rate: float = 1.0,
|
| 305 |
+
pitch: float = 0.0,
|
| 306 |
+
) -> Dict[str, Any]:
|
| 307 |
+
"""
|
| 308 |
+
Synthesize speech and return API-compatible response
|
| 309 |
+
|
| 310 |
+
Args:
|
| 311 |
+
text: Text to synthesize
|
| 312 |
+
voice: Voice name
|
| 313 |
+
speaking_rate: Rate multiplier (1.0 = normal, 1.5 = 50% faster)
|
| 314 |
+
pitch: Pitch adjustment in semitones (-20 to +20)
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
Dictionary with audio content and metadata
|
| 318 |
+
"""
|
| 319 |
+
import base64
|
| 320 |
+
import time
|
| 321 |
+
|
| 322 |
+
start_time = time.time()
|
| 323 |
+
|
| 324 |
+
# Convert rate/pitch to Edge TTS format
|
| 325 |
+
rate_percent = int((speaking_rate - 1.0) * 100)
|
| 326 |
+
rate_str = f"+{rate_percent}%" if rate_percent >= 0 else f"{rate_percent}%"
|
| 327 |
+
pitch_str = f"+{int(pitch)}Hz" if pitch >= 0 else f"{int(pitch)}Hz"
|
| 328 |
+
|
| 329 |
+
# Synthesize
|
| 330 |
+
audio_bytes = await self.synthesize(text, voice, rate_str, pitch_str)
|
| 331 |
+
|
| 332 |
+
processing_time = time.time() - start_time
|
| 333 |
+
|
| 334 |
+
# Estimate duration (~150 chars per second at normal speed)
|
| 335 |
+
estimated_duration = len(text) / 150 / speaking_rate
|
| 336 |
+
|
| 337 |
+
return {
|
| 338 |
+
"audio_content": base64.b64encode(audio_bytes).decode("utf-8"),
|
| 339 |
+
"encoding": "MP3",
|
| 340 |
+
"audio_size": len(audio_bytes),
|
| 341 |
+
"duration_estimate": estimated_duration,
|
| 342 |
+
"voice_used": voice,
|
| 343 |
+
"processing_time": processing_time,
|
| 344 |
+
"cached": False,
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
# Singleton instance
|
| 349 |
+
_edge_tts_service: Optional[EdgeTTSService] = None
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def get_edge_tts_service() -> EdgeTTSService:
|
| 353 |
+
"""Get or create the EdgeTTSService singleton"""
|
| 354 |
+
global _edge_tts_service
|
| 355 |
+
if _edge_tts_service is None:
|
| 356 |
+
_edge_tts_service = EdgeTTSService()
|
| 357 |
+
return _edge_tts_service
|
backend/app/services/emotion_service.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Emotion Analysis Service
|
| 3 |
+
Detects emotion from audio using Wav2Vec2 and text using NLP
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
from typing import Dict, List, Any, Optional
|
| 12 |
+
|
| 13 |
+
from app.core.config import get_settings
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class EmotionService:
|
| 19 |
+
"""
|
| 20 |
+
Service for Speech Emotion Recognition (SER).
|
| 21 |
+
Uses 'ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition'
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self):
|
| 25 |
+
self.model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
|
| 26 |
+
self._model = None
|
| 27 |
+
self._processor = None
|
| 28 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
+
|
| 30 |
+
# Supported emotions in model's order
|
| 31 |
+
self.emotions = [
|
| 32 |
+
"angry", "calm", "disgust", "fearful",
|
| 33 |
+
"happy", "neutral", "sad", "surprised"
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
def _load_model(self):
|
| 37 |
+
"""Lazy load model to save RAM"""
|
| 38 |
+
if self._model is None:
|
| 39 |
+
try:
|
| 40 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
| 41 |
+
|
| 42 |
+
logger.info(f"🎭 Loading Emotion Model ({self.device})...")
|
| 43 |
+
self._processor = Wav2Vec2Processor.from_pretrained(self.model_name)
|
| 44 |
+
self._model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_name)
|
| 45 |
+
self._model.to(self.device)
|
| 46 |
+
logger.info("✅ Emotion Model loaded")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Failed to load emotion model: {e}")
|
| 49 |
+
raise
|
| 50 |
+
|
| 51 |
+
def analyze_audio(self, audio_path: str) -> Dict[str, Any]:
|
| 52 |
+
"""
|
| 53 |
+
Analyze emotion of an entire audio file.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
audio_path: Path to audio file
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Dict with dominant emotion and probability distribution
|
| 60 |
+
"""
|
| 61 |
+
import librosa
|
| 62 |
+
|
| 63 |
+
self._load_model()
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
# Load audio using librosa (16kHz required for Wav2Vec2)
|
| 67 |
+
# Duration limit: Analyze first 30s max for MVP to avoid OOM
|
| 68 |
+
# For full file, we should chunk it.
|
| 69 |
+
y, sr = librosa.load(audio_path, sr=16000, duration=60)
|
| 70 |
+
|
| 71 |
+
inputs = self._processor(y, sampling_rate=16000, return_tensors="pt", padding=True)
|
| 72 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 73 |
+
|
| 74 |
+
with torch.no_grad():
|
| 75 |
+
logits = self._model(**inputs).logits
|
| 76 |
+
|
| 77 |
+
# Get probabilities
|
| 78 |
+
probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
|
| 79 |
+
|
| 80 |
+
# Map to emotions
|
| 81 |
+
scores = {
|
| 82 |
+
self.emotions[i]: float(probs[i])
|
| 83 |
+
for i in range(len(self.emotions))
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# Get dominant
|
| 87 |
+
dominant = max(scores, key=scores.get)
|
| 88 |
+
|
| 89 |
+
return {
|
| 90 |
+
"dominant_emotion": dominant,
|
| 91 |
+
"confidence": scores[dominant],
|
| 92 |
+
"distribution": scores
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.error(f"Audio emotion analysis failed: {e}")
|
| 97 |
+
raise e
|
| 98 |
+
|
| 99 |
+
def analyze_audio_segment(self, audio_data: np.ndarray, sr: int = 16000) -> Dict[str, Any]:
|
| 100 |
+
"""
|
| 101 |
+
Analyze a raw numpy audio segment.
|
| 102 |
+
"""
|
| 103 |
+
self._load_model()
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
inputs = self._processor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
|
| 107 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 108 |
+
|
| 109 |
+
with torch.no_grad():
|
| 110 |
+
logits = self._model(**inputs).logits
|
| 111 |
+
|
| 112 |
+
probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
|
| 113 |
+
scores = {self.emotions[i]: float(probs[i]) for i in range(len(self.emotions))}
|
| 114 |
+
dominant = max(scores, key=scores.get)
|
| 115 |
+
|
| 116 |
+
return {
|
| 117 |
+
"emotion": dominant,
|
| 118 |
+
"score": scores[dominant]
|
| 119 |
+
}
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.error(f"Segment analysis failed: {e}")
|
| 122 |
+
return {"emotion": "neutral", "score": 0.0}
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# Singleton
|
| 126 |
+
_emotion_service = None
|
| 127 |
+
|
| 128 |
+
def get_emotion_service() -> EmotionService:
|
| 129 |
+
global _emotion_service
|
| 130 |
+
if _emotion_service is None:
|
| 131 |
+
_emotion_service = EmotionService()
|
| 132 |
+
return _emotion_service
|
backend/app/services/export_service.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Export Service
|
| 3 |
+
Helper functions for generating transcript exports (PDF, SRT, VTT, TXT)
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from fpdf import FPDF
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
import io
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ExportService:
|
| 12 |
+
@staticmethod
|
| 13 |
+
def to_txt(transcript: Dict[str, Any]) -> str:
|
| 14 |
+
"""Export as plain text"""
|
| 15 |
+
text = transcript.get("text", "")
|
| 16 |
+
|
| 17 |
+
# improved structure
|
| 18 |
+
output = []
|
| 19 |
+
output.append(f"Transcript ID: {transcript.get('id', 'N/A')}")
|
| 20 |
+
output.append(f"Date: {transcript.get('created_at', 'Unknown')}")
|
| 21 |
+
output.append("-" * 40)
|
| 22 |
+
output.append(text)
|
| 23 |
+
|
| 24 |
+
return "\n".join(output)
|
| 25 |
+
|
| 26 |
+
@staticmethod
|
| 27 |
+
def to_srt(transcript: Dict[str, Any]) -> str:
|
| 28 |
+
"""Export as SRT (SubRip Subtitle)"""
|
| 29 |
+
segments = transcript.get("segments") or []
|
| 30 |
+
if not segments:
|
| 31 |
+
# Fallback to word timestamps if segments missing
|
| 32 |
+
words = transcript.get("words", [])
|
| 33 |
+
if words:
|
| 34 |
+
pass # TODO: Construct segments from words
|
| 35 |
+
return "" # Cannot generate SRT without timing
|
| 36 |
+
|
| 37 |
+
srt_lines = []
|
| 38 |
+
for i, segment in enumerate(segments, 1):
|
| 39 |
+
start = ExportService._format_timestamp(segment.get("start_time", 0))
|
| 40 |
+
end = ExportService._format_timestamp(segment.get("end_time", 0))
|
| 41 |
+
text = segment.get("text", "").strip()
|
| 42 |
+
|
| 43 |
+
srt_lines.append(str(i))
|
| 44 |
+
srt_lines.append(f"{start} --> {end}")
|
| 45 |
+
srt_lines.append(text)
|
| 46 |
+
srt_lines.append("")
|
| 47 |
+
|
| 48 |
+
return "\n".join(srt_lines)
|
| 49 |
+
|
| 50 |
+
@staticmethod
|
| 51 |
+
def to_vtt(transcript: Dict[str, Any]) -> str:
|
| 52 |
+
"""Export as WebVTT"""
|
| 53 |
+
srt = ExportService.to_srt(transcript)
|
| 54 |
+
return "WEBVTT\n\n" + srt.replace(",", ".")
|
| 55 |
+
|
| 56 |
+
@staticmethod
|
| 57 |
+
def to_pdf(transcript: Dict[str, Any]) -> bytes:
|
| 58 |
+
"""Export as PDF"""
|
| 59 |
+
pdf = FPDF()
|
| 60 |
+
pdf.add_page()
|
| 61 |
+
pdf.set_font("helvetica", size=12)
|
| 62 |
+
|
| 63 |
+
# Header
|
| 64 |
+
pdf.set_font("helvetica", "B", 16)
|
| 65 |
+
pdf.cell(0, 10, f"Transcript Report", new_x="LMARGIN", new_y="NEXT", align='C')
|
| 66 |
+
pdf.ln(10)
|
| 67 |
+
|
| 68 |
+
# Metadata
|
| 69 |
+
pdf.set_font("helvetica", "B", 10)
|
| 70 |
+
pdf.cell(40, 10, f"Date: {transcript.get('created_at', 'Unknown')}")
|
| 71 |
+
pdf.ln(5)
|
| 72 |
+
pdf.cell(40, 10, f"Duration: {transcript.get('duration', 0)}s")
|
| 73 |
+
pdf.ln(10)
|
| 74 |
+
|
| 75 |
+
# Content
|
| 76 |
+
pdf.set_font("helvetica", size=11)
|
| 77 |
+
text = transcript.get("text", "")
|
| 78 |
+
# fpdf2 handles utf-8 much better now
|
| 79 |
+
pdf.multi_cell(0, 8, text)
|
| 80 |
+
|
| 81 |
+
# NLP Analysis if available
|
| 82 |
+
sentiment = transcript.get("sentiment")
|
| 83 |
+
if sentiment:
|
| 84 |
+
pdf.ln(10)
|
| 85 |
+
pdf.set_font("helvetica", "B", 12)
|
| 86 |
+
pdf.cell(0, 10, "Analysis", new_x="LMARGIN", new_y="NEXT")
|
| 87 |
+
pdf.set_font("helvetica", size=10)
|
| 88 |
+
pdf.cell(0, 8, f"Sentiment: Polarity {sentiment.get('polarity')}, Subjectivity {sentiment.get('subjectivity')}", new_x="LMARGIN", new_y="NEXT")
|
| 89 |
+
|
| 90 |
+
return bytes(pdf.output())
|
| 91 |
+
|
| 92 |
+
@staticmethod
|
| 93 |
+
def _format_timestamp(seconds: float) -> str:
|
| 94 |
+
"""Format seconds to HH:MM:SS,mmm"""
|
| 95 |
+
hours = int(seconds // 3600)
|
| 96 |
+
minutes = int((seconds % 3600) // 60)
|
| 97 |
+
secs = int(seconds % 60)
|
| 98 |
+
millis = int((seconds % 1) * 1000)
|
| 99 |
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
backend/app/services/file_service.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File Service
|
| 3 |
+
Audio file management and processing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import uuid
|
| 8 |
+
import shutil
|
| 9 |
+
import logging
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional, Tuple, Dict, Any
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
|
| 14 |
+
from ..core.config import get_settings
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
settings = get_settings()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class FileService:
|
| 21 |
+
"""
|
| 22 |
+
Service for managing audio file uploads and storage
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
"""Initialize file service and ensure upload directory exists"""
|
| 27 |
+
self.upload_dir = Path(settings.upload_dir)
|
| 28 |
+
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
logger.info(f"File service initialized with upload dir: {self.upload_dir}")
|
| 30 |
+
|
| 31 |
+
def save_upload(
|
| 32 |
+
self,
|
| 33 |
+
file_content: bytes,
|
| 34 |
+
original_filename: str,
|
| 35 |
+
user_id: Optional[int] = None,
|
| 36 |
+
) -> Tuple[str, Dict[str, Any]]:
|
| 37 |
+
"""
|
| 38 |
+
Save an uploaded audio file
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
file_content: File bytes
|
| 42 |
+
original_filename: Original filename from upload
|
| 43 |
+
user_id: Optional user ID for organization
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
Tuple of (storage_path, file_metadata)
|
| 47 |
+
"""
|
| 48 |
+
# Validate file extension
|
| 49 |
+
ext = Path(original_filename).suffix.lower()
|
| 50 |
+
if ext.lstrip('.') not in settings.supported_audio_formats_list:
|
| 51 |
+
raise ValueError(f"Unsupported audio format: {ext}")
|
| 52 |
+
|
| 53 |
+
# Validate file size
|
| 54 |
+
file_size = len(file_content)
|
| 55 |
+
max_size = settings.max_upload_size_mb * 1024 * 1024
|
| 56 |
+
if file_size > max_size:
|
| 57 |
+
raise ValueError(f"File too large: {file_size / 1024 / 1024:.1f}MB (max {settings.max_upload_size_mb}MB)")
|
| 58 |
+
|
| 59 |
+
# Generate unique filename
|
| 60 |
+
unique_id = str(uuid.uuid4())
|
| 61 |
+
date_prefix = datetime.now().strftime("%Y/%m/%d")
|
| 62 |
+
|
| 63 |
+
# Create subdirectory for user or general
|
| 64 |
+
if user_id:
|
| 65 |
+
subdir = self.upload_dir / f"user_{user_id}" / date_prefix
|
| 66 |
+
else:
|
| 67 |
+
subdir = self.upload_dir / "anonymous" / date_prefix
|
| 68 |
+
|
| 69 |
+
subdir.mkdir(parents=True, exist_ok=True)
|
| 70 |
+
|
| 71 |
+
# Save file
|
| 72 |
+
filename = f"{unique_id}{ext}"
|
| 73 |
+
storage_path = subdir / filename
|
| 74 |
+
|
| 75 |
+
with open(storage_path, "wb") as f:
|
| 76 |
+
f.write(file_content)
|
| 77 |
+
|
| 78 |
+
logger.info(f"Saved upload: {original_filename} -> {storage_path}")
|
| 79 |
+
|
| 80 |
+
# Get file metadata
|
| 81 |
+
metadata = self._get_file_metadata(storage_path)
|
| 82 |
+
metadata["original_filename"] = original_filename
|
| 83 |
+
metadata["file_size"] = file_size
|
| 84 |
+
|
| 85 |
+
return str(storage_path), metadata
|
| 86 |
+
|
| 87 |
+
def get_file(self, storage_path: str) -> Optional[bytes]:
|
| 88 |
+
"""
|
| 89 |
+
Get file content by storage path
|
| 90 |
+
|
| 91 |
+
Args:
|
| 92 |
+
storage_path: Path to stored file
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
File bytes or None if not found
|
| 96 |
+
"""
|
| 97 |
+
path = Path(storage_path)
|
| 98 |
+
if not path.exists():
|
| 99 |
+
logger.warning(f"File not found: {storage_path}")
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
with open(path, "rb") as f:
|
| 103 |
+
return f.read()
|
| 104 |
+
|
| 105 |
+
def delete_file(self, storage_path: str) -> bool:
|
| 106 |
+
"""
|
| 107 |
+
Delete a stored file
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
storage_path: Path to stored file
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
True if deleted, False if not found
|
| 114 |
+
"""
|
| 115 |
+
path = Path(storage_path)
|
| 116 |
+
if not path.exists():
|
| 117 |
+
return False
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
path.unlink()
|
| 121 |
+
logger.info(f"Deleted file: {storage_path}")
|
| 122 |
+
return True
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Failed to delete file: {e}")
|
| 125 |
+
return False
|
| 126 |
+
|
| 127 |
+
def _get_file_metadata(self, file_path: Path) -> Dict[str, Any]:
|
| 128 |
+
"""
|
| 129 |
+
Get metadata for an audio file
|
| 130 |
+
Uses ffprobe if available, otherwise basic info
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
file_path: Path to audio file
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
Dict with file metadata
|
| 137 |
+
"""
|
| 138 |
+
ext = file_path.suffix.lower().lstrip('.')
|
| 139 |
+
|
| 140 |
+
metadata = {
|
| 141 |
+
"format": ext,
|
| 142 |
+
"storage_path": str(file_path),
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# Try to get audio metadata using ffprobe
|
| 146 |
+
try:
|
| 147 |
+
import subprocess
|
| 148 |
+
import json
|
| 149 |
+
|
| 150 |
+
result = subprocess.run(
|
| 151 |
+
[
|
| 152 |
+
"ffprobe",
|
| 153 |
+
"-v", "quiet",
|
| 154 |
+
"-print_format", "json",
|
| 155 |
+
"-show_format",
|
| 156 |
+
"-show_streams",
|
| 157 |
+
str(file_path)
|
| 158 |
+
],
|
| 159 |
+
capture_output=True,
|
| 160 |
+
text=True,
|
| 161 |
+
timeout=10,
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
if result.returncode == 0:
|
| 165 |
+
probe_data = json.loads(result.stdout)
|
| 166 |
+
|
| 167 |
+
# Extract format info
|
| 168 |
+
if "format" in probe_data:
|
| 169 |
+
fmt = probe_data["format"]
|
| 170 |
+
metadata["duration"] = float(fmt.get("duration", 0))
|
| 171 |
+
metadata["bit_rate"] = int(fmt.get("bit_rate", 0))
|
| 172 |
+
|
| 173 |
+
# Extract stream info
|
| 174 |
+
for stream in probe_data.get("streams", []):
|
| 175 |
+
if stream.get("codec_type") == "audio":
|
| 176 |
+
metadata["sample_rate"] = int(stream.get("sample_rate", 0))
|
| 177 |
+
metadata["channels"] = int(stream.get("channels", 0))
|
| 178 |
+
metadata["codec"] = stream.get("codec_name", "")
|
| 179 |
+
break
|
| 180 |
+
|
| 181 |
+
logger.debug(f"Extracted metadata via ffprobe: {metadata}")
|
| 182 |
+
except FileNotFoundError:
|
| 183 |
+
logger.debug("ffprobe not available, using basic metadata")
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.warning(f"Failed to extract metadata: {e}")
|
| 186 |
+
|
| 187 |
+
return metadata
|
| 188 |
+
|
| 189 |
+
def cleanup_temp_files(self, max_age_hours: int = 24) -> int:
|
| 190 |
+
"""
|
| 191 |
+
Clean up old temporary/anonymous files
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
max_age_hours: Delete files older than this
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
Number of files deleted
|
| 198 |
+
"""
|
| 199 |
+
deleted = 0
|
| 200 |
+
anonymous_dir = self.upload_dir / "anonymous"
|
| 201 |
+
|
| 202 |
+
if not anonymous_dir.exists():
|
| 203 |
+
return 0
|
| 204 |
+
|
| 205 |
+
cutoff = datetime.now().timestamp() - (max_age_hours * 3600)
|
| 206 |
+
|
| 207 |
+
for file_path in anonymous_dir.rglob("*"):
|
| 208 |
+
if file_path.is_file() and file_path.stat().st_mtime < cutoff:
|
| 209 |
+
try:
|
| 210 |
+
file_path.unlink()
|
| 211 |
+
deleted += 1
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.error(f"Failed to delete {file_path}: {e}")
|
| 214 |
+
|
| 215 |
+
if deleted:
|
| 216 |
+
logger.info(f"Cleaned up {deleted} old temporary files")
|
| 217 |
+
|
| 218 |
+
return deleted
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# Singleton instance
|
| 222 |
+
_file_service: Optional[FileService] = None
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def get_file_service() -> FileService:
|
| 226 |
+
"""Get singleton file service instance"""
|
| 227 |
+
global _file_service
|
| 228 |
+
if _file_service is None:
|
| 229 |
+
_file_service = FileService()
|
| 230 |
+
return _file_service
|
backend/app/services/meeting_service.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Meeting Minutes Service
|
| 3 |
+
Orchestrates Speaker Diarization, STT, and NLP to generate meeting reports
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
import shutil
|
| 9 |
+
from typing import Dict, Any, List, Optional
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
from app.services.diarization_service import get_diarization_service
|
| 13 |
+
from app.services.nlp_service import get_nlp_service
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class MeetingService:
|
| 19 |
+
"""
|
| 20 |
+
Orchestrates the creation of intelligent meeting minutes.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
self.diarization_service = get_diarization_service()
|
| 25 |
+
self.nlp_service = get_nlp_service()
|
| 26 |
+
|
| 27 |
+
def process_meeting(
|
| 28 |
+
self,
|
| 29 |
+
audio_path: str,
|
| 30 |
+
num_speakers: Optional[int] = None,
|
| 31 |
+
language: Optional[str] = None
|
| 32 |
+
) -> Dict[str, Any]:
|
| 33 |
+
"""
|
| 34 |
+
Process a meeting recording to generate full minutes.
|
| 35 |
+
|
| 36 |
+
Pipeline:
|
| 37 |
+
1. Diarization + STT (Who said what)
|
| 38 |
+
2. NLP Analysis (Summary, Action Items, Topics)
|
| 39 |
+
3. Report Generation data
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
audio_path: Path to audio file
|
| 43 |
+
num_speakers: Optional hint for diarization
|
| 44 |
+
language: Optional language code
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
Dict containing full meeting data
|
| 48 |
+
"""
|
| 49 |
+
try:
|
| 50 |
+
logger.info(f"📅 Starting meeting processing for {os.path.basename(audio_path)}")
|
| 51 |
+
|
| 52 |
+
# Step 1: Diarization & Transcription
|
| 53 |
+
# This is the heavy lifting - getting segments with speakers
|
| 54 |
+
diarization_result = self.diarization_service.process_audio(
|
| 55 |
+
audio_path,
|
| 56 |
+
num_speakers=num_speakers,
|
| 57 |
+
language=language,
|
| 58 |
+
preprocess=True # Always preprocess meetings for better quality
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
segments = diarization_result["segments"]
|
| 62 |
+
full_text = " ".join([seg["text"] for seg in segments])
|
| 63 |
+
speaker_stats = diarization_result["speaker_stats"]
|
| 64 |
+
detected_language = diarization_result["language"]
|
| 65 |
+
|
| 66 |
+
# Step 2: NLP Analysis
|
| 67 |
+
logger.info("🧠 Running NLP analysis on meeting transcript...")
|
| 68 |
+
|
| 69 |
+
# 2a. Summary
|
| 70 |
+
summary = self.nlp_service.generate_summary(full_text, sentence_count=5)
|
| 71 |
+
|
| 72 |
+
# 2b. Action Items
|
| 73 |
+
action_items = self.nlp_service.extract_action_items(full_text)
|
| 74 |
+
|
| 75 |
+
# 2c. Keywords/Topics
|
| 76 |
+
keywords = self.nlp_service.extract_keywords(full_text, max_keywords=15)
|
| 77 |
+
|
| 78 |
+
# 2d. Sentiment
|
| 79 |
+
sentiment = self.nlp_service.analyze_sentiment(full_text)
|
| 80 |
+
|
| 81 |
+
# Step 3: Organize Output
|
| 82 |
+
attendees = list(speaker_stats.keys())
|
| 83 |
+
|
| 84 |
+
# Enhance segments with individual analysis if needed?
|
| 85 |
+
# (Skipping per-segment sentiment for now to save time, can add later)
|
| 86 |
+
|
| 87 |
+
result = {
|
| 88 |
+
"metadata": {
|
| 89 |
+
"filename": os.path.basename(audio_path),
|
| 90 |
+
"processed_at": datetime.now().isoformat(),
|
| 91 |
+
"language": detected_language,
|
| 92 |
+
"duration_seconds": sum(speaker_stats.values()),
|
| 93 |
+
"attendee_count": len(attendees),
|
| 94 |
+
"attendees": attendees,
|
| 95 |
+
},
|
| 96 |
+
"summary": summary,
|
| 97 |
+
"action_items": action_items,
|
| 98 |
+
"topics": keywords,
|
| 99 |
+
"sentiment": sentiment,
|
| 100 |
+
"speaker_stats": speaker_stats,
|
| 101 |
+
"transcript_segments": segments,
|
| 102 |
+
"raw_text": full_text,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
logger.info("✅ Meeting processing complete!")
|
| 106 |
+
return result
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.error(f"Meeting processing failed: {e}")
|
| 110 |
+
raise e
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# Singleton instance
|
| 114 |
+
_meeting_service = None
|
| 115 |
+
|
| 116 |
+
def get_meeting_service() -> MeetingService:
|
| 117 |
+
"""Get or create MeetingService singleton."""
|
| 118 |
+
global _meeting_service
|
| 119 |
+
if _meeting_service is None:
|
| 120 |
+
_meeting_service = MeetingService()
|
| 121 |
+
return _meeting_service
|
backend/app/services/nlp_service.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
NLP Service
|
| 3 |
+
Handles text analysis, sentiment, keywords, and summarization
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from typing import List, Dict, Any, Optional
|
| 8 |
+
import nltk
|
| 9 |
+
from textblob import TextBlob
|
| 10 |
+
from sumy.parsers.plaintext import PlaintextParser
|
| 11 |
+
from sumy.nlp.tokenizers import Tokenizer
|
| 12 |
+
from sumy.summarizers.lsa import LsaSummarizer
|
| 13 |
+
from sumy.nlp.stemmers import Stemmer
|
| 14 |
+
from sumy.utils import get_stop_words
|
| 15 |
+
from collections import Counter
|
| 16 |
+
import re
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class NLPService:
|
| 22 |
+
"""
|
| 23 |
+
Service for Natural Language Processing tasks
|
| 24 |
+
Uses local libraries (TextBlob, Sumy) to avoid API costs
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self):
|
| 28 |
+
self._ensure_nltk_resources()
|
| 29 |
+
|
| 30 |
+
def _ensure_nltk_resources(self):
|
| 31 |
+
"""Download necessary NLTK data if missing"""
|
| 32 |
+
resources = ["punkt", "averaged_perceptron_tagger", "brown"]
|
| 33 |
+
for resource in resources:
|
| 34 |
+
try:
|
| 35 |
+
nltk.data.find(f"tokenizers/{resource}")
|
| 36 |
+
except LookupError:
|
| 37 |
+
try:
|
| 38 |
+
nltk.data.find(f"corpora/{resource}")
|
| 39 |
+
except LookupError:
|
| 40 |
+
try:
|
| 41 |
+
nltk.data.find(f"taggers/{resource}")
|
| 42 |
+
except LookupError:
|
| 43 |
+
logger.info(f"Downloading NLTK resource: {resource}")
|
| 44 |
+
nltk.download(resource, quiet=True)
|
| 45 |
+
# sumy specific
|
| 46 |
+
try:
|
| 47 |
+
nltk.data.find("tokenizers/punkt_tab")
|
| 48 |
+
except LookupError:
|
| 49 |
+
nltk.download("punkt", quiet=True)
|
| 50 |
+
nltk.download("punkt_tab", quiet=True)
|
| 51 |
+
|
| 52 |
+
def analyze_sentiment(self, text: str) -> Dict[str, float]:
|
| 53 |
+
"""
|
| 54 |
+
Analyze sentiment of text
|
| 55 |
+
Returns: {polarity: -1.0 to 1.0, subjectivity: 0.0 to 1.0}
|
| 56 |
+
"""
|
| 57 |
+
if not text:
|
| 58 |
+
return {"polarity": 0.0, "subjectivity": 0.0}
|
| 59 |
+
|
| 60 |
+
blob = TextBlob(text)
|
| 61 |
+
return {
|
| 62 |
+
"polarity": round(blob.sentiment.polarity, 2),
|
| 63 |
+
"subjectivity": round(blob.sentiment.subjectivity, 2)
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
def extract_keywords(self, text: str, max_keywords: int = 10) -> List[Dict[str, Any]]:
|
| 67 |
+
"""
|
| 68 |
+
Extract keywords/keyphrases from text
|
| 69 |
+
Returns list of {"text": str, "count": int}
|
| 70 |
+
"""
|
| 71 |
+
if not text:
|
| 72 |
+
return []
|
| 73 |
+
|
| 74 |
+
blob = TextBlob(text)
|
| 75 |
+
|
| 76 |
+
# Get noun phrases
|
| 77 |
+
noun_phrases = blob.noun_phrases
|
| 78 |
+
|
| 79 |
+
if noun_phrases:
|
| 80 |
+
# Count frequency
|
| 81 |
+
counts = Counter(noun_phrases)
|
| 82 |
+
# Return top N
|
| 83 |
+
return [{"text": phrase, "count": count} for phrase, count in counts.most_common(max_keywords)]
|
| 84 |
+
|
| 85 |
+
# Fallback to simple word frequency if no noun phrases
|
| 86 |
+
stop_words = set(["the", "a", "an", "in", "on", "at", "to", "for", "of", "and", "or", "is", "are", "was", "were", "it", "that", "this"])
|
| 87 |
+
words = [w.lower() for w in re.findall(r'\w+', text) if len(w) > 3 and w.lower() not in stop_words]
|
| 88 |
+
counts = Counter(words)
|
| 89 |
+
return [{"text": word, "count": count} for word, count in counts.most_common(max_keywords)]
|
| 90 |
+
|
| 91 |
+
def extract_action_items(self, text: str) -> List[str]:
|
| 92 |
+
"""
|
| 93 |
+
Extract potential action items using regex patterns.
|
| 94 |
+
Looks for phrases like "I will", "we need to", "todo", etc.
|
| 95 |
+
"""
|
| 96 |
+
if not text:
|
| 97 |
+
return []
|
| 98 |
+
|
| 99 |
+
action_patterns = [
|
| 100 |
+
r"(?i)(?:I|we|you|he|she|they) (?:will|shall|must|should|need to|have to|going to) (.*?)[\.,]",
|
| 101 |
+
r"(?i)(?:let's|lets) (.*?)[\.,]",
|
| 102 |
+
r"(?i)(?:action item|todo|to-do)[:\s](.*?)[\.,]",
|
| 103 |
+
r"(?i)(?:please|plz) (.*?)[\.,]",
|
| 104 |
+
r"(?i)(?:make sure|ensure) (?:to|that)? (.*?)[\.,]",
|
| 105 |
+
r"(?i)(?:don't forget|remember) to (.*?)[\.,]",
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
action_items = []
|
| 109 |
+
|
| 110 |
+
# Split into sentences first for better context
|
| 111 |
+
sentences = nltk.sent_tokenize(text)
|
| 112 |
+
|
| 113 |
+
for sentence in sentences:
|
| 114 |
+
for pattern in action_patterns:
|
| 115 |
+
matches = re.findall(pattern, sentence)
|
| 116 |
+
for match in matches:
|
| 117 |
+
# Clean up the match
|
| 118 |
+
item = match.strip()
|
| 119 |
+
if len(item) > 5: # Filter out short noise
|
| 120 |
+
# Try to capture full sentence context if match is short
|
| 121 |
+
if len(item.split()) < 3:
|
| 122 |
+
action_items.append(sentence.strip())
|
| 123 |
+
else:
|
| 124 |
+
# Reconstruct "I will [match]" context if reasonable
|
| 125 |
+
if pattern.startswith(r"(?i)(?:I|we"):
|
| 126 |
+
# Find usage of the trigger word
|
| 127 |
+
trigger = re.search(r"(will|shall|must|should|need to|have to|going to)", sentence, re.IGNORECASE)
|
| 128 |
+
if trigger:
|
| 129 |
+
start = trigger.start()
|
| 130 |
+
action_items.append(sentence[start:].strip())
|
| 131 |
+
else:
|
| 132 |
+
action_items.append(item)
|
| 133 |
+
else:
|
| 134 |
+
action_items.append(item)
|
| 135 |
+
break # One action item per sentence is usually enough
|
| 136 |
+
|
| 137 |
+
return list(set(action_items)) # Dedup
|
| 138 |
+
|
| 139 |
+
def generate_summary(self, text: str, sentence_count: int = 3) -> str:
|
| 140 |
+
"""
|
| 141 |
+
Generate extractive summary using LSA
|
| 142 |
+
"""
|
| 143 |
+
if not text:
|
| 144 |
+
return ""
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
language = "english" # Default to english for now
|
| 148 |
+
parser = PlaintextParser.from_string(text, Tokenizer(language))
|
| 149 |
+
stemmer = Stemmer(language)
|
| 150 |
+
summarizer = LsaSummarizer(stemmer)
|
| 151 |
+
summarizer.stop_words = get_stop_words(language)
|
| 152 |
+
|
| 153 |
+
summary_sentences = summarizer(parser.document, sentence_count)
|
| 154 |
+
return " ".join([str(s) for s in summary_sentences])
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.warning(f"Summarization failed: {e}")
|
| 157 |
+
# Fallback: simple first N sentences
|
| 158 |
+
sentences = text.split('.')
|
| 159 |
+
return ".".join(sentences[:sentence_count]) + "."
|
| 160 |
+
|
| 161 |
+
def process_transcript(self, text: str) -> Dict[str, Any]:
|
| 162 |
+
"""
|
| 163 |
+
Run full NLP pipeline on transcript text
|
| 164 |
+
"""
|
| 165 |
+
return {
|
| 166 |
+
"sentiment": self.analyze_sentiment(text),
|
| 167 |
+
"keywords": self.extract_keywords(text),
|
| 168 |
+
"summary": self.generate_summary(text),
|
| 169 |
+
"action_items": self.extract_action_items(text),
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# Singleton instance
|
| 174 |
+
_nlp_service = None
|
| 175 |
+
|
| 176 |
+
def get_nlp_service() -> NLPService:
|
| 177 |
+
global _nlp_service
|
| 178 |
+
if _nlp_service is None:
|
| 179 |
+
_nlp_service = NLPService()
|
| 180 |
+
return _nlp_service
|
backend/app/services/sign_avatar_service.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Sign Language Avatar Service
|
| 3 |
+
Converts text input into a sequence of sign language images/animations.
|
| 4 |
+
Current implementation: ASL Finger Spelling using static images.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import logging
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
class SignAvatarService:
|
| 14 |
+
"""
|
| 15 |
+
Generates sign language visualizations from text.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
# Placeholder URLs for ASL hand signs (Public CDN or local assets)
|
| 19 |
+
# Using a reliable public source for testing, or we could generate valid placeholders.
|
| 20 |
+
# For now, we simulate with a dictionary mapping.
|
| 21 |
+
ASL_IMAGE_MAP = {
|
| 22 |
+
letter: f"https://www.signingsavvy.com/images/asl/start/Sgn{i}.jpg"
|
| 23 |
+
for i, letter in enumerate(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), start=1) # Simplified mapping logic
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# A more reliable source for ASL alphabet images (fingerspelling)
|
| 27 |
+
# Using openclipart or similar public domain layout is safer,
|
| 28 |
+
# but for this portfolio prototype, we'll construct a mock response structure
|
| 29 |
+
# that the frontend can interpret to render images or use a specific asset path.
|
| 30 |
+
|
| 31 |
+
def __init__(self):
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
def text_to_glosses(self, text: str) -> List[Dict]:
|
| 35 |
+
"""
|
| 36 |
+
Convert text to a sequence of sign glosses (or letters for fingerspelling).
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
text: Input text (e.g. "Hello World")
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
List of objects: {"type": "letter", "value": "H", "image_url": "..."}
|
| 43 |
+
"""
|
| 44 |
+
clean_text = text.upper().strip()
|
| 45 |
+
sequence = []
|
| 46 |
+
|
| 47 |
+
# Simple Finger Spelling approach (MVP)
|
| 48 |
+
for char in clean_text:
|
| 49 |
+
if char.isalpha():
|
| 50 |
+
# In a real app, we'd have local assets.
|
| 51 |
+
# For this demo, we'll return a schematic that the frontend can use
|
| 52 |
+
# to fetch from a public ASL dictionary or strictly local assets if we had them.
|
| 53 |
+
# Let's assume we'll use a public GitHub raw set for stability.
|
| 54 |
+
|
| 55 |
+
# Using a known stable repo for ASL images (e.g. from a tutorial or dataset)
|
| 56 |
+
# or just returning the character for the frontend to render with a custom font/image map.
|
| 57 |
+
|
| 58 |
+
image_url = f"https://raw.githubusercontent.com/redcode-br/ASL-Finger-Spelling/master/assets/{char}.png"
|
| 59 |
+
|
| 60 |
+
sequence.append({
|
| 61 |
+
"type": "letter",
|
| 62 |
+
"value": char,
|
| 63 |
+
"image_url": image_url,
|
| 64 |
+
"duration": 1.0 # seconds to display
|
| 65 |
+
})
|
| 66 |
+
elif char == " ":
|
| 67 |
+
sequence.append({
|
| 68 |
+
"type": "space",
|
| 69 |
+
"value": " ",
|
| 70 |
+
"duration": 0.5
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
return sequence
|
| 74 |
+
|
| 75 |
+
# Singleton
|
| 76 |
+
_avatar_service = None
|
| 77 |
+
|
| 78 |
+
def get_avatar_service():
|
| 79 |
+
global _avatar_service
|
| 80 |
+
if _avatar_service is None:
|
| 81 |
+
_avatar_service = SignAvatarService()
|
| 82 |
+
return _avatar_service
|