Spaces:

lordofgaming
/

voiceforge-universal

Sleeping

App Files Files Community

creator-o1 commited on Jan 31

Commit

d00203b

0 Parent(s):

Initial commit: Complete VoiceForge Enterprise Speech AI Platform

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +28 -0
.github/workflows/backend-ci.yml +62 -0
.github/workflows/ci.yml +36 -0
.gitignore +178 -0
.lighthouseci/lhr-1769848038113.html +0 -0
CHANGELOG.md +128 -0
CONTRIBUTING.md +279 -0
README.md +360 -0
backend/.flake8 +4 -0
backend/Dockerfile +50 -0
backend/app/__init__.py +3 -0
backend/app/api/__init__.py +3 -0
backend/app/api/routes/__init__.py +31 -0
backend/app/api/routes/analysis.py +60 -0
backend/app/api/routes/audio.py +100 -0
backend/app/api/routes/auth.py +116 -0
backend/app/api/routes/batch.py +204 -0
backend/app/api/routes/cloning.py +81 -0
backend/app/api/routes/health.py +93 -0
backend/app/api/routes/sign.py +164 -0
backend/app/api/routes/stt.py +489 -0
backend/app/api/routes/transcripts.py +200 -0
backend/app/api/routes/translation.py +261 -0
backend/app/api/routes/tts.py +245 -0
backend/app/api/routes/ws.py +153 -0
backend/app/core/__init__.py +7 -0
backend/app/core/config.py +108 -0
backend/app/core/limiter.py +27 -0
backend/app/core/middleware.py +70 -0
backend/app/core/security.py +107 -0
backend/app/core/security_encryption.py +101 -0
backend/app/core/security_headers.py +37 -0
backend/app/main.py +257 -0
backend/app/schemas/__init__.py +39 -0
backend/app/schemas/stt.py +98 -0
backend/app/schemas/transcript.py +69 -0
backend/app/schemas/tts.py +67 -0
backend/app/services/__init__.py +13 -0
backend/app/services/audio_service.py +101 -0
backend/app/services/batch_service.py +348 -0
backend/app/services/cache_service.py +71 -0
backend/app/services/clone_service.py +104 -0
backend/app/services/diarization_service.py +338 -0
backend/app/services/edge_tts_service.py +357 -0
backend/app/services/emotion_service.py +132 -0
backend/app/services/export_service.py +99 -0
backend/app/services/file_service.py +230 -0
backend/app/services/meeting_service.py +121 -0
backend/app/services/nlp_service.py +180 -0
backend/app/services/sign_avatar_service.py +82 -0

.env.example ADDED Viewed

	@@ -0,0 +1,28 @@

+# VoiceForge Environment Configuration
+# Copy this file to .env and fill in your values
+# Database
+DATABASE_URL=postgresql://postgres:postgres@localhost:5432/voiceforge
+# Redis
+REDIS_URL=redis://localhost:6379/0
+# Google Cloud
+GOOGLE_APPLICATION_CREDENTIALS=./credentials/google-cloud-key.json
+# API Settings
+API_HOST=0.0.0.0
+API_PORT=8000
+DEBUG=true
+# Security
+SECRET_KEY=your-super-secret-key-change-in-production
+ACCESS_TOKEN_EXPIRE_MINUTES=30
+# File Storage
+UPLOAD_DIR=./uploads
+MAX_AUDIO_DURATION_SECONDS=600
+MAX_UPLOAD_SIZE_MB=50
+# Supported Languages (comma-separated)
+SUPPORTED_LANGUAGES=en-US,en-GB,es-ES,es-MX,fr-FR,de-DE,ja-JP,ko-KR,cmn-CN,hi-IN

.github/workflows/backend-ci.yml ADDED Viewed

	@@ -0,0 +1,62 @@

+name: Backend CI
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'backend/**'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'backend/**'
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: ./backend
+    services:
+      redis:
+        image: redis
+        ports:
+          - 6379:6379
+        options: >-
+          --health-cmd "redis-cli ping"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.10"
+        cache: 'pip'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install flake8 pytest pytest-asyncio httpx
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      env:
+        ENCRYPTION_KEY: ${{ secrets.ENCRYPTION_KEY }} # Mock or secret
+        REDIS_URL: "redis://localhost:6379/0"
+        HF_TOKEN: "mock_token" # Mock for CI
+      run: |
+        # We Mock heavy dependencies (torch, etc) in tests/conftest.py usually,
+        # or we install them. Installing them takes time.
+        # For this demo, we assume they are installed or tests mock them.
+        pytest

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: CI
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.10"
+    - name: Install System Dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y ffmpeg libsndfile1
+    - name: Install Python Dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pytest pytest-asyncio httpx
+        if [ -f backend/requirements.txt ]; then pip install -r backend/requirements.txt; fi
+    - name: Run Tests
+      # We skip slow tests or those requiring GPU/Redis if not available
+      run: |
+        cd backend
+        pytest tests/ -v -m "not integration"

.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the Python version is actually
+#   determined by the app developer rather than the library.
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or even
+#   fail to install them.
+# Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+# .pdm-python
+# .pdm-build/
+# PEP 582; used by e.g. github.com/frenzymadness/venvpdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# OS
+.DS_Store
+Thumbs.db
+# Database
+*.db
+*.sqlite
+# Local models
+models/
+*.bin
+*.pth
+*.onnx
+# Credentials
+credentials/
+*.json
+!deploy/monitoring/*.json
+# Uploads
+uploads/
+# Diagnostic files
+diagnostic_app.py
+diag_traceback.txt
+diag_log.txt
+live_verify.py
+test_prompt.wav
+test_output.mp3
+debug_app.py
+debug_out.txt
+diag_traceback.txt

.lighthouseci/lhr-1769848038113.html ADDED Viewed

The diff for this file is too large to render. See raw diff

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Changelog
+All notable changes to VoiceForge will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [3.0.0] - 2026-01-31
+### Major Architecture Updates
+- **Hybrid STT Engine**:
+  - Integrated `large-v3-turbo` for 8x faster multilingual transcription.
+  - Implemented smart routing between Distil-Whisper (English) and Turbo (Multilingual).
+- **Unified TTS Service**:
+  - Added `MeloTTS` integration for local, low-latency speech synthesis.
+  - Implemented automatic fallback to EdgeTTS for reliability.
+- **Poetry Migration**:
+  - Replaced `requirements.txt` with `pyproject.toml` and `poetry.lock`.
+  - Optimized Docker build workflow (multi-stage build ready).
+### Fixed
+- **Critical Build Fix**: Resolved `numpy`/`torch` version conflicts that caused 30+ min Docker builds.
+## [2.0.1] - 2026-01-31
+### Fixed
+- **CRITICAL**: Resolved numpy/torch dependency conflict causing 30+ minute Docker builds
+  - Pinned `numpy==1.26.4` (last stable 1.x version)
+  - Pinned `torch==2.3.1` and `torchaudio==2.3.1` for compatibility
+  - Docker builds now complete in <10 minutes instead of 30+
+- Added version ranges to core dependencies (fastapi, uvicorn, httpx) for stability
+- Added missing `locust` dependency for performance testing
+### Added
+- `DEPENDENCY_NOTES.md` documenting version constraints and update strategy
+## [2.0.0] - 2026-01-31
+### Added
+- **Advanced Test Suite** (Phase 14)
+  - 74+ tests across unit, integration, performance, and security categories
+  - Master test runner (`tests/run_all_tests.py`) for one-command execution
+  - 100% module coverage across all backend services
+- **Quality Automation Tools**
+  - `analyze_codebase.py`: Code complexity and maintainability metrics
+  - `check_syntax.py`: Python syntax and circular import detection
+  - `check_dependencies.py`: Dependency health and security vulnerability scanning
+  - `check_pipeline.py`: CI/CD pipeline validation (GitHub Actions, Docker)
+  - `coverage_tracker.py`: Module coverage matrix and untested function identification
+  - `lighthouse_audit.py`: Frontend performance auditing
+  - `project_audit.py`: Overall project coverage reporting
+- **Mobile App Foundation** (Phase 13 - In Progress)
+  - Flutter mobile app directory structure
+  - Architecture documentation for mobile companion app
+  - WebSocket integration design for real-time transcription
+- **Documentation**
+  - `docs/TESTING.md`: Comprehensive testing guide
+  - Updated `README.md` with testing instructions
+  - Mobile app setup guides
+### Changed
+- Updated `httpx.AsyncClient` usage to use `ASGITransport` for compatibility with modern httpx
+- Improved test fixtures with proper async handling (`pytest-asyncio`)
+- Enhanced `PROJECT_SUMMARY.md` with Phase 14 achievements
+### Fixed
+- Resolved `httpx` deprecation warnings in integration tests
+- Fixed mock setup in `test_translation_service.py` for `langdetect`
+- Corrected streaming synthesis mock signatures in `test_tts_service.py`
+## [1.5.0] - 2026-01-17
+### Added
+- Memory management with dynamic model unloading (1.5GB → 500MB)
+- WebSocket TTS streaming (<500ms TTFB)
+- SSML prosody control for advanced voice customization
+### Changed
+- Performance improvements across STT and TTS services
+## [1.4.0] - 2026-01-15
+### Added
+- Batched inference for 2-4x throughput improvement
+- Audio preprocessing with noise reduction
+- Speaker diarization (pyannote.audio integration)
+- Voice cloning with Coqui XTTS v2
+## [1.3.0] - 2026-01-10
+### Added
+- Phase 11: Optimization implementation
+  - DNS loopback fix (210x cold start improvement)
+  - Int8 quantization + greedy decoding (3x STT speedup)
+  - Distil-Whisper hybrid routing (10x cumulative STT speedup)
+  - Sentence streaming TTS (8x TTFB speedup)
+- Real-Time Factor: 0.28x (super-realtime performance)
+### Changed
+- STT latency reduced from 38.5s to 3.7s (10x improvement)
+- TTS TTFB reduced from 8.8s to 1.1s (8x improvement)
+## [1.2.0] - 2026-01-05
+### Added
+- Phase 10: Performance research
+  - Comprehensive benchmarking suite
+  - 11 optimization dimensions identified
+  - Priority matrix documentation
+## [1.0.0] - 2026-01-01
+### Added
+- Initial release
+- FastAPI backend with REST API
+- Streamlit frontend with glassmorphism UI
+- Local AI integration (Whisper STT + Edge TTS)
+- WebSocket live recording
+- NLP analysis (sentiment, keywords, summary)
+- Docker containerization
+- Basic documentation
+[2.0.0]: https://github.com/yourusername/voiceforge/compare/v1.5.0...v2.0.0
+[1.5.0]: https://github.com/yourusername/voiceforge/compare/v1.4.0...v1.5.0
+[1.4.0]: https://github.com/yourusername/voiceforge/compare/v1.3.0...v1.4.0
+[1.3.0]: https://github.com/yourusername/voiceforge/compare/v1.2.0...v1.3.0
+[1.2.0]: https://github.com/yourusername/voiceforge/compare/v1.0.0...v1.2.0
+[1.0.0]: https://github.com/yourusername/voiceforge/releases/tag/v1.0.0

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,279 @@

+# Contributing to VoiceForge
+Thank you for considering contributing to VoiceForge! This document provides guidelines for contributing to the project.
+## 🚀 Getting Started
+### Prerequisites
+- Python 3.10+
+- Docker & Docker Compose
+- Git
+### Development Setup
+1. **Clone the repository**
+   ```bash
+   git clone https://github.com/yourusername/voiceforge.git
+   cd voiceforge
+   ```
+2. **Install dependencies**
+   ```bash
+   # Backend
+   cd backend
+   pip install -r requirements.txt
+   # Frontend
+   cd ../frontend
+   pip install -r requirements.txt
+   ```
+3. **Set up environment variables**
+   ```bash
+   cp backend/.env.example backend/.env
+   # Edit .env with your configuration
+   ```
+4. **Run the application**
+   ```bash
+   # Using Docker (recommended)
+   docker-compose up
+   # OR manually
+   # Terminal 1: Backend
+   cd backend
+   uvicorn app.main:app --reload
+   # Terminal 2: Frontend
+   cd frontend
+   streamlit run streamlit_app.py
+   ```
+## 🧪 Testing
+### Running Tests
+```bash
+cd backend
+# Run all tests
+python tests/run_all_tests.py
+# Run specific category
+pytest tests/unit -v
+pytest tests/integration -v
+# Run with coverage
+pytest --cov=app tests/
+```
+### Writing Tests
+- **Unit tests**: Test individual functions in `tests/unit/`
+- **Integration tests**: Test API endpoints in `tests/integration/`
+- **Follow existing patterns**: Check similar tests for examples
+### Quality Checks
+```bash
+# Code quality analysis
+python tests/quality/analyze_codebase.py --path app
+# Dependency health
+python tests/quality/check_dependencies.py
+# Syntax check
+python tests/quality/check_syntax.py --path app
+```
+## 📝 Code Style
+### Python
+- Follow [PEP 8](https://pep8.org/)
+- Use type hints where possible
+- Maximum line length: 100 characters
+- Use descriptive variable names
+### Example
+```python
+from typing import List, Optional
+async def transcribe_audio(
+    file_path: str,
+    language: Optional[str] = None,
+    quality_mode: bool = False
+) -> dict:
+    """
+    Transcribe audio file to text.
+    Args:
+        file_path: Path to audio file
+        language: Language code (auto-detect if None)
+        quality_mode: Use high-quality mode with beam search
+    Returns:
+        dict: Transcription result with segments
+    """
+    # Implementation
+    pass
+```
+### Formatting
+We recommend using:
+- `black` for code formatting
+- `isort` for import sorting
+- `mypy` for type checking
+```bash
+# Format code
+black app/
+isort app/
+# Type check
+mypy app/
+```
+## 🌿 Branch Strategy
+### Branch Naming
+- `feature/description` - New features
+- `fix/description` - Bug fixes
+- `docs/description` - Documentation updates
+- `test/description` - Test additions/improvements
+### Example
+```bash
+git checkout -b feature/add-voice-cloning
+git checkout -b fix/tts-streaming-bug
+git checkout -b docs/update-api-guide
+```
+## 📤 Pull Request Process
+1. **Create a feature branch**
+   ```bash
+   git checkout -b feature/my-new-feature
+   ```
+2. **Make your changes**
+   - Write clean, well-documented code
+   - Add tests for new functionality
+   - Update documentation as needed
+3. **Test your changes**
+   ```bash
+   python tests/run_all_tests.py
+   ```
+4. **Commit with clear messages**
+   ```bash
+   git commit -m "feat: add real-time noise cancellation
+   - Implement RNNoise integration
+   - Add preprocessing pipeline
+   - Add unit tests for audio processing
+   - Update API documentation"
+   ```
+5. **Push and create PR**
+   ```bash
+   git push origin feature/my-new-feature
+   ```
+6. **PR Description Template**
+   ```markdown
+   ## Description
+   Brief description of changes
+   ## Type of Change
+   - [ ] Bug fix
+   - [ ] New feature
+   - [ ] Documentation update
+   - [ ] Performance improvement
+   ## Testing
+   - [ ] Unit tests added/updated
+   - [ ] Integration tests added/updated
+   - [ ] Manual testing performed
+   ## Checklist
+   - [ ] Code follows project style guidelines
+   - [ ] Tests pass locally
+   - [ ] Documentation updated
+   - [ ] No new warnings introduced
+   ```
+## 🐛 Reporting Bugs
+### Bug Report Template
+```markdown
+**Describe the bug**
+A clear description of what the bug is.
+**To Reproduce**
+Steps to reproduce:
+1. Go to '...'
+2. Click on '....'
+3. See error
+**Expected behavior**
+What you expected to happen.
+**Environment:**
+ - OS: [e.g. Windows 11]
+ - Python version: [e.g. 3.10.5]
+ - VoiceForge version: [e.g. 2.0.0]
+**Additional context**
+Add any other context, logs, or screenshots.
+```
+## 💡 Feature Requests
+### Feature Request Template
+```markdown
+**Problem Statement**
+Describe the problem this feature would solve.
+**Proposed Solution**
+Describe your proposed solution.
+**Alternatives Considered**
+What alternatives have you considered?
+**Additional Context**
+Any other context, mockups, or examples.
+```
+## 📚 Documentation
+### Documentation Standards
+- Use clear, concise language
+- Include code examples
+- Update relevant docs when changing functionality
+- Add inline comments for complex logic
+### Documentation Locations
+- `README.md` - Project overview
+- `docs/API.md` - API reference
+- `docs/TESTING.md` - Testing guide
+- `docs/ARCHITECTURE.md` - System architecture
+- Inline docstrings - Function/class documentation
+## 🏆 Recognition
+Contributors will be:
+- Listed in `CONTRIBUTORS.md`
+- Mentioned in release notes
+- Credited in the README
+## 📜 License
+By contributing, you agree that your contributions will be licensed under the MIT License.
+## ❓ Questions?
+- Open an issue for questions
+- Join our discussions
+- Email: your@email.com
+---
+**Thank you for making VoiceForge better!** 🎉

README.md ADDED Viewed

	@@ -0,0 +1,360 @@

+# 🎙️ VoiceForge - Enterprise Speech AI Platform
+![VoiceForge Banner](https://via.placeholder.com/1200x300/2563eb/ffffff?text=VoiceForge+V4.0+-+Production+Ready)
+[![Version](https://img.shields.io/badge/version-4.0.0-blue.svg)](CHANGELOG.md)
+[![Status](https://img.shields.io/badge/status-production--ready-green.svg)](docs/PROJECT_SUMMARY.md)
+[![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/)
+[![FastAPI](https://img.shields.io/badge/fastapi-0.109+-teal.svg)](https://fastapi.tiangolo.com/)
+[![K8s Ready](https://img.shields.io/badge/k8s-ready-326CE5.svg)](deploy/k8s/)
+[![Terraform](https://img.shields.io/badge/terraform-1.0+-844FBA.svg)](deploy/terraform/)
+**VoiceForge V4.0** is an **enterprise-grade, cloud-native** Speech AI platform with complete infrastructure automation, security hardening, and observability. Features local-first Whisper STT, Edge TTS, voice cloning, sign language recognition, and a Flutter mobile companion app.
+---
+## 🚀 V4.0 - Enterprise Edition
+### 🆕 What's New
+- ☸️ **Kubernetes Native**: Production-ready K8s manifests + Helm charts
+- 🏗️ **Infrastructure as Code**: Full Terraform setup for AWS (VPC, EKS, Redis)
+- 📊 **Observability Stack**: Grafana dashboards + Prometheus monitoring with alerts
+- 🔒 **Security Hardening**: Rate limiting, data encryption (Fernet), security headers, penetration tests
+- 📱 **Mobile App**: Flutter companion with offline support, localization (en/es), accessibility
+- 🤖 **Sign Language**: Real-time ASL recognition + avatar generation
+- 🚦 **CI/CD**: GitHub Actions for automated testing
+---
+## 📦 Complete Feature Set
+### 🎧 Speech-to-Text (STT)
+- ✅ Hybrid Local/Cloud (Whisper + Google Cloud)
+- ✅ Real-time WebSocket streaming
+- ✅ Speaker diarization (pyannote)
+- ✅ Word-level timestamps
+- ✅ 50+ languages
+### 🗣️ Text-to-Speech (TTS)
+- ✅ 300+ neural voices (Edge TTS)
+- ✅ Voice cloning (Coqui XTTS v2)
+- ✅ Speed/pitch customization
+- ✅ Streaming playback
+### 🤖 AI Features
+- ✅ Emotion & sentiment analysis
+- ✅ Meeting minutes generation
+- ✅ Keyword extraction & summarization
+- ✅ Audio translation (100+ languages)
+- ✅ Sign language recognition + generation
+### 🎨 Audio Studio
+- ✅ Trim, merge, convert audio
+- ✅ Batch processing
+- ✅ Export: PDF, SRT, VTT, TXT
+### 📱 Mobile App (Flutter)
+- ✅ Cross-platform (Android/iOS)
+- ✅ Offline transcription caching (Hive)
+- ✅ Real-time recording & synthesis
+- ✅ i18n (English/Spanish)
+- ✅ High contrast accessibility mode
+---
+## 🏗️ Enterprise Infrastructure
+### ☸️ Kubernetes Deployment
+```bash
+# Deploy to cluster
+kubectl apply -f deploy/k8s/namespace.yaml
+kubectl apply -f deploy/k8s/backend.yaml
+kubectl apply -f deploy/k8s/ingress.yaml
+# Or use Helm
+helm install voiceforge deploy/helm/voiceforge -f values.yaml
+```
+### 🔧 Terraform Provisioning
+```bash
+cd deploy/terraform
+terraform init
+terraform plan
+terraform apply  # Creates: VPC, EKS, ElastiCache Redis
+```
+**Provisions:**
+- VPC with public/private subnets + NAT
+- EKS cluster with auto-scaling node groups
+- ElastiCache Redis cluster
+- Security groups + IAM roles
+### 📊 Monitoring & Alerting
+```bash
+# Import Grafana dashboard
+kubectl apply -f deploy/monitoring/prometheus-rules.yaml
+# Dashboard JSON: deploy/monitoring/grafana-dashboard.json
+```
+**Metrics tracked:**
+- Request rate, latency (p95/p99)
+- Error rates (5xx)
+- CPU/Memory usage
+- Pod health & restarts
+**Alerts:**
+- High error rate (>5%)
+- High latency (>2s p95)
+- Resource exhaustion
+---
+## 🔒 Security Features
+| Feature | Implementation | Status |
+|---------|----------------|--------|
+| **Rate Limiting** | slowapi + Redis | ✅ 5/min (auth), 10/min (AI) |
+| **Data Encryption** | Fernet (AES) at-rest | ✅ User PII + transcripts |
+| **Security Headers** | HSTS, CSP, X-Frame-Options | ✅ All responses |
+| **Authentication** | JWT + API keys | ✅ Token refresh |
+| **Penetration Tests** | OWASP Top 10 scanner | ✅ Automated |
+Run security tests:
+```bash
+python backend/tests/security/security_tests.py --base-url http://localhost:8000
+```
+---
+## 🚀 Quick Start
+### 1. Docker Compose (Fastest)
+```bash
+git clone https://github.com/yourusername/voiceforge
+cd voiceforge
+docker-compose up -d
+```
+### 2. Local Development
+```bash
+# Backend
+cd backend
+pip install -r requirements.txt
+uvicorn app.main:app --reload
+# Frontend
+cd frontend
+pip install -r requirements.txt
+streamlit run streamlit_app.py
+# Mobile
+cd mobile
+flutter pub get
+flutter run
+```
+### 3. Kubernetes
+```bash
+helm install voiceforge ./deploy/helm/voiceforge \
+  --set redis.enabled=true \
+  --set ingress.hosts[0].host=api.yourdomain.com
+```
+**Access:**
+- Frontend: http://localhost:8501
+- API Docs: http://localhost:8000/docs
+- Metrics: http://localhost:8000/metrics
+---
+## 🛠️ Tech Stack
+### Backend
+- **FastAPI**: Async REST API
+- **SQLAlchemy**: ORM + migrations
+- **Celery**: Background tasks
+- **Redis**: Cache + rate limiting
+- **Prometheus**: Metrics
+### AI/ML
+- **faster-whisper**: Local STT
+- **edge-tts**: Neural TTS (free)
+- **Coqui TTS**: Voice cloning
+- **MediaPipe**: Sign language recognition
+- **pyannote**: Speaker diarization
+### Frontend
+- **Streamlit**: Web UI
+- **Flutter**: Mobile app (Riverpod state)
+### DevOps
+- **Docker**: Multi-stage builds
+- **Kubernetes**: Helm charts + HPA
+- **Terraform**: AWS infrastructure
+- **GitHub Actions**: CI/CD pipeline
+- **Grafana**: Dashboards
+---
+## 📁 Project Structure
+```
+voiceforge/
+├── backend/                 # FastAPI microservices
+│   ├── app/
+│   │   ├── api/routes/     # REST endpoints
+│   │   ├── core/           # Config, security, limiter
+│   │   ├── models/         # SQLAlchemy models
+│   │   ├── services/       # Business logic (STT, TTS, NLP, etc.)
+│   │   └── workers/        # Celery tasks
+│   ├── tests/              # Unit, integration, security tests
+│   │   ├── unit/           # Service tests
+│   │   ├── integration/    # API tests
+│   │   ├── quality/        # Code analyzers
+│   │   └── security/       # OWASP scanners
+│   └── requirements.txt
+├── frontend/               # Streamlit web app
+│   ├── pages/             # Multi-page UI
+│   └── components/        # Reusable widgets
+├── mobile/                # Flutter companion app
+│   ├── lib/
+│   │   ├── features/      # Auth, Transcription, Synthesis, Settings
+│   │   ├── core/          # Theme, providers
+│   │   └── l10n/          # Localization (en, es)
+│   └── pubspec.yaml
+├── deploy/                # Infrastructure
+│   ├── k8s/              # Kubernetes manifests
+│   ├── helm/             # Helm charts
+│   ├── terraform/        # AWS IaC (VPC, EKS, Redis)
+│   ├── monitoring/       # Grafana + Prometheus
+│   └── docker/           # Compose files
+├── docs/                 # Documentation
+│   ├── ARCHITECTURE.md   # System design
+│   ├── DEPLOYMENT_GUIDE.md
+│   ├── WALKTHROUGH.md    # Feature tour
+│   └── adr/              # Architecture decisions
+└── .github/workflows/    # CI/CD pipelines
+```
+---
+## 🧪 Testing
+```bash
+# Run all tests (unit, integration, quality, security)
+cd backend
+python tests/run_all_tests.py
+# Individual test suites
+pytest tests/unit/          # Unit tests
+pytest tests/integration/   # API tests
+python tests/security/security_tests.py  # Penetration tests
+# Mobile tests
+cd mobile
+flutter test
+```
+**Coverage Goal: >80%**
+---
+## 🌍 Supported Languages
+**STT + TTS**: English, Spanish, French, German, Japanese, Korean, Chinese, Hindi, Arabic, Portuguese, Italian, Russian, Dutch, Turkish, Polish, and 35+ more.
+**Voice Cloning**: 16 languages including all above.
+---
+## 📊 Performance Benchmarks
+| Operation | Time | Metric |
+|-----------|------|--------|
+| STT (30s audio) | 3.7s | 0.12x RTF |
+| TTS (80 words) | 1.1s | TTFB |
+| Voice Clone | 2.3s | 3s sample |
+| Sign Recognition | 60 FPS | Real-time |
+**Cost Savings**: 100% (local mode vs cloud APIs)
+---
+## 🚢 Deployment Scenarios
+### Development
+```bash
+docker-compose up
+```
+### Staging (Cloud VM)
+```bash
+docker-compose -f docker-compose.prod.yml up -d
+```
+### Production (Kubernetes)
+```bash
+# Option 1: Direct manifests
+kubectl apply -f deploy/k8s/
+# Option 2: Helm chart
+helm upgrade --install voiceforge deploy/helm/voiceforge \
+  --set replicaCount=3 \
+  --set autoscaling.enabled=true \
+  --set redis.enabled=true
+```
+### Cloud Provisioning
+```bash
+# AWS with Terraform
+cd deploy/terraform
+terraform apply -var="environment=production"
+# GCP or Azure: Adapt Terraform modules
+```
+---
+## 📚 Documentation
+- [📖 Architecture](docs/ARCHITECTURE.md)
+- [🚀 Deployment Guide](docs/DEPLOYMENT_GUIDE.md)
+- [🔍 API Reference](http://localhost:8000/docs)
+- [📱 Mobile Guide](mobile/README.md)
+- [🔐 Security Policy](docs/SECURITY.md)
+- [🎓 Interview Prep](docs/INTERVIEW_PREP.md)
+---
+## 🤝 Contributing
+See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
+---
+## 📝 License
+MIT License - see [LICENSE](LICENSE) for details.
+---
+## 💡 Highlights for Portfolio/Interviews
+This project demonstrates:
+1. **Full-Stack Development**: Backend (FastAPI), Frontend (Streamlit), Mobile (Flutter)
+2. **AI/ML Integration**: Local model deployment, hybrid cloud architecture
+3. **DevOps Excellence**: Docker, K8s, Helm, Terraform, CI/CD
+4. **Security**: Encryption, rate limiting, OWASP testing
+5. **Observability**: Prometheus metrics, Grafana dashboards, alerting
+6. **Scalability**: HPA, async workers, Redis caching
+7. **Accessibility**: i18n, high contrast, screen readers
+---
+<div align="center">
+**Built with ❤️ to showcase enterprise-level AI engineering**
+[⭐ Star this repo](https://github.com/yourusername/voiceforge) • [📧 Contact](mailto:your@email.com)
+</div>

backend/.flake8 ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+max-line-length = 120
+extend-ignore = E203
+exclude = .git,__pycache__,docs/source/conf.py,old,build,dist,venv

backend/Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Build Stage
+FROM python:3.10-slim as builder
+WORKDIR /app
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+# Install system dependencies required for building python packages
+# ffmpeg is needed for audio processing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Install python dependencies
+COPY requirements.txt .
+RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
+# Final Stage
+FROM python:3.10-slim
+WORKDIR /app
+# Install runtime dependencies (ffmpeg)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Copy wheels from builder
+COPY --from=builder /app/wheels /wheels
+COPY --from=builder /app/requirements.txt .
+# Install dependencies from wheels
+RUN pip install --no-cache /wheels/*
+# Copy application code
+COPY . .
+# Create a non-root user
+RUN addgroup --system app && adduser --system --group app
+USER app
+# Expose port
+EXPOSE 8000
+# Run commands
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

backend/app/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+VoiceForge Backend Package
+"""

backend/app/api/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+VoiceForge API Package
+"""

backend/app/api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+VoiceForge API Routes Package
+"""
+from .stt import router as stt_router
+from .tts import router as tts_router
+from .health import router as health_router
+from .transcripts import router as transcripts_router
+from .ws import router as ws_router
+from .translation import router as translation_router
+from .batch import router as batch_router
+from .analysis import router as analysis_router
+from .audio import router as audio_router
+from .cloning import router as cloning_router
+from .sign import router as sign_router
+from .auth import router as auth_router
+__all__ = [
+    "stt_router",
+    "tts_router",
+    "health_router",
+    "transcripts_router",
+    "ws_router",
+    "translation_router",
+    "batch_router",
+    "analysis_router",
+    "audio_router",
+    "cloning_router",
+    "sign_router",
+    "auth_router",
+]

backend/app/api/routes/analysis.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Analysis API Routes
+Endpoints for Emotion and Sentiment Analysis
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from typing import Dict, Any
+import logging
+import os
+import shutil
+import tempfile
+from app.services.emotion_service import get_emotion_service
+from app.services.nlp_service import get_nlp_service
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/analysis", tags=["Analysis"])
+@router.post("/emotion/audio")
+async def analyze_audio_emotion(
+    file: UploadFile = File(..., description="Audio file to analyze"),
+):
+    """
+    Analyze emotions in an audio file using Wav2Vec2.
+    Returns dominant emotion and probability distribution.
+    """
+    service = get_emotion_service()
+    # Save to temp file
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        result = service.analyze_audio(tmp_path)
+        return result
+    except Exception as e:
+        logger.error(f"Emotion analysis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
+@router.post("/sentiment/text")
+async def analyze_text_sentiment(
+    text: str = Form(..., description="Text to analyze"),
+):
+    """
+    Analyze text sentiment (polarity and subjectivity).
+    """
+    nlp_service = get_nlp_service()
+    try:
+        return nlp_service.analyze_sentiment(text)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/audio.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+Audio Editing API Routes
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from fastapi.responses import FileResponse
+from typing import List, Optional
+import os
+import shutil
+import tempfile
+import uuid
+from app.services.audio_service import get_audio_service, AudioService
+router = APIRouter(prefix="/audio", tags=["Audio Studio"])
+@router.post("/trim")
+async def trim_audio(
+    file: UploadFile = File(..., description="Audio file"),
+    start_sec: float = Form(..., description="Start time in seconds"),
+    end_sec: float = Form(..., description="End time in seconds"),
+    service: AudioService = Depends(get_audio_service)
+):
+    """Trim an audio file"""
+    suffix = os.path.splitext(file.filename)[1] or ".mp3"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        output_path = tmp_path.replace(suffix, f"_trimmed{suffix}")
+        service.trim_audio(tmp_path, int(start_sec * 1000), int(end_sec * 1000), output_path)
+        return FileResponse(
+            output_path,
+            filename=f"trimmed_{file.filename}",
+            background=None # Let FastAPI handle cleanup? No, we need custom cleanup or use background task
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    # Note: Temp files might persist. In prod, use a cleanup task.
+@router.post("/merge")
+async def merge_audio(
+    files: List[UploadFile] = File(..., description="Files to merge"),
+    format: str = Form("mp3", description="Output format"),
+    service: AudioService = Depends(get_audio_service)
+):
+    """Merge multiple audio files"""
+    temp_files = []
+    try:
+        for file in files:
+            suffix = os.path.splitext(file.filename)[1] or ".mp3"
+            tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+            content = await file.read()
+            tmp.write(content)
+            tmp.close()
+            temp_files.append(tmp.name)
+        output_filename = f"merged_{uuid.uuid4()}.{format}"
+        output_path = os.path.join(tempfile.gettempdir(), output_filename)
+        service.merge_audio(temp_files, output_path)
+        return FileResponse(output_path, filename=output_filename)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        for p in temp_files:
+            try:
+                os.unlink(p)
+            except:
+                pass
+@router.post("/convert")
+async def convert_audio(
+    file: UploadFile = File(..., description="Audio file"),
+    target_format: str = Form(..., description="Target format (mp3, wav, flac, ogg)"),
+    service: AudioService = Depends(get_audio_service)
+):
+    """Convert audio format"""
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        output_path = service.convert_format(tmp_path, target_format)
+        return FileResponse(
+            output_path,
+            filename=f"{os.path.splitext(file.filename)[0]}.{target_format}"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass

backend/app/api/routes/auth.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from datetime import datetime, timedelta
+from typing import List
+from pydantic import BaseModel
+import secrets
+from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi.security import OAuth2PasswordRequestForm
+from sqlalchemy.orm import Session
+from ...core.security import (
+    create_access_token,
+    get_password_hash,
+    verify_password,
+    get_current_active_user,
+    ACCESS_TOKEN_EXPIRE_MINUTES
+)
+from ...models import get_db, User, ApiKey
+from ...core.limiter import limiter
+from fastapi import APIRouter, Depends, HTTPException, status, Request
+router = APIRouter(prefix="/auth", tags=["Authentication"])
+# --- Schemas ---
+class Token(BaseModel):
+    access_token: str
+    token_type: str
+class UserCreate(BaseModel):
+    email: str
+    password: str
+    full_name: str = None
+class UserOut(BaseModel):
+    id: int
+    email: str
+    full_name: str = None
+    is_active: bool
+    class Config:
+        orm_mode = True
+class ApiKeyCreate(BaseModel):
+    name: str
+class ApiKeyOut(BaseModel):
+    key: str
+    name: str
+    created_at: datetime
+    class Config:
+        orm_mode = True
+# --- Endpoints ---
+@router.post("/register", response_model=UserOut)
+@limiter.limit("5/minute")
+async def register(request: Request, user_in: UserCreate, db: Session = Depends(get_db)):
+    """Register a new user"""
+    existing_user = db.query(User).filter(User.email == user_in.email).first()
+    if existing_user:
+        raise HTTPException(status_code=400, detail="Email already registered")
+    hashed_password = get_password_hash(user_in.password)
+    new_user = User(
+        email=user_in.email,
+        hashed_password=hashed_password,
+        full_name=user_in.full_name
+    )
+    db.add(new_user)
+    db.commit()
+    db.refresh(new_user)
+    return new_user
+@router.post("/login", response_model=Token)
+@limiter.limit("5/minute")
+async def login(request: Request, form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
+    """Login to get access token"""
+    user = db.query(User).filter(User.email == form_data.username).first()
+    if not user or not verify_password(form_data.password, user.hashed_password):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Incorrect email or password",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+    access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    access_token = create_access_token(
+        subject=user.id, expires_delta=access_token_expires
+    )
+    return {"access_token": access_token, "token_type": "bearer"}
+@router.post("/api-keys", response_model=ApiKeyOut)
+async def create_api_key(
+    key_in: ApiKeyCreate,
+    current_user: User = Depends(get_current_active_user),
+    db: Session = Depends(get_db)
+):
+    """Generate a new API key for the current user"""
+    # Generate secure 32-char key
+    raw_key = secrets.token_urlsafe(32)
+    api_key_str = f"vf_{raw_key}"  # Prefix for identification
+    new_key = ApiKey(
+        key=api_key_str,
+        name=key_in.name,
+        user_id=current_user.id
+    )
+    db.add(new_key)
+    db.commit()
+    db.refresh(new_key)
+    return new_key
+@router.get("/me", response_model=UserOut)
+async def read_users_me(current_user: User = Depends(get_current_active_user)):
+    """Get current user details"""
+    return current_user

backend/app/api/routes/batch.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Batch Processing API Routes
+Endpoints for submitting and managing batch transcription jobs
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends, BackgroundTasks
+from fastapi.responses import FileResponse
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+import logging
+import shutil
+import os
+import tempfile
+from pathlib import Path
+from app.services.batch_service import get_batch_service
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/batch", tags=["batch"])
+# Request/Response Models
+class BatchJobResponse(BaseModel):
+    """Batch job response model."""
+    job_id: str
+    status: str
+    progress: float
+    created_at: str
+    total_files: int
+    completed_files: int
+    failed_files: int
+    has_zip: bool
+    files: Optional[Dict[str, Any]] = None
+# Endpoints
+@router.post("/transcribe", response_model=BatchJobResponse)
+async def create_batch_job(
+    background_tasks: BackgroundTasks,
+    files: List[UploadFile] = File(..., description="Audio files to transcribe"),
+    language: Optional[str] = Form(None, description="Language code (e.g., 'en', 'hi')"),
+    output_format: str = Form("txt", description="Output format (txt, srt)"),
+):
+    """
+    Submit a batch of audio files for transcription.
+    1. Uploads multiple files
+    2. Creates a batch job
+    3. Starts processing in background
+    Args:
+        files: List of audio files
+        language: Optional language code
+        output_format: Output format (txt or srt)
+    Returns:
+        Created job details
+    """
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided")
+    if len(files) > 50:
+        raise HTTPException(status_code=400, detail="Maximum 50 files per batch")
+    try:
+        service = get_batch_service()
+        # Create temp files for processing
+        file_paths = {}
+        original_names = []
+        for file in files:
+            suffix = Path(file.filename).suffix or ".wav"
+            # Create a named temp file that persists until manually deleted
+            tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+            content = await file.read()
+            tmp.write(content)
+            tmp.close()
+            file_paths[file.filename] = tmp.name
+            original_names.append(file.filename)
+        # Create job
+        job = service.create_job(
+            filenames=original_names,
+            options={
+                "language": language,
+                "output_format": output_format,
+            }
+        )
+        # Connect to Celery worker for processing
+        from app.workers.tasks import process_audio_file
+        # NOTE: For MVP batch service, we are currently keeping the simplified background_tasks approach
+        # because the 'process_audio_file' task defined in tasks.py is for individual files,
+        # whereas 'process_job' handles the whole batch logic (zipping etc).
+        # To fully migrate, we would need to refactor batch_service to span multiple tasks.
+        #
+        # For now, let's keep the background_task for the orchestrator, and have the orchestrator
+        # call the celery tasks for individual files?
+        # Actually, `service.process_job` currently runs synchronously in a background thread.
+        # We will leave as is for 3.1 step 1, but we CAN use Celery for the individual transcriptions.
+        # Start processing in background (Orchestrator runs in thread, calls expensive operations)
+        background_tasks.add_task(
+             service.process_job,
+             job_id=job.job_id,
+             file_paths=file_paths,
+        )
+        return job.to_dict()
+    except Exception as e:
+        # Cleanup any created temp files on error
+        for path in file_paths.values():
+            try:
+                os.unlink(path)
+            except:
+                pass
+        logger.error(f"Batch job creation failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/jobs", response_model=List[BatchJobResponse])
+async def list_jobs(limit: int = 10):
+    """
+    List recent batch jobs.
+    Args:
+        limit: Max number of jobs to return
+    Returns:
+        List of jobs
+    """
+    service = get_batch_service()
+    jobs = service.list_jobs(limit)
+    return [job.to_dict() for job in jobs]
+@router.get("/{job_id}", response_model=BatchJobResponse)
+async def get_job_status(job_id: str):
+    """
+    Get status of a specific batch job.
+    Args:
+        job_id: Job ID
+    Returns:
+        Job details and progress
+    """
+    service = get_batch_service()
+    job = service.get_job(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail="Job not found")
+    return job.to_dict()
+@router.get("/{job_id}/download")
+async def download_results(job_id: str):
+    """
+    Download batch job results as ZIP.
+    Args:
+        job_id: Job ID
+    Returns:
+        ZIP file download
+    """
+    service = get_batch_service()
+    zip_path = service.get_zip_path(job_id)
+    if not zip_path:
+        raise HTTPException(status_code=404, detail="Results not available (job may be processing or failed)")
+    return FileResponse(
+        path=zip_path,
+        filename=f"batch_{job_id}_results.zip",
+        media_type="application/zip",
+    )
+@router.delete("/{job_id}")
+async def delete_job(job_id: str):
+    """
+    Delete a batch job and cleanup files.
+    Args:
+        job_id: Job ID
+    """
+    service = get_batch_service()
+    # Try to cancel first if running
+    service.cancel_job(job_id)
+    # Delete data
+    success = service.delete_job(job_id)
+    if not success:
+        raise HTTPException(status_code=404, detail="Job not found")
+    return {"status": "deleted", "job_id": job_id}

backend/app/api/routes/cloning.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+Voice Cloning API Routes
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form, Depends
+from fastapi.responses import FileResponse
+from typing import List, Optional
+import os
+import shutil
+import tempfile
+import uuid
+from app.services.clone_service import get_clone_service, CloneService
+router = APIRouter(prefix="/clone", tags=["Voice Cloning"])
+@router.post("/synthesize")
+async def clone_synthesize(
+    text: str = Form(..., description="Text to speak"),
+    language: str = Form("en", description="Language code (en, es, fr, de, etc.)"),
+    files: List[UploadFile] = File(..., description="Reference audio samples (1-3 files, 3-10s each recommended)"),
+    service: CloneService = Depends(get_clone_service)
+):
+    """
+    Clone a voice from reference audio samples.
+    Uses Coqui XTTS v2.
+    WARNING: Heavy operation. May take 5-20 seconds depending on GPU.
+    """
+    # Validation
+    if not files:
+        raise HTTPException(status_code=400, detail="At least one reference audio file is required")
+    temp_files = []
+    try:
+        # Save reference files
+        for file in files:
+            suffix = os.path.splitext(file.filename)[1] or ".wav"
+            tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+            content = await file.read()
+            tmp.write(content)
+            tmp.close()
+            temp_files.append(tmp.name)
+        # Generate output path
+        output_filename = f"cloned_{uuid.uuid4()}.wav"
+        output_path = os.path.join(tempfile.gettempdir(), output_filename)
+        # Synthesize
+        service.clone_voice(
+            text=text,
+            speaker_wav_paths=temp_files,
+            language=language,
+            output_path=output_path
+        )
+        return FileResponse(
+            output_path,
+            filename="cloned_speech.wav",
+            media_type="audio/wav"
+        )
+    except ImportError:
+        raise HTTPException(status_code=503, detail="Voice Cloning service not available (TTS library missing)")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Cleanup input files
+        for p in temp_files:
+            try:
+                os.unlink(p)
+            except:
+                pass
+        # Note: Output file cleanup needs management in prod (background task or stream)
+@router.get("/languages")
+def get_languages(service: CloneService = Depends(get_clone_service)):
+    return {"languages": service.get_supported_languages()}

backend/app/api/routes/health.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Health Check Router
+"""
+from fastapi import APIRouter
+router = APIRouter(prefix="/health", tags=["Health"])
+@router.get("")
+@router.get("/")
+async def health_check():
+    """Basic health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": "voiceforge-api",
+        "version": "1.0.0",
+    }
+@router.get("/ready")
+async def readiness_check():
+    """Readiness check - verifies all dependencies are available"""
+    # TODO: Check database, Redis, Google Cloud connectivity
+    return {
+        "status": "ready",
+        "checks": {
+            "database": "ok",
+            "redis": "ok",
+            "google_cloud": "ok",
+        }
+    }
+@router.get("/memory")
+async def memory_status():
+    """Get current memory usage and loaded models"""
+    from ...services.whisper_stt_service import (
+        _whisper_models,
+        _model_last_used,
+        get_memory_usage_mb
+    )
+    import time
+    current_time = time.time()
+    models_info = {}
+    for name in _whisper_models.keys():
+        last_used = _model_last_used.get(name, 0)
+        idle_seconds = current_time - last_used if last_used else 0
+        models_info[name] = {
+            "loaded": True,
+            "idle_seconds": round(idle_seconds, 1)
+        }
+    return {
+        "memory_mb": round(get_memory_usage_mb(), 1),
+        "loaded_models": list(_whisper_models.keys()),
+        "models_detail": models_info
+    }
+@router.post("/memory/cleanup")
+async def cleanup_memory():
+    """Unload idle models to free memory"""
+    from ...services.whisper_stt_service import cleanup_idle_models, get_memory_usage_mb
+    before = get_memory_usage_mb()
+    cleanup_idle_models()
+    after = get_memory_usage_mb()
+    return {
+        "memory_before_mb": round(before, 1),
+        "memory_after_mb": round(after, 1),
+        "freed_mb": round(before - after, 1)
+    }
+@router.post("/memory/unload-all")
+async def unload_all():
+    """Unload ALL models to free maximum memory"""
+    from ...services.whisper_stt_service import unload_all_models, get_memory_usage_mb
+    before = get_memory_usage_mb()
+    unloaded = unload_all_models()
+    after = get_memory_usage_mb()
+    return {
+        "unloaded_models": unloaded,
+        "memory_before_mb": round(before, 1),
+        "memory_after_mb": round(after, 1),
+        "freed_mb": round(before - after, 1)
+    }

backend/app/api/routes/sign.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Sign Language API Routes
+Provides WebSocket and REST endpoints for ASL recognition.
+"""
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+import numpy as np
+import base64
+import cv2
+import logging
+from typing import List
+from ...services.sign_recognition_service import get_sign_service, SignPrediction
+from ...services.sign_avatar_service import get_avatar_service
+from pydantic import BaseModel
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/sign", tags=["Sign Language"])
+class TextToSignRequest(BaseModel):
+    text: str
+@router.get("/health")
+async def sign_health():
+    """Check if sign recognition service is available"""
+    try:
+        service = get_sign_service()
+        return {"status": "ready", "service": "SignRecognitionService"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+@router.post("/recognize")
+async def recognize_sign(file: UploadFile = File(..., description="Image of hand sign")):
+    """
+    Recognize ASL letter from a single image.
+    Upload an image containing a hand sign to get the predicted letter.
+    """
+    try:
+        # Read image
+        contents = await file.read()
+        nparr = np.frombuffer(contents, np.uint8)
+        image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+        if image is None:
+            raise HTTPException(status_code=400, detail="Invalid image file")
+        # Get predictions
+        service = get_sign_service()
+        predictions = service.process_frame(image)
+        if not predictions:
+            return JSONResponse({
+                "success": True,
+                "predictions": [],
+                "message": "No hands detected in image"
+            })
+        return JSONResponse({
+            "success": True,
+            "predictions": [
+                {
+                    "letter": p.letter,
+                    "confidence": p.confidence
+                }
+                for p in predictions
+            ]
+        })
+    except Exception as e:
+        logger.error(f"Sign recognition error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.websocket("/live")
+async def sign_websocket(websocket: WebSocket):
+    """
+    WebSocket endpoint for real-time sign language recognition.
+    Client sends base64-encoded JPEG frames, server responds with predictions.
+    Protocol:
+    - Client sends: {"frame": "<base64 jpeg>"}
+    - Server sends: {"predictions": [{"letter": "A", "confidence": 0.8}]}
+    """
+    await websocket.accept()
+    service = get_sign_service()
+    logger.info("Sign language WebSocket connected")
+    try:
+        while True:
+            # Receive frame from client
+            data = await websocket.receive_json()
+            if "frame" not in data:
+                await websocket.send_json({"error": "Missing 'frame' field"})
+                continue
+            # Decode base64 image
+            try:
+                frame_data = base64.b64decode(data["frame"])
+                nparr = np.frombuffer(frame_data, np.uint8)
+                frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+                if frame is None:
+                    await websocket.send_json({"error": "Invalid frame data"})
+                    continue
+            except Exception as e:
+                await websocket.send_json({"error": f"Frame decode error: {e}"})
+                continue
+            # Process frame
+            predictions = service.process_frame(frame)
+            # Send results
+            await websocket.send_json({
+                "predictions": [
+                    {
+                        "letter": p.letter,
+                        "confidence": round(p.confidence, 2)
+                    }
+                    for p in predictions
+                ]
+            })
+    except WebSocketDisconnect:
+        logger.info("Sign language WebSocket disconnected")
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+        await websocket.close(code=1011, reason=str(e))
+@router.get("/alphabet")
+async def get_alphabet():
+    """Get list of supported ASL letters"""
+    return {
+        "supported_letters": list("ABCDILUVWY5"),  # Currently implemented
+        "note": "J and Z require motion tracking (coming soon)"
+    }
+@router.post("/animate")
+async def animate_text(request: TextToSignRequest):
+    """
+    Convert text to sign language animation sequence (Finger Spelling).
+    """
+    try:
+        service = get_avatar_service()
+        sequence = service.text_to_glosses(request.text)
+        return {
+            "success": True,
+            "sequence": sequence,
+            "count": len(sequence)
+        }
+    except Exception as e:
+        logger.error(f"Animation error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/stt.py ADDED Viewed

	@@ -0,0 +1,489 @@

+"""
+Speech-to-Text API Router
+"""
+import logging
+from datetime import datetime
+from typing import Optional, List
+from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Depends, Request
+from fastapi.responses import JSONResponse
+from ...core.limiter import limiter
+from ...services.stt_service import get_stt_service, STTService
+from ...services.file_service import get_file_service, FileService
+from ...schemas.stt import (
+    TranscriptionResponse,
+    TranscriptionRequest,
+    LanguageInfo,
+    LanguageListResponse,
+)
+from ...core.config import get_settings
+from sqlalchemy.orm import Session
+from ...models import get_db, AudioFile, Transcript
+from ...workers.tasks import process_audio_file
+from celery.result import AsyncResult
+from ...schemas.stt import (
+    TranscriptionResponse,
+    TranscriptionRequest,
+    LanguageInfo,
+    LanguageListResponse,
+    AsyncTranscriptionResponse,
+    TaskStatusResponse,
+)
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/stt", tags=["Speech-to-Text"])
+settings = get_settings()
+@router.get("/languages", response_model=LanguageListResponse)
+async def get_supported_languages(
+    stt_service: STTService = Depends(get_stt_service),
+):
+    """
+    Get list of supported languages for speech-to-text
+    """
+    languages = stt_service.get_supported_languages()
+    return LanguageListResponse(
+        languages=languages,
+        total=len(languages),
+    )
+@router.post("/upload", response_model=TranscriptionResponse)
+@limiter.limit("10/minute")
+async def transcribe_upload(
+    request: Request,
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    enable_punctuation: bool = Form(default=True, description="Enable automatic punctuation"),
+    enable_word_timestamps: bool = Form(default=True, description="Include word-level timestamps"),
+    enable_diarization: bool = Form(default=False, description="Enable speaker diarization"),
+    speaker_count: Optional[int] = Form(default=None, description="Expected number of speakers"),
+    prompt: Optional[str] = Form(None, description="Custom vocabulary/keywords (e.g. 'VoiceForge, PyTorch')"),
+    stt_service: STTService = Depends(get_stt_service),
+    file_service: FileService = Depends(get_file_service),
+    db: Session = Depends(get_db),
+):
+    """
+    Transcribe an uploaded audio file
+    Supports: WAV, MP3, M4A, FLAC, OGG, WebM
+    For files longer than 1 minute, consider using the async endpoint.
+    """
+    # Validate file type
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    ext = file.filename.split(".")[-1].lower()
+    if ext not in settings.supported_audio_formats_list:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format: {ext}. Supported: {', '.join(settings.supported_audio_formats_list)}"
+        )
+    # Validate language
+    if language not in settings.supported_languages_list:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported language: {language}. Supported: {', '.join(settings.supported_languages_list)}"
+        )
+    try:
+        # Read file content
+        content = await file.read()
+        # Save to storage
+        storage_path, metadata = file_service.save_upload(
+            file_content=content,
+            original_filename=file.filename,
+        )
+        logger.info(f"Processing transcription for {file.filename} ({len(content)} bytes)")
+        # Perform transcription
+        result = stt_service.transcribe_file(
+            audio_path=storage_path,
+            language=language,
+            enable_automatic_punctuation=enable_punctuation,
+            enable_word_time_offsets=enable_word_timestamps,
+            enable_speaker_diarization=enable_diarization,
+            diarization_speaker_count=speaker_count,
+            sample_rate=metadata.get("sample_rate"),
+            prompt=prompt, # Custom vocabulary
+        )
+        # Clean up temp file (optional - could keep for history)
+        # file_service.delete_file(storage_path)
+        # Save to database
+        try:
+            # 1. Create AudioFile record
+            audio_file = AudioFile(
+                storage_path=str(storage_path),
+                original_filename=file.filename,
+                duration=result.duration,
+                format=ext,
+                sample_rate=metadata.get("sample_rate"),
+                language=language,
+                detected_language=result.language,
+                status="done"
+            )
+            db.add(audio_file)
+            db.flush() # get ID
+            # 2. Create Transcript record
+            transcript = Transcript(
+                audio_file_id=audio_file.id,
+                raw_text=result.text,
+                processed_text=result.text, # initially same
+                segments=[s.model_dump() for s in result.segments] if result.segments else [],
+                language=result.language,
+                created_at=datetime.utcnow(),
+            )
+            db.add(transcript)
+            db.commit()
+            db.refresh(transcript)
+            # Return result with ID
+            response_data = result.model_dump()
+            response_data["id"] = transcript.id
+            # Explicitly validate to catch errors early
+            try:
+                return TranscriptionResponse(**response_data)
+            except Exception as e:
+                logger.error(f"Validation error for response: {e}")
+                logger.error(f"Response data: {response_data}")
+                raise HTTPException(status_code=500, detail=f"Response validation failed: {str(e)}")
+            return response
+        except Exception as e:
+            logger.error(f"Failed to save to DB: {e}")
+            # Don't fail the request if DB save fails, just return result
+            # But in production we might want to ensure persistence
+            return result
+    except FileNotFoundError as e:
+        logger.error(f"File error: {e}")
+        raise HTTPException(status_code=404, detail=str(e))
+    except ValueError as e:
+        logger.error(f"Validation error: {e}")
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.exception(f"Transcription failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
+@router.post("/upload/quality")
+async def transcribe_quality(
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    preprocess: bool = Form(default=False, description="Apply noise reduction (5-15% WER improvement)"),
+    prompt: Optional[str] = Form(None, description="Custom vocabulary/keywords"),
+):
+    """
+    High-quality transcription mode (optimized for accuracy).
+    Features:
+    - beam_size=5 for more accurate decoding (~40% fewer errors)
+    - condition_on_previous_text=False to reduce hallucinations
+    - Optional audio preprocessing for noisy environments
+    Trade-off: ~2x slower than standard mode
+    Best for: Important recordings, noisy audio, reduced error tolerance
+    """
+    from app.services.whisper_stt_service import get_whisper_stt_service
+    import tempfile
+    import os
+    # Validate file
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    try:
+        content = await file.read()
+        # Save to temp file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            f.write(content)
+            temp_path = f.name
+        try:
+            stt_service = get_whisper_stt_service()
+            result = stt_service.transcribe_quality(
+                temp_path,
+                language=language,
+                preprocess=preprocess,
+                prompt=prompt,
+            )
+            return result
+        finally:
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+    except Exception as e:
+        logger.exception(f"Quality transcription failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
+@router.post("/upload/batch")
+async def transcribe_batch(
+    files: List[UploadFile] = File(..., description="Multiple audio files to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    batch_size: int = Form(default=8, description="Batch size (8 optimal for CPU)"),
+):
+    """
+    Batch transcription for high throughput.
+    Uses BatchedInferencePipeline for 2-3x speedup on concurrent files.
+    Best for: Processing multiple files, API with high concurrency
+    """
+    from app.services.whisper_stt_service import get_whisper_stt_service
+    import tempfile
+    import os
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided")
+    results = []
+    stt_service = get_whisper_stt_service()
+    for file in files:
+        if not file.filename:
+            continue
+        try:
+            content = await file.read()
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                f.write(content)
+                temp_path = f.name
+            try:
+                result = stt_service.transcribe_batched(
+                    temp_path,
+                    language=language,
+                    batch_size=batch_size,
+                )
+                result["filename"] = file.filename
+                results.append(result)
+            finally:
+                try:
+                    os.unlink(temp_path)
+                except:
+                    pass
+        except Exception as e:
+            logger.error(f"Failed to transcribe {file.filename}: {e}")
+            results.append({
+                "filename": file.filename,
+                "error": str(e),
+            })
+    return {
+        "count": len(results),
+        "results": results,
+        "mode": "batched",
+        "batch_size": batch_size,
+    }
+@router.post("/async-upload", response_model=AsyncTranscriptionResponse)
+async def transcribe_async_upload(
+    file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: str = Form(default="en-US", description="Language code"),
+    file_service: FileService = Depends(get_file_service),
+    db: Session = Depends(get_db),
+):
+    """
+    Asynchronously transcribe an uploaded audio file (Celery)
+    """
+    # Validate file type
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    ext = file.filename.split(".")[-1].lower()
+    if ext not in settings.supported_audio_formats_list:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format: {ext}"
+        )
+    try:
+        content = await file.read()
+        storage_path, metadata = file_service.save_upload(
+            file_content=content,
+            original_filename=file.filename,
+        )
+        # Create AudioFile record with 'queued' status
+        audio_file = AudioFile(
+            storage_path=str(storage_path),
+            original_filename=file.filename,
+            duration=0.0, # Will be updated by worker
+            format=ext,
+            sample_rate=metadata.get("sample_rate"),
+            language=language,
+            status="queued"
+        )
+        db.add(audio_file)
+        db.commit()
+        db.refresh(audio_file)
+        # Trigger Celery Task
+        task = process_audio_file.delay(audio_file.id)
+        return AsyncTranscriptionResponse(
+            task_id=task.id,
+            audio_file_id=audio_file.id,
+            status="queued",
+            message="File uploaded and queued for processing"
+        )
+    except Exception as e:
+        logger.exception(f"Async upload failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/tasks/{task_id}", response_model=TaskStatusResponse)
+async def get_task_status(task_id: str, db: Session = Depends(get_db)):
+    """
+    Check status of an async transcription task
+    """
+    task_result = AsyncResult(task_id)
+    response = TaskStatusResponse(
+        task_id=task_id,
+        status=task_result.status.lower(),
+        created_at=datetime.utcnow(), # Approximate or fetch from DB tracked tasks
+        updated_at=datetime.utcnow()
+    )
+    if task_result.successful():
+        # If successful, the result of the task function isn't returned directly
+        # because process_audio_file returns None (it saves to DB).
+        # We need to find the Transcript associated with this task if possible.
+        # Ideally, we should store task_id in AudioFile or Transcript to link them.
+        # For now, we just report completion.
+        response.status = "completed"
+        response.progress = 100.0
+    elif task_result.failed():
+        response.status = "failed"
+        response.error = str(task_result.result)
+    elif task_result.state == 'PROGRESS':
+        response.status = "processing"
+        # If we had progress updating in the task, we could read it here
+    return response
+@router.post("/transcribe-bytes", response_model=TranscriptionResponse)
+async def transcribe_bytes(
+    audio_content: bytes,
+    language: str = "en-US",
+    encoding: str = "LINEAR16",
+    sample_rate: int = 16000,
+    stt_service: STTService = Depends(get_stt_service),
+):
+    """
+    Transcribe raw audio bytes (for streaming/real-time use)
+    This endpoint is primarily for internal use or advanced clients
+    that send pre-processed audio data.
+    """
+    try:
+        result = stt_service.transcribe_bytes(
+            audio_content=audio_content,
+            language=language,
+            encoding=encoding,
+            sample_rate=sample_rate,
+        )
+        return result
+    except Exception as e:
+        logger.exception(f"Transcription failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# TODO: WebSocket endpoint for real-time streaming
+# @router.websocket("/stream")
+# async def stream_transcription(websocket: WebSocket):
+#     """Real-time streaming transcription via WebSocket"""
+#     pass
+@router.post("/upload/diarize")
+async def diarize_audio(
+    file: UploadFile = File(..., description="Audio file to diarize"),
+    num_speakers: Optional[int] = Form(None, description="Exact number of speakers (optional)"),
+    min_speakers: Optional[int] = Form(None, description="Minimum number of speakers (optional)"),
+    max_speakers: Optional[int] = Form(None, description="Maximum number of speakers (optional)"),
+    language: Optional[str] = Form(None, description="Language code (e.g., 'en'). Auto-detected if not provided."),
+    preprocess: bool = Form(False, description="Apply noise reduction before processing (improves accuracy for noisy audio)"),
+):
+    """
+    Perform Speaker Diarization ("Who said what").
+    Uses faster-whisper for transcription + pyannote.audio for speaker identification.
+    Requires:
+    - HF_TOKEN in .env for Pyannote model access
+    Returns:
+    - segments: List of segments with timestamps, text, and speaker labels
+    - speaker_stats: Speaking time per speaker
+    - language: Detected/specified language
+    """
+    from app.services.diarization_service import get_diarization_service
+    import tempfile
+    import os
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    try:
+        # Save temp file
+        content = await file.read()
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            f.write(content)
+            temp_path = f.name
+        try:
+            service = get_diarization_service()
+            result = service.process_audio(
+                temp_path,
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers,
+                language=language,
+                preprocess=preprocess,
+            )
+            return result
+        except ValueError as e:
+            # Token missing
+            raise HTTPException(status_code=400, detail=str(e))
+        except ImportError as e:
+            # Not installed
+            raise HTTPException(status_code=503, detail=str(e))
+        except Exception as e:
+            logger.exception("Diarization error")
+            raise HTTPException(status_code=500, detail=f"Diarization failed: {str(e)}")
+        finally:
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+    except Exception as e:
+        logger.error(f"Diarization request failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/transcripts.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Transcript Management Routes
+CRUD operations and Export
+"""
+from typing import List, Optional
+from fastapi import APIRouter, Depends, HTTPException, Response, Query, UploadFile, File, Form
+from sqlalchemy.orm import Session
+from datetime import datetime
+from ...models import get_db, Transcript, AudioFile
+from ...schemas.transcript import TranscriptResponse, TranscriptUpdate
+from ...services.nlp_service import get_nlp_service, NLPService
+from ...services.export_service import ExportService
+router = APIRouter(prefix="/transcripts", tags=["Transcripts"])
+@router.get("", response_model=List[TranscriptResponse])
+async def list_transcripts(
+    skip: int = 0,
+    limit: int = 100,
+    db: Session = Depends(get_db),
+):
+    """List all transcripts"""
+    transcripts = db.query(Transcript).order_by(Transcript.created_at.desc()).offset(skip).limit(limit).all()
+    return transcripts
+@router.get("/{transcript_id}", response_model=TranscriptResponse)
+async def get_transcript(
+    transcript_id: int,
+    db: Session = Depends(get_db),
+):
+    """Get specific transcript details"""
+    transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
+    if not transcript:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    return transcript
+@router.post("/{transcript_id}/analyze")
+async def analyze_transcript(
+    transcript_id: int,
+    db: Session = Depends(get_db),
+    nlp_service: NLPService = Depends(get_nlp_service),
+):
+    """Run NLP analysis on a transcript"""
+    transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
+    if not transcript:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    if not transcript.processed_text:
+        raise HTTPException(status_code=400, detail="Transcript has no text content")
+    # Run analysis
+    analysis = nlp_service.process_transcript(transcript.processed_text)
+    # Update DB
+    transcript.sentiment = analysis["sentiment"]
+    transcript.topics = {"keywords": analysis["keywords"]}
+    transcript.summary = analysis["summary"]
+    transcript.updated_at = datetime.utcnow()
+    db.commit()
+    db.refresh(transcript)
+    return {
+        "status": "success",
+        "analysis": analysis
+    }
+@router.get("/{transcript_id}/export")
+async def export_transcript(
+    transcript_id: int,
+    format: str = Query(..., regex="^(txt|srt|vtt|pdf)$"),
+    db: Session = Depends(get_db),
+):
+    """
+    Export transcript to specific format
+    """
+    transcript = db.query(Transcript).filter(Transcript.id == transcript_id).first()
+    if not transcript:
+        raise HTTPException(status_code=404, detail="Transcript not found")
+    # Convert model to dict for service
+    data = {
+        "id": transcript.id,
+        "text": transcript.processed_text,
+        "created_at": str(transcript.created_at),
+        "duration": 0,
+        "segments": transcript.segments,
+        "words": [],
+        "sentiment": transcript.sentiment,
+    }
+    if format == "txt":
+        content = ExportService.to_txt(data)
+        media_type = "text/plain"
+    elif format == "srt":
+        content = ExportService.to_srt(data)
+        media_type = "text/plain"
+    elif format == "vtt":
+        content = ExportService.to_vtt(data)
+        media_type = "text/vtt"
+    elif format == "pdf":
+        content = ExportService.to_pdf(data)
+        media_type = "application/pdf"
+    else:
+        raise HTTPException(status_code=400, detail="Unsupported format")
+    return Response(
+        content=content,
+        media_type=media_type,
+        headers={
+            "Content-Disposition": f'attachment; filename="transcript_{transcript_id}.{format}"'
+        }
+    )
+@router.post("/meeting")
+async def process_meeting(
+    file: UploadFile = File(..., description="Audio recording of meeting"),
+    num_speakers: Optional[int] = Form(None, description="Number of speakers (hint)"),
+    language: Optional[str] = Form(None, description="Language code"),
+    db: Session = Depends(get_db),
+):
+    """
+    Process a meeting recording:
+    1. Diarization (Who spoke when)
+    2. Transcription (What was said)
+    3. NLP Analysis (Summary, Action Items, Sentiment)
+    4. Save to DB
+    """
+    import shutil
+    import os
+    import tempfile
+    from ...services.meeting_service import get_meeting_service
+    # Save upload to temp file
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        shutil.copyfileobj(file.file, tmp)
+        tmp_path = tmp.name
+    try:
+        meeting_service = get_meeting_service()
+        # Run full pipeline
+        # This can be slow (minutes) so strictly speaking should be a background task
+        # But for this MVP level we'll do it synchronously with a long timeout
+        result = meeting_service.process_meeting(
+            audio_path=tmp_path,
+            num_speakers=num_speakers,
+            language=language
+        )
+        # Save to DB
+        # Create AudioFile record first
+        audio_file = AudioFile(
+            filename=file.filename,
+            filepath="processed_in_memory", # We delete temp file, so no perm path
+            duration=result["metadata"]["duration_seconds"],
+            file_size=0,
+            format=suffix.replace(".", "")
+        )
+        db.add(audio_file)
+        db.commit()
+        db.refresh(audio_file)
+        # Create Transcript record
+        transcript = Transcript(
+            audio_file_id=audio_file.id,
+            raw_text=result["raw_text"],
+            processed_text=result["raw_text"],
+            segments=result["transcript_segments"],
+            sentiment=result["sentiment"],
+            topics={"keywords": result["topics"]},
+            action_items=result["action_items"],
+            attendees=result["metadata"]["attendees"],
+            summary=result["summary"],
+            language=result["metadata"]["language"],
+            confidence=0.95, # Estimated
+            duration=result["metadata"]["duration_seconds"],
+            created_at=datetime.utcnow()
+        )
+        db.add(transcript)
+        db.commit()
+        db.refresh(transcript)
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        # Cleanup
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass

backend/app/api/routes/translation.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Translation API Routes
+Endpoints for text and audio translation services
+"""
+from fastapi import APIRouter, HTTPException, UploadFile, File, Form
+from pydantic import BaseModel, Field
+from typing import Optional, List
+import logging
+from app.services.translation_service import get_translation_service
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/translation", tags=["translation"])
+# Request/Response Models
+class TranslateTextRequest(BaseModel):
+    """Request model for text translation."""
+    text: str = Field(..., min_length=1, max_length=5000, description="Text to translate")
+    source_lang: str = Field(..., description="Source language code (e.g., 'hi', 'en-US')")
+    target_lang: str = Field(..., description="Target language code (e.g., 'en', 'es')")
+    use_pivot: bool = Field(default=True, description="Use English as pivot for unsupported pairs")
+class TranslateTextResponse(BaseModel):
+    """Response model for text translation."""
+    translated_text: str
+    source_lang: str
+    target_lang: str
+    source_text: str
+    processing_time: float
+    word_count: int
+    pivot_used: Optional[bool] = False
+    intermediate_text: Optional[str] = None
+    model_used: Optional[str] = None
+class LanguageInfo(BaseModel):
+    """Language information model."""
+    code: str
+    name: str
+    flag: str
+    native: str
+class TranslationPair(BaseModel):
+    """Translation pair model."""
+    code: str
+    source: LanguageInfo
+    target: LanguageInfo
+class DetectLanguageResponse(BaseModel):
+    """Response model for language detection."""
+    detected_language: str
+    confidence: float
+    language_info: Optional[dict] = None
+    all_probabilities: Optional[List[dict]] = None
+# Endpoints
+@router.get("/languages", response_model=List[LanguageInfo])
+async def get_supported_languages():
+    """
+    Get list of all supported languages.
+    Returns:
+        List of supported languages with metadata
+    """
+    service = get_translation_service()
+    return service.get_supported_languages()
+@router.get("/pairs")
+async def get_supported_pairs():
+    """
+    Get list of all supported translation pairs.
+    Returns:
+        List of supported source->target language pairs
+    """
+    service = get_translation_service()
+    return {
+        "pairs": service.get_supported_pairs(),
+        "total": len(service.get_supported_pairs()),
+    }
+@router.post("/text", response_model=TranslateTextResponse)
+async def translate_text(request: TranslateTextRequest):
+    """
+    Translate text from source to target language.
+    - Uses Helsinki-NLP MarianMT models (~300MB per language pair)
+    - Supports pivot translation through English for unsupported pairs
+    - First request for a language pair may take longer (model loading)
+    Args:
+        request: Translation request with text and language codes
+    Returns:
+        Translated text with metadata
+    """
+    service = get_translation_service()
+    try:
+        if request.use_pivot:
+            result = service.translate_with_pivot(
+                text=request.text,
+                source_lang=request.source_lang,
+                target_lang=request.target_lang,
+            )
+        else:
+            result = service.translate_text(
+                text=request.text,
+                source_lang=request.source_lang,
+                target_lang=request.target_lang,
+            )
+        return TranslateTextResponse(**result)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"Translation error: {e}")
+        raise HTTPException(status_code=500, detail=f"Translation failed: {str(e)}")
+@router.post("/detect", response_model=DetectLanguageResponse)
+async def detect_language(text: str = Form(..., min_length=10, description="Text to analyze")):
+    """
+    Detect the language of input text.
+    Args:
+        text: Text to analyze (minimum 10 characters for accuracy)
+    Returns:
+        Detected language with confidence score
+    """
+    service = get_translation_service()
+    result = service.detect_language(text)
+    if result.get("error"):
+        raise HTTPException(status_code=400, detail=result["error"])
+    return DetectLanguageResponse(**result)
+@router.get("/model-info")
+async def get_model_info():
+    """
+    Get information about loaded translation models.
+    Returns:
+        Model loading status and supported pairs
+    """
+    service = get_translation_service()
+    return service.get_model_info()
+@router.post("/audio")
+async def translate_audio(
+    file: UploadFile = File(..., description="Audio file to translate"),
+    source_lang: str = Form(..., description="Source language code"),
+    target_lang: str = Form(..., description="Target language code"),
+    generate_audio: bool = Form(default=True, description="Generate TTS output"),
+):
+    """
+    Full audio translation pipeline: STT → Translate → TTS
+    1. Transcribe audio using Whisper
+    2. Translate text using MarianMT
+    3. Optionally generate speech in target language
+    Args:
+        file: Audio file (WAV, MP3, etc.)
+        source_lang: Source language code
+        target_lang: Target language code
+        generate_audio: Whether to generate TTS output
+    Returns:
+        Transcription, translation, and optional audio response
+    """
+    import tempfile
+    import os
+    from app.services.whisper_stt_service import get_whisper_stt_service
+    from app.services.edge_tts_service import get_edge_tts_service
+    translation_service = get_translation_service()
+    stt_service = get_whisper_stt_service()
+    tts_service = get_edge_tts_service()
+    # Save uploaded file
+    suffix = os.path.splitext(file.filename)[1] or ".wav"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        content = await file.read()
+        tmp.write(content)
+        tmp_path = tmp.name
+    try:
+        # Step 1: Transcribe
+        transcription = stt_service.transcribe_file(tmp_path, language=source_lang)
+        source_text = transcription["text"]
+        if not source_text.strip():
+            raise HTTPException(status_code=400, detail="No speech detected in audio")
+        # Step 2: Translate
+        translation = translation_service.translate_with_pivot(
+            text=source_text,
+            source_lang=source_lang,
+            target_lang=target_lang,
+        )
+        translated_text = translation["translated_text"]
+        # Step 3: Generate TTS (optional)
+        audio_base64 = None
+        if generate_audio:
+            # Map language code to voice
+            voice_map = {
+                "en": "en-US-AriaNeural",
+                "hi": "hi-IN-SwaraNeural",
+                "es": "es-ES-ElviraNeural",
+                "fr": "fr-FR-DeniseNeural",
+                "de": "de-DE-KatjaNeural",
+                "zh": "zh-CN-XiaoxiaoNeural",
+                "ja": "ja-JP-NanamiNeural",
+                "ko": "ko-KR-SunHiNeural",
+                "ar": "ar-SA-ZariyahNeural",
+                "ru": "ru-RU-SvetlanaNeural",
+            }
+            target_code = target_lang.split("-")[0].lower()
+            voice = voice_map.get(target_code, "en-US-AriaNeural")
+            audio_bytes = tts_service.synthesize_sync(translated_text, voice=voice)
+            import base64
+            audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+        return {
+            "source_text": source_text,
+            "translated_text": translated_text,
+            "source_lang": source_lang,
+            "target_lang": target_lang,
+            "transcription_time": transcription["processing_time"],
+            "translation_time": translation["processing_time"],
+            "audio_base64": audio_base64,
+            "audio_format": "mp3" if audio_base64 else None,
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Audio translation failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass

backend/app/api/routes/tts.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+Text-to-Speech API Router
+"""
+import base64
+import logging
+from typing import Optional
+from fastapi import APIRouter, HTTPException, Depends, Response, Request
+from fastapi.responses import StreamingResponse
+from io import BytesIO
+from ...core.limiter import limiter
+from ...services.tts_service import get_tts_service, TTSService
+from ...schemas.tts import (
+    SynthesisRequest,
+    SynthesisResponse,
+    VoiceInfo,
+    VoiceListResponse,
+    VoicePreviewRequest,
+)
+from ...core.config import get_settings
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/tts", tags=["Text-to-Speech"])
+settings = get_settings()
+@router.get("/voices", response_model=VoiceListResponse)
+async def get_voices(
+    language: Optional[str] = None,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Get list of available TTS voices
+    Optionally filter by language code (e.g., "en-US", "es", "fr")
+    """
+    return await tts_service.get_voices(language_code=language)
+@router.get("/voices/{language}", response_model=VoiceListResponse)
+async def get_voices_by_language(
+    language: str,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Get voices for a specific language
+    """
+    if language not in settings.supported_languages_list:
+        # Try partial match (e.g., "en" matches "en-US", "en-GB")
+        partial_matches = [l for l in settings.supported_languages_list if l.startswith(language)]
+        if not partial_matches:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported language: {language}"
+            )
+    return await tts_service.get_voices(language_code=language)
+@router.post("/synthesize", response_model=SynthesisResponse)
+@limiter.limit("10/minute")
+async def synthesize_speech(
+    request: Request,
+    request_body: SynthesisRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Synthesize text to speech
+    Returns base64-encoded audio content along with metadata.
+    Decode the audio_content field to get the audio bytes.
+    """
+    # Validate text length
+    if len(request_body.text) > 5000:
+        raise HTTPException(
+            status_code=400,
+            detail="Text too long. Maximum 5000 characters."
+        )
+    # Validate language
+    lang_base = request_body.language.split("-")[0] if "-" in request_body.language else request_body.language
+    supported_bases = [l.split("-")[0] for l in settings.supported_languages_list]
+    if lang_base not in supported_bases:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported language: {request_body.language}"
+        )
+    try:
+        result = await tts_service.synthesize(request_body)
+        return result
+    except ValueError as e:
+        logger.error(f"Synthesis validation error: {e}")
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.exception(f"Synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
+@router.post("/stream")
+async def stream_speech(
+    request: SynthesisRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Stream text-to-speech audio
+    Returns a chunked audio stream (audio/mpeg) for immediate playback.
+    Best for long text to reduce latency (TTFB).
+    """
+    try:
+        return StreamingResponse(
+            tts_service.synthesize_stream(request),
+            media_type="audio/mpeg"
+        )
+    except Exception as e:
+        logger.exception(f"Streaming synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/ssml")
+async def synthesize_ssml(
+    text: str,
+    voice: str = "en-US-AriaNeural",
+    rate: str = "medium",
+    pitch: str = "medium",
+    emphasis: Optional[str] = None,
+    auto_breaks: bool = True,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Synthesize speech with SSML prosody control
+    Supports advanced speech customization:
+    - rate: 'x-slow', 'slow', 'medium', 'fast', 'x-fast'
+    - pitch: 'x-low', 'low', 'medium', 'high', 'x-high'
+    - emphasis: 'reduced', 'moderate', 'strong'
+    - auto_breaks: Add natural pauses at punctuation
+    Returns audio/mpeg stream.
+    """
+    try:
+        from ...services.edge_tts_service import get_edge_tts_service
+        edge_service = get_edge_tts_service()
+        # Build SSML
+        ssml = edge_service.build_ssml(
+            text=text,
+            voice=voice,
+            rate=rate,
+            pitch=pitch,
+            emphasis=emphasis,
+            breaks=auto_breaks
+        )
+        # Synthesize
+        audio_bytes = await edge_service.synthesize_ssml(ssml, voice)
+        return Response(
+            content=audio_bytes,
+            media_type="audio/mpeg",
+            headers={"Content-Disposition": "inline; filename=speech.mp3"}
+        )
+    except Exception as e:
+        logger.exception(f"SSML synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/synthesize/audio")
+async def synthesize_audio_file(
+    request: SynthesisRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Synthesize text and return audio file directly
+    Returns the audio file as a downloadable stream.
+    """
+    try:
+        result = await tts_service.synthesize(request)
+        # Decode base64 audio
+        audio_bytes = base64.b64decode(result.audio_content)
+        # Determine content type
+        content_types = {
+            "MP3": "audio/mpeg",
+            "LINEAR16": "audio/wav",
+            "OGG_OPUS": "audio/ogg",
+        }
+        content_type = content_types.get(result.encoding, "audio/mpeg")
+        # Return as streaming response
+        return StreamingResponse(
+            BytesIO(audio_bytes),
+            media_type=content_type,
+            headers={
+                "Content-Disposition": f'attachment; filename="speech.{result.encoding.lower()}"',
+                "Content-Length": str(result.audio_size),
+            }
+        )
+    except Exception as e:
+        logger.exception(f"Audio synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/preview")
+async def preview_voice(
+    request: VoicePreviewRequest,
+    tts_service: TTSService = Depends(get_tts_service),
+):
+    """
+    Generate a short preview of a voice
+    Returns a small audio sample for voice selection UI.
+    """
+    # Find the voice to get its language
+    voices = tts_service.get_voices().voices
+    voice_info = next((v for v in voices if v.name == request.voice), None)
+    if not voice_info:
+        raise HTTPException(status_code=404, detail=f"Voice not found: {request.voice}")
+    # Create synthesis request with preview text
+    synth_request = SynthesisRequest(
+        text=request.text or "Hello! This is a preview of my voice.",
+        language=voice_info.language_code,
+        voice=request.voice,
+        audio_encoding="MP3",
+    )
+    try:
+        result = tts_service.synthesize(synth_request)
+        # Return audio directly
+        audio_bytes = base64.b64decode(result.audio_content)
+        return StreamingResponse(
+            BytesIO(audio_bytes),
+            media_type="audio/mpeg",
+        )
+    except Exception as e:
+        logger.exception(f"Preview failed: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

backend/app/api/routes/ws.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+WebSocket Router for Real-Time Transcription
+"""
+import logging
+import json
+from typing import Dict
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/ws", tags=["WebSocket"])
+class ConnectionManager:
+    """Manages active WebSocket connections"""
+    def __init__(self):
+        self.active_connections: Dict[str, WebSocket] = {}
+    async def connect(self, client_id: str, websocket: WebSocket):
+        await websocket.accept()
+        self.active_connections[client_id] = websocket
+        logger.info(f"Client {client_id} connected")
+    def disconnect(self, client_id: str):
+        if client_id in self.active_connections:
+            del self.active_connections[client_id]
+            logger.info(f"Client {client_id} disconnected")
+    async def send_json(self, client_id: str, data: dict):
+        if client_id in self.active_connections:
+            await self.active_connections[client_id].send_json(data)
+manager = ConnectionManager()
+@router.websocket("/transcription/{client_id}")
+async def websocket_transcription(websocket: WebSocket, client_id: str):
+    """
+    Real-time streaming transcription via WebSocket with VAD
+    """
+    await manager.connect(client_id, websocket)
+    from app.services.ws_stt_service import StreamManager, transcribe_buffer
+    stream_manager = StreamManager(websocket)
+    async def handle_transcription(audio_bytes: bytes):
+        """Callback for processing speech segments."""
+        try:
+            # Send processing status
+            await manager.send_json(client_id, {"status": "processing"})
+            # Transcribe
+            result = await transcribe_buffer(audio_bytes)
+            text = result.get("text", "").strip()
+            if text:
+                # Send result
+                await manager.send_json(client_id, {
+                    "text": text,
+                    "is_final": True,
+                    "status": "complete"
+                })
+                logger.info(f"Transcribed: {text}")
+        except Exception as e:
+            logger.error(f"Transcription callback error: {e}")
+            await manager.send_json(client_id, {"error": str(e)})
+    try:
+        # Start processing loop
+        await stream_manager.process_stream(handle_transcription)
+    except WebSocketDisconnect:
+        manager.disconnect(client_id)
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+        try:
+            await manager.send_json(client_id, {"error": str(e)})
+        except:
+            pass
+        manager.disconnect(client_id)
+@router.websocket("/tts/{client_id}")
+async def websocket_tts(websocket: WebSocket, client_id: str):
+    """
+    Real-time Text-to-Speech via WebSocket
+    Protocol:
+    - Client sends: JSON {"text": "...", "voice": "...", "rate": "...", "pitch": "..."}
+    - Server sends: Binary audio chunks (MP3) followed by JSON {"status": "complete"}
+    This achieves <500ms TTFB by streaming as chunks are generated.
+    """
+    await manager.connect(client_id, websocket)
+    try:
+        import edge_tts
+        while True:
+            # Receive synthesis request
+            data = await websocket.receive_json()
+            text = data.get("text", "")
+            voice = data.get("voice", "en-US-AriaNeural")
+            rate = data.get("rate", "+0%")
+            pitch = data.get("pitch", "+0Hz")
+            if not text:
+                await websocket.send_json({"error": "No text provided"})
+                continue
+            logger.info(f"WebSocket TTS: Synthesizing '{text[:50]}...' with {voice}")
+            # Stream audio chunks directly
+            import time
+            start_time = time.time()
+            first_chunk_sent = False
+            total_bytes = 0
+            communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch)
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    await websocket.send_bytes(chunk["data"])
+                    total_bytes += len(chunk["data"])
+                    if not first_chunk_sent:
+                        ttfb = (time.time() - start_time) * 1000
+                        logger.info(f"WebSocket TTS TTFB: {ttfb:.0f}ms")
+                        first_chunk_sent = True
+            # Send completion marker
+            total_time = time.time() - start_time
+            await websocket.send_json({
+                "status": "complete",
+                "total_bytes": total_bytes,
+                "total_time_ms": round(total_time * 1000),
+                "ttfb_ms": round(ttfb) if first_chunk_sent else None
+            })
+    except WebSocketDisconnect:
+        manager.disconnect(client_id)
+    except Exception as e:
+        logger.error(f"WebSocket TTS error: {e}")
+        try:
+            await websocket.send_json({"error": str(e)})
+        except:
+            pass
+        manager.disconnect(client_id)

backend/app/core/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+VoiceForge Core Package
+"""
+from .config import get_settings, Settings, LANGUAGE_METADATA
+__all__ = ["get_settings", "Settings", "LANGUAGE_METADATA"]

backend/app/core/config.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+VoiceForge Configuration
+Pydantic Settings for application configuration
+"""
+from functools import lru_cache
+from typing import List
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables"""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="allow",  # Allow extra env vars without error
+    )
+    # Application
+    app_name: str = "VoiceForge"
+    app_version: str = "1.0.0"
+    debug: bool = False
+    # API Server
+    api_host: str = "0.0.0.0"
+    api_port: int = 8000
+    # Database
+    database_url: str = Field(
+        default="sqlite:///./voiceforge.db",
+        description="Database connection URL (SQLite for dev, PostgreSQL for prod)"
+    )
+    # Redis
+    redis_url: str = Field(
+        default="redis://localhost:6379/0",
+        description="Redis connection URL for caching and Celery"
+    )
+    # Google Cloud
+    google_application_credentials: str = Field(
+        default="./credentials/google-cloud-key.json",
+        description="Path to Google Cloud service account JSON key"
+    )
+    # AI Services Configuration
+    use_local_services: bool = Field(
+        default=True,
+        description="Use local free services (Whisper + EdgeTTS) instead of Google Cloud"
+    )
+    whisper_model: str = Field(
+        default="small",
+        description="Whisper model size (tiny, base, small, medium, large-v3)"
+    )
+    # Security
+    secret_key: str = Field(
+        default="your-super-secret-key-change-in-production",
+        description="Secret key for JWT encoding"
+    )
+    access_token_expire_minutes: int = 30
+    algorithm: str = "HS256"
+    hf_token: str | None = Field(default=None, description="Hugging Face Token for Diarization")
+    # File Storage
+    upload_dir: str = "./uploads"
+    max_audio_duration_seconds: int = 600  # 10 minutes
+    max_upload_size_mb: int = 50
+    # Supported Languages
+    supported_languages: str = "en-US,en-GB,es-ES,es-MX,fr-FR,de-DE,ja-JP,ko-KR,zh-CN,hi-IN"
+    # Audio Formats
+    supported_audio_formats: str = "wav,mp3,m4a,flac,ogg,webm"
+    @property
+    def supported_languages_list(self) -> List[str]:
+        """Get supported languages as a list"""
+        return [lang.strip() for lang in self.supported_languages.split(",")]
+    @property
+    def supported_audio_formats_list(self) -> List[str]:
+        """Get supported audio formats as a list"""
+        return [fmt.strip() for fmt in self.supported_audio_formats.split(",")]
+# Language metadata for UI display
+LANGUAGE_METADATA = {
+    "en-US": {"name": "English (US)", "flag": "🇺🇸", "native": "English"},
+    "en-GB": {"name": "English (UK)", "flag": "🇬🇧", "native": "English"},
+    "es-ES": {"name": "Spanish (Spain)", "flag": "🇪🇸", "native": "Español"},
+    "es-MX": {"name": "Spanish (Mexico)", "flag": "🇲🇽", "native": "Español"},
+    "fr-FR": {"name": "French", "flag": "🇫🇷", "native": "Français"},
+    "de-DE": {"name": "German", "flag": "🇩🇪", "native": "Deutsch"},
+    "ja-JP": {"name": "Japanese", "flag": "🇯🇵", "native": "日本語"},
+    "ko-KR": {"name": "Korean", "flag": "🇰🇷", "native": "한국어"},
+    "zh-CN": {"name": "Chinese (Mandarin)", "flag": "🇨🇳", "native": "中文"},
+    "hi-IN": {"name": "Hindi", "flag": "🇮🇳", "native": "हिन्दी"},
+}
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance"""
+    return Settings()

backend/app/core/limiter.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+# Initialize Limiter
+# Use in-memory storage for local dev (Redis for production)
+redis_url = os.getenv("REDIS_URL")
+# For local testing without Redis, use memory storage
+if redis_url and redis_url.strip():
+    try:
+        import redis
+        r = redis.from_url(redis_url)
+        r.ping()  # Test connection
+        storage_uri = redis_url
+    except Exception:
+        # Redis not available, fall back to memory
+        storage_uri = "memory://"
+else:
+    storage_uri = "memory://"
+limiter = Limiter(
+    key_func=get_remote_address,
+    storage_uri=storage_uri,
+    default_limits=["60/minute"]  # Global limit: 60 req/min per IP
+)

backend/app/core/middleware.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+Rate Limiting Middleware
+Uses Redis to track and limit request rates per IP address.
+Pure ASGI implementation to avoid BaseHTTPMiddleware issues.
+"""
+import time
+import redis
+from starlette.responses import JSONResponse
+from starlette.types import ASGIApp, Scope, Receive, Send
+from ..core.config import get_settings
+settings = get_settings()
+class RateLimitMiddleware:
+    def __init__(self, app: ASGIApp):
+        self.app = app
+        # Hardcoded or from settings (bypassing constructor arg issue)
+        self.requests_per_minute = 60
+        self.window_size = 60  # seconds
+        # Connect to Redis
+        try:
+            self.redis_client = redis.from_url(settings.redis_url)
+        except Exception as e:
+            print(f"⚠️ Rate limiter disabled: Could not connect to Redis ({e})")
+            self.redis_client = None
+    async def __call__(self, scope: Scope, receive: Receive, send: Send):
+        # Skip if not HTTP
+        if scope["type"] != "http":
+            await self.app(scope, receive, send)
+            return
+        # Skip rate limiting for non-API routes or if Redis is down
+        path = scope.get("path", "")
+        if not path.startswith("/api/") or self.redis_client is None:
+            await self.app(scope, receive, send)
+            return
+        # Get client IP
+        client = scope.get("client")
+        client_ip = client[0] if client else "unknown"
+        key = f"rate_limit:{client_ip}"
+        try:
+            # Simple fixed window counter
+            current_count = self.redis_client.incr(key)
+            # Set expiry on first request
+            if current_count == 1:
+                self.redis_client.expire(key, self.window_size)
+            if current_count > self.requests_per_minute:
+                response = JSONResponse(
+                    status_code=429,
+                    content={
+                        "detail": "Too many requests",
+                        "retry_after": self.window_size
+                    },
+                    headers={"Retry-After": str(self.window_size)}
+                )
+                await response(scope, receive, send)
+                return
+        except redis.RedisError:
+            # Fail open if Redis has issues during request
+            pass
+        await self.app(scope, receive, send)

backend/app/core/security.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Security Utilities
+Handles password hashing, JWT generation, and API key verification.
+"""
+from datetime import datetime, timedelta
+from typing import Optional, Union, Any
+from jose import jwt
+from passlib.context import CryptContext
+from fastapi.security import OAuth2PasswordBearer, APIKeyHeader
+from fastapi import Depends, HTTPException, status
+from sqlalchemy.orm import Session
+from ..core.config import get_settings
+from ..models import get_db, User, ApiKey
+settings = get_settings()
+# Password hashing (PBKDF2 is safer/easier on Windows than bcrypt sometimes)
+pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
+# JWT configuration
+SECRET_KEY = settings.secret_key
+ALGORITHM = settings.algorithm
+ACCESS_TOKEN_EXPIRE_MINUTES = settings.access_token_expire_minutes
+# OAuth2 scheme
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="api/v1/auth/login")
+api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
+def verify_password(plain_password: str, hashed_password: str) -> bool:
+    return pwd_context.verify(plain_password, hashed_password)
+def get_password_hash(password: str) -> str:
+    return pwd_context.hash(password)
+def create_access_token(subject: Union[str, Any], expires_delta: timedelta = None) -> str:
+    if expires_delta:
+        expire = datetime.utcnow() + expires_delta
+    else:
+        expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    to_encode = {"exp": expire, "sub": str(subject)}
+    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+    return encoded_jwt
+async def get_current_user(token: str = Depends(oauth2_scheme), db: Session = Depends(get_db)) -> User:
+    """Validate JWT and return user"""
+    credentials_exception = HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Could not validate credentials",
+        headers={"WWW-Authenticate": "Bearer"},
+    )
+    try:
+        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+        user_id: str = payload.get("sub")
+        if user_id is None:
+            raise credentials_exception
+    except Exception:
+        raise credentials_exception
+    user = db.query(User).filter(User.id == int(user_id)).first()
+    if user is None:
+        raise credentials_exception
+    return user
+async def get_current_active_user(current_user: User = Depends(get_current_user)) -> User:
+    if not current_user.is_active:
+        raise HTTPException(status_code=400, detail="Inactive user")
+    return current_user
+async def verify_api_key(
+    api_key: str = Depends(api_key_header),
+    db: Session = Depends(get_db)
+) -> Optional[User]:
+    """
+    Validate API key from X-API-Key header.
+    Returns the associated user if valid, else None (or raises if enforcing).
+    """
+    if not api_key:
+        return None  # Or raise if strict
+    key_record = db.query(ApiKey).filter(ApiKey.key == api_key, ApiKey.is_active == True).first()
+    if key_record:
+        # Update usage stats
+        key_record.last_used_at = datetime.utcnow()
+        db.commit()
+        return key_record.user
+    return None  # Invalid key
+def get_api_user_or_jwt_user(
+    api_key_user: Optional[User] = Depends(verify_api_key),
+    jwt_user: Optional[User] = Depends(get_current_user)
+) -> User:
+    """Allow access via either API Key or JWT"""
+    if api_key_user:
+        return api_key_user
+    if jwt_user:
+        return jwt_user
+    raise HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Not authenticated"
+    )

backend/app/core/security_encryption.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Field-level Encryption for SQLAlchemy Models.
+Uses Fernet symmetric encryption from the `cryptography` library.
+The ENCRYPTION_KEY should be a 32-byte base64-encoded key.
+Generate one with: from cryptography.fernet import Fernet; print(Fernet.generate_key())
+"""
+import os
+import base64
+import logging
+from typing import Optional
+from cryptography.fernet import Fernet, InvalidToken
+from sqlalchemy import TypeDecorator, String
+logger = logging.getLogger(__name__)
+# --- Configuration ---
+# IMPORTANT: Store this securely! In production, use secrets manager or env vars.
+# Default key is for development ONLY - regenerate for production!
+_DEFAULT_DEV_KEY = "VOICEFORGE_DEV_KEY_REPLACE_ME_NOW="  # Placeholder - NOT a valid key
+def _get_encryption_key() -> bytes:
+    """Get the encryption key from environment or generate a dev default."""
+    key_str = os.getenv("ENCRYPTION_KEY")
+    if key_str:
+        return key_str.encode()
+    # Generate a consistent dev key (NOT SECURE - dev only)
+    logger.warning("⚠️  ENCRYPTION_KEY not set! Using insecure dev key. DO NOT USE IN PRODUCTION.")
+    # Create a valid Fernet key from a predictable seed for dev
+    return Fernet.generate_key()  # This generates a random key each run - bad for dev persistence
+    # For dev consistency, use a fixed key (still insecure):
+    # return base64.urlsafe_b64encode(b"32_byte_dev_key_for_testing_1234")
+# Cache the Fernet instance
+_fernet: Optional[Fernet] = None
+def get_fernet() -> Fernet:
+    """Get or create the Fernet encryption instance."""
+    global _fernet
+    if _fernet is None:
+        key = _get_encryption_key()
+        _fernet = Fernet(key)
+    return _fernet
+# --- SQLAlchemy TypeDecorator ---
+class EncryptedString(TypeDecorator):
+    """
+    SQLAlchemy type that encrypts/decrypts string values transparently.
+    Usage:
+        class User(Base):
+            full_name = Column(EncryptedString(255), nullable=True)
+    The encrypted data is stored as a base64-encoded string in the database.
+    """
+    impl = String
+    cache_ok = True
+    def __init__(self, length: int = 512, *args, **kwargs):
+        # Encrypted strings are longer than plaintext, so pad the length
+        super().__init__(length * 2, *args, **kwargs)
+    def process_bind_param(self, value, dialect):
+        """Encrypt the value before storing in DB."""
+        if value is None:
+            return None
+        try:
+            fernet = get_fernet()
+            # Encode string to bytes, encrypt, then decode to string for storage
+            encrypted = fernet.encrypt(value.encode('utf-8'))
+            return encrypted.decode('utf-8')
+        except Exception as e:
+            logger.error(f"Encryption failed: {e}")
+            # In case of encryption failure, store plaintext (fail-open for dev)
+            # In production, you might want to raise instead
+            return value
+    def process_result_value(self, value, dialect):
+        """Decrypt the value when reading from DB."""
+        if value is None:
+            return None
+        try:
+            fernet = get_fernet()
+            # Decode from storage string, decrypt, then decode to string
+            decrypted = fernet.decrypt(value.encode('utf-8'))
+            return decrypted.decode('utf-8')
+        except InvalidToken:
+            # Value might be plaintext (legacy data or encryption disabled)
+            logger.warning("Decryption failed - returning raw value (possible legacy data)")
+            return value
+        except Exception as e:
+            logger.error(f"Decryption failed: {e}")
+            return value

backend/app/core/security_headers.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.types import ASGIApp, Receive, Scope, Send
+class SecurityHeadersMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app: ASGIApp):
+        super().__init__(app)
+    async def dispatch(self, request, call_next):
+        response = await call_next(request)
+        # Prevent Clickjacking
+        response.headers["X-Frame-Options"] = "DENY"
+        # Prevent MIME type sniffing
+        response.headers["X-Content-Type-Options"] = "nosniff"
+        # Enable XSS filtering in browser (legacy but good for depth)
+        response.headers["X-XSS-Protection"] = "1; mode=block"
+        # Strict Transport Security (HSTS)
+        # Enforce HTTPS. max-age=31536000 is 1 year.
+        # includeSubDomains applies to all subdomains.
+        # preload allows domain to be included in browser preload lists.
+        # NOTE: Only effective if served over HTTPS.
+        response.headers["Strict-Transport-Security"] = "max-age=31536000; includeSubDomains"
+        # Content Security Policy (CSP)
+        # Very strict default: only allow content from self.
+        # This might need adjustment for Swagger UI (CDN assets) or other resources.
+        # For now, we allow 'unsafe-inline' and 'unsafe-eval' for Swagger UI compatibility if needed,
+        # but primarily 'self'.
+        response.headers["Content-Security-Policy"] = "default-src 'self'; img-src 'self' data: https:; style-src 'self' 'unsafe-inline'; script-src 'self' 'unsafe-inline';"
+        # Referrer Policy
+        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+        return response

backend/app/main.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+VoiceForge - FastAPI Main Application
+Production-grade Speech-to-Text & Text-to-Speech API
+"""
+import logging
+# WARN: PyTorch 2.6+ security workaround for Pyannote
+# Must be before any other torch imports
+import os
+os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
+import torch.serialization
+try:
+    torch.serialization.add_safe_globals([dict])
+except:
+    pass
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.openapi.utils import get_openapi
+from prometheus_fastapi_instrumentator import Instrumentator
+from .core.config import get_settings
+from .api.routes import (
+    stt_router,
+    tts_router,
+    health_router,
+    transcripts_router,
+    ws_router,
+    translation_router,
+    batch_router,
+    analysis_router,
+    audio_router,
+    cloning_router,
+    sign_router,
+    auth_router
+)
+from .models import Base, engine
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+settings = get_settings()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    Application lifespan handler
+    Runs on startup and shutdown
+    """
+    # Startup
+    logger.info(f"Starting {settings.app_name} v{settings.app_version}")
+    # Create database tables
+    logger.info("Creating database tables...")
+    Base.metadata.create_all(bind=engine)
+    # Pre-warm Whisper models for faster first request
+    logger.info("Pre-warming AI models...")
+    try:
+        from .services.whisper_stt_service import get_whisper_model
+        # Pre-load English Distil model (most common)
+        get_whisper_model("distil-small.en")
+        logger.info("✅ Distil-Whisper model loaded")
+        # Pre-load multilingual model
+        get_whisper_model("small")
+        logger.info("✅ Whisper-small model loaded")
+    except Exception as e:
+        logger.warning(f"Model pre-warming failed: {e}")
+    # Pre-cache TTS voice list
+    try:
+        from .services.tts_service import get_tts_service
+        tts_service = get_tts_service()
+        await tts_service.get_voices()
+        logger.info("✅ TTS voice list cached")
+    except Exception as e:
+        logger.warning(f"Voice list caching failed: {e}")
+    logger.info("🚀 Startup complete - All models warmed up!")
+    yield
+    # Shutdown
+    logger.info("Shutting down...")
+    # TODO: Close database connections
+    # TODO: Close Redis connections
+    logger.info("Shutdown complete")
+# Create FastAPI application
+app = FastAPI(
+    title=settings.app_name,
+    description="""
+## VoiceForge API
+Production-grade Speech-to-Text and Text-to-Speech API.
+### Features
+- 🎤 **Speech-to-Text**: Transcribe audio files with word-level timestamps
+- 🔊 **Text-to-Speech**: Synthesize speech with 300+ neural voices
+- 🌍 **Multi-language**: Support for 10+ languages
+- 🧠 **AI Analysis**: Sentiment, keywords, and summarization
+- 🌐 **Translation**: Translate text/audio between 20+ languages
+- ⚡ **Free & Fast**: Local Whisper + Edge TTS - no API costs
+    """,
+    version=settings.app_version,
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan,
+)
+from slowapi import _rate_limit_exceeded_handler
+from slowapi.errors import RateLimitExceeded
+from slowapi.middleware import SlowAPIMiddleware
+from .core.limiter import limiter
+from .core.security_headers import SecurityHeadersMiddleware
+# Add Rate Limiting (default: 60 requests/min per IP)
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+app.add_middleware(SlowAPIMiddleware)
+# Security Headers (Must be before CORS to ensure headers are present even on errors/CORS blocks)
+app.add_middleware(SecurityHeadersMiddleware)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Prometheus Metrics
+Instrumentator().instrument(app).expose(app)
+# Include routers
+app.include_router(health_router)
+app.include_router(auth_router, prefix="/api/v1")
+app.include_router(stt_router, prefix="/api/v1")
+app.include_router(tts_router, prefix="/api/v1")
+app.include_router(transcripts_router, prefix="/api/v1")
+app.include_router(ws_router, prefix="/api/v1")
+app.include_router(translation_router, prefix="/api/v1")
+app.include_router(batch_router, prefix="/api/v1")
+app.include_router(analysis_router, prefix="/api/v1")
+app.include_router(audio_router, prefix="/api/v1")
+app.include_router(cloning_router, prefix="/api/v1")
+app.include_router(sign_router, prefix="/api/v1")
+# Exception handlers
+@app.exception_handler(Exception)
+async def global_exception_handler(request: Request, exc: Exception):
+    """Global exception handler for unhandled errors"""
+    logger.exception(f"Unhandled error: {exc}")
+    return JSONResponse(
+        status_code=500,
+        content={
+            "error": "internal_server_error",
+            "message": "An unexpected error occurred",
+            "detail": str(exc) if settings.debug else None,
+        },
+    )
+@app.exception_handler(ValueError)
+async def value_error_handler(request: Request, exc: ValueError):
+    """Handler for validation errors"""
+    return JSONResponse(
+        status_code=400,
+        content={
+            "error": "validation_error",
+            "message": str(exc),
+        },
+    )
+# Root endpoint
+@app.get("/", tags=["Root"])
+async def root():
+    """API root - returns basic info"""
+    return {
+        "name": settings.app_name,
+        "version": settings.app_version,
+        "status": "running",
+        "docs": "/docs",
+        "health": "/health",
+    }
+# Custom OpenAPI schema
+def custom_openapi():
+    """Generate custom OpenAPI schema with enhanced documentation"""
+    if app.openapi_schema:
+        return app.openapi_schema
+    openapi_schema = get_openapi(
+        title=settings.app_name,
+        version=settings.app_version,
+        description=app.description,
+        routes=app.routes,
+    )
+    # Add custom logo
+    openapi_schema["info"]["x-logo"] = {
+        "url": "https://example.com/logo.png"
+    }
+    # Add tags with descriptions
+    openapi_schema["tags"] = [
+        {
+            "name": "Health",
+            "description": "Health check endpoints for monitoring",
+        },
+        {
+            "name": "Speech-to-Text",
+            "description": "Convert audio to text with timestamps and speaker detection",
+        },
+        {
+            "name": "Text-to-Speech",
+            "description": "Convert text to natural-sounding speech",
+        },
+    ]
+    app.openapi_schema = openapi_schema
+    return app.openapi_schema
+app.openapi = custom_openapi
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "app.main:app",
+        host=settings.api_host,
+        port=settings.api_port,
+        reload=settings.debug,
+    )

backend/app/schemas/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+VoiceForge Schemas Package
+"""
+from .stt import (
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionSegment,
+    TranscriptionWord,
+    LanguageInfo,
+)
+from .tts import (
+    SynthesisRequest,
+    SynthesisResponse,
+    VoiceInfo,
+    VoiceListResponse,
+)
+from .transcript import (
+    TranscriptCreate,
+    TranscriptUpdate,
+    TranscriptResponse,
+    TranscriptListResponse,
+)
+__all__ = [
+    "TranscriptionRequest",
+    "TranscriptionResponse",
+    "TranscriptionSegment",
+    "TranscriptionWord",
+    "LanguageInfo",
+    "SynthesisRequest",
+    "SynthesisResponse",
+    "VoiceInfo",
+    "VoiceListResponse",
+    "TranscriptCreate",
+    "TranscriptUpdate",
+    "TranscriptResponse",
+    "TranscriptListResponse",
+]

backend/app/schemas/stt.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+Speech-to-Text Schemas
+"""
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+class TranscriptionWord(BaseModel):
+    """Individual word with timing information"""
+    word: str
+    start_time: float = Field(..., description="Start time in seconds")
+    end_time: float = Field(..., description="End time in seconds")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence score")
+class TranscriptionSegment(BaseModel):
+    """Transcript segment with speaker and timing"""
+    text: str
+    start_time: float = Field(..., description="Start time in seconds")
+    end_time: float = Field(..., description="End time in seconds")
+    speaker: Optional[str] = Field(None, description="Speaker label (e.g., SPEAKER_1)")
+    confidence: float = Field(..., ge=0.0, le=1.0)
+    words: Optional[List[TranscriptionWord]] = None
+class TranscriptionRequest(BaseModel):
+    """Request parameters for transcription"""
+    language: str = Field(default="en-US", description="Language code (e.g., en-US)")
+    enable_automatic_punctuation: bool = True
+    enable_word_time_offsets: bool = True
+    enable_speaker_diarization: bool = False
+    diarization_speaker_count: Optional[int] = Field(None, ge=2, le=10)
+    model: str = Field(default="default", description="STT model to use")
+class TranscriptionResponse(BaseModel):
+    """Response from transcription"""
+    id: Optional[int] = None
+    audio_file_id: Optional[int] = None
+    text: str = Field(..., description="Full transcription text")
+    segments: List[TranscriptionSegment] = Field(default_factory=list)
+    words: Optional[List[TranscriptionWord]] = None
+    language: str
+    detected_language: Optional[str] = None
+    confidence: float = Field(..., ge=0.0, le=1.0)
+    duration: float = Field(..., description="Audio duration in seconds")
+    word_count: int
+    processing_time: float = Field(..., description="Processing time in seconds")
+    model_config = {
+        "from_attributes": True
+    }
+class StreamingTranscriptionResponse(BaseModel):
+    """Response for streaming transcription updates"""
+    is_final: bool = False
+    text: str
+    confidence: float = Field(default=0.0, ge=0.0, le=1.0)
+    stability: float = Field(default=0.0, ge=0.0, le=1.0)
+class LanguageInfo(BaseModel):
+    """Language information for UI display"""
+    code: str = Field(..., description="Language code (e.g., en-US)")
+    name: str = Field(..., description="Display name (e.g., English (US))")
+    native_name: str = Field(..., description="Native name (e.g., English)")
+    flag: str = Field(..., description="Flag emoji")
+    stt_supported: bool = True
+    tts_supported: bool = True
+class LanguageListResponse(BaseModel):
+    """Response with list of supported languages"""
+    languages: List[LanguageInfo]
+    total: int
+class TaskStatusResponse(BaseModel):
+    """Status of an async transcription task"""
+    task_id: str
+    status: str = Field(..., description="pending, processing, completed, failed")
+    progress: float = Field(default=0.0, ge=0.0, le=100.0, description="Progress percentage")
+    result: Optional[TranscriptionResponse] = None
+    error: Optional[str] = None
+    created_at: datetime
+    updated_at: datetime
+class AsyncTranscriptionResponse(BaseModel):
+    """Response for async transcription submission"""
+    task_id: str
+    audio_file_id: int
+    status: str = "queued"
+    message: str = "File uploaded and queued for processing"

backend/app/schemas/transcript.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+Transcript Schemas
+"""
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from pydantic import BaseModel, Field
+from .stt import TranscriptionSegment, TranscriptionWord
+class TranscriptCreate(BaseModel):
+    """Schema for creating a transcript"""
+    raw_text: str
+    processed_text: Optional[str] = None
+    segments: Optional[List[Dict[str, Any]]] = None
+    words: Optional[List[Dict[str, Any]]] = None
+    language: str = "en-US"
+    confidence: Optional[float] = None
+    duration: Optional[float] = None
+class TranscriptUpdate(BaseModel):
+    """Schema for updating a transcript"""
+    processed_text: Optional[str] = None
+    language: Optional[str] = None
+class TranscriptResponse(BaseModel):
+    """Schema for transcript response"""
+    id: int
+    audio_file_id: Optional[int] = None
+    user_id: Optional[int] = None
+    raw_text: Optional[str] = None
+    processed_text: Optional[str] = None
+    segments: Optional[List[Dict[str, Any]]] = None
+    words: Optional[List[Dict[str, Any]]] = None
+    language: Optional[str] = None
+    translation_language: Optional[str] = None
+    translated_text: Optional[str] = None
+    sentiment: Optional[Dict[str, Any]] = None
+    topics: Optional[List[str]] = None
+    keywords: Optional[List[Dict[str, Any]]] = None
+    summary: Optional[str] = None
+    confidence: Optional[float] = None
+    duration: Optional[float] = None
+    word_count: Optional[int] = None
+    created_at: datetime
+    updated_at: Optional[datetime] = None
+    model_config = {
+        "from_attributes": True
+    }
+class TranscriptListResponse(BaseModel):
+    """Schema for paginated transcript list"""
+    transcripts: List[TranscriptResponse]
+    total: int
+    page: int
+    page_size: int
+    has_more: bool
+class ExportRequest(BaseModel):
+    """Schema for transcript export request"""
+    format: str = Field(..., pattern="^(txt|srt|vtt|pdf|json)$")
+    include_timestamps: bool = True
+    include_speakers: bool = True

backend/app/schemas/tts.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""
+Text-to-Speech Schemas
+"""
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class SynthesisRequest(BaseModel):
+    """Request for text-to-speech synthesis"""
+    text: str = Field(..., min_length=1, max_length=5000, description="Text to synthesize")
+    language: str = Field(default="en-US", description="Language code")
+    voice: Optional[str] = Field(None, description="Voice name (e.g., en-US-Wavenet-D)")
+    # Audio configuration
+    audio_encoding: str = Field(default="MP3", description="Output format: MP3, LINEAR16, OGG_OPUS")
+    sample_rate: int = Field(default=24000, description="Sample rate in Hz")
+    # Voice tuning
+    speaking_rate: float = Field(default=1.0, ge=0.25, le=4.0, description="Speaking rate")
+    pitch: float = Field(default=0.0, ge=-20.0, le=20.0, description="Voice pitch in semitones")
+    volume_gain_db: float = Field(default=0.0, ge=-96.0, le=16.0, description="Volume gain in dB")
+    # SSML support
+    use_ssml: bool = Field(default=False, description="Treat text as SSML")
+class SynthesisResponse(BaseModel):
+    """Response from text-to-speech synthesis"""
+    audio_content: str = Field(..., description="Base64 encoded audio")
+    audio_size: int = Field(..., description="Audio size in bytes")
+    duration_estimate: float = Field(..., description="Estimated duration in seconds")
+    voice_used: str
+    language: str
+    encoding: str
+    sample_rate: int
+    processing_time: float = Field(..., description="Processing time in seconds")
+class VoiceInfo(BaseModel):
+    """Information about a TTS voice"""
+    name: str = Field(..., description="Voice name (e.g., en-US-Wavenet-D)")
+    language_code: str = Field(..., description="Language code")
+    language_name: str = Field(..., description="Language display name")
+    ssml_gender: str = Field(..., description="MALE, FEMALE, or NEUTRAL")
+    natural_sample_rate: int = Field(..., description="Native sample rate in Hz")
+    voice_type: str = Field(..., description="Standard, WaveNet, or Neural2")
+    # Display helpers
+    display_name: Optional[str] = None
+    flag: Optional[str] = None
+class VoiceListResponse(BaseModel):
+    """Response with list of available voices"""
+    voices: List[VoiceInfo]
+    total: int
+    language_filter: Optional[str] = None
+class VoicePreviewRequest(BaseModel):
+    """Request for voice preview"""
+    voice: str = Field(..., description="Voice name to preview")
+    text: Optional[str] = Field(
+        default="Hello! This is a preview of my voice.",
+        max_length=200
+    )

backend/app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+VoiceForge Services Package
+"""
+from .stt_service import STTService
+from .tts_service import TTSService
+from .file_service import FileService
+__all__ = [
+    "STTService",
+    "TTSService",
+    "FileService",
+]

backend/app/services/audio_service.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Audio Editing Service
+Handles audio manipulation: Trimming, Merging, and Conversion using Pydub/FFmpeg
+"""
+import os
+import logging
+from typing import List, Optional
+from pydub import AudioSegment
+import tempfile
+logger = logging.getLogger(__name__)
+class AudioService:
+    """
+    Service for audio manipulation tasks.
+    Requires ffmpeg to be installed/available in path.
+    """
+    def __init__(self):
+        pass
+    def load_audio(self, file_path: str) -> AudioSegment:
+        """Load audio file into Pydub AudioSegment"""
+        try:
+            return AudioSegment.from_file(file_path)
+        except Exception as e:
+            logger.error(f"Failed to load audio {file_path}: {e}")
+            raise ValueError(f"Could not load audio file: {str(e)}")
+    def trim_audio(self, input_path: str, start_ms: int, end_ms: int, output_path: Optional[str] = None) -> str:
+        """
+        Trim audio from start_ms to end_ms.
+        """
+        if start_ms < 0 or end_ms <= start_ms:
+            raise ValueError("Invalid start/end timestamps")
+        audio = self.load_audio(input_path)
+        # Check duration
+        if start_ms >= len(audio):
+            raise ValueError("Start time exceeds audio duration")
+        # Slice
+        trimmed = audio[start_ms:end_ms]
+        if not output_path:
+            base, ext = os.path.splitext(input_path)
+            output_path = f"{base}_trimmed{ext}"
+        trimmed.export(output_path, format=os.path.splitext(output_path)[1][1:])
+        logger.info(f"Trimmed audio saved to {output_path}")
+        return output_path
+    def merge_audio(self, file_paths: List[str], output_path: str, crossfade_ms: int = 0) -> str:
+        """
+        Merge multiple audio files into one.
+        """
+        if not file_paths:
+            raise ValueError("No files to merge")
+        combined = AudioSegment.empty()
+        for path in file_paths:
+            segment = self.load_audio(path)
+            if crossfade_ms > 0 and len(combined) > 0:
+                combined = combined.append(segment, crossfade=crossfade_ms)
+            else:
+                combined += segment
+        # Create dir if needed
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        # Export
+        fmt = os.path.splitext(output_path)[1][1:] or "mp3"
+        combined.export(output_path, format=fmt)
+        logger.info(f"Merged {len(file_paths)} files to {output_path}")
+        return output_path
+    def convert_format(self, input_path: str, target_format: str) -> str:
+        """
+        Convert audio format (e.g. wav -> mp3)
+        """
+        audio = self.load_audio(input_path)
+        base = os.path.splitext(input_path)[0]
+        output_path = f"{base}.{target_format}"
+        audio.export(output_path, format=target_format)
+        logger.info(f"Converted to {target_format}: {output_path}")
+        return output_path
+# Singleton
+_audio_service = None
+def get_audio_service() -> AudioService:
+    global _audio_service
+    if _audio_service is None:
+        _audio_service = AudioService()
+    return _audio_service

backend/app/services/batch_service.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""
+Batch Processing Service
+Handles multi-file transcription with job tracking and parallel processing
+"""
+import asyncio
+import logging
+import os
+import tempfile
+import uuid
+import zipfile
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, field
+from enum import Enum
+logger = logging.getLogger(__name__)
+class JobStatus(str, Enum):
+    """Batch job status enum."""
+    PENDING = "pending"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+class FileStatus(str, Enum):
+    """Individual file status."""
+    QUEUED = "queued"
+    PROCESSING = "processing"
+    COMPLETED = "completed"
+    FAILED = "failed"
+@dataclass
+class FileResult:
+    """Result for a single file in batch."""
+    filename: str
+    status: FileStatus = FileStatus.QUEUED
+    progress: float = 0.0
+    transcript: Optional[str] = None
+    language: Optional[str] = None
+    duration: Optional[float] = None
+    word_count: Optional[int] = None
+    processing_time: Optional[float] = None
+    error: Optional[str] = None
+    output_path: Optional[str] = None
+@dataclass
+class BatchJob:
+    """Batch processing job."""
+    job_id: str
+    status: JobStatus = JobStatus.PENDING
+    created_at: datetime = field(default_factory=datetime.now)
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    files: Dict[str, FileResult] = field(default_factory=dict)
+    total_files: int = 0
+    completed_files: int = 0
+    failed_files: int = 0
+    options: Dict[str, Any] = field(default_factory=dict)
+    output_zip_path: Optional[str] = None
+    @property
+    def progress(self) -> float:
+        """Overall job progress percentage."""
+        if self.total_files == 0:
+            return 0.0
+        return (self.completed_files + self.failed_files) / self.total_files * 100
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for API response."""
+        return {
+            "job_id": self.job_id,
+            "status": self.status.value,
+            "progress": round(self.progress, 1),
+            "created_at": self.created_at.isoformat(),
+            "started_at": self.started_at.isoformat() if self.started_at else None,
+            "completed_at": self.completed_at.isoformat() if self.completed_at else None,
+            "total_files": self.total_files,
+            "completed_files": self.completed_files,
+            "failed_files": self.failed_files,
+            "files": {
+                name: {
+                    "filename": f.filename,
+                    "status": f.status.value,
+                    "progress": f.progress,
+                    "transcript": f.transcript[:500] + "..." if f.transcript and len(f.transcript) > 500 else f.transcript,
+                    "language": f.language,
+                    "duration": f.duration,
+                    "word_count": f.word_count,
+                    "processing_time": f.processing_time,
+                    "error": f.error,
+                }
+                for name, f in self.files.items()
+            },
+            "options": self.options,
+            "has_zip": self.output_zip_path is not None,
+        }
+# In-memory job store (use Redis in production)
+_batch_jobs: Dict[str, BatchJob] = {}
+class BatchProcessingService:
+    """
+    Service for batch audio transcription.
+    Processes multiple files with progress tracking.
+    """
+    def __init__(self, output_dir: Optional[str] = None):
+        """Initialize batch service."""
+        self.output_dir = output_dir or tempfile.gettempdir()
+        self._processing_lock = asyncio.Lock()
+    def create_job(
+        self,
+        filenames: List[str],
+        options: Optional[Dict[str, Any]] = None,
+    ) -> BatchJob:
+        """
+        Create a new batch job.
+        Args:
+            filenames: List of filenames to process
+            options: Processing options (language, output_format, etc.)
+        Returns:
+            Created BatchJob
+        """
+        job_id = str(uuid.uuid4())[:8]
+        files = {
+            name: FileResult(filename=name)
+            for name in filenames
+        }
+        job = BatchJob(
+            job_id=job_id,
+            files=files,
+            total_files=len(filenames),
+            options=options or {},
+        )
+        _batch_jobs[job_id] = job
+        logger.info(f"Created batch job {job_id} with {len(filenames)} files")
+        return job
+    def get_job(self, job_id: str) -> Optional[BatchJob]:
+        """Get job by ID."""
+        return _batch_jobs.get(job_id)
+    def list_jobs(self, limit: int = 20) -> List[BatchJob]:
+        """List recent jobs."""
+        jobs = list(_batch_jobs.values())
+        jobs.sort(key=lambda j: j.created_at, reverse=True)
+        return jobs[:limit]
+    async def process_job(
+        self,
+        job_id: str,
+        file_paths: Dict[str, str],
+    ) -> BatchJob:
+        """
+        Process all files in a batch job.
+        Args:
+            job_id: Job ID
+            file_paths: Mapping of filename -> temp file path
+        Returns:
+            Completed BatchJob
+        """
+        job = self.get_job(job_id)
+        if not job:
+            raise ValueError(f"Job not found: {job_id}")
+        job.status = JobStatus.PROCESSING
+        job.started_at = datetime.now()
+        # STT Service is used inside the worker now
+        # from app.services.whisper_stt_service import get_whisper_stt_service
+        # stt_service = get_whisper_stt_service()
+        # Get options
+        language = job.options.get("language")
+        output_format = job.options.get("output_format", "txt")
+        # Process each file
+        output_files: List[str] = []
+        for filename, file_path in file_paths.items():
+            file_result = job.files.get(filename)
+            if not file_result:
+                continue
+            file_result.status = FileStatus.PROCESSING
+            file_result.progress = 0.0
+            try:
+                import time
+                start_time = time.time()
+                # Transcribe via Celery Worker
+                from app.workers.tasks import transcribe_file_path
+                # Dispatch task
+                task = transcribe_file_path.delay(
+                    file_path=file_path,
+                    language=language,
+                    output_format=output_format
+                )
+                # Wait for result (since this service runs in background thread)
+                # In a full async arch we would return job_id and poll,
+                # but here we keep the batch logic simple while scaling the compute.
+                task_result = task.get(timeout=600) # 10 min timeout per file
+                processing_time = time.time() - start_time
+                # Update file result
+                file_result.transcript = task_result.get("text", "")
+                file_result.language = task_result.get("language", "unknown")
+                file_result.duration = task_result.get("duration")
+                file_result.word_count = len(file_result.transcript.split())
+                file_result.processing_time = round(processing_time, 2)
+                file_result.status = FileStatus.COMPLETED
+                file_result.progress = 100.0
+                # Helper for SRT writing since we have raw segments dicts now
+                result = {"segments": task_result.get("segments", []), "text": file_result.transcript}
+                # Save output file
+                output_filename = Path(filename).stem + f".{output_format}"
+                output_path = os.path.join(self.output_dir, job_id, output_filename)
+                os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                with open(output_path, "w", encoding="utf-8") as f:
+                    if output_format == "srt":
+                        # Write SRT format
+                        segments = result.get("segments", [])
+                        for i, seg in enumerate(segments, 1):
+                            start = self._format_srt_time(seg.get("start", 0))
+                            end = self._format_srt_time(seg.get("end", 0))
+                            text = seg.get("text", "").strip()
+                            f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
+                    else:
+                        f.write(file_result.transcript)
+                file_result.output_path = output_path
+                output_files.append(output_path)
+                job.completed_files += 1
+                logger.info(f"[{job_id}] Completed {filename} ({job.completed_files}/{job.total_files})")
+            except Exception as e:
+                file_result.status = FileStatus.FAILED
+                file_result.error = str(e)
+                file_result.progress = 0.0
+                job.failed_files += 1
+                logger.error(f"[{job_id}] Failed {filename}: {e}")
+            finally:
+                # Clean up temp file
+                try:
+                    if os.path.exists(file_path):
+                        os.unlink(file_path)
+                except:
+                    pass
+        # Create ZIP of all outputs
+        if output_files:
+            zip_path = os.path.join(self.output_dir, f"{job_id}_results.zip")
+            with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+                for file_path in output_files:
+                    zf.write(file_path, os.path.basename(file_path))
+            job.output_zip_path = zip_path
+            logger.info(f"[{job_id}] Created ZIP: {zip_path}")
+        # Update job status
+        job.status = JobStatus.COMPLETED if job.failed_files == 0 else JobStatus.FAILED
+        job.completed_at = datetime.now()
+        return job
+    def _format_srt_time(self, seconds: float) -> str:
+        """Format seconds to SRT time format (HH:MM:SS,mmm)."""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis = int((seconds % 1) * 1000)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+    def cancel_job(self, job_id: str) -> bool:
+        """Cancel a pending/processing job."""
+        job = self.get_job(job_id)
+        if job and job.status in [JobStatus.PENDING, JobStatus.PROCESSING]:
+            job.status = JobStatus.CANCELLED
+            return True
+        return False
+    def delete_job(self, job_id: str) -> bool:
+        """Delete a job and its output files."""
+        job = _batch_jobs.pop(job_id, None)
+        if job:
+            # Clean up files
+            if job.output_zip_path and os.path.exists(job.output_zip_path):
+                try:
+                    os.unlink(job.output_zip_path)
+                except:
+                    pass
+            job_dir = os.path.join(self.output_dir, job_id)
+            if os.path.exists(job_dir):
+                try:
+                    import shutil
+                    shutil.rmtree(job_dir)
+                except:
+                    pass
+            return True
+        return False
+    def get_zip_path(self, job_id: str) -> Optional[str]:
+        """Get path to job's output ZIP file."""
+        job = self.get_job(job_id)
+        if job and job.output_zip_path and os.path.exists(job.output_zip_path):
+            return job.output_zip_path
+        return None
+# Singleton instance
+_batch_service: Optional[BatchProcessingService] = None
+def get_batch_service() -> BatchProcessingService:
+    """Get or create BatchProcessingService singleton."""
+    global _batch_service
+    if _batch_service is None:
+        _batch_service = BatchProcessingService()
+    return _batch_service

backend/app/services/cache_service.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import redis
+import json
+import hashlib
+import logging
+from typing import Optional, Any
+from functools import lru_cache
+from ..core.config import get_settings
+logger = logging.getLogger(__name__)
+class CacheService:
+    def __init__(self):
+        settings = get_settings()
+        self.default_ttl = 3600 # 1 hour
+        self.redis = None
+        self.disk_cache = None
+        # Try Redis first
+        try:
+            self.redis = redis.from_url(settings.redis_url, decode_responses=False)
+            self.redis.ping()
+            logger.info("✅ Redis Cache connected")
+        except Exception as e:
+            logger.warning(f"⚠️ Redis unavailable, falling back to DiskCache: {e}")
+            self.redis = None
+            # Fallback to DiskCache
+            try:
+                import diskcache
+                cache_dir = "./cache_data"
+                self.disk_cache = diskcache.Cache(cache_dir)
+                logger.info(f"💾 DiskCache initialized at {cache_dir}")
+            except Exception as e:
+                logger.error(f"❌ DiskCache init failed: {e}")
+    def get(self, key: str) -> Optional[bytes]:
+        """Get raw bytes from cache"""
+        try:
+            if self.redis:
+                return self.redis.get(key)
+            elif self.disk_cache:
+                return self.disk_cache.get(key)
+        except Exception as e:
+            logger.error(f"Cache get failed: {e}")
+        return None
+    def set(self, key: str, value: bytes, ttl: int = None):
+        """Set raw bytes in cache"""
+        try:
+            ttl_val = ttl or self.default_ttl
+            if self.redis:
+                self.redis.setex(key, ttl_val, value)
+            elif self.disk_cache:
+                self.disk_cache.set(key, value, expire=ttl_val)
+        except Exception as e:
+             logger.error(f"Cache set failed: {e}")
+    def generate_key(self, prefix: str, **kwargs) -> str:
+        """Generate a stable cache key from arguments"""
+        # Convert all values to string for stability
+        safe_kwargs = {k: str(v) for k, v in kwargs.items()}
+        sorted_kwargs = dict(sorted(safe_kwargs.items()))
+        key_str = json.dumps(sorted_kwargs, sort_keys=True)
+        hash_str = hashlib.md5(key_str.encode()).hexdigest()
+        return f"{prefix}:{hash_str}"
+@lru_cache()
+def get_cache_service() -> CacheService:
+    return CacheService()

backend/app/services/clone_service.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Voice Cloning Service (Coqui XTTS)
+High-quality multi-lingual text-to-speech with voice cloning capabilities.
+"""
+import os
+import logging
+import torch
+import gc
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+import tempfile
+logger = logging.getLogger(__name__)
+class CloneService:
+    """
+    Service for Voice Cloning using Coqui XTTS v2.
+    """
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tts = None
+        self.model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+        self.loaded = False
+    def load_model(self):
+        """Lazy load the heavy XTTS model"""
+        if self.loaded:
+            return
+        try:
+            logger.info(f"Loading XTTS model ({self.device})... This may take a while.")
+            from TTS.api import TTS
+            # Load model
+            self.tts = TTS(self.model_name).to(self.device)
+            self.loaded = True
+            logger.info("✅ XTTS Model loaded successfully")
+        except ImportError as e:
+            logger.error("TTS library not installed. Please install 'TTS'.")
+            raise ImportError("Voice Cloning requires 'TTS' library.")
+        except Exception as e:
+            logger.error(f"Failed to load XTTS model: {e}")
+            raise e
+    def unload_model(self):
+        """Unload model to free VRAM"""
+        if self.tts:
+            del self.tts
+            self.tts = None
+            self.loaded = False
+            gc.collect()
+            torch.cuda.empty_cache()
+            logger.info("🗑️ XTTS Model unloaded")
+    def clone_voice(
+        self,
+        text: str,
+        speaker_wav_paths: List[str],
+        language: str = "en",
+        output_path: Optional[str] = None
+    ) -> str:
+        """
+        Synthesize speech in the style of the reference audio.
+        """
+        if not self.loaded:
+            self.load_model()
+        if not output_path:
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                output_path = f.name
+        try:
+            # XTTS synthesis
+            # Note: speaker_wav can be a list of files for better cloning
+            self.tts.tts_to_file(
+                text=text,
+                speaker_wav=speaker_wav_paths,
+                language=language,
+                file_path=output_path,
+                split_sentences=True
+            )
+            logger.info(f"Cloned speech generated: {output_path}")
+            return output_path
+        except Exception as e:
+            logger.error(f"Cloning failed: {e}")
+            raise e
+    def get_supported_languages(self) -> List[str]:
+        # XTTS v2 supported languages
+        return ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko"]
+# Singleton
+_clone_service = None
+def get_clone_service():
+    global _clone_service
+    if _clone_service is None:
+        _clone_service = CloneService()
+    return _clone_service

backend/app/services/diarization_service.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""
+Speaker Diarization Service - Clean Implementation
+Uses faster-whisper + pyannote.audio directly (no whisperx)
+This avoids the KeyError bugs in whisperx alignment while providing
+the same functionality.
+"""
+import os
+import gc
+import logging
+import torch
+from typing import Optional, Dict, Any, List
+from dotenv import load_dotenv
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+# Load environment variables from .env file
+load_dotenv()
+# Workaround for PyTorch 2.6+ weights_only security restriction
+os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
+class DiarizationService:
+    """
+    Speaker Diarization Service using faster-whisper + pyannote.audio.
+    This implementation avoids whisperx entirely to prevent alignment bugs.
+    Flow:
+    1. Transcribe with faster-whisper (word-level timestamps)
+    2. Diarize with pyannote.audio (speaker segments)
+    3. Merge speakers with transcript segments
+    Requires:
+    - faster-whisper (already installed)
+    - pyannote.audio
+    - Valid Hugging Face Token (HF_TOKEN) in .env
+    """
+    def __init__(self):
+        self.settings = get_settings()
+        # Auto-detect GPU (prefer CUDA for speed)
+        if torch.cuda.is_available():
+            self.device = "cuda"
+            self.compute_type = "float16"
+            logger.info(f"🚀 Diarization using GPU: {torch.cuda.get_device_name(0)}")
+        else:
+            self.device = "cpu"
+            self.compute_type = "int8"
+            logger.info("⚠️ Diarization using CPU (slower)")
+        # Load HF token
+        self.hf_token = os.getenv("HF_TOKEN")
+        if not self.hf_token:
+            logger.warning("⚠️ HF_TOKEN not found. Speaker diarization will fail.")
+        # FFmpeg Setup for Windows
+        self._setup_ffmpeg()
+    def _setup_ffmpeg(self):
+        """Auto-configure FFmpeg from imageio-ffmpeg if not in PATH"""
+        try:
+            import imageio_ffmpeg
+            import shutil
+            ffmpeg_src = imageio_ffmpeg.get_ffmpeg_exe()
+            backend_dir = os.getcwd()
+            ffmpeg_dest = os.path.join(backend_dir, "ffmpeg.exe")
+            if not os.path.exists(ffmpeg_dest):
+                shutil.copy(ffmpeg_src, ffmpeg_dest)
+                logger.info(f"🔧 Configured FFmpeg: {ffmpeg_dest}")
+            if backend_dir not in os.environ.get("PATH", ""):
+                os.environ["PATH"] = backend_dir + os.pathsep + os.environ.get("PATH", "")
+        except Exception as e:
+            logger.warning(f"⚠️ Could not auto-configure FFmpeg: {e}")
+    def check_requirements(self):
+        """Validate requirements before processing"""
+        if not self.hf_token:
+            raise ValueError(
+                "HF_TOKEN is missing. Add HF_TOKEN=your_token to .env file. "
+                "Get one at: https://huggingface.co/settings/tokens"
+            )
+    def _get_diarization_pipeline(self):
+        """Load pyannote diarization pipeline with PyTorch 2.6+ fix"""
+        from pyannote.audio import Pipeline
+        # Monkey-patch torch.load for PyTorch 2.6+ compatibility
+        original_load = torch.load
+        def safe_load(*args, **kwargs):
+            kwargs.pop('weights_only', None)
+            return original_load(*args, **kwargs, weights_only=False)
+        torch.load = safe_load
+        try:
+            pipeline = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+                use_auth_token=self.hf_token
+            )
+            if self.device == "cuda":
+                pipeline.to(torch.device("cuda"))
+            return pipeline
+        finally:
+            torch.load = original_load
+    def _transcribe_with_timestamps(self, audio_path: str, language: Optional[str] = None) -> Dict:
+        """Transcribe audio using faster-whisper with word timestamps"""
+        from faster_whisper import WhisperModel
+        # CTranslate2 (faster-whisper) doesn't support float16 on all GPUs
+        # Use int8 for whisper, but pyannote still benefits from CUDA
+        whisper_compute = "int8" if self.device == "cuda" else "int8"
+        model = WhisperModel(
+            "small",
+            device=self.device,
+            compute_type=whisper_compute
+        )
+        segments_raw, info = model.transcribe(
+            audio_path,
+            language=language,
+            word_timestamps=True,
+            vad_filter=True
+        )
+        segments = []
+        for segment in segments_raw:
+            segments.append({
+                "start": segment.start,
+                "end": segment.end,
+                "text": segment.text.strip(),
+                "words": [
+                    {"start": w.start, "end": w.end, "word": w.word}
+                    for w in (segment.words or [])
+                ]
+            })
+        # Cleanup
+        del model
+        gc.collect()
+        return {
+            "segments": segments,
+            "language": info.language
+        }
+    def _preprocess_audio(self, audio_path: str) -> str:
+        """
+        Apply noise reduction to audio file.
+        Returns path to cleaned audio file.
+        """
+        try:
+            import noisereduce as nr
+            import librosa
+            import soundfile as sf
+            import tempfile
+            logger.info("🔧 Preprocessing audio (noise reduction)...")
+            # Load audio
+            audio, sr = librosa.load(audio_path, sr=16000, mono=True)
+            # Apply spectral gating noise reduction
+            reduced_noise = nr.reduce_noise(
+                y=audio,
+                sr=sr,
+                stationary=True,
+                prop_decrease=0.75
+            )
+            # Save to temp file
+            temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            sf.write(temp_file.name, reduced_noise, sr)
+            logger.info(f"  → Noise reduction complete, saved to {temp_file.name}")
+            return temp_file.name
+        except ImportError as e:
+            logger.warning(f"⚠️ Audio preprocessing unavailable (install noisereduce, librosa, soundfile): {e}")
+            return audio_path
+        except Exception as e:
+            logger.warning(f"⚠️ Audio preprocessing failed: {e}")
+            return audio_path
+    def _merge_speakers(self, transcript: Dict, diarization) -> List[Dict]:
+        """
+        Merge speaker labels from diarization with transcript segments.
+        Uses midpoint matching with nearest-speaker fallback to minimize UNKNOWN labels.
+        """
+        segments = transcript["segments"]
+        result = []
+        # Build list of speaker turns for efficient lookup
+        speaker_turns = [
+            (turn.start, turn.end, spk)
+            for turn, _, spk in diarization.itertracks(yield_label=True)
+        ]
+        for seg in segments:
+            mid_time = (seg["start"] + seg["end"]) / 2
+            speaker = None
+            # Step 1: Try exact midpoint match
+            for start, end, spk in speaker_turns:
+                if start <= mid_time <= end:
+                    speaker = spk
+                    break
+            # Step 2: If no match, find nearest speaker (fallback)
+            if speaker is None and speaker_turns:
+                min_distance = float('inf')
+                for start, end, spk in speaker_turns:
+                    # Distance to nearest edge of speaker segment
+                    if mid_time < start:
+                        dist = start - mid_time
+                    elif mid_time > end:
+                        dist = mid_time - end
+                    else:
+                        dist = 0  # Should have been caught above
+                    if dist < min_distance:
+                        min_distance = dist
+                        speaker = spk
+            # Final fallback (shouldn't happen)
+            if speaker is None:
+                speaker = "UNKNOWN"
+            result.append({
+                "start": seg["start"],
+                "end": seg["end"],
+                "text": seg["text"],
+                "speaker": speaker
+            })
+        return result
+    def process_audio(
+        self,
+        audio_path: str,
+        num_speakers: Optional[int] = None,
+        min_speakers: Optional[int] = None,
+        max_speakers: Optional[int] = None,
+        language: Optional[str] = None,
+        preprocess: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Full diarization pipeline: [Preprocess] → Transcribe → Diarize → Merge
+        Args:
+            audio_path: Path to audio file
+            num_speakers: Exact number of speakers (optional)
+            min_speakers: Minimum speakers (optional)
+            max_speakers: Maximum speakers (optional)
+            language: Force language code (optional, auto-detected if None)
+            preprocess: Apply noise reduction before processing (default: False)
+        Returns:
+            Dict with segments, speaker_stats, language, status
+        """
+        self.check_requirements()
+        logger.info(f"🎤 Starting diarization on {self.device}...")
+        # Optional preprocessing for noise reduction
+        processed_path = audio_path
+        if preprocess:
+            processed_path = self._preprocess_audio(audio_path)
+        try:
+            # Step 1: Transcribe with faster-whisper
+            logger.info("Step 1/3: Transcribing audio...")
+            transcript = self._transcribe_with_timestamps(processed_path, language)
+            detected_lang = transcript["language"]
+            logger.info(f"  → Language: {detected_lang}, Segments: {len(transcript['segments'])}")
+            # Step 2: Diarize with pyannote
+            logger.info("Step 2/3: Identifying speakers...")
+            pipeline = self._get_diarization_pipeline()
+            diarization = pipeline(
+                processed_path,
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers
+            )
+            # Cleanup pipeline
+            del pipeline
+            gc.collect()
+            # Step 3: Merge results
+            logger.info("Step 3/3: Merging speakers with transcript...")
+            segments = self._merge_speakers(transcript, diarization)
+            # Calculate speaker stats
+            speaker_stats = {}
+            for seg in segments:
+                spk = seg["speaker"]
+                dur = seg["end"] - seg["start"]
+                speaker_stats[spk] = speaker_stats.get(spk, 0) + dur
+            logger.info(f"✅ Diarization complete: {len(segments)} segments, {len(speaker_stats)} speakers")
+            return {
+                "segments": segments,
+                "speaker_stats": speaker_stats,
+                "language": detected_lang,
+                "status": "success"
+            }
+        except Exception as e:
+            logger.exception("Diarization failed")
+            raise e
+        finally:
+            gc.collect()
+            if self.device == "cuda":
+                torch.cuda.empty_cache()
+# Singleton
+_diarization_service = None
+def get_diarization_service():
+    global _diarization_service
+    if not _diarization_service:
+        _diarization_service = DiarizationService()
+    return _diarization_service

backend/app/services/edge_tts_service.py ADDED Viewed

	@@ -0,0 +1,357 @@

+"""
+Edge-TTS Text-to-Speech Service
+Free, high-quality neural TTS using Microsoft Edge's speech synthesis
+"""
+import asyncio
+import io
+import logging
+import edge_tts
+from typing import Optional, List, Dict, Any
+logger = logging.getLogger(__name__)
+# Available voice samples by language
+VOICE_CATALOG = {
+    "en-US": [
+        {"name": "en-US-AriaNeural", "gender": "Female", "style": "professional"},
+        {"name": "en-US-GuyNeural", "gender": "Male", "style": "casual"},
+        {"name": "en-US-JennyNeural", "gender": "Female", "style": "friendly"},
+        {"name": "en-US-ChristopherNeural", "gender": "Male", "style": "newscast"},
+    ],
+    "en-GB": [
+        {"name": "en-GB-SoniaNeural", "gender": "Female", "style": "professional"},
+        {"name": "en-GB-RyanNeural", "gender": "Male", "style": "casual"},
+    ],
+    "en-IN": [
+        {"name": "en-IN-NeerjaNeural", "gender": "Female", "style": "professional"},
+        {"name": "en-IN-PrabhatNeural", "gender": "Male", "style": "casual"},
+    ],
+    "hi-IN": [
+        {"name": "hi-IN-SwaraNeural", "gender": "Female", "style": "professional"},
+        {"name": "hi-IN-MadhurNeural", "gender": "Male", "style": "casual"},
+    ],
+    "es-ES": [
+        {"name": "es-ES-ElviraNeural", "gender": "Female", "style": "professional"},
+        {"name": "es-ES-AlvaroNeural", "gender": "Male", "style": "casual"},
+    ],
+    "es-MX": [
+        {"name": "es-MX-DaliaNeural", "gender": "Female", "style": "professional"},
+        {"name": "es-MX-JorgeNeural", "gender": "Male", "style": "casual"},
+    ],
+    "fr-FR": [
+        {"name": "fr-FR-DeniseNeural", "gender": "Female", "style": "professional"},
+        {"name": "fr-FR-HenriNeural", "gender": "Male", "style": "casual"},
+    ],
+    "de-DE": [
+        {"name": "de-DE-KatjaNeural", "gender": "Female", "style": "professional"},
+        {"name": "de-DE-ConradNeural", "gender": "Male", "style": "casual"},
+    ],
+    "ja-JP": [
+        {"name": "ja-JP-NanamiNeural", "gender": "Female", "style": "professional"},
+        {"name": "ja-JP-KeitaNeural", "gender": "Male", "style": "casual"},
+    ],
+    "ko-KR": [
+        {"name": "ko-KR-SunHiNeural", "gender": "Female", "style": "professional"},
+        {"name": "ko-KR-InJoonNeural", "gender": "Male", "style": "casual"},
+    ],
+    "zh-CN": [
+        {"name": "zh-CN-XiaoxiaoNeural", "gender": "Female", "style": "professional"},
+        {"name": "zh-CN-YunxiNeural", "gender": "Male", "style": "casual"},
+    ],
+}
+class EdgeTTSService:
+    """
+    Text-to-Speech service using Microsoft Edge TTS (free, neural voices)
+    """
+    def __init__(self):
+        """Initialize the Edge TTS service"""
+        self._all_voices = None
+    # Class-level cache
+    _voices_cache = None
+    async def get_voices(self, language: Optional[str] = None) -> List[Dict[str, Any]]:
+        """
+        Get available voices
+        """
+        # Check cache
+        if EdgeTTSService._voices_cache is None:
+            try:
+                voices = await edge_tts.list_voices()
+                # Transform to our format
+                formatted_voices = []
+                for v in voices:
+                    formatted_voices.append({
+                        "name": v["ShortName"],
+                        "display_name": v["ShortName"].replace("-", " ").split("Neural")[0].strip(),
+                        "language_code": v["Locale"],
+                        "gender": v["Gender"],
+                        "voice_type": "Neural",
+                    })
+                EdgeTTSService._voices_cache = formatted_voices
+            except Exception as e:
+                logger.error(f"Failed to fetch voices from Edge TTS: {e}. Falling back to catalog.")
+                # Fallback to catalog
+                voices = []
+                for lang, lang_voices in VOICE_CATALOG.items():
+                    for v in lang_voices:
+                        voices.append({
+                            "name": v["name"],
+                            "display_name": v["name"].replace("-", " ").replace("Neural", "").strip(),
+                            "language_code": lang,
+                            "gender": v["gender"],
+                            "voice_type": "Neural",
+                        })
+                EdgeTTSService._voices_cache = voices
+        voices = EdgeTTSService._voices_cache
+        # Filter by language if specified
+        if language:
+            voices = [v for v in voices if v["language_code"].startswith(language)]
+        return voices
+    def get_voices_sync(self, language: Optional[str] = None) -> List[Dict[str, Any]]:
+        """Synchronous wrapper for get_voices"""
+        # Create a new event loop if necessary for sync wrapper
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        if loop.is_running():
+            # If loop is running, we can't block it.
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as pool:
+                future = asyncio.run_coroutine_threadsafe(self.get_voices(language), loop)
+                return future.result()
+        return loop.run_until_complete(self.get_voices(language))
+    def build_ssml(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "medium",
+        pitch: str = "medium",
+        emphasis: str = None,
+        breaks: bool = True
+    ) -> str:
+        """
+        Build SSML markup for advanced prosody control.
+        Args:
+            text: Plain text to convert
+            voice: Voice name
+            rate: Speed - 'x-slow', 'slow', 'medium', 'fast', 'x-fast' or percentage
+            pitch: Pitch - 'x-low', 'low', 'medium', 'high', 'x-high' or Hz offset
+            emphasis: Optional emphasis level - 'reduced', 'moderate', 'strong'
+            breaks: Auto-insert breaks at punctuation
+        Returns:
+            SSML-formatted string
+        """
+        # Normalize rate/pitch values
+        rate_value = rate if rate in ['x-slow', 'slow', 'medium', 'fast', 'x-fast'] else rate
+        pitch_value = pitch if pitch in ['x-low', 'low', 'medium', 'high', 'x-high'] else pitch
+        # Build SSML
+        ssml_parts = ['<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">']
+        ssml_parts.append(f'<voice name="{voice}">')
+        ssml_parts.append(f'<prosody rate="{rate_value}" pitch="{pitch_value}">')
+        if emphasis:
+            ssml_parts.append(f'<emphasis level="{emphasis}">')
+        # Auto-insert breaks for natural speech
+        if breaks:
+            import re
+            # Add short breaks after commas, longer after periods
+            processed_text = re.sub(r'([,;:])\s*', r'\1<break time="200ms"/>', text)
+            processed_text = re.sub(r'([.!?])\s+', r'\1<break time="500ms"/>', processed_text)
+            ssml_parts.append(processed_text)
+        else:
+            ssml_parts.append(text)
+        if emphasis:
+            ssml_parts.append('</emphasis>')
+        ssml_parts.append('</prosody>')
+        ssml_parts.append('</voice>')
+        ssml_parts.append('</speak>')
+        return ''.join(ssml_parts)
+    async def synthesize_ssml(
+        self,
+        ssml_text: str,
+        voice: str = "en-US-AriaNeural",
+    ) -> bytes:
+        """
+        Synthesize speech from SSML markup.
+        Args:
+            ssml_text: SSML-formatted text
+            voice: Voice name (for edge-tts communication)
+        Returns:
+            Audio bytes (MP3)
+        """
+        logger.info(f"Synthesizing SSML with voice: {voice}")
+        # Edge TTS handles SSML natively
+        communicate = edge_tts.Communicate(ssml_text, voice)
+        audio_buffer = io.BytesIO()
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                audio_buffer.write(chunk["data"])
+        audio_buffer.seek(0)
+        return audio_buffer.read()
+    async def synthesize_stream(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "+0%",
+        pitch: str = "+0Hz",
+    ):
+        """
+        Stream speech synthesis chunks.
+        Optimized to stream sentence-by-sentence to reduce TTFB (Time To First Byte),
+        avoiding full-text buffering issues.
+        """
+        import re
+        # Split text into sentences to force incremental processing
+        # This regex matches sentences ending with . ! ? or end of string
+        # It keeps the proper punctuation.
+        sentences = re.findall(r'[^.!?]+(?:[.!?]+|$)', text)
+        if not sentences:
+            sentences = [text]
+        logger.info(f"Streaming {len(sentences)} sentences for low latency...")
+        for sentence in sentences:
+            if not sentence.strip():
+                continue
+            communicate = edge_tts.Communicate(sentence, voice, rate=rate, pitch=pitch)
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    yield chunk["data"]
+    async def synthesize(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "+0%",
+        pitch: str = "+0Hz",
+    ) -> bytes:
+        """
+        Synthesize speech from text
+        Args:
+            text: Text to synthesize
+            voice: Voice name (e.g., 'en-US-AriaNeural')
+            rate: Speaking rate adjustment (e.g., '+20%', '-10%')
+            pitch: Pitch adjustment (e.g., '+5Hz', '-10Hz')
+        Returns:
+            Audio content as bytes (MP3 format)
+        """
+        # Reuse stream method to avoid duplication
+        audio_buffer = io.BytesIO()
+        async for chunk in self.synthesize_stream(text, voice, rate, pitch):
+            audio_buffer.write(chunk)
+        audio_buffer.seek(0)
+        return audio_buffer.read()
+    def synthesize_sync(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        rate: str = "+0%",
+        pitch: str = "+0Hz",
+    ) -> bytes:
+        """Synchronous wrapper for synthesize"""
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        return loop.run_until_complete(self.synthesize(text, voice, rate, pitch))
+    async def synthesize_to_response(
+        self,
+        text: str,
+        voice: str = "en-US-AriaNeural",
+        speaking_rate: float = 1.0,
+        pitch: float = 0.0,
+    ) -> Dict[str, Any]:
+        """
+        Synthesize speech and return API-compatible response
+        Args:
+            text: Text to synthesize
+            voice: Voice name
+            speaking_rate: Rate multiplier (1.0 = normal, 1.5 = 50% faster)
+            pitch: Pitch adjustment in semitones (-20 to +20)
+        Returns:
+            Dictionary with audio content and metadata
+        """
+        import base64
+        import time
+        start_time = time.time()
+        # Convert rate/pitch to Edge TTS format
+        rate_percent = int((speaking_rate - 1.0) * 100)
+        rate_str = f"+{rate_percent}%" if rate_percent >= 0 else f"{rate_percent}%"
+        pitch_str = f"+{int(pitch)}Hz" if pitch >= 0 else f"{int(pitch)}Hz"
+        # Synthesize
+        audio_bytes = await self.synthesize(text, voice, rate_str, pitch_str)
+        processing_time = time.time() - start_time
+        # Estimate duration (~150 chars per second at normal speed)
+        estimated_duration = len(text) / 150 / speaking_rate
+        return {
+            "audio_content": base64.b64encode(audio_bytes).decode("utf-8"),
+            "encoding": "MP3",
+            "audio_size": len(audio_bytes),
+            "duration_estimate": estimated_duration,
+            "voice_used": voice,
+            "processing_time": processing_time,
+            "cached": False,
+        }
+# Singleton instance
+_edge_tts_service: Optional[EdgeTTSService] = None
+def get_edge_tts_service() -> EdgeTTSService:
+    """Get or create the EdgeTTSService singleton"""
+    global _edge_tts_service
+    if _edge_tts_service is None:
+        _edge_tts_service = EdgeTTSService()
+    return _edge_tts_service

backend/app/services/emotion_service.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Emotion Analysis Service
+Detects emotion from audio using Wav2Vec2 and text using NLP
+"""
+import logging
+import os
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Dict, List, Any, Optional
+from app.core.config import get_settings
+logger = logging.getLogger(__name__)
+class EmotionService:
+    """
+    Service for Speech Emotion Recognition (SER).
+    Uses 'ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition'
+    """
+    def __init__(self):
+        self.model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+        self._model = None
+        self._processor = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Supported emotions in model's order
+        self.emotions = [
+            "angry", "calm", "disgust", "fearful",
+            "happy", "neutral", "sad", "surprised"
+        ]
+    def _load_model(self):
+        """Lazy load model to save RAM"""
+        if self._model is None:
+            try:
+                from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
+                logger.info(f"🎭 Loading Emotion Model ({self.device})...")
+                self._processor = Wav2Vec2Processor.from_pretrained(self.model_name)
+                self._model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_name)
+                self._model.to(self.device)
+                logger.info("✅ Emotion Model loaded")
+            except Exception as e:
+                logger.error(f"Failed to load emotion model: {e}")
+                raise
+    def analyze_audio(self, audio_path: str) -> Dict[str, Any]:
+        """
+        Analyze emotion of an entire audio file.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Dict with dominant emotion and probability distribution
+        """
+        import librosa
+        self._load_model()
+        try:
+            # Load audio using librosa (16kHz required for Wav2Vec2)
+            # Duration limit: Analyze first 30s max for MVP to avoid OOM
+            # For full file, we should chunk it.
+            y, sr = librosa.load(audio_path, sr=16000, duration=60)
+            inputs = self._processor(y, sampling_rate=16000, return_tensors="pt", padding=True)
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                logits = self._model(**inputs).logits
+            # Get probabilities
+            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
+            # Map to emotions
+            scores = {
+                self.emotions[i]: float(probs[i])
+                for i in range(len(self.emotions))
+            }
+            # Get dominant
+            dominant = max(scores, key=scores.get)
+            return {
+                "dominant_emotion": dominant,
+                "confidence": scores[dominant],
+                "distribution": scores
+            }
+        except Exception as e:
+            logger.error(f"Audio emotion analysis failed: {e}")
+            raise e
+    def analyze_audio_segment(self, audio_data: np.ndarray, sr: int = 16000) -> Dict[str, Any]:
+        """
+        Analyze a raw numpy audio segment.
+        """
+        self._load_model()
+        try:
+            inputs = self._processor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                logits = self._model(**inputs).logits
+            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
+            scores = {self.emotions[i]: float(probs[i]) for i in range(len(self.emotions))}
+            dominant = max(scores, key=scores.get)
+            return {
+                "emotion": dominant,
+                "score": scores[dominant]
+            }
+        except Exception as e:
+            logger.error(f"Segment analysis failed: {e}")
+            return {"emotion": "neutral", "score": 0.0}
+# Singleton
+_emotion_service = None
+def get_emotion_service() -> EmotionService:
+    global _emotion_service
+    if _emotion_service is None:
+        _emotion_service = EmotionService()
+    return _emotion_service

backend/app/services/export_service.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+Export Service
+Helper functions for generating transcript exports (PDF, SRT, VTT, TXT)
+"""
+from fpdf import FPDF
+from typing import List, Dict, Any
+import io
+class ExportService:
+    @staticmethod
+    def to_txt(transcript: Dict[str, Any]) -> str:
+        """Export as plain text"""
+        text = transcript.get("text", "")
+        # improved structure
+        output = []
+        output.append(f"Transcript ID: {transcript.get('id', 'N/A')}")
+        output.append(f"Date: {transcript.get('created_at', 'Unknown')}")
+        output.append("-" * 40)
+        output.append(text)
+        return "\n".join(output)
+    @staticmethod
+    def to_srt(transcript: Dict[str, Any]) -> str:
+        """Export as SRT (SubRip Subtitle)"""
+        segments = transcript.get("segments") or []
+        if not segments:
+            # Fallback to word timestamps if segments missing
+            words = transcript.get("words", [])
+            if words:
+                pass # TODO: Construct segments from words
+            return "" # Cannot generate SRT without timing
+        srt_lines = []
+        for i, segment in enumerate(segments, 1):
+            start = ExportService._format_timestamp(segment.get("start_time", 0))
+            end = ExportService._format_timestamp(segment.get("end_time", 0))
+            text = segment.get("text", "").strip()
+            srt_lines.append(str(i))
+            srt_lines.append(f"{start} --> {end}")
+            srt_lines.append(text)
+            srt_lines.append("")
+        return "\n".join(srt_lines)
+    @staticmethod
+    def to_vtt(transcript: Dict[str, Any]) -> str:
+        """Export as WebVTT"""
+        srt = ExportService.to_srt(transcript)
+        return "WEBVTT\n\n" + srt.replace(",", ".")
+    @staticmethod
+    def to_pdf(transcript: Dict[str, Any]) -> bytes:
+        """Export as PDF"""
+        pdf = FPDF()
+        pdf.add_page()
+        pdf.set_font("helvetica", size=12)
+        # Header
+        pdf.set_font("helvetica", "B", 16)
+        pdf.cell(0, 10, f"Transcript Report", new_x="LMARGIN", new_y="NEXT", align='C')
+        pdf.ln(10)
+        # Metadata
+        pdf.set_font("helvetica", "B", 10)
+        pdf.cell(40, 10, f"Date: {transcript.get('created_at', 'Unknown')}")
+        pdf.ln(5)
+        pdf.cell(40, 10, f"Duration: {transcript.get('duration', 0)}s")
+        pdf.ln(10)
+        # Content
+        pdf.set_font("helvetica", size=11)
+        text = transcript.get("text", "")
+        # fpdf2 handles utf-8 much better now
+        pdf.multi_cell(0, 8, text)
+        # NLP Analysis if available
+        sentiment = transcript.get("sentiment")
+        if sentiment:
+            pdf.ln(10)
+            pdf.set_font("helvetica", "B", 12)
+            pdf.cell(0, 10, "Analysis", new_x="LMARGIN", new_y="NEXT")
+            pdf.set_font("helvetica", size=10)
+            pdf.cell(0, 8, f"Sentiment: Polarity {sentiment.get('polarity')}, Subjectivity {sentiment.get('subjectivity')}", new_x="LMARGIN", new_y="NEXT")
+        return bytes(pdf.output())
+    @staticmethod
+    def _format_timestamp(seconds: float) -> str:
+        """Format seconds to HH:MM:SS,mmm"""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis = int((seconds % 1) * 1000)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

backend/app/services/file_service.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+File Service
+Audio file management and processing
+"""
+import os
+import uuid
+import shutil
+import logging
+from pathlib import Path
+from typing import Optional, Tuple, Dict, Any
+from datetime import datetime
+from ..core.config import get_settings
+logger = logging.getLogger(__name__)
+settings = get_settings()
+class FileService:
+    """
+    Service for managing audio file uploads and storage
+    """
+    def __init__(self):
+        """Initialize file service and ensure upload directory exists"""
+        self.upload_dir = Path(settings.upload_dir)
+        self.upload_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"File service initialized with upload dir: {self.upload_dir}")
+    def save_upload(
+        self,
+        file_content: bytes,
+        original_filename: str,
+        user_id: Optional[int] = None,
+    ) -> Tuple[str, Dict[str, Any]]:
+        """
+        Save an uploaded audio file
+        Args:
+            file_content: File bytes
+            original_filename: Original filename from upload
+            user_id: Optional user ID for organization
+        Returns:
+            Tuple of (storage_path, file_metadata)
+        """
+        # Validate file extension
+        ext = Path(original_filename).suffix.lower()
+        if ext.lstrip('.') not in settings.supported_audio_formats_list:
+            raise ValueError(f"Unsupported audio format: {ext}")
+        # Validate file size
+        file_size = len(file_content)
+        max_size = settings.max_upload_size_mb * 1024 * 1024
+        if file_size > max_size:
+            raise ValueError(f"File too large: {file_size / 1024 / 1024:.1f}MB (max {settings.max_upload_size_mb}MB)")
+        # Generate unique filename
+        unique_id = str(uuid.uuid4())
+        date_prefix = datetime.now().strftime("%Y/%m/%d")
+        # Create subdirectory for user or general
+        if user_id:
+            subdir = self.upload_dir / f"user_{user_id}" / date_prefix
+        else:
+            subdir = self.upload_dir / "anonymous" / date_prefix
+        subdir.mkdir(parents=True, exist_ok=True)
+        # Save file
+        filename = f"{unique_id}{ext}"
+        storage_path = subdir / filename
+        with open(storage_path, "wb") as f:
+            f.write(file_content)
+        logger.info(f"Saved upload: {original_filename} -> {storage_path}")
+        # Get file metadata
+        metadata = self._get_file_metadata(storage_path)
+        metadata["original_filename"] = original_filename
+        metadata["file_size"] = file_size
+        return str(storage_path), metadata
+    def get_file(self, storage_path: str) -> Optional[bytes]:
+        """
+        Get file content by storage path
+        Args:
+            storage_path: Path to stored file
+        Returns:
+            File bytes or None if not found
+        """
+        path = Path(storage_path)
+        if not path.exists():
+            logger.warning(f"File not found: {storage_path}")
+            return None
+        with open(path, "rb") as f:
+            return f.read()
+    def delete_file(self, storage_path: str) -> bool:
+        """
+        Delete a stored file
+        Args:
+            storage_path: Path to stored file
+        Returns:
+            True if deleted, False if not found
+        """
+        path = Path(storage_path)
+        if not path.exists():
+            return False
+        try:
+            path.unlink()
+            logger.info(f"Deleted file: {storage_path}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to delete file: {e}")
+            return False
+    def _get_file_metadata(self, file_path: Path) -> Dict[str, Any]:
+        """
+        Get metadata for an audio file
+        Uses ffprobe if available, otherwise basic info
+        Args:
+            file_path: Path to audio file
+        Returns:
+            Dict with file metadata
+        """
+        ext = file_path.suffix.lower().lstrip('.')
+        metadata = {
+            "format": ext,
+            "storage_path": str(file_path),
+        }
+        # Try to get audio metadata using ffprobe
+        try:
+            import subprocess
+            import json
+            result = subprocess.run(
+                [
+                    "ffprobe",
+                    "-v", "quiet",
+                    "-print_format", "json",
+                    "-show_format",
+                    "-show_streams",
+                    str(file_path)
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                probe_data = json.loads(result.stdout)
+                # Extract format info
+                if "format" in probe_data:
+                    fmt = probe_data["format"]
+                    metadata["duration"] = float(fmt.get("duration", 0))
+                    metadata["bit_rate"] = int(fmt.get("bit_rate", 0))
+                # Extract stream info
+                for stream in probe_data.get("streams", []):
+                    if stream.get("codec_type") == "audio":
+                        metadata["sample_rate"] = int(stream.get("sample_rate", 0))
+                        metadata["channels"] = int(stream.get("channels", 0))
+                        metadata["codec"] = stream.get("codec_name", "")
+                        break
+                logger.debug(f"Extracted metadata via ffprobe: {metadata}")
+        except FileNotFoundError:
+            logger.debug("ffprobe not available, using basic metadata")
+        except Exception as e:
+            logger.warning(f"Failed to extract metadata: {e}")
+        return metadata
+    def cleanup_temp_files(self, max_age_hours: int = 24) -> int:
+        """
+        Clean up old temporary/anonymous files
+        Args:
+            max_age_hours: Delete files older than this
+        Returns:
+            Number of files deleted
+        """
+        deleted = 0
+        anonymous_dir = self.upload_dir / "anonymous"
+        if not anonymous_dir.exists():
+            return 0
+        cutoff = datetime.now().timestamp() - (max_age_hours * 3600)
+        for file_path in anonymous_dir.rglob("*"):
+            if file_path.is_file() and file_path.stat().st_mtime < cutoff:
+                try:
+                    file_path.unlink()
+                    deleted += 1
+                except Exception as e:
+                    logger.error(f"Failed to delete {file_path}: {e}")
+        if deleted:
+            logger.info(f"Cleaned up {deleted} old temporary files")
+        return deleted
+# Singleton instance
+_file_service: Optional[FileService] = None
+def get_file_service() -> FileService:
+    """Get singleton file service instance"""
+    global _file_service
+    if _file_service is None:
+        _file_service = FileService()
+    return _file_service

backend/app/services/meeting_service.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Meeting Minutes Service
+Orchestrates Speaker Diarization, STT, and NLP to generate meeting reports
+"""
+import logging
+import os
+import shutil
+from typing import Dict, Any, List, Optional
+from datetime import datetime
+from app.services.diarization_service import get_diarization_service
+from app.services.nlp_service import get_nlp_service
+logger = logging.getLogger(__name__)
+class MeetingService:
+    """
+    Orchestrates the creation of intelligent meeting minutes.
+    """
+    def __init__(self):
+        self.diarization_service = get_diarization_service()
+        self.nlp_service = get_nlp_service()
+    def process_meeting(
+        self,
+        audio_path: str,
+        num_speakers: Optional[int] = None,
+        language: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Process a meeting recording to generate full minutes.
+        Pipeline:
+        1. Diarization + STT (Who said what)
+        2. NLP Analysis (Summary, Action Items, Topics)
+        3. Report Generation data
+        Args:
+            audio_path: Path to audio file
+            num_speakers: Optional hint for diarization
+            language: Optional language code
+        Returns:
+            Dict containing full meeting data
+        """
+        try:
+            logger.info(f"📅 Starting meeting processing for {os.path.basename(audio_path)}")
+            # Step 1: Diarization & Transcription
+            # This is the heavy lifting - getting segments with speakers
+            diarization_result = self.diarization_service.process_audio(
+                audio_path,
+                num_speakers=num_speakers,
+                language=language,
+                preprocess=True  # Always preprocess meetings for better quality
+            )
+            segments = diarization_result["segments"]
+            full_text = " ".join([seg["text"] for seg in segments])
+            speaker_stats = diarization_result["speaker_stats"]
+            detected_language = diarization_result["language"]
+            # Step 2: NLP Analysis
+            logger.info("🧠 Running NLP analysis on meeting transcript...")
+            # 2a. Summary
+            summary = self.nlp_service.generate_summary(full_text, sentence_count=5)
+            # 2b. Action Items
+            action_items = self.nlp_service.extract_action_items(full_text)
+            # 2c. Keywords/Topics
+            keywords = self.nlp_service.extract_keywords(full_text, max_keywords=15)
+            # 2d. Sentiment
+            sentiment = self.nlp_service.analyze_sentiment(full_text)
+            # Step 3: Organize Output
+            attendees = list(speaker_stats.keys())
+            # Enhance segments with individual analysis if needed?
+            # (Skipping per-segment sentiment for now to save time, can add later)
+            result = {
+                "metadata": {
+                    "filename": os.path.basename(audio_path),
+                    "processed_at": datetime.now().isoformat(),
+                    "language": detected_language,
+                    "duration_seconds": sum(speaker_stats.values()),
+                    "attendee_count": len(attendees),
+                    "attendees": attendees,
+                },
+                "summary": summary,
+                "action_items": action_items,
+                "topics": keywords,
+                "sentiment": sentiment,
+                "speaker_stats": speaker_stats,
+                "transcript_segments": segments,
+                "raw_text": full_text,
+            }
+            logger.info("✅ Meeting processing complete!")
+            return result
+        except Exception as e:
+            logger.error(f"Meeting processing failed: {e}")
+            raise e
+# Singleton instance
+_meeting_service = None
+def get_meeting_service() -> MeetingService:
+    """Get or create MeetingService singleton."""
+    global _meeting_service
+    if _meeting_service is None:
+        _meeting_service = MeetingService()
+    return _meeting_service

backend/app/services/nlp_service.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+NLP Service
+Handles text analysis, sentiment, keywords, and summarization
+"""
+import logging
+from typing import List, Dict, Any, Optional
+import nltk
+from textblob import TextBlob
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.lsa import LsaSummarizer
+from sumy.nlp.stemmers import Stemmer
+from sumy.utils import get_stop_words
+from collections import Counter
+import re
+logger = logging.getLogger(__name__)
+class NLPService:
+    """
+    Service for Natural Language Processing tasks
+    Uses local libraries (TextBlob, Sumy) to avoid API costs
+    """
+    def __init__(self):
+        self._ensure_nltk_resources()
+    def _ensure_nltk_resources(self):
+        """Download necessary NLTK data if missing"""
+        resources = ["punkt", "averaged_perceptron_tagger", "brown"]
+        for resource in resources:
+            try:
+                nltk.data.find(f"tokenizers/{resource}")
+            except LookupError:
+                try:
+                    nltk.data.find(f"corpora/{resource}")
+                except LookupError:
+                    try:
+                        nltk.data.find(f"taggers/{resource}")
+                    except LookupError:
+                        logger.info(f"Downloading NLTK resource: {resource}")
+                        nltk.download(resource, quiet=True)
+        # sumy specific
+        try:
+            nltk.data.find("tokenizers/punkt_tab")
+        except LookupError:
+            nltk.download("punkt", quiet=True)
+            nltk.download("punkt_tab", quiet=True)
+    def analyze_sentiment(self, text: str) -> Dict[str, float]:
+        """
+        Analyze sentiment of text
+        Returns: {polarity: -1.0 to 1.0, subjectivity: 0.0 to 1.0}
+        """
+        if not text:
+            return {"polarity": 0.0, "subjectivity": 0.0}
+        blob = TextBlob(text)
+        return {
+            "polarity": round(blob.sentiment.polarity, 2),
+            "subjectivity": round(blob.sentiment.subjectivity, 2)
+        }
+    def extract_keywords(self, text: str, max_keywords: int = 10) -> List[Dict[str, Any]]:
+        """
+        Extract keywords/keyphrases from text
+        Returns list of {"text": str, "count": int}
+        """
+        if not text:
+            return []
+        blob = TextBlob(text)
+        # Get noun phrases
+        noun_phrases = blob.noun_phrases
+        if noun_phrases:
+            # Count frequency
+            counts = Counter(noun_phrases)
+            # Return top N
+            return [{"text": phrase, "count": count} for phrase, count in counts.most_common(max_keywords)]
+        # Fallback to simple word frequency if no noun phrases
+        stop_words = set(["the", "a", "an", "in", "on", "at", "to", "for", "of", "and", "or", "is", "are", "was", "were", "it", "that", "this"])
+        words = [w.lower() for w in re.findall(r'\w+', text) if len(w) > 3 and w.lower() not in stop_words]
+        counts = Counter(words)
+        return [{"text": word, "count": count} for word, count in counts.most_common(max_keywords)]
+    def extract_action_items(self, text: str) -> List[str]:
+        """
+        Extract potential action items using regex patterns.
+        Looks for phrases like "I will", "we need to", "todo", etc.
+        """
+        if not text:
+            return []
+        action_patterns = [
+            r"(?i)(?:I|we|you|he|she|they) (?:will|shall|must|should|need to|have to|going to) (.*?)[\.,]",
+            r"(?i)(?:let's|lets) (.*?)[\.,]",
+            r"(?i)(?:action item|todo|to-do)[:\s](.*?)[\.,]",
+            r"(?i)(?:please|plz) (.*?)[\.,]",
+            r"(?i)(?:make sure|ensure) (?:to|that)? (.*?)[\.,]",
+            r"(?i)(?:don't forget|remember) to (.*?)[\.,]",
+        ]
+        action_items = []
+        # Split into sentences first for better context
+        sentences = nltk.sent_tokenize(text)
+        for sentence in sentences:
+            for pattern in action_patterns:
+                matches = re.findall(pattern, sentence)
+                for match in matches:
+                    # Clean up the match
+                    item = match.strip()
+                    if len(item) > 5: # Filter out short noise
+                        # Try to capture full sentence context if match is short
+                        if len(item.split()) < 3:
+                            action_items.append(sentence.strip())
+                        else:
+                            # Reconstruct "I will [match]" context if reasonable
+                            if pattern.startswith(r"(?i)(?:I|we"):
+                                # Find usage of the trigger word
+                                trigger = re.search(r"(will|shall|must|should|need to|have to|going to)", sentence, re.IGNORECASE)
+                                if trigger:
+                                    start = trigger.start()
+                                    action_items.append(sentence[start:].strip())
+                                else:
+                                    action_items.append(item)
+                            else:
+                                action_items.append(item)
+                        break # One action item per sentence is usually enough
+        return list(set(action_items)) # Dedup
+    def generate_summary(self, text: str, sentence_count: int = 3) -> str:
+        """
+        Generate extractive summary using LSA
+        """
+        if not text:
+            return ""
+        try:
+            language = "english" # Default to english for now
+            parser = PlaintextParser.from_string(text, Tokenizer(language))
+            stemmer = Stemmer(language)
+            summarizer = LsaSummarizer(stemmer)
+            summarizer.stop_words = get_stop_words(language)
+            summary_sentences = summarizer(parser.document, sentence_count)
+            return " ".join([str(s) for s in summary_sentences])
+        except Exception as e:
+            logger.warning(f"Summarization failed: {e}")
+            # Fallback: simple first N sentences
+            sentences = text.split('.')
+            return ".".join(sentences[:sentence_count]) + "."
+    def process_transcript(self, text: str) -> Dict[str, Any]:
+        """
+        Run full NLP pipeline on transcript text
+        """
+        return {
+            "sentiment": self.analyze_sentiment(text),
+            "keywords": self.extract_keywords(text),
+            "summary": self.generate_summary(text),
+            "action_items": self.extract_action_items(text),
+        }
+# Singleton instance
+_nlp_service = None
+def get_nlp_service() -> NLPService:
+    global _nlp_service
+    if _nlp_service is None:
+        _nlp_service = NLPService()
+    return _nlp_service

backend/app/services/sign_avatar_service.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Sign Language Avatar Service
+Converts text input into a sequence of sign language images/animations.
+Current implementation: ASL Finger Spelling using static images.
+"""
+import os
+import logging
+from typing import List, Dict, Optional
+logger = logging.getLogger(__name__)
+class SignAvatarService:
+    """
+    Generates sign language visualizations from text.
+    """
+    # Placeholder URLs for ASL hand signs (Public CDN or local assets)
+    # Using a reliable public source for testing, or we could generate valid placeholders.
+    # For now, we simulate with a dictionary mapping.
+    ASL_IMAGE_MAP = {
+        letter: f"https://www.signingsavvy.com/images/asl/start/Sgn{i}.jpg"
+        for i, letter in enumerate(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), start=1) # Simplified mapping logic
+    }
+    # A more reliable source for ASL alphabet images (fingerspelling)
+    # Using openclipart or similar public domain layout is safer,
+    # but for this portfolio prototype, we'll construct a mock response structure
+    # that the frontend can interpret to render images or use a specific asset path.
+    def __init__(self):
+        pass
+    def text_to_glosses(self, text: str) -> List[Dict]:
+        """
+        Convert text to a sequence of sign glosses (or letters for fingerspelling).
+        Args:
+            text: Input text (e.g. "Hello World")
+        Returns:
+            List of objects: {"type": "letter", "value": "H", "image_url": "..."}
+        """
+        clean_text = text.upper().strip()
+        sequence = []
+        # Simple Finger Spelling approach (MVP)
+        for char in clean_text:
+            if char.isalpha():
+                # In a real app, we'd have local assets.
+                # For this demo, we'll return a schematic that the frontend can use
+                # to fetch from a public ASL dictionary or strictly local assets if we had them.
+                # Let's assume we'll use a public GitHub raw set for stability.
+                # Using a known stable repo for ASL images (e.g. from a tutorial or dataset)
+                # or just returning the character for the frontend to render with a custom font/image map.
+                image_url = f"https://raw.githubusercontent.com/redcode-br/ASL-Finger-Spelling/master/assets/{char}.png"
+                sequence.append({
+                    "type": "letter",
+                    "value": char,
+                    "image_url": image_url,
+                    "duration": 1.0 # seconds to display
+                })
+            elif char == " ":
+                sequence.append({
+                    "type": "space",
+                    "value": " ",
+                    "duration": 0.5
+                })
+        return sequence
+# Singleton
+_avatar_service = None
+def get_avatar_service():
+    global _avatar_service
+    if _avatar_service is None:
+        _avatar_service = SignAvatarService()
+    return _avatar_service