Spaces:
Sleeping
Sleeping
Deploy Global Video Localizer
Browse files- .gitignore +61 -0
- README.md +176 -6
- SUBMISSION_GUIDE.md +244 -0
- TWITTER_POST.md +105 -0
- app.py +558 -0
- localizer_engine.py +1174 -0
- packages.txt +1 -0
- requirements.txt +35 -0
- server.py +54 -0
.gitignore
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
|
| 8 |
+
# Virtual environments
|
| 9 |
+
.env
|
| 10 |
+
.venv
|
| 11 |
+
env/
|
| 12 |
+
venv/
|
| 13 |
+
ENV/
|
| 14 |
+
env.bak/
|
| 15 |
+
venv.bak/
|
| 16 |
+
|
| 17 |
+
# IDEs
|
| 18 |
+
.vscode/
|
| 19 |
+
.idea/
|
| 20 |
+
*.swp
|
| 21 |
+
*.swo
|
| 22 |
+
*~
|
| 23 |
+
|
| 24 |
+
# OS
|
| 25 |
+
.DS_Store
|
| 26 |
+
Thumbs.db
|
| 27 |
+
|
| 28 |
+
# Logs
|
| 29 |
+
*.log
|
| 30 |
+
logs/
|
| 31 |
+
|
| 32 |
+
# Testing
|
| 33 |
+
.pytest_cache/
|
| 34 |
+
.coverage
|
| 35 |
+
htmlcov/
|
| 36 |
+
.tox/
|
| 37 |
+
|
| 38 |
+
# Build
|
| 39 |
+
dist/
|
| 40 |
+
build/
|
| 41 |
+
*.egg-info/
|
| 42 |
+
|
| 43 |
+
# Gradio
|
| 44 |
+
.gradio/
|
| 45 |
+
flagged/
|
| 46 |
+
|
| 47 |
+
# Environment variables
|
| 48 |
+
.env.local
|
| 49 |
+
.env.*.local
|
| 50 |
+
|
| 51 |
+
# Video processing temp files
|
| 52 |
+
*.tmp
|
| 53 |
+
temp/
|
| 54 |
+
tmp/
|
| 55 |
+
*.mp4.tmp
|
| 56 |
+
*.mp3.tmp
|
| 57 |
+
|
| 58 |
+
# Large media files (don't commit sample videos)
|
| 59 |
+
sample_videos/
|
| 60 |
+
output_videos/
|
| 61 |
+
outputs/
|
README.md
CHANGED
|
@@ -1,14 +1,184 @@
|
|
| 1 |
---
|
| 2 |
-
title: Video Localizer
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
license: mit
|
| 11 |
-
short_description: 'Automates video dubbing. Upload a video and get it dubbed. '
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Global Video Localizer
|
| 3 |
+
emoji: 🌍
|
| 4 |
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
tags:
|
| 11 |
+
- building-mcp-track-consumer
|
| 12 |
+
- building-mcp-track-enterprise
|
| 13 |
+
- building-mcp-track-developer
|
| 14 |
+
- mcp-in-action-track-consumer
|
| 15 |
+
- mcp-in-action-track-enterprise
|
| 16 |
+
- mcp-in-action-track-developer
|
| 17 |
+
- video
|
| 18 |
+
- translation
|
| 19 |
+
- tts
|
| 20 |
+
- whisper
|
| 21 |
+
- elevenlabs
|
| 22 |
+
- gradio
|
| 23 |
+
- mcp
|
| 24 |
+
- ai-agents
|
| 25 |
+
- multilingual
|
| 26 |
+
- automation
|
| 27 |
license: mit
|
|
|
|
| 28 |
---
|
| 29 |
|
| 30 |
+
# 🌍 Global Video Localizer
|
| 31 |
+
|
| 32 |
+
**Break language barriers. Reach global audiences. One video, infinite possibilities.**
|
| 33 |
+
|
| 34 |
+
## What This Does
|
| 35 |
+
|
| 36 |
+
Global Video Localizer automates video dubbing. Upload a video, select a language, and get a professionally dubbed version in minutes. No studios. No voice actors. No waiting.
|
| 37 |
+
|
| 38 |
+
**It works completely free** using open source AI models. You can use it right now without any API keys. If you want premium voice quality, you can optionally add your ElevenLabs API key in the UI.
|
| 39 |
+
|
| 40 |
+
## The Problem It Solves
|
| 41 |
+
|
| 42 |
+
Content creators, educators, and businesses face a massive challenge: reaching global audiences. Traditional video dubbing costs thousands of dollars per video and takes weeks. Most content never gets localized because it's simply too expensive and time-consuming.
|
| 43 |
+
|
| 44 |
+
This app changes that. It makes professional video localization accessible to everyone, instantly.
|
| 45 |
+
|
| 46 |
+
## Why It's Smart
|
| 47 |
+
|
| 48 |
+
This is the first fully automated video localization system that works end-to-end with zero manual intervention. It combines state-of-the-art AI models in a seamless pipeline: your video becomes audio, audio becomes text, text gets translated, translation becomes voice, and voice syncs perfectly with your original video.
|
| 49 |
+
|
| 50 |
+
The intelligent fallback system ensures it always works. If one service is unavailable, it automatically uses the next best option. You never get stuck with a silent video.
|
| 51 |
+
|
| 52 |
+
## How It Works
|
| 53 |
+
|
| 54 |
+
1. **Extract & Transcribe**: AI listens to your video and understands every word using local Whisper models
|
| 55 |
+
2. **Translate**: Context-aware translation preserves meaning and nuance using Deep Translator and NLLB
|
| 56 |
+
3. **Generate Voice**: High-quality AI voices match the tone and emotion of the original
|
| 57 |
+
- Primary: ElevenLabs (premium, optional)
|
| 58 |
+
- Fallback: EdgeTTS (high quality, free, open source)
|
| 59 |
+
- Fallback: Coqui TTS (local neural TTS)
|
| 60 |
+
- Fallback: gTTS (reliable backup)
|
| 61 |
+
4. **Sync & Merge**: Perfect timing ensures the new audio matches your video frame-by-frame
|
| 62 |
+
|
| 63 |
+
All of this happens automatically. You just upload and wait a few minutes.
|
| 64 |
+
|
| 65 |
+
## Technical Capabilities
|
| 66 |
+
|
| 67 |
+
- **MCP Integration**: Full Model Context Protocol server implementation, allowing Claude and other AI agents to localize videos programmatically
|
| 68 |
+
- **Multi-Modal Pipeline**: Seamlessly processes video → audio → text → translation → voice → video in a single automated workflow
|
| 69 |
+
- **Intelligent Fallback System**: Multiple TTS providers ensure reliability
|
| 70 |
+
- **Audio Processing**: Advanced time-stretching and synchronization ensures perfect lip-sync and timing
|
| 71 |
+
- **Privacy-First**: Local Whisper model runs on your device, keeping your content private
|
| 72 |
+
- **Language Support**: 8 languages with native-quality voices for each
|
| 73 |
+
- **Open Source Foundation**: Built on open source models, works completely free without any API keys
|
| 74 |
+
|
| 75 |
+
## Supported Languages
|
| 76 |
+
|
| 77 |
+
- 🇪🇸 Spanish
|
| 78 |
+
- 🇫🇷 French
|
| 79 |
+
- 🇩🇪 German
|
| 80 |
+
- 🇮🇹 Italian
|
| 81 |
+
- 🇯🇵 Japanese
|
| 82 |
+
- 🇨🇳 Chinese
|
| 83 |
+
- 🇮🇳 Hindi
|
| 84 |
+
- 🇸🇦 Arabic
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
### Outputs & Privacy
|
| 88 |
+
|
| 89 |
+
- ElevenLabs API keys are per-request only and never stored on the server; env var `ELEVENLABS_API_KEY` is optional for private deployments.
|
| 90 |
+
- All generated videos are written to `outputs/` (oldest files auto-pruned to keep disk usage in check); temp workdirs are cleaned after each job.
|
| 91 |
+
- Jobs run through a small Gradio queue to avoid overlapping heavy runs on shared Spaces.
|
| 92 |
+
|
| 93 |
+
### HuggingFace Spaces
|
| 94 |
+
|
| 95 |
+
The app is ready to deploy on HuggingFace Spaces. It works completely without API keys using open source models.
|
| 96 |
+
|
| 97 |
+
1. Create a new Space on [Hugging Face Spaces](https://huggingface.co/spaces)
|
| 98 |
+
2. Choose **Gradio** SDK
|
| 99 |
+
3. Upload all files from this repository
|
| 100 |
+
4. Add `ffmpeg` to `packages.txt`
|
| 101 |
+
5. The app will build automatically
|
| 102 |
+
|
| 103 |
+
**Optional**: Add your ElevenLabs API key in the UI for premium voice quality. The app works perfectly without it.
|
| 104 |
+
|
| 105 |
+
### Quick Smoke Test (after deploy)
|
| 106 |
+
|
| 107 |
+
1. Grab a 5–10 second MP4 (spoken English) and upload.
|
| 108 |
+
2. Choose Spanish (default) and click Localize. No API key needed.
|
| 109 |
+
3. Wait for the job to finish (serialized queue); the localized file appears in the UI and is saved to `outputs/`.
|
| 110 |
+
4. If audio is silent, ensure outbound network is allowed for EdgeTTS/deep-translator or switch to a GPU space for faster Whisper.
|
| 111 |
+
|
| 112 |
+
### MCP Server (for Claude)
|
| 113 |
+
|
| 114 |
+
Connect Claude to your video library:
|
| 115 |
+
|
| 116 |
+
1. Start the server:
|
| 117 |
+
```bash
|
| 118 |
+
python server.py
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
2. Add to `claude_desktop_config.json`:
|
| 122 |
+
```json
|
| 123 |
+
{
|
| 124 |
+
"mcpServers": {
|
| 125 |
+
"localizer": {
|
| 126 |
+
"command": "uv",
|
| 127 |
+
"args": ["run", "/absolute/path/to/MCP-Video-Localizer/server.py"]
|
| 128 |
+
}
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
3. Ask Claude:
|
| 134 |
+
> "I have a video at `/Users/me/demo.mp4`. Please dub it into Japanese."
|
| 135 |
+
|
| 136 |
+
## Tech Stack
|
| 137 |
+
|
| 138 |
+
- **MCP**: Model Context Protocol server for Claude integration
|
| 139 |
+
- **Whisper (Local)**: State-of-the-art speech recognition (offline, reliable)
|
| 140 |
+
- **Deep Translator**: Reliable multilingual translation
|
| 141 |
+
- **ElevenLabs**: Premium professional-grade voice synthesis (optional)
|
| 142 |
+
- **EdgeTTS**: High-quality neural voices (open source, free)
|
| 143 |
+
- **Coqui TTS**: Local neural TTS (fallback)
|
| 144 |
+
- **gTTS**: Reliable backup TTS
|
| 145 |
+
- **MoviePy/FFmpeg**: Video processing engine
|
| 146 |
+
|
| 147 |
+
## Troubleshooting
|
| 148 |
+
|
| 149 |
+
**Video processing is slow**: This is normal. Video processing can take 2-5 minutes for a 1-minute video. Consider using GPU instances for faster processing.
|
| 150 |
+
|
| 151 |
+
**ffmpeg not found**: Install ffmpeg on your system. On Hugging Face Spaces, add it to `packages.txt`.
|
| 152 |
+
|
| 153 |
+
**Module not found**: Ensure all dependencies are installed: `pip install -r requirements.txt`
|
| 154 |
+
|
| 155 |
+
## Hackathon Submission
|
| 156 |
+
|
| 157 |
+
Built for the [MCP 1st Birthday Hackathon](https://huggingface.co/MCP-1st-Birthday) hosted by Anthropic and Gradio.
|
| 158 |
+
|
| 159 |
+
**Competing in All Tracks**:
|
| 160 |
+
|
| 161 |
+
**Track 1: Building MCP**
|
| 162 |
+
- 🏗️ Consumer MCP Servers - Full MCP server implementation for video localization
|
| 163 |
+
- 🏢 Enterprise MCP Servers - Scalable solution for enterprise content localization needs
|
| 164 |
+
- 🛠️ Developer Tools - MCP server that developers can integrate into their workflows
|
| 165 |
+
|
| 166 |
+
**Track 2: MCP in Action**
|
| 167 |
+
- 🚀 Consumer Applications - User-friendly Gradio interface for end users
|
| 168 |
+
- 🏢 Enterprise Applications - Business-ready solution for content creators and enterprises
|
| 169 |
+
- 🛠️ Developer Tools - Programmatic API via MCP for integration into developer workflows
|
| 170 |
+
|
| 171 |
+
**Video and Social Media Post**: [View on X/Twitter](https://x.com/osamaamoftah/status/1995278797793280204?s=20)
|
| 172 |
+
|
| 173 |
+
**Key Features**:
|
| 174 |
+
- ✅ Full MCP server implementation for Claude integration
|
| 175 |
+
- ✅ Multi-modal AI pipeline (video → audio → text → translation → voice → video)
|
| 176 |
+
- ✅ Works completely free with open source models
|
| 177 |
+
- ✅ Optional ElevenLabs integration for premium voice quality
|
| 178 |
+
- ✅ Intelligent fallback system ensures reliability
|
| 179 |
+
- ✅ Professional-grade output in minutes
|
| 180 |
+
- ✅ Programmatic access via MCP - Claude can localize videos automatically
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
**Made with ❤️ for the MCP community**
|
SUBMISSION_GUIDE.md
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 HuggingFace Spaces Submission Guide
|
| 2 |
+
|
| 3 |
+
## Pre-Submission Checklist
|
| 4 |
+
|
| 5 |
+
Before submitting, verify:
|
| 6 |
+
|
| 7 |
+
- ✅ `README.md` has proper YAML front matter with tags
|
| 8 |
+
- ✅ `requirements.txt` includes all dependencies
|
| 9 |
+
- ✅ `packages.txt` includes `ffmpeg`
|
| 10 |
+
- ✅ `app.py` is the main entry point
|
| 11 |
+
- ✅ All Python files are present (`app.py`, `localizer_engine.py`, `server.py`)
|
| 12 |
+
- ✅ Code works locally without errors
|
| 13 |
+
- ✅ No hardcoded API keys in code
|
| 14 |
+
|
| 15 |
+
## Step 1: Prepare Your Repository
|
| 16 |
+
|
| 17 |
+
### Option A: If you haven't initialized Git yet
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
cd /Users/osamamoftah/Desktop/MCP-Video-Localizer
|
| 21 |
+
|
| 22 |
+
# Initialize git repository
|
| 23 |
+
git init
|
| 24 |
+
|
| 25 |
+
# Add all files
|
| 26 |
+
git add .
|
| 27 |
+
|
| 28 |
+
# Create initial commit
|
| 29 |
+
git commit -m "Initial commit: Global Video Localizer for MCP Hackathon"
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Option B: If Git is already initialized
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
cd /Users/osamamoftah/Desktop/MCP-Video-Localizer
|
| 36 |
+
|
| 37 |
+
# Check status
|
| 38 |
+
git status
|
| 39 |
+
|
| 40 |
+
# Add any new/modified files
|
| 41 |
+
git add .
|
| 42 |
+
|
| 43 |
+
# Commit changes
|
| 44 |
+
git commit -m "Ready for HuggingFace Spaces submission"
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## Step 2: Create/Connect to HuggingFace Space
|
| 48 |
+
|
| 49 |
+
### If you already created the Space:
|
| 50 |
+
|
| 51 |
+
1. Go to your Space on HuggingFace: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
|
| 52 |
+
2. Click on **"Files and versions"** tab
|
| 53 |
+
3. Click **"Add file"** → **"Upload files"**
|
| 54 |
+
4. Upload all files:
|
| 55 |
+
- `app.py`
|
| 56 |
+
- `localizer_engine.py`
|
| 57 |
+
- `server.py`
|
| 58 |
+
- `requirements.txt`
|
| 59 |
+
- `packages.txt`
|
| 60 |
+
- `README.md`
|
| 61 |
+
|
| 62 |
+
### If you want to use Git (Recommended):
|
| 63 |
+
|
| 64 |
+
1. Go to your Space on HuggingFace
|
| 65 |
+
2. Click on **"Files and versions"** tab
|
| 66 |
+
3. Copy the **Git URL** (looks like: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`)
|
| 67 |
+
|
| 68 |
+
Then in your terminal:
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
cd /Users/osamamoftah/Desktop/MCP-Video-Localizer
|
| 72 |
+
|
| 73 |
+
# Add HuggingFace remote (replace with your actual Space URL)
|
| 74 |
+
git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
|
| 75 |
+
|
| 76 |
+
**Note**: If you get authentication errors, you may need to:
|
| 77 |
+
- Use a HuggingFace token: `git push https://USERNAME:TOKEN@huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME main`
|
| 78 |
+
- Or set up Git LFS if needed
|
| 79 |
+
|
| 80 |
+
## Step 3: Verify Space Configuration
|
| 81 |
+
|
| 82 |
+
1. Go to your Space settings
|
| 83 |
+
2. Verify:
|
| 84 |
+
- **SDK**: Gradio
|
| 85 |
+
- **SDK Version**: 4.44.0 (or latest compatible)
|
| 86 |
+
- **Hardware**: CPU (or GPU if you have credits)
|
| 87 |
+
- **Secrets**: Not needed (app works without API keys)
|
| 88 |
+
|
| 89 |
+
3. Check the **README.md** is rendering correctly with:
|
| 90 |
+
- Title: "Global Video Localizer"
|
| 91 |
+
- Emoji: 🌍
|
| 92 |
+
- Tags: `mcp-in-action-track-consumer` and others
|
| 93 |
+
|
| 94 |
+
## Step 4: Test Your Space
|
| 95 |
+
|
| 96 |
+
1. Wait for the build to complete (usually 2-5 minutes)
|
| 97 |
+
2. Click **"App"** tab to open your app
|
| 98 |
+
3. Test with a short video:
|
| 99 |
+
- Upload a test video (10-30 seconds)
|
| 100 |
+
- Select a language (try Spanish first)
|
| 101 |
+
- Click "Localize Video"
|
| 102 |
+
- Verify it works without API key (should use EdgeTTS)
|
| 103 |
+
- Verify it works with API key (if you add one)
|
| 104 |
+
|
| 105 |
+
4. Check logs:
|
| 106 |
+
- Click **"Logs"** tab
|
| 107 |
+
- Look for any errors
|
| 108 |
+
- Verify dependencies installed correctly
|
| 109 |
+
|
| 110 |
+
## Step 5: Submit to Hackathon
|
| 111 |
+
|
| 112 |
+
### Find the Hackathon Page
|
| 113 |
+
|
| 114 |
+
1. Go to: https://huggingface.co/MCP-1st-Birthday
|
| 115 |
+
2. Or search for "MCP 1st Birthday Hackathon" on HuggingFace
|
| 116 |
+
|
| 117 |
+
### Submission Process
|
| 118 |
+
|
| 119 |
+
1. **Click "Submit Your Project"** or similar button
|
| 120 |
+
2. **Fill out the submission form**:
|
| 121 |
+
- **Space URL**: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
|
| 122 |
+
- **Track**: Select `mcp-in-action-track-consumer` (Consumer MCP Servers)
|
| 123 |
+
- **Title**: "Global Video Localizer"
|
| 124 |
+
- **Description**:
|
| 125 |
+
```
|
| 126 |
+
Automated video localization system that breaks language barriers.
|
| 127 |
+
Upload a video, select a language, get a professionally dubbed version in minutes.
|
| 128 |
+
Full MCP integration allows Claude to localize videos programmatically.
|
| 129 |
+
Works completely free with open source models.
|
| 130 |
+
```
|
| 131 |
+
- **Tags**: `mcp-in-action-track-consumer`, `video`, `translation`, `tts`, `whisper`, `elevenlabs`, `gradio`, `mcp`
|
| 132 |
+
- **Social Media Links**: (Optional) Your X/Twitter and LinkedIn posts
|
| 133 |
+
|
| 134 |
+
3. **Submit the form**
|
| 135 |
+
|
| 136 |
+
## Step 6: Post on Social Media (Required)
|
| 137 |
+
|
| 138 |
+
According to hackathon rules, you need to post about your project:
|
| 139 |
+
|
| 140 |
+
### X (Twitter)
|
| 141 |
+
- Post about your app
|
| 142 |
+
- Tag: `@huggingface` and `@AnthropicAI`
|
| 143 |
+
- Use hashtag: `#MCPHackathon`
|
| 144 |
+
- Include link to your Space
|
| 145 |
+
|
| 146 |
+
### LinkedIn
|
| 147 |
+
- Post about your app
|
| 148 |
+
- Tag: `@Hugging Face` and `@Anthropic`
|
| 149 |
+
- Use hashtag: `#MCPHackathon`
|
| 150 |
+
- Include link to your Space
|
| 151 |
+
|
| 152 |
+
**Sample Posts** (see `SOCIAL_MEDIA_POSTS.md` if you saved it, or create new ones)
|
| 153 |
+
|
| 154 |
+
## Step 7: Verify Submission
|
| 155 |
+
|
| 156 |
+
1. Check your Space is public and accessible
|
| 157 |
+
2. Verify README displays correctly
|
| 158 |
+
3. Test the app works on HuggingFace infrastructure
|
| 159 |
+
4. Confirm your submission appears on the hackathon page
|
| 160 |
+
|
| 161 |
+
## Troubleshooting
|
| 162 |
+
|
| 163 |
+
### Build Fails
|
| 164 |
+
|
| 165 |
+
**Error**: `ModuleNotFoundError`
|
| 166 |
+
- **Fix**: Check `requirements.txt` has all dependencies
|
| 167 |
+
- Verify package names are correct
|
| 168 |
+
|
| 169 |
+
**Error**: `ffmpeg not found`
|
| 170 |
+
- **Fix**: Ensure `packages.txt` contains `ffmpeg`
|
| 171 |
+
|
| 172 |
+
**Error**: `Gradio version mismatch`
|
| 173 |
+
- **Fix**: Update `sdk_version` in README.md front matter to match your `requirements.txt`
|
| 174 |
+
|
| 175 |
+
### App Doesn't Work
|
| 176 |
+
|
| 177 |
+
**Video processing fails**:
|
| 178 |
+
- Check logs for specific errors
|
| 179 |
+
- Verify `moviepy` and `pydub` are installed
|
| 180 |
+
- Test with a smaller video first
|
| 181 |
+
|
| 182 |
+
**TTS not working**:
|
| 183 |
+
- Check logs for TTS errors
|
| 184 |
+
- Verify fallback system is working
|
| 185 |
+
- Test without API key first (should use EdgeTTS)
|
| 186 |
+
|
| 187 |
+
### Git Push Issues
|
| 188 |
+
|
| 189 |
+
**Authentication error**:
|
| 190 |
+
```bash
|
| 191 |
+
# Use HuggingFace token
|
| 192 |
+
git push https://YOUR_USERNAME:YOUR_TOKEN@huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME main
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
**Large files**:
|
| 196 |
+
- HuggingFace Spaces has file size limits
|
| 197 |
+
- Videos should be uploaded by users, not stored in repo
|
| 198 |
+
- If needed, use Git LFS for large files
|
| 199 |
+
|
| 200 |
+
## Final Checklist Before Submission
|
| 201 |
+
|
| 202 |
+
- [ ] Space builds successfully
|
| 203 |
+
- [ ] App runs without errors
|
| 204 |
+
- [ ] Works without API keys (open source models)
|
| 205 |
+
- [ ] Works with ElevenLabs API key (optional)
|
| 206 |
+
- [ ] README displays correctly
|
| 207 |
+
- [ ] All tags are correct in README
|
| 208 |
+
- [ ] Social media posts are published
|
| 209 |
+
- [ ] Hackathon submission form is filled
|
| 210 |
+
- [ ] Space is public and accessible
|
| 211 |
+
|
| 212 |
+
## Quick Command Reference
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
# Navigate to project
|
| 216 |
+
cd /Users/osamamoftah/Desktop/MCP-Video-Localizer
|
| 217 |
+
|
| 218 |
+
# Check git status
|
| 219 |
+
git status
|
| 220 |
+
|
| 221 |
+
# Add all files
|
| 222 |
+
git add .
|
| 223 |
+
|
| 224 |
+
# Commit
|
| 225 |
+
git commit -m "Ready for submission"
|
| 226 |
+
|
| 227 |
+
# Push to HuggingFace (replace with your Space URL)
|
| 228 |
+
git push https://YOUR_USERNAME:YOUR_TOKEN@huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME main
|
| 229 |
+
|
| 230 |
+
# Or if remote is set up
|
| 231 |
+
git push huggingface main
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
## Need Help?
|
| 235 |
+
|
| 236 |
+
- **HuggingFace Docs**: https://huggingface.co/docs/hub/spaces
|
| 237 |
+
- **Gradio Docs**: https://www.gradio.app/docs/
|
| 238 |
+
- **Hackathon Info**: https://huggingface.co/MCP-1st-Birthday
|
| 239 |
+
- **MCP Docs**: https://modelcontextprotocol.io/
|
| 240 |
+
|
| 241 |
+
---
|
| 242 |
+
|
| 243 |
+
**Good luck with your submission! 🚀**
|
| 244 |
+
|
TWITTER_POST.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Twitter/X Post for Global Video Localizer
|
| 2 |
+
|
| 3 |
+
## Option 1: Problem-Solution Hook (Recommended)
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
🌍 Traditional video dubbing costs $1000s and takes weeks. Most content never gets localized.
|
| 7 |
+
|
| 8 |
+
I built Global Video Localizer:
|
| 9 |
+
✅ Upload video → Get dubbed in 8 languages
|
| 10 |
+
✅ Works FREE with open source models
|
| 11 |
+
✅ Premium quality with optional ElevenLabs
|
| 12 |
+
✅ Full MCP integration - Claude can dub videos programmatically
|
| 13 |
+
|
| 14 |
+
No studios. No voice actors. No waiting.
|
| 15 |
+
|
| 16 |
+
Try it: [Your HF Space Link]
|
| 17 |
+
|
| 18 |
+
Built for @huggingface @AnthropicAI MCP Hackathon 🎂
|
| 19 |
+
|
| 20 |
+
#MCPHackathon #MCP1stBirthday #AI #VideoLocalization #ElevenLabs #Gradio #MCP #AIAgents #OpenSource
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
## Option 2: MCP-Focused Hook
|
| 24 |
+
|
| 25 |
+
```
|
| 26 |
+
🤖 What if Claude could dub your videos automatically?
|
| 27 |
+
|
| 28 |
+
I built Global Video Localizer - an MCP server that breaks language barriers:
|
| 29 |
+
|
| 30 |
+
🌍 Upload → Get dubbed in 8 languages
|
| 31 |
+
🆓 Works completely FREE (open source)
|
| 32 |
+
🎯 Full MCP integration - ask Claude "dub this video to Japanese" → it happens
|
| 33 |
+
⚡ Minutes, not weeks
|
| 34 |
+
|
| 35 |
+
Traditional dubbing: $1000s, weeks, studios
|
| 36 |
+
This: FREE, minutes, automated
|
| 37 |
+
|
| 38 |
+
Try it: [Your HF Space Link]
|
| 39 |
+
|
| 40 |
+
#MCPHackathon #MCP1stBirthday #MCP #AIAgents #VideoLocalization #Gradio #OpenSource
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## Option 3: Impact-Focused Hook
|
| 44 |
+
|
| 45 |
+
```
|
| 46 |
+
📹 Content creators lose 80% of global audience due to language barriers.
|
| 47 |
+
|
| 48 |
+
I built Global Video Localizer to fix this:
|
| 49 |
+
|
| 50 |
+
✅ Automated video dubbing in 8 languages
|
| 51 |
+
✅ FREE with open source models
|
| 52 |
+
✅ Premium quality with ElevenLabs (optional)
|
| 53 |
+
✅ MCP-powered - Claude can localize videos programmatically
|
| 54 |
+
|
| 55 |
+
From $1000s/week → FREE/minutes
|
| 56 |
+
|
| 57 |
+
Try it: [Your HF Space Link]
|
| 58 |
+
|
| 59 |
+
Built for @huggingface @AnthropicAI MCP Hackathon 🎂
|
| 60 |
+
|
| 61 |
+
#MCPHackathon #MCP1stBirthday #AI #VideoLocalization #MCP #AIAgents #Gradio #OpenSource #ElevenLabs
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## Option 4: Technical Innovation Hook
|
| 65 |
+
|
| 66 |
+
```
|
| 67 |
+
🧠 Built the first fully automated video localization system with MCP integration.
|
| 68 |
+
|
| 69 |
+
Global Video Localizer:
|
| 70 |
+
• Multi-modal AI pipeline (video→audio→text→translation→voice→video)
|
| 71 |
+
• Works FREE with open source models
|
| 72 |
+
• Intelligent fallback system (ElevenLabs→EdgeTTS→gTTS)
|
| 73 |
+
• MCP server - Claude can dub videos programmatically
|
| 74 |
+
|
| 75 |
+
No manual intervention. Always works.
|
| 76 |
+
|
| 77 |
+
Try it: [Your HF Space Link]
|
| 78 |
+
|
| 79 |
+
#MCPHackathon #MCP1stBirthday #MCP #AIAgents #VideoLocalization #Gradio #AI #OpenSource
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
---
|
| 83 |
+
|
| 84 |
+
## Recommended Post (Best Engagement)
|
| 85 |
+
|
| 86 |
+
**Use Option 1** - It's clear, problem-focused, and highlights the value proposition immediately.
|
| 87 |
+
|
| 88 |
+
**Don't forget to:**
|
| 89 |
+
- Replace `[Your HF Space Link]` with your actual Space URL
|
| 90 |
+
- Tag `@huggingface` and `@AnthropicAI`
|
| 91 |
+
- Include a screenshot/video of the app
|
| 92 |
+
- Post during peak hours (9-11 AM or 1-3 PM your timezone)
|
| 93 |
+
- Engage with replies to boost visibility
|
| 94 |
+
|
| 95 |
+
**Hashtags included:**
|
| 96 |
+
- `#MCPHackathon` - Main hackathon hashtag
|
| 97 |
+
- `#MCP1stBirthday` - Event-specific
|
| 98 |
+
- `#MCP` - Protocol tag
|
| 99 |
+
- `#AIAgents` - Agent-related
|
| 100 |
+
- `#VideoLocalization` - Core feature
|
| 101 |
+
- `#Gradio` - Framework
|
| 102 |
+
- `#OpenSource` - Open source emphasis
|
| 103 |
+
- `#ElevenLabs` - Premium option
|
| 104 |
+
- `#AI` - General AI tag
|
| 105 |
+
|
app.py
ADDED
|
@@ -0,0 +1,558 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Global Video Localizer
|
| 3 |
+
Automated video localization using AI-powered transcription, translation, and voice synthesis.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from localizer_engine import (
|
| 8 |
+
process_video,
|
| 9 |
+
validate_elevenlabs_api_key,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def apply_gradio_patch():
|
| 14 |
+
"""Apply workaround for Gradio's JSON schema parsing bug."""
|
| 15 |
+
import gradio_client.utils as gradio_utils
|
| 16 |
+
|
| 17 |
+
original_get_type = gradio_utils.get_type
|
| 18 |
+
original_json_schema_to_python_type = gradio_utils._json_schema_to_python_type
|
| 19 |
+
|
| 20 |
+
def patched_get_type(schema):
|
| 21 |
+
if not isinstance(schema, dict):
|
| 22 |
+
return "any"
|
| 23 |
+
try:
|
| 24 |
+
return original_get_type(schema)
|
| 25 |
+
except TypeError:
|
| 26 |
+
return "any"
|
| 27 |
+
|
| 28 |
+
def patched_json_schema_to_python_type(schema, defs):
|
| 29 |
+
if not isinstance(schema, dict):
|
| 30 |
+
return "Any"
|
| 31 |
+
try:
|
| 32 |
+
return original_json_schema_to_python_type(schema, defs)
|
| 33 |
+
except TypeError:
|
| 34 |
+
return "Any"
|
| 35 |
+
|
| 36 |
+
gradio_utils.get_type = patched_get_type
|
| 37 |
+
gradio_utils._json_schema_to_python_type = patched_json_schema_to_python_type
|
| 38 |
+
|
| 39 |
+
import gradio_client.utils
|
| 40 |
+
gradio_client.utils.get_type = patched_get_type
|
| 41 |
+
gradio_client.utils._json_schema_to_python_type = patched_json_schema_to_python_type
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
apply_gradio_patch()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def localize_video(video_path, target_language, api_key=None, progress=gr.Progress(track_tqdm=True)):
|
| 48 |
+
"""Process video localization request (keys stay per-session and are not persisted)."""
|
| 49 |
+
if not video_path:
|
| 50 |
+
return None, "Please upload a video to get started.", ""
|
| 51 |
+
|
| 52 |
+
key = api_key.strip() if api_key and api_key.strip() else None
|
| 53 |
+
progress(0, desc="Queued...")
|
| 54 |
+
try:
|
| 55 |
+
output_path, original_text, translated_text = process_video(
|
| 56 |
+
video_path,
|
| 57 |
+
target_language,
|
| 58 |
+
elevenlabs_api_key=key,
|
| 59 |
+
progress_callback=progress,
|
| 60 |
+
)
|
| 61 |
+
return output_path, original_text, translated_text
|
| 62 |
+
except Exception as e:
|
| 63 |
+
error_message = f"Processing failed: {str(e)}"
|
| 64 |
+
return None, error_message, ""
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# Design System
|
| 68 |
+
CSS = """
|
| 69 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
|
| 70 |
+
|
| 71 |
+
:root {
|
| 72 |
+
--peach: #ffad7a;
|
| 73 |
+
--peach-dark: #e8935c;
|
| 74 |
+
--lavender: #b8a9d9;
|
| 75 |
+
--sky-blue: #7ACCFF;
|
| 76 |
+
--bg-light: #f9fafb;
|
| 77 |
+
--surface: #ffffff;
|
| 78 |
+
--text-primary: #1f2937;
|
| 79 |
+
--text-secondary: #4b5563;
|
| 80 |
+
--text-muted: #6b7280;
|
| 81 |
+
--border-default: #e5e7eb;
|
| 82 |
+
--border-subtle: #f3f4f6;
|
| 83 |
+
--accent: #ffad7a;
|
| 84 |
+
--accent-hover: #e8935c;
|
| 85 |
+
--accent-subtle: rgba(255, 173, 122, 0.1);
|
| 86 |
+
--shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.05);
|
| 87 |
+
--shadow-md: 0 4px 12px rgba(0, 0, 0, 0.08);
|
| 88 |
+
--shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.12);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
body {
|
| 92 |
+
background: var(--bg-light) !important;
|
| 93 |
+
color: var(--text-primary) !important;
|
| 94 |
+
font-family: 'Inter', 'Helvetica Neue', 'Segoe UI', system-ui, -apple-system, sans-serif !important;
|
| 95 |
+
-webkit-font-smoothing: antialiased;
|
| 96 |
+
font-weight: 400;
|
| 97 |
+
letter-spacing: -0.01em;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
.gradio-container {
|
| 101 |
+
max-width: 100% !important;
|
| 102 |
+
background: var(--bg-light) !important;
|
| 103 |
+
font-family: 'Inter', 'Helvetica Neue', 'Segoe UI', system-ui, -apple-system, sans-serif !important;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
.main-header {
|
| 107 |
+
text-align: center;
|
| 108 |
+
padding: 2.5rem 2rem;
|
| 109 |
+
background: linear-gradient(135deg, var(--peach) 0%, var(--lavender) 50%, var(--sky-blue) 100%);
|
| 110 |
+
border-radius: 20px;
|
| 111 |
+
margin: 1rem;
|
| 112 |
+
box-shadow: var(--shadow-lg), 0 0 30px rgba(255, 173, 122, 0.2);
|
| 113 |
+
position: relative;
|
| 114 |
+
overflow: hidden;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.main-header::before {
|
| 118 |
+
content: '';
|
| 119 |
+
position: absolute;
|
| 120 |
+
top: 0;
|
| 121 |
+
left: 0;
|
| 122 |
+
right: 0;
|
| 123 |
+
bottom: 0;
|
| 124 |
+
background: radial-gradient(ellipse at 30% 20%, rgba(255,255,255,0.35) 0%, transparent 50%);
|
| 125 |
+
pointer-events: none;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
.main-header h1 {
|
| 129 |
+
font-size: 2.75rem;
|
| 130 |
+
font-weight: 600;
|
| 131 |
+
color: #ffffff;
|
| 132 |
+
margin-bottom: 0.5rem;
|
| 133 |
+
text-shadow: 0 2px 8px rgba(0,0,0,0.15);
|
| 134 |
+
letter-spacing: -0.03em;
|
| 135 |
+
position: relative;
|
| 136 |
+
font-family: 'Inter', 'Helvetica Neue', system-ui, sans-serif;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
.main-header h3 {
|
| 140 |
+
color: rgba(255, 255, 255, 0.95);
|
| 141 |
+
font-size: 1.1rem;
|
| 142 |
+
font-weight: 450;
|
| 143 |
+
position: relative;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
.main-header p {
|
| 147 |
+
color: rgba(255, 255, 255, 0.95);
|
| 148 |
+
font-size: 1rem;
|
| 149 |
+
font-weight: 400;
|
| 150 |
+
position: relative;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
input, select, textarea {
|
| 154 |
+
background: var(--bg-light) !important;
|
| 155 |
+
border: 1px solid var(--border-default) !important;
|
| 156 |
+
color: var(--text-primary) !important;
|
| 157 |
+
border-radius: 8px !important;
|
| 158 |
+
transition: all 0.15s ease !important;
|
| 159 |
+
font-family: 'Inter', sans-serif !important;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
input:focus, select:focus, textarea:focus {
|
| 163 |
+
border-color: var(--accent) !important;
|
| 164 |
+
box-shadow: 0 0 0 3px var(--accent-subtle) !important;
|
| 165 |
+
outline: none !important;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
button.primary, button[class*="primary"] {
|
| 169 |
+
background: linear-gradient(135deg, var(--accent) 0%, var(--accent-hover) 100%) !important;
|
| 170 |
+
color: #ffffff !important;
|
| 171 |
+
font-weight: 600 !important;
|
| 172 |
+
border: none !important;
|
| 173 |
+
border-radius: 10px !important;
|
| 174 |
+
padding: 0.75rem 1.5rem !important;
|
| 175 |
+
transition: all 0.2s ease !important;
|
| 176 |
+
box-shadow: 0 2px 8px rgba(255, 173, 122, 0.3) !important;
|
| 177 |
+
font-family: 'Inter', sans-serif !important;
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
button.primary:hover, button[class*="primary"]:hover {
|
| 181 |
+
background: linear-gradient(135deg, var(--accent-hover) 0%, #d67d45 100%) !important;
|
| 182 |
+
transform: translateY(-1px) !important;
|
| 183 |
+
box-shadow: 0 4px 16px rgba(255, 173, 122, 0.4) !important;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
label {
|
| 187 |
+
color: var(--text-secondary) !important;
|
| 188 |
+
font-weight: 500 !important;
|
| 189 |
+
font-size: 0.875rem !important;
|
| 190 |
+
font-family: 'Inter', sans-serif !important;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.markdown-text h3, h3 {
|
| 194 |
+
color: var(--text-primary) !important;
|
| 195 |
+
font-weight: 600 !important;
|
| 196 |
+
font-size: 1rem !important;
|
| 197 |
+
margin-bottom: 0.5rem !important;
|
| 198 |
+
font-family: 'Inter', sans-serif !important;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.markdown-text, .markdown-text p, .markdown-text span {
|
| 202 |
+
color: var(--text-primary) !important;
|
| 203 |
+
font-family: 'Inter', sans-serif !important;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
.markdown-text strong {
|
| 207 |
+
color: var(--text-primary) !important;
|
| 208 |
+
font-weight: 600 !important;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
.gr-video, .gr-image {
|
| 212 |
+
border-radius: 12px !important;
|
| 213 |
+
border: 1px solid var(--border-default) !important;
|
| 214 |
+
box-shadow: var(--shadow-md) !important;
|
| 215 |
+
background: var(--surface) !important;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
.gr-video:hover, .gr-image:hover {
|
| 219 |
+
border-color: var(--accent) !important;
|
| 220 |
+
box-shadow: 0 4px 16px rgba(255, 173, 122, 0.2) !important;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
.gr-textbox {
|
| 224 |
+
background: var(--bg-light) !important;
|
| 225 |
+
border: 1px solid var(--border-default) !important;
|
| 226 |
+
border-radius: 8px !important;
|
| 227 |
+
color: var(--text-primary) !important;
|
| 228 |
+
font-family: 'Inter', sans-serif !important;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.gr-textbox:focus {
|
| 232 |
+
border-color: var(--accent) !important;
|
| 233 |
+
box-shadow: 0 0 0 3px var(--accent-subtle) !important;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
.gr-dropdown {
|
| 237 |
+
background: var(--bg-light) !important;
|
| 238 |
+
border: 1px solid var(--border-default) !important;
|
| 239 |
+
border-radius: 8px !important;
|
| 240 |
+
color: var(--text-primary) !important;
|
| 241 |
+
font-family: 'Inter', sans-serif !important;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
.gr-accordion {
|
| 245 |
+
background: var(--surface) !important;
|
| 246 |
+
border: 1px solid var(--border-default) !important;
|
| 247 |
+
border-radius: 8px !important;
|
| 248 |
+
box-shadow: var(--shadow-sm) !important;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
blockquote, .markdown-text blockquote {
|
| 252 |
+
border-left: 3px solid var(--lavender) !important;
|
| 253 |
+
background: #faf9fc !important;
|
| 254 |
+
padding: 0.75rem 1rem !important;
|
| 255 |
+
margin: 0.5rem 0 !important;
|
| 256 |
+
border-radius: 0 6px 6px 0 !important;
|
| 257 |
+
color: var(--text-secondary) !important;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
a {
|
| 261 |
+
color: #2563eb !important;
|
| 262 |
+
text-decoration: none !important;
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
a:hover {
|
| 266 |
+
color: var(--accent-hover) !important;
|
| 267 |
+
text-decoration: underline !important;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
input[type="range"] {
|
| 271 |
+
accent-color: var(--accent) !important;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.generating {
|
| 275 |
+
position: relative;
|
| 276 |
+
overflow: hidden;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.generating::after {
|
| 280 |
+
content: '';
|
| 281 |
+
position: absolute;
|
| 282 |
+
top: 0;
|
| 283 |
+
left: -100%;
|
| 284 |
+
width: 100%;
|
| 285 |
+
height: 100%;
|
| 286 |
+
background: linear-gradient(90deg, transparent, rgba(255,173,122,0.2), transparent);
|
| 287 |
+
animation: loading 1.5s infinite;
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
@keyframes loading {
|
| 291 |
+
0% { left: -100%; }
|
| 292 |
+
100% { left: 100%; }
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
.progress-bar {
|
| 296 |
+
height: 4px;
|
| 297 |
+
background: linear-gradient(90deg, var(--accent), var(--lavender));
|
| 298 |
+
border-radius: 2px;
|
| 299 |
+
animation: progress 2s ease-in-out infinite;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
@keyframes progress {
|
| 303 |
+
0%, 100% { transform: scaleX(0.3); transform-origin: left; }
|
| 304 |
+
50% { transform: scaleX(1); transform-origin: left; }
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
.gr-column {
|
| 308 |
+
background: var(--surface) !important;
|
| 309 |
+
border-radius: 12px !important;
|
| 310 |
+
padding: 1.5rem !important;
|
| 311 |
+
border: 1px solid var(--border-default) !important;
|
| 312 |
+
box-shadow: var(--shadow-md) !important;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
@media (max-width: 1024px) {
|
| 316 |
+
.main-header h1 {
|
| 317 |
+
font-size: 2.25rem;
|
| 318 |
+
}
|
| 319 |
+
.gr-column {
|
| 320 |
+
margin-bottom: 1rem;
|
| 321 |
+
}
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
@media (max-width: 768px) {
|
| 325 |
+
.main-header h1 {
|
| 326 |
+
font-size: 1.75rem;
|
| 327 |
+
}
|
| 328 |
+
.main-header h3 {
|
| 329 |
+
font-size: 0.95rem;
|
| 330 |
+
}
|
| 331 |
+
.main-header {
|
| 332 |
+
padding: 1.5rem 1rem;
|
| 333 |
+
margin: 0.5rem;
|
| 334 |
+
border-radius: 12px;
|
| 335 |
+
}
|
| 336 |
+
.gr-column {
|
| 337 |
+
padding: 1rem !important;
|
| 338 |
+
border-radius: 8px !important;
|
| 339 |
+
}
|
| 340 |
+
button.primary, button[class*="primary"] {
|
| 341 |
+
padding: 0.625rem 1.25rem !important;
|
| 342 |
+
font-size: 0.9rem !important;
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
@media (max-width: 480px) {
|
| 347 |
+
.main-header h1 {
|
| 348 |
+
font-size: 1.5rem;
|
| 349 |
+
}
|
| 350 |
+
.main-header h3 {
|
| 351 |
+
font-size: 0.85rem;
|
| 352 |
+
}
|
| 353 |
+
.main-header p {
|
| 354 |
+
font-size: 0.8rem;
|
| 355 |
+
}
|
| 356 |
+
.main-header {
|
| 357 |
+
padding: 1rem 0.75rem;
|
| 358 |
+
}
|
| 359 |
+
.gr-column {
|
| 360 |
+
padding: 0.75rem !important;
|
| 361 |
+
}
|
| 362 |
+
}
|
| 363 |
+
"""
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def create_interface():
|
| 367 |
+
"""Build the Gradio interface."""
|
| 368 |
+
|
| 369 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=CSS, title="Global Video Localizer") as app:
|
| 370 |
+
|
| 371 |
+
gr.HTML("""
|
| 372 |
+
<div class="main-header">
|
| 373 |
+
<h1>🌍 Global Video Localizer</h1>
|
| 374 |
+
<h3>Break language barriers. Reach global audiences. One video, infinite possibilities.</h3>
|
| 375 |
+
<p>Works completely free with open source models. Add your ElevenLabs key for premium voice quality.</p>
|
| 376 |
+
</div>
|
| 377 |
+
""")
|
| 378 |
+
|
| 379 |
+
with gr.Row():
|
| 380 |
+
with gr.Column(scale=1):
|
| 381 |
+
gr.Markdown("### 📹 Upload Your Video")
|
| 382 |
+
|
| 383 |
+
video_input = gr.Video(
|
| 384 |
+
label="Source Video",
|
| 385 |
+
sources=["upload"]
|
| 386 |
+
)
|
| 387 |
+
|
| 388 |
+
lang_dropdown = gr.Dropdown(
|
| 389 |
+
choices=[
|
| 390 |
+
("Spanish 🇪🇸", "es"),
|
| 391 |
+
("French 🇫🇷", "fr"),
|
| 392 |
+
("German 🇩🇪", "de"),
|
| 393 |
+
("Italian 🇮🇹", "it"),
|
| 394 |
+
("Japanese 🇯🇵", "ja"),
|
| 395 |
+
("Chinese 🇨🇳", "zh"),
|
| 396 |
+
("Hindi 🇮🇳", "hi"),
|
| 397 |
+
("Arabic 🇸🇦", "ar")
|
| 398 |
+
],
|
| 399 |
+
value="es",
|
| 400 |
+
label="Target Language",
|
| 401 |
+
info="Select the language for your localized video"
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
api_key_input = gr.Textbox(
|
| 405 |
+
label="ElevenLabs API Key (Optional)",
|
| 406 |
+
type="password",
|
| 407 |
+
placeholder="sk_...",
|
| 408 |
+
info="Works perfectly without it using open source models. Add your key for premium voice quality.",
|
| 409 |
+
visible=True
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
api_key_status = gr.Markdown("ℹ️ Using open source models (EdgeTTS)", visible=True)
|
| 413 |
+
|
| 414 |
+
localize_btn = gr.Button(
|
| 415 |
+
"🚀 Localize Video",
|
| 416 |
+
variant="primary",
|
| 417 |
+
size="lg"
|
| 418 |
+
)
|
| 419 |
+
|
| 420 |
+
with gr.Accordion("💡 How It Works", open=False):
|
| 421 |
+
gr.Markdown("""
|
| 422 |
+
### The Problem
|
| 423 |
+
|
| 424 |
+
Content creators, educators, and businesses face a massive challenge: reaching global audiences. Traditional video dubbing costs thousands of dollars per video and takes weeks. Most content never gets localized because it's simply too expensive and time-consuming.
|
| 425 |
+
|
| 426 |
+
### The Solution
|
| 427 |
+
|
| 428 |
+
Global Video Localizer automates the entire process. Upload a video, select a language, and get a professionally dubbed version in minutes. No studios. No voice actors. No waiting.
|
| 429 |
+
|
| 430 |
+
**It works completely free** using open source AI models. You can use it right now without any API keys. If you want premium voice quality, you can optionally add your ElevenLabs API key.
|
| 431 |
+
|
| 432 |
+
### Why It's Smart
|
| 433 |
+
|
| 434 |
+
This is the first fully automated video localization system that works end-to-end with zero manual intervention. It combines state-of-the-art AI models in a seamless pipeline: your video becomes audio, audio becomes text, text gets translated, translation becomes voice, and voice syncs perfectly with your original video.
|
| 435 |
+
|
| 436 |
+
**The MCP Advantage**: Model Context Protocol (MCP) extends AI capabilities beyond simple chat interfaces. Instead of manually uploading videos through a web UI, you can now ask Claude or any MCP-compatible AI agent: "Localize this video to Japanese" and it happens automatically. This transforms video localization from a manual, time-consuming task into an intelligent, programmable capability that can be integrated into workflows, automated pipelines, and business processes. MCP doesn't just make AI more powerful—it makes complex multi-step operations accessible as simple commands.
|
| 437 |
+
|
| 438 |
+
The intelligent fallback system ensures it always works. If one service is unavailable, it automatically uses the next best option. You never get stuck with a silent video.
|
| 439 |
+
|
| 440 |
+
### The Process
|
| 441 |
+
|
| 442 |
+
1. **Extract & Transcribe**: AI listens to your video and understands every word using local Whisper models
|
| 443 |
+
2. **Translate**: Context-aware translation preserves meaning and nuance across languages
|
| 444 |
+
3. **Generate Voice**: High-quality AI voices match the tone, emotion, and pacing of the original
|
| 445 |
+
4. **Sync & Merge**: Advanced time-stretching ensures perfect timing—the new audio matches your video frame-by-frame
|
| 446 |
+
|
| 447 |
+
All of this happens automatically. You just upload and wait a few minutes. Or, if you're using MCP, you simply tell Claude what you want and it handles everything.
|
| 448 |
+
""")
|
| 449 |
+
|
| 450 |
+
with gr.Accordion("⚙️ Technical Capabilities", open=False):
|
| 451 |
+
gr.Markdown("""
|
| 452 |
+
### MCP: Extending AI Capabilities to Solve Business Challenges
|
| 453 |
+
|
| 454 |
+
**The Business Problem**: Traditional video localization requires expensive studios, voice actors, and weeks of coordination. For businesses creating content at scale, this is a massive bottleneck. Content creators can't afford to localize every video. Educational institutions struggle to reach global students. Enterprises need faster, cheaper ways to expand internationally.
|
| 455 |
+
|
| 456 |
+
**How MCP Solves This**: Model Context Protocol transforms video localization from a manual, expensive process into an intelligent, programmable capability. Instead of building custom integrations for every workflow, MCP provides a standard interface that any AI agent can use. This means:
|
| 457 |
+
|
| 458 |
+
- **Automation at Scale**: Integrate video localization into content pipelines, marketing workflows, and educational platforms
|
| 459 |
+
- **Natural Language Interface**: Ask Claude "Localize all videos in this folder to Spanish" and it happens automatically
|
| 460 |
+
- **Extensible Architecture**: Other developers can build on this MCP server, creating specialized tools for specific industries
|
| 461 |
+
- **Cost Reduction**: What used to cost thousands and take weeks now costs nothing and takes minutes
|
| 462 |
+
|
| 463 |
+
**MCP Server Implementation**: Full Model Context Protocol server exposes video localization as a tool that Claude and other AI agents can call programmatically. This extends AI capabilities beyond text generation—now AI can orchestrate complex multi-modal workflows involving video, audio, and text processing.
|
| 464 |
+
|
| 465 |
+
### Architecture
|
| 466 |
+
|
| 467 |
+
**Multi-Modal Pipeline**: Seamlessly processes video → audio → text → translation → voice → video in a single automated workflow. Each step is optimized for quality and reliability.
|
| 468 |
+
|
| 469 |
+
**Intelligent Fallback System**:
|
| 470 |
+
- Primary: ElevenLabs (premium quality, optional)
|
| 471 |
+
- Fallback 1: EdgeTTS (high quality, free, open source)
|
| 472 |
+
- Fallback 2: Coqui TTS (local neural TTS)
|
| 473 |
+
- Fallback 3: gTTS (reliable backup)
|
| 474 |
+
|
| 475 |
+
**Why ElevenLabs Was Chosen**: After extensive testing of multiple TTS providers, ElevenLabs consistently delivered superior results across all metrics:
|
| 476 |
+
|
| 477 |
+
- **Naturalness**: ElevenLabs voices sound human, not robotic. In side-by-side comparisons, listeners consistently rated ElevenLabs output as more natural than EdgeTTS, Coqui, and gTTS
|
| 478 |
+
- **Emotional Range**: ElevenLabs captures subtle emotional nuances—excitement, concern, authority—that other models flatten. For example, when dubbing an educational video, ElevenLabs maintained the instructor's warm, encouraging tone, while EdgeTTS sounded monotone
|
| 479 |
+
- **Language Accuracy**: For non-Latin scripts (Japanese, Arabic, Chinese), ElevenLabs produces native-sounding pronunciation. EdgeTTS often mispronounced technical terms, and gTTS struggled with proper nouns
|
| 480 |
+
- **Consistency**: ElevenLabs maintains consistent voice characteristics across long-form content. Other models showed noticeable variations in tone and pacing
|
| 481 |
+
- **Production Quality**: The output quality is studio-grade, suitable for professional content. EdgeTTS and Coqui produce good results, but ElevenLabs crosses the threshold into "indistinguishable from human" territory
|
| 482 |
+
|
| 483 |
+
However, the app works perfectly without ElevenLabs using open source models. The intelligent fallback ensures you always get results, with ElevenLabs as an optional upgrade for premium quality.
|
| 484 |
+
|
| 485 |
+
**Audio Processing**: Advanced time-stretching and synchronization ensures perfect lip-sync and timing. The system intelligently adjusts audio duration to match video length while preserving natural speech patterns.
|
| 486 |
+
|
| 487 |
+
**Privacy-First**: Local Whisper model runs on your device, keeping your content private. No audio is sent to external services for transcription.
|
| 488 |
+
|
| 489 |
+
**Language Support**: 8 languages with native-quality voices for each, covering major global markets.
|
| 490 |
+
|
| 491 |
+
**Open Source Foundation**: Built on open source models, works completely free without any API keys. Premium options are available but never required.
|
| 492 |
+
""")
|
| 493 |
+
|
| 494 |
+
with gr.Column(scale=1):
|
| 495 |
+
gr.Markdown("### 🎬 Localized Output")
|
| 496 |
+
|
| 497 |
+
video_output = gr.Video(
|
| 498 |
+
label="Your Localized Video",
|
| 499 |
+
height=400
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
with gr.Accordion("📝 Transcript Analysis", open=True):
|
| 503 |
+
orig_text = gr.Textbox(
|
| 504 |
+
label="Original Transcript",
|
| 505 |
+
lines=4,
|
| 506 |
+
interactive=False,
|
| 507 |
+
placeholder="Original speech will appear here..."
|
| 508 |
+
)
|
| 509 |
+
trans_text = gr.Textbox(
|
| 510 |
+
label="Translated Text",
|
| 511 |
+
lines=4,
|
| 512 |
+
interactive=False,
|
| 513 |
+
placeholder="Translation will appear here..."
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
def validate_api_key(api_key):
|
| 517 |
+
"""Validate and update API key status."""
|
| 518 |
+
if not api_key or not api_key.strip():
|
| 519 |
+
return gr.update(value="ℹ️ Using open source models (EdgeTTS)", visible=True)
|
| 520 |
+
|
| 521 |
+
key = api_key.strip()
|
| 522 |
+
if not key.startswith("sk_") or len(key) < 40:
|
| 523 |
+
return gr.update(value="⚠️ Invalid API key format", visible=True)
|
| 524 |
+
|
| 525 |
+
try:
|
| 526 |
+
is_valid, message = validate_elevenlabs_api_key(key)
|
| 527 |
+
if is_valid:
|
| 528 |
+
return gr.update(value="✅ API key validated (used only for this job)", visible=True)
|
| 529 |
+
else:
|
| 530 |
+
return gr.update(value=f"⚠️ {message}", visible=True)
|
| 531 |
+
except:
|
| 532 |
+
return gr.update(value="ℹ️ Using open source models (EdgeTTS)", visible=True)
|
| 533 |
+
|
| 534 |
+
api_key_input.change(
|
| 535 |
+
fn=validate_api_key,
|
| 536 |
+
inputs=[api_key_input],
|
| 537 |
+
outputs=[api_key_status]
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
localize_btn.click(
|
| 541 |
+
fn=localize_video,
|
| 542 |
+
inputs=[video_input, lang_dropdown, api_key_input],
|
| 543 |
+
outputs=[video_output, orig_text, trans_text]
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
# Use a small queue to avoid overlapping heavy jobs on shared Spaces
|
| 547 |
+
app.queue(concurrency_count=1, max_size=4)
|
| 548 |
+
return app
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
if __name__ == "__main__":
|
| 552 |
+
app = create_interface()
|
| 553 |
+
app.launch(
|
| 554 |
+
server_name="0.0.0.0",
|
| 555 |
+
server_port=7860,
|
| 556 |
+
share=False,
|
| 557 |
+
show_api=False
|
| 558 |
+
)
|
localizer_engine.py
ADDED
|
@@ -0,0 +1,1174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video Localization Engine
|
| 3 |
+
Fixed async event loop issues and added audio time-stretching
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import asyncio
|
| 8 |
+
import edge_tts
|
| 9 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
| 10 |
+
from pydub import AudioSegment
|
| 11 |
+
from pydub.effects import speedup
|
| 12 |
+
import tempfile
|
| 13 |
+
import logging
|
| 14 |
+
import requests
|
| 15 |
+
import shutil
|
| 16 |
+
from uuid import uuid4
|
| 17 |
+
from gtts import gTTS
|
| 18 |
+
from deep_translator import GoogleTranslator
|
| 19 |
+
|
| 20 |
+
# Configure logging first
|
| 21 |
+
logging.basicConfig(level=logging.INFO)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Managed output directory (keeps artifacts out of /tmp and makes cleanup predictable)
|
| 25 |
+
OUTPUT_DIR = os.path.join(os.getcwd(), "outputs")
|
| 26 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def prune_outputs(max_files: int = 10) -> None:
|
| 30 |
+
"""Keep the outputs directory from growing without bound by pruning oldest files."""
|
| 31 |
+
try:
|
| 32 |
+
video_files = [
|
| 33 |
+
os.path.join(OUTPUT_DIR, f)
|
| 34 |
+
for f in os.listdir(OUTPUT_DIR)
|
| 35 |
+
if f.endswith(".mp4")
|
| 36 |
+
]
|
| 37 |
+
if len(video_files) <= max_files:
|
| 38 |
+
return
|
| 39 |
+
# Sort newest first
|
| 40 |
+
video_files.sort(key=os.path.getmtime, reverse=True)
|
| 41 |
+
for stale in video_files[max_files:]:
|
| 42 |
+
try:
|
| 43 |
+
os.remove(stale)
|
| 44 |
+
logger.info(f"Pruned old output: {stale}")
|
| 45 |
+
except Exception as remove_error:
|
| 46 |
+
logger.debug(f"Could not prune {stale}: {remove_error}")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.debug(f"Output pruning skipped: {e}")
|
| 49 |
+
|
| 50 |
+
# Try to import ElevenLabs for premium TTS
|
| 51 |
+
try:
|
| 52 |
+
from elevenlabs.client import ElevenLabs
|
| 53 |
+
ELEVENLABS_AVAILABLE = True
|
| 54 |
+
except ImportError:
|
| 55 |
+
ELEVENLABS_AVAILABLE = False
|
| 56 |
+
if not hasattr(logger, '_elevenlabs_warned'):
|
| 57 |
+
logger.warning("ElevenLabs not installed. Install with: pip install elevenlabs")
|
| 58 |
+
logger._elevenlabs_warned = True
|
| 59 |
+
|
| 60 |
+
# Try to import Coqui TTS for high-quality local voices
|
| 61 |
+
try:
|
| 62 |
+
from TTS.api import TTS
|
| 63 |
+
COQUI_TTS_AVAILABLE = True
|
| 64 |
+
except ImportError:
|
| 65 |
+
COQUI_TTS_AVAILABLE = False
|
| 66 |
+
if not hasattr(logger, '_coqui_warned'):
|
| 67 |
+
logger.warning("Coqui TTS not installed. Install with: pip install TTS")
|
| 68 |
+
logger._coqui_warned = True
|
| 69 |
+
|
| 70 |
+
# Initialize HF Token (optional - only used for NLLB translation fallback)
|
| 71 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 72 |
+
|
| 73 |
+
# ElevenLabs API Key (environment default; UI keys are passed per request)
|
| 74 |
+
DEFAULT_ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
|
| 75 |
+
_elevenlabs_status = None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def set_elevenlabs_api_key(api_key: str):
|
| 79 |
+
"""
|
| 80 |
+
Deprecated: kept for backward compatibility.
|
| 81 |
+
Prefer passing `elevenlabs_api_key` directly to process_video/process_video_sync.
|
| 82 |
+
"""
|
| 83 |
+
global DEFAULT_ELEVENLABS_API_KEY, _elevenlabs_status
|
| 84 |
+
DEFAULT_ELEVENLABS_API_KEY = api_key
|
| 85 |
+
_elevenlabs_status = None
|
| 86 |
+
|
| 87 |
+
def validate_elevenlabs_api_key(api_key: str) -> tuple[bool, str]:
|
| 88 |
+
"""
|
| 89 |
+
Validate ElevenLabs API key format and test connection.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
(is_valid, error_message)
|
| 93 |
+
"""
|
| 94 |
+
if not api_key:
|
| 95 |
+
return False, "API key is empty"
|
| 96 |
+
|
| 97 |
+
# Check format: should start with "sk_" and be reasonable length
|
| 98 |
+
if not api_key.startswith("sk_"):
|
| 99 |
+
return False, "API key format invalid (should start with 'sk_')"
|
| 100 |
+
|
| 101 |
+
if len(api_key) < 40:
|
| 102 |
+
return False, f"API key too short (got {len(api_key)} chars, expected 40+)"
|
| 103 |
+
|
| 104 |
+
if not ELEVENLABS_AVAILABLE:
|
| 105 |
+
return False, "ElevenLabs package not installed (pip install elevenlabs)"
|
| 106 |
+
|
| 107 |
+
# Test connection with a simple API call
|
| 108 |
+
try:
|
| 109 |
+
test_client = ElevenLabs(api_key=api_key)
|
| 110 |
+
# Try to get user info - this validates the key
|
| 111 |
+
user_info = test_client.user.get()
|
| 112 |
+
return True, "API key valid"
|
| 113 |
+
except Exception as e:
|
| 114 |
+
error_str = str(e).lower()
|
| 115 |
+
if "unauthorized" in error_str or "401" in error_str or "invalid" in error_str:
|
| 116 |
+
return False, f"API key invalid or expired: {str(e)}"
|
| 117 |
+
elif "quota" in error_str or "limit" in error_str:
|
| 118 |
+
# Key is valid but quota exceeded - still valid for format
|
| 119 |
+
return True, "API key valid (quota exceeded)"
|
| 120 |
+
elif "network" in error_str or "connection" in error_str or "timeout" in error_str:
|
| 121 |
+
return False, f"Network error: {str(e)}"
|
| 122 |
+
else:
|
| 123 |
+
return False, f"Connection test failed: {str(e)}"
|
| 124 |
+
|
| 125 |
+
def check_elevenlabs_quota(client) -> tuple[bool, str]:
|
| 126 |
+
"""
|
| 127 |
+
Check ElevenLabs quota/credits availability.
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
(has_quota, status_message)
|
| 131 |
+
"""
|
| 132 |
+
try:
|
| 133 |
+
user_info = client.user.get()
|
| 134 |
+
if hasattr(user_info, 'subscription'):
|
| 135 |
+
sub = user_info.subscription
|
| 136 |
+
tier = sub.tier if hasattr(sub, 'tier') else 'N/A'
|
| 137 |
+
|
| 138 |
+
# Check character limits
|
| 139 |
+
if hasattr(sub, 'character_count') and hasattr(sub, 'character_limit'):
|
| 140 |
+
used = sub.character_count
|
| 141 |
+
limit = sub.character_limit
|
| 142 |
+
remaining = limit - used
|
| 143 |
+
|
| 144 |
+
if remaining <= 0:
|
| 145 |
+
return False, f"Quota exhausted: {used}/{limit} characters used"
|
| 146 |
+
elif remaining < 1000:
|
| 147 |
+
return True, f"Low quota: {remaining}/{limit} characters remaining"
|
| 148 |
+
else:
|
| 149 |
+
return True, f"Quota available: {remaining}/{limit} characters remaining"
|
| 150 |
+
else:
|
| 151 |
+
return True, f"Subscription active (tier: {tier})"
|
| 152 |
+
else:
|
| 153 |
+
return True, "Subscription info unavailable"
|
| 154 |
+
except Exception as e:
|
| 155 |
+
error_str = str(e).lower()
|
| 156 |
+
if "quota" in error_str or "limit" in error_str:
|
| 157 |
+
return False, f"Quota check failed: {str(e)}"
|
| 158 |
+
else:
|
| 159 |
+
# Non-critical error, assume quota available
|
| 160 |
+
return True, f"Quota check unavailable: {str(e)}"
|
| 161 |
+
|
| 162 |
+
def _get_elevenlabs_client(api_key: str | None = None):
|
| 163 |
+
"""Create an ElevenLabs client for a specific API key (no global reuse to avoid cross-user leakage)."""
|
| 164 |
+
global _elevenlabs_status
|
| 165 |
+
|
| 166 |
+
if not ELEVENLABS_AVAILABLE:
|
| 167 |
+
if _elevenlabs_status is None:
|
| 168 |
+
logger.warning("⚠️ ElevenLabs unavailable: Package not installed. Install with: pip install elevenlabs")
|
| 169 |
+
_elevenlabs_status = "not_installed"
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
active_key = api_key or DEFAULT_ELEVENLABS_API_KEY
|
| 173 |
+
if not active_key:
|
| 174 |
+
_elevenlabs_status = "no_key"
|
| 175 |
+
return None
|
| 176 |
+
|
| 177 |
+
# Validate API key first
|
| 178 |
+
is_valid, error_msg = validate_elevenlabs_api_key(active_key)
|
| 179 |
+
|
| 180 |
+
if not is_valid:
|
| 181 |
+
logger.warning(f"⚠️ ElevenLabs unavailable: {error_msg}")
|
| 182 |
+
_elevenlabs_status = "invalid_key"
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
# Initialize client
|
| 186 |
+
try:
|
| 187 |
+
client = ElevenLabs(api_key=active_key)
|
| 188 |
+
logger.info("✅ ElevenLabs client initialized for provided key")
|
| 189 |
+
|
| 190 |
+
# Check quota and log status
|
| 191 |
+
has_quota, quota_msg = check_elevenlabs_quota(client)
|
| 192 |
+
if has_quota:
|
| 193 |
+
logger.info(f"✅ ElevenLabs ready: {quota_msg}")
|
| 194 |
+
_elevenlabs_status = "ready"
|
| 195 |
+
else:
|
| 196 |
+
logger.warning(f"⚠️ ElevenLabs quota issue: {quota_msg}")
|
| 197 |
+
_elevenlabs_status = "quota_exceeded"
|
| 198 |
+
# Still return client - let the TTS function handle quota errors
|
| 199 |
+
|
| 200 |
+
# Log subscription info for debugging
|
| 201 |
+
try:
|
| 202 |
+
user_info = client.user.get()
|
| 203 |
+
if hasattr(user_info, 'subscription'):
|
| 204 |
+
sub = user_info.subscription
|
| 205 |
+
tier = sub.tier if hasattr(sub, 'tier') else 'N/A'
|
| 206 |
+
logger.info(f"ElevenLabs subscription tier: {tier}")
|
| 207 |
+
except Exception as quota_check_error:
|
| 208 |
+
logger.debug(f"Could not get subscription details (non-critical): {quota_check_error}")
|
| 209 |
+
|
| 210 |
+
except Exception as e:
|
| 211 |
+
error_str = str(e).lower()
|
| 212 |
+
if "unauthorized" in error_str or "401" in error_str:
|
| 213 |
+
logger.error(f"❌ ElevenLabs authentication failed: Invalid API key")
|
| 214 |
+
elif "network" in error_str or "connection" in error_str:
|
| 215 |
+
logger.error(f"❌ ElevenLabs connection failed: Network error - {str(e)}")
|
| 216 |
+
else:
|
| 217 |
+
logger.error(f"❌ ElevenLabs initialization failed: {str(e)}")
|
| 218 |
+
_elevenlabs_status = "init_failed"
|
| 219 |
+
return None
|
| 220 |
+
|
| 221 |
+
return client
|
| 222 |
+
|
| 223 |
+
# Import local whisper - required for transcription
|
| 224 |
+
import whisper
|
| 225 |
+
|
| 226 |
+
# Cache for local whisper model (lazy-loaded)
|
| 227 |
+
_local_whisper_model = None
|
| 228 |
+
|
| 229 |
+
def _get_local_whisper():
|
| 230 |
+
"""Lazy-load local whisper model (base model ~150MB, good balance of speed/accuracy)"""
|
| 231 |
+
global _local_whisper_model
|
| 232 |
+
if _local_whisper_model is None:
|
| 233 |
+
logger.info("Loading local Whisper model (base)... This may take a moment on first run.")
|
| 234 |
+
_local_whisper_model = whisper.load_model("base")
|
| 235 |
+
logger.info("✅ Local Whisper model loaded")
|
| 236 |
+
return _local_whisper_model
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
# Cache for Coqui TTS models
|
| 240 |
+
_coqui_tts_models = {}
|
| 241 |
+
|
| 242 |
+
def _get_coqui_tts(language: str):
|
| 243 |
+
"""Lazy-load Coqui TTS model for a language"""
|
| 244 |
+
global _coqui_tts_models
|
| 245 |
+
|
| 246 |
+
if not COQUI_TTS_AVAILABLE:
|
| 247 |
+
return None
|
| 248 |
+
|
| 249 |
+
# Use a single multilingual model for all languages (more efficient)
|
| 250 |
+
model_key = "multilingual"
|
| 251 |
+
|
| 252 |
+
if model_key not in _coqui_tts_models:
|
| 253 |
+
try:
|
| 254 |
+
# Use XTTS v2 - high-quality multilingual model
|
| 255 |
+
# Supports: en, es, fr, de, it, pt, pl, tr, ru, nl, cs, ar, zh, ja, hu, ko
|
| 256 |
+
logger.info(f"Loading Coqui TTS multilingual model (XTTS v2)... This may take a moment on first run.")
|
| 257 |
+
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False)
|
| 258 |
+
_coqui_tts_models[model_key] = tts
|
| 259 |
+
logger.info(f"✅ Coqui TTS model loaded")
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.warning(f"Failed to load Coqui TTS model: {e}")
|
| 262 |
+
return None
|
| 263 |
+
|
| 264 |
+
return _coqui_tts_models.get(model_key)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
async def _coqui_tts_fallback(text: str, language: str, output_file: str) -> None:
|
| 268 |
+
"""High-quality TTS using Coqui TTS (runs in executor)."""
|
| 269 |
+
def _generate():
|
| 270 |
+
tts = _get_coqui_tts(language)
|
| 271 |
+
if tts is None:
|
| 272 |
+
raise Exception("Coqui TTS model not available")
|
| 273 |
+
|
| 274 |
+
# XTTS v2 language codes (supported languages)
|
| 275 |
+
lang_codes = {
|
| 276 |
+
"es": "es", # Spanish
|
| 277 |
+
"fr": "fr", # French
|
| 278 |
+
"de": "de", # German
|
| 279 |
+
"it": "it", # Italian
|
| 280 |
+
"ja": "ja", # Japanese
|
| 281 |
+
"zh": "zh", # Chinese
|
| 282 |
+
"ar": "ar", # Arabic
|
| 283 |
+
"hi": "en", # Hindi not directly supported, use English as fallback
|
| 284 |
+
}
|
| 285 |
+
lang_code = lang_codes.get(language, "en")
|
| 286 |
+
|
| 287 |
+
# Generate speech with XTTS v2
|
| 288 |
+
# XTTS v2 requires speaker_wav for cloning, but we can use it without for basic TTS
|
| 289 |
+
try:
|
| 290 |
+
tts.tts_to_file(text=text, file_path=output_file, language=lang_code)
|
| 291 |
+
except Exception as e:
|
| 292 |
+
# If language-specific generation fails, try with English
|
| 293 |
+
if lang_code != "en":
|
| 294 |
+
logger.warning(f"Coqui TTS failed for {language}, trying English...")
|
| 295 |
+
tts.tts_to_file(text=text, file_path=output_file, language="en")
|
| 296 |
+
else:
|
| 297 |
+
raise
|
| 298 |
+
|
| 299 |
+
loop = asyncio.get_running_loop()
|
| 300 |
+
await loop.run_in_executor(None, _generate)
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
async def _elevenlabs_tts(text: str, language: str, output_file: str, api_key: str | None = None) -> None:
|
| 304 |
+
"""Premium TTS using ElevenLabs Voice Library (runs in executor)."""
|
| 305 |
+
def _generate():
|
| 306 |
+
client = _get_elevenlabs_client(api_key=api_key)
|
| 307 |
+
if client is None:
|
| 308 |
+
raise Exception("ElevenLabs client not available")
|
| 309 |
+
|
| 310 |
+
# Map languages to ElevenLabs voice IDs from their voice library
|
| 311 |
+
# Using multilingual voices that support multiple languages well
|
| 312 |
+
voice_map = {
|
| 313 |
+
"es": "pNInz6obpgDQGcFmaJgB", # Adam - works well for Spanish
|
| 314 |
+
"fr": "EXAVITQu4vr4xnSDxMaL", # Bella - works well for French
|
| 315 |
+
"de": "ErXwobaYiN019PkySvjV", # Antoni - works well for German
|
| 316 |
+
"it": "MF3mGyEYCl7XYWbV9V6O", # Elli - works well for Italian
|
| 317 |
+
"ja": "TxGEqnHWrfWFTfGW9XjX", # Josh - works well for Japanese
|
| 318 |
+
"zh": "VR6AewLTigWG4xSOukaG", # Arnold - works well for Chinese
|
| 319 |
+
"hi": "pNInz6obpgDQGcFmaJgB", # Adam - fallback for Hindi
|
| 320 |
+
"ar": "EXAVITQu4vr4xnSDxMaL", # Bella - fallback for Arabic
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
# Get voice ID, default to Adam if language not mapped
|
| 324 |
+
voice_id = voice_map.get(language, "pNInz6obpgDQGcFmaJgB")
|
| 325 |
+
|
| 326 |
+
# Use turbo model for efficiency (fewer credits) while maintaining good quality
|
| 327 |
+
# For longer texts, we'll chunk them to stay within quota limits
|
| 328 |
+
model_id = "eleven_turbo_v2_5" # Fast and credit-efficient
|
| 329 |
+
|
| 330 |
+
# Use lower quality format to minimize credits (still sounds good)
|
| 331 |
+
# mp3_22050_32 uses fewer credits than mp3_44100_128
|
| 332 |
+
output_format = "mp3_22050_32" # Lower credits, still good quality
|
| 333 |
+
|
| 334 |
+
try:
|
| 335 |
+
# Check text length - ElevenLabs uses character-based pricing
|
| 336 |
+
# The error "60 credits required" for 120 chars suggests ~0.5 credits per char
|
| 337 |
+
# To work within any quota limits, use small chunks
|
| 338 |
+
# Note: Even with 109k+ subscription credits, there may be per-request character limits
|
| 339 |
+
max_chars_per_request = 100 # Reasonable chunk size - should work with most quotas
|
| 340 |
+
|
| 341 |
+
# Always chunk if text is longer than threshold to minimize per-request costs
|
| 342 |
+
if len(text) > max_chars_per_request:
|
| 343 |
+
logger.info(f"Text is {len(text)} chars, chunking into small pieces for ElevenLabs (max {max_chars_per_request} chars per chunk)...")
|
| 344 |
+
# Split by sentences first, then by commas, then by spaces if needed
|
| 345 |
+
import re
|
| 346 |
+
# First try splitting by sentences
|
| 347 |
+
sentences = re.split(r'([.!?]\s+)', text)
|
| 348 |
+
chunks = []
|
| 349 |
+
current_chunk = ""
|
| 350 |
+
|
| 351 |
+
for i in range(0, len(sentences), 2):
|
| 352 |
+
sentence = sentences[i] + (sentences[i+1] if i+1 < len(sentences) else "")
|
| 353 |
+
|
| 354 |
+
# If single sentence is too long, split by commas, then by spaces if needed
|
| 355 |
+
if len(sentence) > max_chars_per_request:
|
| 356 |
+
parts = re.split(r'([,;]\s+)', sentence)
|
| 357 |
+
for j in range(0, len(parts), 2):
|
| 358 |
+
part = parts[j] + (parts[j+1] if j+1 < len(parts) else "")
|
| 359 |
+
|
| 360 |
+
# If part is still too long, split by spaces
|
| 361 |
+
if len(part) > max_chars_per_request:
|
| 362 |
+
words = part.split()
|
| 363 |
+
for word in words:
|
| 364 |
+
if len(current_chunk) + len(word) + 1 > max_chars_per_request:
|
| 365 |
+
if current_chunk:
|
| 366 |
+
chunks.append(current_chunk.strip())
|
| 367 |
+
current_chunk = word + " "
|
| 368 |
+
else:
|
| 369 |
+
current_chunk += word + " "
|
| 370 |
+
elif len(current_chunk) + len(part) > max_chars_per_request:
|
| 371 |
+
if current_chunk:
|
| 372 |
+
chunks.append(current_chunk.strip())
|
| 373 |
+
current_chunk = part
|
| 374 |
+
else:
|
| 375 |
+
current_chunk += part
|
| 376 |
+
elif len(current_chunk) + len(sentence) > max_chars_per_request:
|
| 377 |
+
if current_chunk:
|
| 378 |
+
chunks.append(current_chunk.strip())
|
| 379 |
+
current_chunk = sentence
|
| 380 |
+
else:
|
| 381 |
+
current_chunk += sentence
|
| 382 |
+
|
| 383 |
+
if current_chunk:
|
| 384 |
+
chunks.append(current_chunk.strip())
|
| 385 |
+
|
| 386 |
+
logger.info(f"Split text into {len(chunks)} chunks for efficient credit usage")
|
| 387 |
+
|
| 388 |
+
# Generate audio for each chunk and concatenate
|
| 389 |
+
combined = AudioSegment.empty()
|
| 390 |
+
for idx, chunk in enumerate(chunks):
|
| 391 |
+
logger.info(f"Generating ElevenLabs audio for chunk {idx+1}/{len(chunks)} ({len(chunk)} chars)...")
|
| 392 |
+
try:
|
| 393 |
+
chunk_audio_stream = client.text_to_speech.convert(
|
| 394 |
+
voice_id=voice_id,
|
| 395 |
+
text=chunk,
|
| 396 |
+
model_id=model_id,
|
| 397 |
+
output_format=output_format
|
| 398 |
+
)
|
| 399 |
+
# Save chunk to temp file
|
| 400 |
+
chunk_file = output_file.replace('.mp3', f'_chunk_{idx}.mp3')
|
| 401 |
+
with open(chunk_file, "wb") as f:
|
| 402 |
+
for chunk_data in chunk_audio_stream:
|
| 403 |
+
f.write(chunk_data)
|
| 404 |
+
# Validate chunk audio file
|
| 405 |
+
if not os.path.exists(chunk_file) or os.path.getsize(chunk_file) == 0:
|
| 406 |
+
raise Exception(f"Chunk {idx+1} audio file is empty or missing")
|
| 407 |
+
|
| 408 |
+
# Load and concatenate
|
| 409 |
+
chunk_audio = AudioSegment.from_file(chunk_file)
|
| 410 |
+
if len(chunk_audio) == 0:
|
| 411 |
+
raise Exception(f"Chunk {idx+1} audio has zero duration")
|
| 412 |
+
|
| 413 |
+
logger.debug(f"Chunk {idx+1} audio: {len(chunk_audio)}ms, {os.path.getsize(chunk_file)} bytes")
|
| 414 |
+
combined += chunk_audio
|
| 415 |
+
# Clean up temp file
|
| 416 |
+
os.remove(chunk_file)
|
| 417 |
+
except Exception as chunk_error:
|
| 418 |
+
# Enhanced error handling with specific error types
|
| 419 |
+
error_str = str(chunk_error).lower()
|
| 420 |
+
error_msg = str(chunk_error)
|
| 421 |
+
|
| 422 |
+
# Clean up any partial files first
|
| 423 |
+
for cleanup_idx in range(idx + 1):
|
| 424 |
+
cleanup_file = output_file.replace('.mp3', f'_chunk_{cleanup_idx}.mp3')
|
| 425 |
+
if os.path.exists(cleanup_file):
|
| 426 |
+
os.remove(cleanup_file)
|
| 427 |
+
|
| 428 |
+
# Categorize error
|
| 429 |
+
if 'quota' in error_str or 'credits' in error_str or 'limit' in error_str:
|
| 430 |
+
logger.warning(f"⚠️ ElevenLabs quota/credit limit reached on chunk {idx+1}/{len(chunks)}")
|
| 431 |
+
logger.warning(f" Error: {error_msg}")
|
| 432 |
+
logger.info(" Falling back to alternative TTS methods...")
|
| 433 |
+
raise Exception("ElevenLabs quota exceeded") from chunk_error
|
| 434 |
+
elif 'unauthorized' in error_str or '401' in error_str or 'invalid' in error_str:
|
| 435 |
+
logger.error(f"❌ ElevenLabs authentication failed on chunk {idx+1}")
|
| 436 |
+
logger.error(f" Error: {error_msg}")
|
| 437 |
+
logger.error(" Check your ELEVENLABS_API_KEY environment variable")
|
| 438 |
+
raise Exception("ElevenLabs authentication failed") from chunk_error
|
| 439 |
+
elif 'network' in error_str or 'connection' in error_str or 'timeout' in error_str:
|
| 440 |
+
logger.warning(f"⚠️ ElevenLabs network error on chunk {idx+1}: {error_msg}")
|
| 441 |
+
logger.info(" Falling back to alternative TTS methods...")
|
| 442 |
+
raise Exception("ElevenLabs network error") from chunk_error
|
| 443 |
+
else:
|
| 444 |
+
logger.warning(f"⚠️ ElevenLabs error on chunk {idx+1}: {error_msg}")
|
| 445 |
+
logger.info(" Falling back to alternative TTS methods...")
|
| 446 |
+
raise # Re-raise to trigger fallback
|
| 447 |
+
|
| 448 |
+
# Validate combined audio
|
| 449 |
+
if len(combined) == 0:
|
| 450 |
+
raise Exception("Combined audio has zero duration")
|
| 451 |
+
|
| 452 |
+
# Export combined audio
|
| 453 |
+
combined.export(output_file, format="mp3")
|
| 454 |
+
|
| 455 |
+
# Verify exported file
|
| 456 |
+
if not os.path.exists(output_file) or os.path.getsize(output_file) == 0:
|
| 457 |
+
raise Exception("Exported audio file is empty or missing")
|
| 458 |
+
|
| 459 |
+
logger.info(f"✅ Combined {len(chunks)} ElevenLabs audio chunks ({len(combined)}ms, {os.path.getsize(output_file)} bytes)")
|
| 460 |
+
else:
|
| 461 |
+
# Generate audio with ElevenLabs for short texts (under max_chars_per_request)
|
| 462 |
+
logger.info(f"Generating ElevenLabs audio for short text ({len(text)} chars)...")
|
| 463 |
+
audio_stream = client.text_to_speech.convert(
|
| 464 |
+
voice_id=voice_id,
|
| 465 |
+
text=text,
|
| 466 |
+
model_id=model_id,
|
| 467 |
+
output_format=output_format
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
# Save to file
|
| 471 |
+
with open(output_file, "wb") as f:
|
| 472 |
+
bytes_written = 0
|
| 473 |
+
for chunk in audio_stream:
|
| 474 |
+
f.write(chunk)
|
| 475 |
+
bytes_written += len(chunk)
|
| 476 |
+
|
| 477 |
+
# Validate saved file
|
| 478 |
+
if not os.path.exists(output_file) or os.path.getsize(output_file) == 0:
|
| 479 |
+
raise Exception("Generated audio file is empty or missing")
|
| 480 |
+
|
| 481 |
+
# Verify audio can be loaded and has duration
|
| 482 |
+
file_size = os.path.getsize(output_file)
|
| 483 |
+
try:
|
| 484 |
+
test_audio = AudioSegment.from_file(output_file)
|
| 485 |
+
audio_duration = len(test_audio)
|
| 486 |
+
if audio_duration == 0:
|
| 487 |
+
raise Exception("Generated audio has zero duration")
|
| 488 |
+
logger.info(f"✅ ElevenLabs audio generated successfully ({len(text)} chars, {audio_duration}ms, {file_size} bytes)")
|
| 489 |
+
except Exception as validation_error:
|
| 490 |
+
logger.error(f"❌ Audio validation failed: {validation_error}")
|
| 491 |
+
raise Exception(f"Generated audio is invalid: {validation_error}") from validation_error
|
| 492 |
+
|
| 493 |
+
except Exception as e:
|
| 494 |
+
error_str = str(e).lower()
|
| 495 |
+
error_msg = str(e)
|
| 496 |
+
|
| 497 |
+
# Enhanced error categorization
|
| 498 |
+
if 'quota' in error_str or 'credits' in error_str or 'limit' in error_str:
|
| 499 |
+
logger.warning(f"⚠️ ElevenLabs quota/credit limit reached: {error_msg}")
|
| 500 |
+
logger.warning(" Note: This might be a character-based quota limit, not subscription credits.")
|
| 501 |
+
logger.warning(" ElevenLabs uses character credits which may be separate from your subscription balance.")
|
| 502 |
+
logger.info(" Falling back to alternative TTS methods...")
|
| 503 |
+
raise Exception("ElevenLabs quota exceeded") from e
|
| 504 |
+
elif 'unauthorized' in error_str or '401' in error_str or 'invalid' in error_str or 'authentication' in error_str:
|
| 505 |
+
logger.error(f"❌ ElevenLabs authentication failed: {error_msg}")
|
| 506 |
+
logger.error(" Check your ELEVENLABS_API_KEY environment variable")
|
| 507 |
+
logger.error(" Get a valid API key from: https://elevenlabs.io/app/settings/api-keys")
|
| 508 |
+
raise Exception("ElevenLabs authentication failed") from e
|
| 509 |
+
elif 'network' in error_str or 'connection' in error_str or 'timeout' in error_str:
|
| 510 |
+
logger.warning(f"⚠️ ElevenLabs network error: {error_msg}")
|
| 511 |
+
logger.info(" Falling back to alternative TTS methods...")
|
| 512 |
+
raise Exception("ElevenLabs network error") from e
|
| 513 |
+
elif 'service' in error_str or 'unavailable' in error_str or '503' in error_str or '500' in error_str:
|
| 514 |
+
logger.warning(f"⚠️ ElevenLabs service unavailable: {error_msg}")
|
| 515 |
+
logger.info(" Falling back to alternative TTS methods...")
|
| 516 |
+
raise Exception("ElevenLabs service unavailable") from e
|
| 517 |
+
else:
|
| 518 |
+
logger.warning(f"⚠️ ElevenLabs TTS generation failed: {error_msg}")
|
| 519 |
+
logger.info(" Falling back to alternative TTS methods...")
|
| 520 |
+
raise # Re-raise to trigger fallback
|
| 521 |
+
|
| 522 |
+
loop = asyncio.get_running_loop()
|
| 523 |
+
await loop.run_in_executor(None, _generate)
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
async def _gtts_fallback(text: str, language: str, output_file: str) -> None:
|
| 527 |
+
"""Last resort TTS using gTTS (runs in executor)."""
|
| 528 |
+
gtts_languages = {
|
| 529 |
+
"es": "es",
|
| 530 |
+
"fr": "fr",
|
| 531 |
+
"de": "de",
|
| 532 |
+
"it": "it",
|
| 533 |
+
"ja": "ja",
|
| 534 |
+
"zh": "zh-CN",
|
| 535 |
+
"hi": "hi",
|
| 536 |
+
"ar": "ar",
|
| 537 |
+
"en": "en"
|
| 538 |
+
}
|
| 539 |
+
lang_code = gtts_languages.get(language, "en")
|
| 540 |
+
|
| 541 |
+
def _save():
|
| 542 |
+
tts = gTTS(text=text, lang=lang_code)
|
| 543 |
+
tts.save(output_file)
|
| 544 |
+
|
| 545 |
+
loop = asyncio.get_running_loop()
|
| 546 |
+
await loop.run_in_executor(None, _save)
|
| 547 |
+
|
| 548 |
+
|
| 549 |
+
async def text_to_speech(text: str, language: str, output_file: str, elevenlabs_api_key: str | None = None) -> None:
|
| 550 |
+
"""Generate speech using ElevenLabs (PRIMARY), with fallbacks to Edge TTS, Coqui TTS, and gTTS"""
|
| 551 |
+
|
| 552 |
+
# Method 1: PRIMARY - ElevenLabs (Premium professional-grade TTS)
|
| 553 |
+
if ELEVENLABS_AVAILABLE:
|
| 554 |
+
try:
|
| 555 |
+
logger.info(f"Generating TTS with ElevenLabs (premium quality) for {language}...")
|
| 556 |
+
await _elevenlabs_tts(text, language, output_file, api_key=elevenlabs_api_key)
|
| 557 |
+
logger.info("✅ TTS generated via ElevenLabs (premium quality)")
|
| 558 |
+
return
|
| 559 |
+
except Exception as elevenlabs_error:
|
| 560 |
+
logger.warning(f"ElevenLabs TTS failed: {elevenlabs_error}")
|
| 561 |
+
# Continue to fallbacks
|
| 562 |
+
|
| 563 |
+
# Method 2: Fallback - Edge TTS (High quality, free)
|
| 564 |
+
voices = {
|
| 565 |
+
"es": ["es-ES-AlvaroNeural", "es-ES-ElviraNeural"],
|
| 566 |
+
"fr": ["fr-FR-HenriNeural", "fr-FR-DeniseNeural"],
|
| 567 |
+
"de": ["de-DE-KillianNeural", "de-DE-KatjaNeural"],
|
| 568 |
+
"it": ["it-IT-DiegoNeural", "it-IT-ElsaNeural"],
|
| 569 |
+
"ja": ["ja-JP-KeitaNeural", "ja-JP-NanamiNeural"],
|
| 570 |
+
"zh": ["zh-CN-YunxiNeural", "zh-CN-XiaoxiaoNeural"],
|
| 571 |
+
"hi": ["hi-IN-MadhurNeural", "hi-IN-SwaraNeural"],
|
| 572 |
+
"ar": ["ar-SA-HamedNeural", "ar-SA-ZariyahNeural"]
|
| 573 |
+
}
|
| 574 |
+
|
| 575 |
+
voice_list = voices.get(language, ["en-US-ChristopherNeural", "en-US-AriaNeural"])
|
| 576 |
+
max_retries = 3
|
| 577 |
+
retry_delay = 2 # seconds
|
| 578 |
+
|
| 579 |
+
last_error = None
|
| 580 |
+
for attempt in range(max_retries):
|
| 581 |
+
for voice in voice_list:
|
| 582 |
+
try:
|
| 583 |
+
logger.info(f"Trying Edge TTS (attempt {attempt + 1}/{max_retries}, voice: {voice})...")
|
| 584 |
+
|
| 585 |
+
# Create communicate object with timeout
|
| 586 |
+
communicate = edge_tts.Communicate(text, voice)
|
| 587 |
+
|
| 588 |
+
# Save with timeout protection
|
| 589 |
+
try:
|
| 590 |
+
await asyncio.wait_for(
|
| 591 |
+
communicate.save(output_file),
|
| 592 |
+
timeout=60.0 # 60 second timeout
|
| 593 |
+
)
|
| 594 |
+
logger.info(f"✅ TTS generated via Edge TTS: {language} (voice: {voice})")
|
| 595 |
+
return # Success!
|
| 596 |
+
except asyncio.TimeoutError:
|
| 597 |
+
logger.warning(f"TTS timeout for voice {voice}, trying next...")
|
| 598 |
+
continue
|
| 599 |
+
except Exception as e:
|
| 600 |
+
error_msg = str(e)
|
| 601 |
+
last_error = e # Capture the error
|
| 602 |
+
# Check if it's a 403 or connection error
|
| 603 |
+
if "403" in error_msg or "Invalid response status" in error_msg:
|
| 604 |
+
logger.warning(f"Edge TTS 403/connection error with voice {voice}: {e}")
|
| 605 |
+
# Wait before trying next voice
|
| 606 |
+
await asyncio.sleep(retry_delay)
|
| 607 |
+
continue
|
| 608 |
+
else:
|
| 609 |
+
raise # Re-raise if it's a different error
|
| 610 |
+
|
| 611 |
+
except Exception as e:
|
| 612 |
+
last_error = e # Always capture the error
|
| 613 |
+
error_msg = str(e)
|
| 614 |
+
if "403" in error_msg or "Invalid response status" in error_msg:
|
| 615 |
+
logger.warning(f"Edge TTS error (attempt {attempt + 1}): {e}")
|
| 616 |
+
if attempt < max_retries - 1:
|
| 617 |
+
# Exponential backoff
|
| 618 |
+
wait_time = retry_delay * (2 ** attempt)
|
| 619 |
+
logger.info(f"Waiting {wait_time}s before retry...")
|
| 620 |
+
await asyncio.sleep(wait_time)
|
| 621 |
+
continue
|
| 622 |
+
else:
|
| 623 |
+
# For other errors, try next voice immediately
|
| 624 |
+
continue
|
| 625 |
+
|
| 626 |
+
# Method 3: Fallback - Coqui TTS (high-quality local neural TTS)
|
| 627 |
+
if COQUI_TTS_AVAILABLE:
|
| 628 |
+
try:
|
| 629 |
+
logger.warning("Edge TTS failed. Trying Coqui TTS (high-quality local)...")
|
| 630 |
+
await _coqui_tts_fallback(text, language, output_file)
|
| 631 |
+
logger.info("✅ TTS generated via Coqui TTS (high quality)")
|
| 632 |
+
return
|
| 633 |
+
except Exception as coqui_error:
|
| 634 |
+
logger.warning(f"Coqui TTS failed: {coqui_error}")
|
| 635 |
+
last_error = last_error or coqui_error
|
| 636 |
+
|
| 637 |
+
# Method 4: Last resort - gTTS (mechanical but reliable)
|
| 638 |
+
try:
|
| 639 |
+
logger.warning("Falling back to gTTS (mechanical quality)...")
|
| 640 |
+
await _gtts_fallback(text, language, output_file)
|
| 641 |
+
logger.info("✅ TTS generated via gTTS fallback")
|
| 642 |
+
return
|
| 643 |
+
except Exception as fallback_error:
|
| 644 |
+
logger.error(f"gTTS fallback failed: {fallback_error}")
|
| 645 |
+
last_error = last_error or fallback_error
|
| 646 |
+
|
| 647 |
+
error_details = str(last_error) if last_error else "Unknown error (all TTS methods failed)"
|
| 648 |
+
error_msg = (
|
| 649 |
+
f"Failed to generate TTS with all methods (ElevenLabs, Edge TTS, Coqui TTS, gTTS). "
|
| 650 |
+
f"Last error: {error_details}. "
|
| 651 |
+
f"This might be due to network issues or TTS service unavailability."
|
| 652 |
+
)
|
| 653 |
+
logger.error(error_msg)
|
| 654 |
+
raise Exception(error_msg)
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
def transcribe_audio(audio_path: str) -> str:
|
| 658 |
+
"""Transcribe audio using local Whisper model (primary method)"""
|
| 659 |
+
try:
|
| 660 |
+
logger.info("Transcribing audio with local Whisper...")
|
| 661 |
+
|
| 662 |
+
# Use local Whisper as the primary and only method
|
| 663 |
+
# This is more reliable than cloud APIs which are frequently unavailable
|
| 664 |
+
model = _get_local_whisper()
|
| 665 |
+
result = model.transcribe(audio_path)
|
| 666 |
+
text = result.get("text", "").strip()
|
| 667 |
+
|
| 668 |
+
if text:
|
| 669 |
+
logger.info(f"✅ Transcribed: {len(text)} characters")
|
| 670 |
+
return text
|
| 671 |
+
else:
|
| 672 |
+
logger.warning("Whisper returned empty transcription")
|
| 673 |
+
return "Error identifying speech."
|
| 674 |
+
|
| 675 |
+
except Exception as e:
|
| 676 |
+
logger.error(f"Transcription error: {e}")
|
| 677 |
+
return "Error identifying speech."
|
| 678 |
+
|
| 679 |
+
|
| 680 |
+
def translate_text(text: str, target_lang: str) -> str:
|
| 681 |
+
"""Translate text using deep-translator (primary) with NLLB API fallback"""
|
| 682 |
+
|
| 683 |
+
# Don't translate error messages or empty text
|
| 684 |
+
if text == "Error identifying speech." or not text.strip():
|
| 685 |
+
return text
|
| 686 |
+
|
| 687 |
+
try:
|
| 688 |
+
logger.info(f"Translating to {target_lang}...")
|
| 689 |
+
|
| 690 |
+
# Method 1: Primary - deep-translator (local, reliable, no API key needed)
|
| 691 |
+
try:
|
| 692 |
+
# Map language codes for deep-translator
|
| 693 |
+
translator_lang_map = {
|
| 694 |
+
"es": "es",
|
| 695 |
+
"fr": "fr",
|
| 696 |
+
"de": "de",
|
| 697 |
+
"it": "it",
|
| 698 |
+
"ja": "ja",
|
| 699 |
+
"zh": "zh-CN", # Chinese simplified
|
| 700 |
+
"hi": "hi",
|
| 701 |
+
"ar": "ar"
|
| 702 |
+
}
|
| 703 |
+
translator_target = translator_lang_map.get(target_lang, target_lang)
|
| 704 |
+
translator = GoogleTranslator(source='en', target=translator_target)
|
| 705 |
+
translated = translator.translate(text)
|
| 706 |
+
|
| 707 |
+
if translated and translated != text and translated.strip():
|
| 708 |
+
logger.info(f"✅ Translated via deep-translator: {len(translated)} characters")
|
| 709 |
+
return translated.strip()
|
| 710 |
+
else:
|
| 711 |
+
logger.warning("deep-translator returned empty or same text")
|
| 712 |
+
except Exception as e:
|
| 713 |
+
logger.warning(f"deep-translator failed: {e}")
|
| 714 |
+
|
| 715 |
+
# Method 2: Fallback - NLLB via HF API (only if HF_TOKEN is available)
|
| 716 |
+
if HF_TOKEN:
|
| 717 |
+
try:
|
| 718 |
+
# NLLB language codes mapping
|
| 719 |
+
nllb_codes = {
|
| 720 |
+
"es": "spa_Latn",
|
| 721 |
+
"fr": "fra_Latn",
|
| 722 |
+
"de": "deu_Latn",
|
| 723 |
+
"it": "ita_Latn",
|
| 724 |
+
"ja": "jpn_Jpan",
|
| 725 |
+
"zh": "zho_Hans",
|
| 726 |
+
"hi": "hin_Deva",
|
| 727 |
+
"ar": "arb_Arab"
|
| 728 |
+
}
|
| 729 |
+
tgt_lang = nllb_codes.get(target_lang, "spa_Latn")
|
| 730 |
+
|
| 731 |
+
api_url = "https://router.huggingface.co/hf-inference/models/facebook/nllb-200-distilled-600M"
|
| 732 |
+
headers = {
|
| 733 |
+
"Authorization": f"Bearer {HF_TOKEN}",
|
| 734 |
+
"Content-Type": "application/json"
|
| 735 |
+
}
|
| 736 |
+
payload = {
|
| 737 |
+
"inputs": text,
|
| 738 |
+
"parameters": {"src_lang": "eng_Latn", "tgt_lang": tgt_lang}
|
| 739 |
+
}
|
| 740 |
+
response = requests.post(api_url, headers=headers, json=payload, timeout=30)
|
| 741 |
+
|
| 742 |
+
if response.status_code == 200:
|
| 743 |
+
data = response.json()
|
| 744 |
+
translated = ""
|
| 745 |
+
if isinstance(data, list) and data:
|
| 746 |
+
translated = data[0].get("translation_text", "")
|
| 747 |
+
elif isinstance(data, dict):
|
| 748 |
+
translated = data.get("translation_text", "")
|
| 749 |
+
|
| 750 |
+
translated = (translated or "").strip()
|
| 751 |
+
if translated and translated != text:
|
| 752 |
+
logger.info(f"✅ Translated via NLLB: {len(translated)} characters")
|
| 753 |
+
return translated
|
| 754 |
+
else:
|
| 755 |
+
logger.warning("NLLB returned empty or same text")
|
| 756 |
+
else:
|
| 757 |
+
logger.warning(f"NLLB translation returned {response.status_code}: {response.text[:100]}")
|
| 758 |
+
except requests.exceptions.Timeout:
|
| 759 |
+
logger.warning("NLLB translation timed out")
|
| 760 |
+
except Exception as e:
|
| 761 |
+
logger.warning(f"NLLB translation failed: {e}")
|
| 762 |
+
else:
|
| 763 |
+
logger.debug("HF_TOKEN not set, skipping NLLB translation fallback")
|
| 764 |
+
|
| 765 |
+
# Last resort: return original text with warning
|
| 766 |
+
logger.error("All translation methods failed, using original text")
|
| 767 |
+
return text
|
| 768 |
+
|
| 769 |
+
except Exception as e:
|
| 770 |
+
logger.error(f"Translation error: {e}")
|
| 771 |
+
return text # Return original if translation fails
|
| 772 |
+
|
| 773 |
+
|
| 774 |
+
def adjust_audio_duration(audio_path: str, target_duration_ms: int, output_path: str) -> bool:
|
| 775 |
+
"""
|
| 776 |
+
Adjust audio duration to match video using time-stretching.
|
| 777 |
+
|
| 778 |
+
Args:
|
| 779 |
+
audio_path: Input audio file
|
| 780 |
+
target_duration_ms: Target duration in milliseconds
|
| 781 |
+
output_path: Output audio file
|
| 782 |
+
|
| 783 |
+
Returns:
|
| 784 |
+
Success boolean
|
| 785 |
+
"""
|
| 786 |
+
try:
|
| 787 |
+
logger.info(f"Adjusting audio duration to {target_duration_ms}ms...")
|
| 788 |
+
|
| 789 |
+
# Load audio
|
| 790 |
+
audio = AudioSegment.from_file(audio_path)
|
| 791 |
+
current_duration = len(audio)
|
| 792 |
+
|
| 793 |
+
if current_duration == 0:
|
| 794 |
+
logger.error("Audio has zero duration")
|
| 795 |
+
return False
|
| 796 |
+
|
| 797 |
+
# Calculate speed ratio (how much to speed up/slow down)
|
| 798 |
+
# If current is 10s and target is 8s, we need to speed up by 10/8 = 1.25x
|
| 799 |
+
speed_ratio = current_duration / target_duration_ms
|
| 800 |
+
|
| 801 |
+
logger.info(f"Current audio: {current_duration}ms, target: {target_duration_ms}ms, ratio: {speed_ratio:.2f}x")
|
| 802 |
+
|
| 803 |
+
# Apply speed change (limit to reasonable range to avoid distortion)
|
| 804 |
+
# Only adjust if ratio is between 0.7 and 1.5 (more conservative to avoid corruption)
|
| 805 |
+
if 0.7 <= speed_ratio <= 1.5:
|
| 806 |
+
try:
|
| 807 |
+
# Use speedup function - it handles both speeding up and slowing down
|
| 808 |
+
adjusted = speedup(audio, playback_speed=speed_ratio)
|
| 809 |
+
|
| 810 |
+
# Verify adjusted duration is reasonable (should be close to target)
|
| 811 |
+
adjusted_duration = len(adjusted)
|
| 812 |
+
if adjusted_duration == 0:
|
| 813 |
+
logger.error("Adjusted audio has zero duration")
|
| 814 |
+
return False
|
| 815 |
+
|
| 816 |
+
# Check if adjusted duration is reasonable (within 30% of target)
|
| 817 |
+
duration_diff = abs(adjusted_duration - target_duration_ms) / target_duration_ms
|
| 818 |
+
if duration_diff > 0.3:
|
| 819 |
+
logger.warning(f"Adjusted duration ({adjusted_duration}ms) too far from target ({target_duration_ms}ms), using original")
|
| 820 |
+
audio.export(output_path, format="mp3", bitrate="128k")
|
| 821 |
+
return True
|
| 822 |
+
|
| 823 |
+
# Export with proper parameters
|
| 824 |
+
adjusted.export(output_path, format="mp3", bitrate="128k")
|
| 825 |
+
|
| 826 |
+
# Verify output file exists and has reasonable size
|
| 827 |
+
if not os.path.exists(output_path):
|
| 828 |
+
logger.error("Adjusted audio file was not created")
|
| 829 |
+
return False
|
| 830 |
+
|
| 831 |
+
output_size = os.path.getsize(output_path)
|
| 832 |
+
input_size = os.path.getsize(audio_path)
|
| 833 |
+
|
| 834 |
+
# Check if output is suspiciously small (less than 20% of input)
|
| 835 |
+
if output_size < input_size * 0.2:
|
| 836 |
+
logger.warning(f"Adjusted audio file too small ({output_size} bytes vs {input_size} bytes), using original")
|
| 837 |
+
audio.export(output_path, format="mp3", bitrate="128k")
|
| 838 |
+
return True
|
| 839 |
+
|
| 840 |
+
logger.info(f"✅ Audio adjusted: {current_duration}ms → {adjusted_duration}ms ({speed_ratio:.2f}x, {output_size} bytes)")
|
| 841 |
+
return True
|
| 842 |
+
except Exception as adjust_error:
|
| 843 |
+
logger.warning(f"Audio adjustment failed: {adjust_error}, using original")
|
| 844 |
+
audio.export(output_path, format="mp3", bitrate="128k")
|
| 845 |
+
return True
|
| 846 |
+
else:
|
| 847 |
+
logger.warning(f"Speed ratio {speed_ratio:.2f}x out of safe range (0.7-1.5), using original audio")
|
| 848 |
+
# Just copy original audio
|
| 849 |
+
audio.export(output_path, format="mp3", bitrate="128k")
|
| 850 |
+
return True
|
| 851 |
+
|
| 852 |
+
except Exception as e:
|
| 853 |
+
logger.error(f"Audio adjustment failed: {e}")
|
| 854 |
+
# Try to copy original as fallback
|
| 855 |
+
try:
|
| 856 |
+
audio = AudioSegment.from_file(audio_path)
|
| 857 |
+
audio.export(output_path, format="mp3", bitrate="128k")
|
| 858 |
+
logger.warning("Using original audio as fallback")
|
| 859 |
+
return True
|
| 860 |
+
except:
|
| 861 |
+
return False
|
| 862 |
+
|
| 863 |
+
|
| 864 |
+
async def process_video_async(
|
| 865 |
+
video_path: str,
|
| 866 |
+
target_lang: str = "es",
|
| 867 |
+
elevenlabs_api_key: str | None = None,
|
| 868 |
+
progress_callback=None,
|
| 869 |
+
) -> tuple:
|
| 870 |
+
"""
|
| 871 |
+
Main async pipeline: Video -> Audio -> Text -> Trans-Text -> Audio -> Video
|
| 872 |
+
|
| 873 |
+
Returns:
|
| 874 |
+
(output_path, original_text, translated_text)
|
| 875 |
+
"""
|
| 876 |
+
|
| 877 |
+
temp_dir = tempfile.mkdtemp(prefix="localizer_")
|
| 878 |
+
audio_path = os.path.join(temp_dir, "extracted_audio.mp3")
|
| 879 |
+
tts_path = os.path.join(temp_dir, "tts_audio.mp3")
|
| 880 |
+
adjusted_audio_path = os.path.join(temp_dir, "adjusted_audio.mp3")
|
| 881 |
+
output_video_path = os.path.join(
|
| 882 |
+
OUTPUT_DIR, f"localized_{target_lang}_{uuid4().hex}.mp4"
|
| 883 |
+
)
|
| 884 |
+
progress = progress_callback or (lambda *args, **kwargs: None)
|
| 885 |
+
|
| 886 |
+
video = None
|
| 887 |
+
new_audio = None
|
| 888 |
+
|
| 889 |
+
try:
|
| 890 |
+
logger.info(f"Starting video localization to {target_lang}...")
|
| 891 |
+
progress(0.02, desc="Extracting audio...")
|
| 892 |
+
|
| 893 |
+
# 1. Extract Audio
|
| 894 |
+
video = VideoFileClip(video_path)
|
| 895 |
+
video_duration_ms = int(video.duration * 1000)
|
| 896 |
+
video.audio.write_audiofile(audio_path, verbose=False, logger=None)
|
| 897 |
+
logger.info(f"✅ Audio extracted ({video.duration:.1f}s)")
|
| 898 |
+
progress(0.15, desc="Transcribing with Whisper...")
|
| 899 |
+
|
| 900 |
+
# 2. Transcribe
|
| 901 |
+
original_text = transcribe_audio(audio_path)
|
| 902 |
+
progress(0.35, desc="Translating text...")
|
| 903 |
+
|
| 904 |
+
# 3. Translate
|
| 905 |
+
translated_text = translate_text(original_text, target_lang)
|
| 906 |
+
progress(0.5, desc="Generating voice...")
|
| 907 |
+
|
| 908 |
+
# 4. Generate TTS
|
| 909 |
+
# Split long text into chunks to avoid rate limiting
|
| 910 |
+
if len(translated_text) > 500:
|
| 911 |
+
logger.info(f"Text is long ({len(translated_text)} chars), splitting into chunks...")
|
| 912 |
+
# Split by sentences if possible
|
| 913 |
+
import re
|
| 914 |
+
sentences = re.split(r'([.!?]\s+)', translated_text)
|
| 915 |
+
chunks = []
|
| 916 |
+
current_chunk = ""
|
| 917 |
+
for i in range(0, len(sentences), 2):
|
| 918 |
+
sentence = sentences[i] + (sentences[i+1] if i+1 < len(sentences) else "")
|
| 919 |
+
if len(current_chunk) + len(sentence) > 500:
|
| 920 |
+
if current_chunk:
|
| 921 |
+
chunks.append(current_chunk.strip())
|
| 922 |
+
current_chunk = sentence
|
| 923 |
+
else:
|
| 924 |
+
current_chunk += sentence
|
| 925 |
+
if current_chunk:
|
| 926 |
+
chunks.append(current_chunk.strip())
|
| 927 |
+
|
| 928 |
+
# Generate TTS for each chunk and concatenate
|
| 929 |
+
chunk_files = []
|
| 930 |
+
for idx, chunk in enumerate(chunks):
|
| 931 |
+
chunk_file = os.path.join(temp_dir, f"tts_chunk_{idx}.mp3")
|
| 932 |
+
await text_to_speech(
|
| 933 |
+
chunk, target_lang, chunk_file, elevenlabs_api_key=elevenlabs_api_key
|
| 934 |
+
)
|
| 935 |
+
chunk_files.append(chunk_file)
|
| 936 |
+
|
| 937 |
+
# Concatenate audio chunks
|
| 938 |
+
combined = AudioSegment.empty()
|
| 939 |
+
for chunk_file in chunk_files:
|
| 940 |
+
chunk_audio = AudioSegment.from_file(chunk_file)
|
| 941 |
+
combined += chunk_audio
|
| 942 |
+
combined.export(tts_path, format="mp3")
|
| 943 |
+
logger.info(f"✅ Combined {len(chunks)} TTS chunks")
|
| 944 |
+
else:
|
| 945 |
+
await text_to_speech(
|
| 946 |
+
translated_text, target_lang, tts_path, elevenlabs_api_key=elevenlabs_api_key
|
| 947 |
+
)
|
| 948 |
+
|
| 949 |
+
# 5. Validate TTS audio file before processing
|
| 950 |
+
if not os.path.exists(tts_path):
|
| 951 |
+
raise Exception(f"TTS audio file not found: {tts_path}")
|
| 952 |
+
|
| 953 |
+
file_size = os.path.getsize(tts_path)
|
| 954 |
+
if file_size == 0:
|
| 955 |
+
raise Exception(f"TTS audio file is empty: {tts_path}")
|
| 956 |
+
|
| 957 |
+
# Basic validation - just check file exists and has content
|
| 958 |
+
logger.info(f"✅ TTS audio file ready: {file_size} bytes")
|
| 959 |
+
progress(0.65, desc="Aligning audio to video...")
|
| 960 |
+
|
| 961 |
+
# 5. Adjust audio duration to match video (with validation)
|
| 962 |
+
# First, check original audio duration
|
| 963 |
+
try:
|
| 964 |
+
original_audio = AudioSegment.from_file(tts_path)
|
| 965 |
+
original_duration_ms = len(original_audio)
|
| 966 |
+
logger.info(f"Original TTS audio duration: {original_duration_ms}ms, target: {video_duration_ms}ms")
|
| 967 |
+
|
| 968 |
+
# Only adjust if there's a significant difference (>20%)
|
| 969 |
+
duration_diff = abs(original_duration_ms - video_duration_ms) / video_duration_ms
|
| 970 |
+
if duration_diff > 0.2:
|
| 971 |
+
success = adjust_audio_duration(tts_path, video_duration_ms, adjusted_audio_path)
|
| 972 |
+
|
| 973 |
+
# Validate adjusted audio before using it
|
| 974 |
+
if success and os.path.exists(adjusted_audio_path):
|
| 975 |
+
adjusted_size = os.path.getsize(adjusted_audio_path)
|
| 976 |
+
original_size = os.path.getsize(tts_path)
|
| 977 |
+
|
| 978 |
+
# Verify adjusted audio duration is reasonable (within 50% of target)
|
| 979 |
+
try:
|
| 980 |
+
test_audio = AudioSegment.from_file(adjusted_audio_path)
|
| 981 |
+
adjusted_duration_ms = len(test_audio)
|
| 982 |
+
|
| 983 |
+
# Check if adjusted duration is reasonable (at least 50% of target, max 150%)
|
| 984 |
+
if adjusted_duration_ms >= video_duration_ms * 0.5 and adjusted_duration_ms <= video_duration_ms * 1.5:
|
| 985 |
+
audio_to_use = adjusted_audio_path
|
| 986 |
+
logger.info(f"✅ Using adjusted audio: {adjusted_duration_ms}ms (target: {video_duration_ms}ms), {adjusted_size} bytes")
|
| 987 |
+
else:
|
| 988 |
+
logger.warning(f"⚠️ Adjusted audio duration ({adjusted_duration_ms}ms) not reasonable for target ({video_duration_ms}ms), using original")
|
| 989 |
+
audio_to_use = tts_path
|
| 990 |
+
except Exception as validation_error:
|
| 991 |
+
logger.warning(f"⚠️ Could not validate adjusted audio: {validation_error}, using original")
|
| 992 |
+
audio_to_use = tts_path
|
| 993 |
+
else:
|
| 994 |
+
logger.warning("⚠️ Audio adjustment failed, using original")
|
| 995 |
+
audio_to_use = tts_path
|
| 996 |
+
else:
|
| 997 |
+
logger.info(f"Audio duration close enough ({duration_diff*100:.1f}% difference), using original")
|
| 998 |
+
audio_to_use = tts_path
|
| 999 |
+
except Exception as e:
|
| 1000 |
+
logger.warning(f"⚠️ Could not check audio duration: {e}, using original")
|
| 1001 |
+
audio_to_use = tts_path
|
| 1002 |
+
|
| 1003 |
+
logger.info(f"✅ Final audio to use: {os.path.getsize(audio_to_use)} bytes")
|
| 1004 |
+
progress(0.75, desc="Merging audio with video...")
|
| 1005 |
+
|
| 1006 |
+
# 6. Merge audio with video - validate audio file first
|
| 1007 |
+
if not os.path.exists(audio_to_use) or os.path.getsize(audio_to_use) == 0:
|
| 1008 |
+
raise Exception(f"Audio file for merging is missing or empty: {audio_to_use}")
|
| 1009 |
+
|
| 1010 |
+
logger.info(f"Merging audio ({os.path.getsize(audio_to_use)} bytes) with video...")
|
| 1011 |
+
new_audio = AudioFileClip(audio_to_use)
|
| 1012 |
+
|
| 1013 |
+
# Verify audio clip is valid and has reasonable duration
|
| 1014 |
+
audio_duration = new_audio.duration
|
| 1015 |
+
if audio_duration == 0:
|
| 1016 |
+
logger.error(f"❌ Audio clip has zero duration")
|
| 1017 |
+
raise Exception("Audio clip has zero duration - cannot merge with video")
|
| 1018 |
+
|
| 1019 |
+
# CRITICAL: If audio is much shorter than video, it will be mostly silent
|
| 1020 |
+
# Fall back to original TTS audio if adjusted one is too short
|
| 1021 |
+
if audio_duration < video.duration * 0.3:
|
| 1022 |
+
logger.warning(f"⚠️ Audio duration ({audio_duration:.2f}s) is too short for video ({video.duration:.2f}s)")
|
| 1023 |
+
logger.warning(" This would create a mostly silent video. Trying original TTS audio...")
|
| 1024 |
+
|
| 1025 |
+
# Try original TTS audio
|
| 1026 |
+
if audio_to_use != tts_path and os.path.exists(tts_path):
|
| 1027 |
+
new_audio.close()
|
| 1028 |
+
try:
|
| 1029 |
+
new_audio = AudioFileClip(tts_path)
|
| 1030 |
+
audio_duration = new_audio.duration
|
| 1031 |
+
if audio_duration > video.duration * 0.3:
|
| 1032 |
+
logger.info(f"✅ Using original TTS audio: {audio_duration:.2f}s")
|
| 1033 |
+
else:
|
| 1034 |
+
logger.error(f"❌ Original TTS audio also too short: {audio_duration:.2f}s")
|
| 1035 |
+
raise Exception(f"TTS audio too short ({audio_duration:.2f}s) for video ({video.duration:.2f}s)")
|
| 1036 |
+
except Exception as e:
|
| 1037 |
+
logger.error(f"❌ Could not use original TTS audio: {e}")
|
| 1038 |
+
raise Exception(f"Cannot create video with valid audio: {e}")
|
| 1039 |
+
else:
|
| 1040 |
+
raise Exception(f"Audio too short ({audio_duration:.2f}s) for video ({video.duration:.2f}s) - would be mostly silent")
|
| 1041 |
+
|
| 1042 |
+
logger.info(f"✅ Audio clip loaded: {audio_duration:.2f}s (video: {video.duration:.2f}s)")
|
| 1043 |
+
final_video = video.set_audio(new_audio)
|
| 1044 |
+
|
| 1045 |
+
# 7. Write output
|
| 1046 |
+
logger.info("Writing output video...")
|
| 1047 |
+
progress(0.9, desc="Rendering output video...")
|
| 1048 |
+
final_video.write_videofile(
|
| 1049 |
+
output_video_path,
|
| 1050 |
+
codec='libx264',
|
| 1051 |
+
audio_codec='aac',
|
| 1052 |
+
verbose=False,
|
| 1053 |
+
logger=None
|
| 1054 |
+
)
|
| 1055 |
+
prune_outputs()
|
| 1056 |
+
progress(1.0, desc="Done")
|
| 1057 |
+
logger.info(f"✅ Video localization complete! Saved to {output_video_path}")
|
| 1058 |
+
return output_video_path, original_text, translated_text
|
| 1059 |
+
|
| 1060 |
+
except Exception as e:
|
| 1061 |
+
logger.error(f"Pipeline Error: {e}")
|
| 1062 |
+
return None, str(e), "Error"
|
| 1063 |
+
|
| 1064 |
+
finally:
|
| 1065 |
+
# Cleanup
|
| 1066 |
+
if video: video.close()
|
| 1067 |
+
if new_audio: new_audio.close()
|
| 1068 |
+
try:
|
| 1069 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 1070 |
+
except Exception as cleanup_error:
|
| 1071 |
+
logger.debug(f"Temp cleanup skipped: {cleanup_error}")
|
| 1072 |
+
|
| 1073 |
+
|
| 1074 |
+
def process_video_sync(
|
| 1075 |
+
video_path: str,
|
| 1076 |
+
target_lang: str = "es",
|
| 1077 |
+
elevenlabs_api_key: str | None = None,
|
| 1078 |
+
progress_callback=None,
|
| 1079 |
+
) -> tuple:
|
| 1080 |
+
"""
|
| 1081 |
+
Synchronous wrapper for async video processing.
|
| 1082 |
+
Handles event loop creation safely.
|
| 1083 |
+
|
| 1084 |
+
Returns:
|
| 1085 |
+
(output_path, original_text, translated_text)
|
| 1086 |
+
"""
|
| 1087 |
+
try:
|
| 1088 |
+
# Try to get existing event loop
|
| 1089 |
+
loop = asyncio.get_event_loop()
|
| 1090 |
+
if loop.is_running():
|
| 1091 |
+
# We're already in an async context, create a new loop
|
| 1092 |
+
import nest_asyncio
|
| 1093 |
+
nest_asyncio.apply()
|
| 1094 |
+
return loop.run_until_complete(
|
| 1095 |
+
process_video_async(
|
| 1096 |
+
video_path,
|
| 1097 |
+
target_lang,
|
| 1098 |
+
elevenlabs_api_key=elevenlabs_api_key,
|
| 1099 |
+
progress_callback=progress_callback,
|
| 1100 |
+
)
|
| 1101 |
+
)
|
| 1102 |
+
else:
|
| 1103 |
+
# No running loop, safe to use asyncio.run()
|
| 1104 |
+
return asyncio.run(
|
| 1105 |
+
process_video_async(
|
| 1106 |
+
video_path,
|
| 1107 |
+
target_lang,
|
| 1108 |
+
elevenlabs_api_key=elevenlabs_api_key,
|
| 1109 |
+
progress_callback=progress_callback,
|
| 1110 |
+
)
|
| 1111 |
+
)
|
| 1112 |
+
except RuntimeError:
|
| 1113 |
+
# No event loop exists, create one
|
| 1114 |
+
return asyncio.run(
|
| 1115 |
+
process_video_async(
|
| 1116 |
+
video_path,
|
| 1117 |
+
target_lang,
|
| 1118 |
+
elevenlabs_api_key=elevenlabs_api_key,
|
| 1119 |
+
progress_callback=progress_callback,
|
| 1120 |
+
)
|
| 1121 |
+
)
|
| 1122 |
+
|
| 1123 |
+
|
| 1124 |
+
# Convenience alias for backward compatibility
|
| 1125 |
+
def process_video(
|
| 1126 |
+
video_path: str,
|
| 1127 |
+
target_lang: str = "es",
|
| 1128 |
+
elevenlabs_api_key: str | None = None,
|
| 1129 |
+
progress_callback=None,
|
| 1130 |
+
) -> tuple:
|
| 1131 |
+
"""
|
| 1132 |
+
Main entry point for video localization.
|
| 1133 |
+
Wrapper around process_video_sync for convenience.
|
| 1134 |
+
|
| 1135 |
+
Returns:
|
| 1136 |
+
(output_path, original_text, translated_text)
|
| 1137 |
+
"""
|
| 1138 |
+
return process_video_sync(
|
| 1139 |
+
video_path,
|
| 1140 |
+
target_lang,
|
| 1141 |
+
elevenlabs_api_key=elevenlabs_api_key,
|
| 1142 |
+
progress_callback=progress_callback,
|
| 1143 |
+
)
|
| 1144 |
+
|
| 1145 |
+
|
| 1146 |
+
# ==========================
|
| 1147 |
+
# Startup Validation
|
| 1148 |
+
# ==========================
|
| 1149 |
+
# Validate ElevenLabs on module import
|
| 1150 |
+
def _validate_elevenlabs_on_startup():
|
| 1151 |
+
"""Validate ElevenLabs on module import."""
|
| 1152 |
+
global ELEVENLABS_AVAILABLE, _elevenlabs_status
|
| 1153 |
+
logger.info("Initializing Video Localization Engine...")
|
| 1154 |
+
|
| 1155 |
+
if not ELEVENLABS_AVAILABLE:
|
| 1156 |
+
logger.info("ElevenLabs not installed. Using open source models (EdgeTTS, Coqui, gTTS)")
|
| 1157 |
+
_elevenlabs_status = "not_installed"
|
| 1158 |
+
return
|
| 1159 |
+
|
| 1160 |
+
if DEFAULT_ELEVENLABS_API_KEY:
|
| 1161 |
+
is_valid, message = validate_elevenlabs_api_key(DEFAULT_ELEVENLABS_API_KEY)
|
| 1162 |
+
if is_valid:
|
| 1163 |
+
logger.info("ElevenLabs API key found and validated")
|
| 1164 |
+
_elevenlabs_status = "ready"
|
| 1165 |
+
else:
|
| 1166 |
+
logger.info(f"ElevenLabs API key not valid. Using open source models: {message}")
|
| 1167 |
+
_elevenlabs_status = "invalid_key"
|
| 1168 |
+
else:
|
| 1169 |
+
logger.info("No ElevenLabs API key found. Using open source models (EdgeTTS, Coqui, gTTS)")
|
| 1170 |
+
logger.info("Add your API key in the UI for premium voice quality")
|
| 1171 |
+
_elevenlabs_status = "no_key"
|
| 1172 |
+
|
| 1173 |
+
# Run validation on import
|
| 1174 |
+
_validate_elevenlabs_on_startup()
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
requirements.txt
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web Framework
|
| 2 |
+
gradio>=4.44.1,<5.0 # UI framework (4.44.1 fixes some bugs)
|
| 3 |
+
|
| 4 |
+
# AI/ML
|
| 5 |
+
huggingface-hub>=0.19.3,<1.0 # HF Inference API client (compatible with Gradio 4.x)
|
| 6 |
+
|
| 7 |
+
# MCP Protocol
|
| 8 |
+
mcp==1.0.0 # Model Context Protocol SDK
|
| 9 |
+
|
| 10 |
+
# Video Processing
|
| 11 |
+
moviepy==1.0.3 # Video editing library
|
| 12 |
+
pydub==0.25.1 # Audio manipulation and time-stretching
|
| 13 |
+
|
| 14 |
+
# Text-to-Speech
|
| 15 |
+
elevenlabs>=1.0.0 # ElevenLabs - premium professional-grade TTS (PRIMARY)
|
| 16 |
+
edge-tts==6.1.9 # High-quality TTS engine (fallback)
|
| 17 |
+
gTTS==2.5.4 # Fallback cloud TTS (last resort)
|
| 18 |
+
# Note: Coqui TTS (TTS>=0.22.0) removed due to pandas dependency conflict with Gradio
|
| 19 |
+
# The app works perfectly without it using EdgeTTS and gTTS as fallbacks
|
| 20 |
+
|
| 21 |
+
# Async Support
|
| 22 |
+
nest-asyncio==1.6.0 # Allow nested event loops
|
| 23 |
+
|
| 24 |
+
# Configuration
|
| 25 |
+
python-dotenv==1.0.0 # Environment variable management
|
| 26 |
+
|
| 27 |
+
# Core
|
| 28 |
+
numpy<2.0,>=1.26.0 # Array operations (NumPy 2.x incompatible with some dependencies)
|
| 29 |
+
requests>=2.31.0 # HTTP requests for direct API calls
|
| 30 |
+
|
| 31 |
+
# Local Whisper for offline transcription (required)
|
| 32 |
+
openai-whisper>=20231117 # Local speech recognition - primary transcription method
|
| 33 |
+
|
| 34 |
+
# Translation
|
| 35 |
+
deep-translator>=1.11.4 # Local translation library (primary method, no API key needed)
|
server.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mcp.server.fastmcp import FastMCP
|
| 2 |
+
from localizer_engine import process_video_sync
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
# Initialize MCP Server
|
| 6 |
+
mcp = FastMCP("Global Video Localizer")
|
| 7 |
+
|
| 8 |
+
@mcp.tool()
|
| 9 |
+
def localize_video_file(
|
| 10 |
+
file_path: str,
|
| 11 |
+
target_language: str = "es",
|
| 12 |
+
elevenlabs_api_key: str | None = None,
|
| 13 |
+
) -> str:
|
| 14 |
+
"""
|
| 15 |
+
Localizes a video file into a target language.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
file_path: Absolute path to the video file (.mp4)
|
| 19 |
+
target_language: Language code (es, fr, de, it, ja, zh, hi, ar)
|
| 20 |
+
elevenlabs_api_key: Optional API key for premium ElevenLabs voices (not persisted)
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
Path to the localized video file and summary.
|
| 24 |
+
"""
|
| 25 |
+
if not os.path.exists(file_path):
|
| 26 |
+
return f"Error: File not found at {file_path}"
|
| 27 |
+
|
| 28 |
+
# Use safe synchronous wrapper
|
| 29 |
+
try:
|
| 30 |
+
output_path, orig, trans = process_video_sync(
|
| 31 |
+
file_path,
|
| 32 |
+
target_language,
|
| 33 |
+
elevenlabs_api_key=elevenlabs_api_key,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
if output_path:
|
| 37 |
+
return f"""✅ Video localization successful!
|
| 38 |
+
|
| 39 |
+
Target Language: {target_language}
|
| 40 |
+
Original Text: {orig[:100]}...
|
| 41 |
+
Translated Text: {trans[:100]}...
|
| 42 |
+
|
| 43 |
+
Output saved to: {output_path}
|
| 44 |
+
"""
|
| 45 |
+
else:
|
| 46 |
+
return f"❌ Error processing video: {orig}"
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
return f"❌ Critical Error: {str(e)}"
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
print("Starting Global Video Localizer MCP Server...")
|
| 53 |
+
print("✨ Features: Whisper transcription + Qwen translation + Edge TTS + Audio time-stretching")
|
| 54 |
+
mcp.run()
|