Spaces:
Sleeping
Sleeping
Upload 26 files
Browse files- .dockerignore +55 -0
- .gitattributes +5 -0
- .gitignore +8 -0
- Dockerfile +24 -0
- LICENSE +21 -0
- README.md +59 -10
- api_server/auth_middleware.py +27 -0
- api_server/v1_media_router.py +755 -0
- api_server/v1_utils_router.py +167 -0
- assets/anton.ttf +3 -0
- assets/icon_volume.png +3 -0
- assets/noto.ttf +3 -0
- assets/noto_hindi.ttf +3 -0
- assets/person.png +3 -0
- cuda.Dockerfile +45 -0
- requirements.txt +16 -0
- server.py +57 -0
- utils/image.py +386 -0
- utils/proxy.py +0 -0
- video/builder.py +347 -0
- video/caption.py +354 -0
- video/config.py +53 -0
- video/media.py +850 -0
- video/storage.py +323 -0
- video/stt.py +41 -0
- video/tts.py +443 -0
- video/tts_chatterbox.py +256 -0
.dockerignore
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Docker
|
| 6 |
+
Dockerfile
|
| 7 |
+
.dockerignore
|
| 8 |
+
._.DS_Store
|
| 9 |
+
|
| 10 |
+
# Python
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
*.so
|
| 15 |
+
.Python
|
| 16 |
+
env/
|
| 17 |
+
build/
|
| 18 |
+
develop-eggs/
|
| 19 |
+
dist/
|
| 20 |
+
downloads/
|
| 21 |
+
eggs/
|
| 22 |
+
.eggs/
|
| 23 |
+
lib/
|
| 24 |
+
lib64/
|
| 25 |
+
parts/
|
| 26 |
+
sdist/
|
| 27 |
+
var/
|
| 28 |
+
*.egg-info/
|
| 29 |
+
.installed.cfg
|
| 30 |
+
*.egg
|
| 31 |
+
.pytest_cache
|
| 32 |
+
|
| 33 |
+
# Generated files
|
| 34 |
+
*.mp4
|
| 35 |
+
!bgvideo.mp4
|
| 36 |
+
!bgvideo2.mp4
|
| 37 |
+
!video.mp4
|
| 38 |
+
!video2.mp4
|
| 39 |
+
*.wav
|
| 40 |
+
!0.wav
|
| 41 |
+
multi.mp4
|
| 42 |
+
tmp/
|
| 43 |
+
!captions/
|
| 44 |
+
|
| 45 |
+
# Editor files
|
| 46 |
+
.idea/
|
| 47 |
+
.vscode/
|
| 48 |
+
*.swp
|
| 49 |
+
*.swo
|
| 50 |
+
|
| 51 |
+
# Temporary files
|
| 52 |
+
.DS_Store
|
| 53 |
+
.cache/
|
| 54 |
+
*.tmp
|
| 55 |
+
*.bak
|
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/anton.ttf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/icon_volume.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/noto_hindi.ttf filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/noto.ttf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/person.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
.hypothesis
|
| 3 |
+
.venv
|
| 4 |
+
media
|
| 5 |
+
tmp
|
| 6 |
+
captions
|
| 7 |
+
.DS_Store
|
| 8 |
+
._.DS_Store
|
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
fonts-ebgaramond \
|
| 7 |
+
ffmpeg \
|
| 8 |
+
libsndfile1 \
|
| 9 |
+
fonts-dejavu \
|
| 10 |
+
build-essential \
|
| 11 |
+
g++ \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
COPY requirements.txt .
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
COPY api_server /app/api_server
|
| 18 |
+
COPY utils /app/utils
|
| 19 |
+
COPY video /app/video
|
| 20 |
+
COPY server.py /app/server.py
|
| 21 |
+
|
| 22 |
+
ENV PYTHONUNBUFFERED=1
|
| 23 |
+
|
| 24 |
+
CMD ["fastapi", "run", "server.py", "--host", "0.0.0.0", "--port", "8000"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 David Gyori
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,10 +1,59 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AI Agents A-Z No-Code Tools (V1)
|
| 2 |
+
|
| 3 |
+
Video editing tools to use with no-code tools like n8n, Zapier, and Make. Brought to you by [AI Agents A-Z](https://www.youtube.com/@aiagentsaz).
|
| 4 |
+
|
| 5 |
+
## [📚 Join our Skool community for the premium edition of the server and other premium content](https://www.skool.com/ai-agents-az/about)
|
| 6 |
+
|
| 7 |
+
[Watch the YouTube video featuring this project](https://www.youtube.com/watch?v=1-UuldAM6fQ)
|
| 8 |
+
|
| 9 |
+
### Be part of a growing community and help us create more content like this
|
| 10 |
+
|
| 11 |
+
# Starting the project
|
| 12 |
+
|
| 13 |
+
## Using Docker
|
| 14 |
+
|
| 15 |
+
```
|
| 16 |
+
docker run --rm -p 8000:8000 -it gyoridavid/ai-agents-no-code-tools:latest
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
If you have an NVidia GPU and have the [Cuda Toolkit](https://developer.nvidia.com/cuda-toolkit) installed, you can run the server with GPU support
|
| 20 |
+
|
| 21 |
+
```
|
| 22 |
+
docker run --rm --gpus=all -e NVIDIA_VISIBLE_DEVICES=all -e NVIDIA_DRIVER_CAPABILITIES=all -p 8000:8000 -it gyoridavid/ai-agents-no-code-tools:latest-cuda
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
## With python
|
| 26 |
+
|
| 27 |
+
1. Clone the repository
|
| 28 |
+
2. Create a virtual environment
|
| 29 |
+
```bash
|
| 30 |
+
python -m venv venv
|
| 31 |
+
```
|
| 32 |
+
3. Activate the virtual environment
|
| 33 |
+
- On Windows:
|
| 34 |
+
```bash
|
| 35 |
+
venv\Scripts\activate
|
| 36 |
+
```
|
| 37 |
+
- On macOS/Linux:
|
| 38 |
+
```bash
|
| 39 |
+
source venv/bin/activate
|
| 40 |
+
```
|
| 41 |
+
4. Install the dependencies
|
| 42 |
+
```bash
|
| 43 |
+
pip install -r requirements.txt
|
| 44 |
+
```
|
| 45 |
+
5. Run the application
|
| 46 |
+
```bash
|
| 47 |
+
fastapi dev server.py --host 0.0.0.0
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
# Documentation
|
| 51 |
+
|
| 52 |
+
After starting the project, you can access the documentation at [http://localhost:8000/docs](http://localhost:8000/docs).
|
| 53 |
+
|
| 54 |
+
# Contributing
|
| 55 |
+
|
| 56 |
+
While PRs are welcome, please note that due to the nature of the project, I may not be able to review them in a timely manner. If you have any questions or suggestions, feel free to open an issue.
|
| 57 |
+
|
| 58 |
+
# License
|
| 59 |
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
api_server/auth_middleware.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import Request, status
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from loguru import logger
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
auth_tokens = os.getenv("AUTH_TOKENS", "").split(",") if os.getenv("AUTH_TOKENS") else []
|
| 8 |
+
|
| 9 |
+
async def auth_middleware(request: Request, call_next):
|
| 10 |
+
# skip authentication if the auth_tokens list is empty
|
| 11 |
+
if not len(auth_tokens):
|
| 12 |
+
return await call_next(request)
|
| 13 |
+
# authenticate all requests except the /health endpoint
|
| 14 |
+
if request.url.path != "/health":
|
| 15 |
+
auth_token = request.headers.get("Authorization")
|
| 16 |
+
logger.bind(
|
| 17 |
+
path=request.url.path,
|
| 18 |
+
method=request.method,
|
| 19 |
+
auth_token=auth_token,
|
| 20 |
+
).debug("Received request")
|
| 21 |
+
if not auth_token or auth_token not in auth_tokens:
|
| 22 |
+
return JSONResponse(
|
| 23 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 24 |
+
content={"error": "Unauthorized"},
|
| 25 |
+
)
|
| 26 |
+
response = await call_next(request)
|
| 27 |
+
return response
|
api_server/v1_media_router.py
ADDED
|
@@ -0,0 +1,755 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import Query, Request, status, APIRouter, UploadFile, File, Form, BackgroundTasks
|
| 2 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 3 |
+
from typing import Literal, Optional
|
| 4 |
+
import os
|
| 5 |
+
from loguru import logger
|
| 6 |
+
import matplotlib.font_manager as fm
|
| 7 |
+
|
| 8 |
+
from video.tts import TTS
|
| 9 |
+
from video.tts_chatterbox import TTSChatterbox
|
| 10 |
+
from video.stt import STT
|
| 11 |
+
from video.storage import Storage
|
| 12 |
+
from video.caption import Caption
|
| 13 |
+
from video.media import MediaUtils
|
| 14 |
+
from video.builder import VideoBuilder
|
| 15 |
+
from utils.image import resize_image_cover
|
| 16 |
+
|
| 17 |
+
CHUNK_SIZE = 1024 * 1024 * 10 # 10MB chunks
|
| 18 |
+
|
| 19 |
+
def iterfile(path: str):
|
| 20 |
+
with open(path, mode="rb") as file:
|
| 21 |
+
while chunk := file.read(CHUNK_SIZE):
|
| 22 |
+
yield chunk
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
v1_media_api_router = APIRouter()
|
| 26 |
+
|
| 27 |
+
storage_path = os.getenv("STORAGE_PATH", os.path.join(os.path.abspath(os.getcwd()), "media"))
|
| 28 |
+
|
| 29 |
+
storage = Storage(
|
| 30 |
+
storage_path=storage_path,
|
| 31 |
+
)
|
| 32 |
+
stt = STT()
|
| 33 |
+
tts_manager = TTS()
|
| 34 |
+
tts_chatterbox = TTSChatterbox()
|
| 35 |
+
|
| 36 |
+
@v1_media_api_router.post("/audio-tools/transcribe")
|
| 37 |
+
def transcribe(
|
| 38 |
+
audio_file: UploadFile = File(..., description="Audio file to transcribe"),
|
| 39 |
+
language: Optional[str] = Form(None, description="Language code (optional)"),
|
| 40 |
+
):
|
| 41 |
+
"""
|
| 42 |
+
Transcribe audio file to text.
|
| 43 |
+
"""
|
| 44 |
+
logger.bind(language=language, filename=audio_file.filename).info(
|
| 45 |
+
"Transcribing audio file"
|
| 46 |
+
)
|
| 47 |
+
captions, duration = stt.transcribe(audio_file.file, beam_size=5, language=language)
|
| 48 |
+
transcription = "".join([cap["text"] for cap in captions])
|
| 49 |
+
|
| 50 |
+
return {
|
| 51 |
+
"transcription": transcription,
|
| 52 |
+
"duration": duration,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
@v1_media_api_router.get("/audio-tools/tts/kokoro/voices")
|
| 56 |
+
def get_kokoro_voices():
|
| 57 |
+
voices = tts_manager.valid_kokoro_voices()
|
| 58 |
+
return {"voices": voices}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@v1_media_api_router.post("/audio-tools/tts/kokoro")
|
| 62 |
+
def generate_kokoro_tts(
|
| 63 |
+
background_tasks: BackgroundTasks,
|
| 64 |
+
text: str = Form(..., description="Text to convert to speech"),
|
| 65 |
+
voice: Optional[str] = Form(None, description="Voice name for kokoro TTS"),
|
| 66 |
+
speed: Optional[float] = Form(None, description="Speed for kokoro TTS"),
|
| 67 |
+
):
|
| 68 |
+
"""
|
| 69 |
+
Generate audio from text using specified TTS engine.
|
| 70 |
+
"""
|
| 71 |
+
if not voice:
|
| 72 |
+
voice = "af_heart"
|
| 73 |
+
voices = tts_manager.valid_kokoro_voices()
|
| 74 |
+
if voice not in voices:
|
| 75 |
+
return JSONResponse(
|
| 76 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 77 |
+
content={"error": f"Invalid voice: {voice}. Valid voices: {voices}"},
|
| 78 |
+
)
|
| 79 |
+
audio_id, audio_path = storage.create_media_filename_with_id(
|
| 80 |
+
media_type="audio", file_extension=".wav"
|
| 81 |
+
)
|
| 82 |
+
tmp_file_id = storage.create_tmp_file(audio_id)
|
| 83 |
+
|
| 84 |
+
def bg_task():
|
| 85 |
+
tts_manager.kokoro(
|
| 86 |
+
text=text,
|
| 87 |
+
output_path=audio_path,
|
| 88 |
+
voice=voice,
|
| 89 |
+
speed=speed if speed else 1.0,
|
| 90 |
+
)
|
| 91 |
+
storage.delete_media(tmp_file_id)
|
| 92 |
+
|
| 93 |
+
# background_tasks.add_task(bg_task)
|
| 94 |
+
logger.info(f"Adding background task for TTS generation with ID: {audio_id}")
|
| 95 |
+
background_tasks.add_task(bg_task)
|
| 96 |
+
logger.info(f"Background task added for TTS generation with ID: {audio_id}")
|
| 97 |
+
|
| 98 |
+
return {"file_id": audio_id}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@v1_media_api_router.post("/audio-tools/tts/chatterbox")
|
| 102 |
+
def generate_chatterbox_tts(
|
| 103 |
+
background_tasks: BackgroundTasks,
|
| 104 |
+
text: str = Form(..., description="Text to convert to speech"),
|
| 105 |
+
sample_audio_id: Optional[str] = Form(
|
| 106 |
+
None, description="Sample audio ID for voice cloning"
|
| 107 |
+
),
|
| 108 |
+
sample_audio_file: Optional[UploadFile] = File(
|
| 109 |
+
None, description="Sample audio file for voice cloning"
|
| 110 |
+
),
|
| 111 |
+
exaggeration: Optional[float] = Form(
|
| 112 |
+
0.5, description="Exaggeration factor for voice cloning, default: 0.5"
|
| 113 |
+
),
|
| 114 |
+
cfg_weight: Optional[float] = Form(0.5, description="CFG weight for voice cloning, default: 0.5"),
|
| 115 |
+
temperature: Optional[float] = Form(
|
| 116 |
+
0.8, description="Temperature for voice cloning (default: 0.8)"
|
| 117 |
+
),
|
| 118 |
+
chunk_chars: Optional[int] = Form(1024, description="Max characters per chunk (default: 1024)"),
|
| 119 |
+
chunk_silence_ms: Optional[int] = Form(
|
| 120 |
+
350, description="Silence duration between chunks in milliseconds (default: 350)"
|
| 121 |
+
)
|
| 122 |
+
):
|
| 123 |
+
"""
|
| 124 |
+
Generate audio from text using Chatterbox TTS.
|
| 125 |
+
"""
|
| 126 |
+
audio_id, audio_path = storage.create_media_filename_with_id(
|
| 127 |
+
media_type="audio", file_extension=".wav"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
sample_audio_path = None
|
| 131 |
+
if sample_audio_file:
|
| 132 |
+
if not sample_audio_file.filename.endswith(".wav"):
|
| 133 |
+
return JSONResponse(
|
| 134 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 135 |
+
content={"error": "Sample audio file must be a .wav file."},
|
| 136 |
+
)
|
| 137 |
+
sample_audio_id = storage.upload_media(
|
| 138 |
+
media_type="tmp",
|
| 139 |
+
media_data=sample_audio_file.file.read(),
|
| 140 |
+
file_extension=".wav",
|
| 141 |
+
)
|
| 142 |
+
sample_audio_path = storage.get_media_path(sample_audio_id)
|
| 143 |
+
elif sample_audio_id:
|
| 144 |
+
if not storage.media_exists(sample_audio_id):
|
| 145 |
+
return JSONResponse(
|
| 146 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 147 |
+
content={"error": f"Sample audio with ID {sample_audio_id} not found."},
|
| 148 |
+
)
|
| 149 |
+
sample_audio_path = storage.get_media_path(sample_audio_id)
|
| 150 |
+
|
| 151 |
+
tmp_file_id = storage.create_tmp_file(audio_id)
|
| 152 |
+
|
| 153 |
+
def bg_task():
|
| 154 |
+
try:
|
| 155 |
+
tts_chatterbox.chatterbox(
|
| 156 |
+
text=text,
|
| 157 |
+
output_path=audio_path,
|
| 158 |
+
sample_audio_path=sample_audio_path,
|
| 159 |
+
exaggeration=exaggeration,
|
| 160 |
+
cfg_weight=cfg_weight,
|
| 161 |
+
temperature=temperature,
|
| 162 |
+
chunk_chars=chunk_chars,
|
| 163 |
+
chunk_silence_ms=chunk_silence_ms,
|
| 164 |
+
)
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"Error in Chatterbox TTS: {e}")
|
| 167 |
+
finally:
|
| 168 |
+
storage.delete_media(tmp_file_id)
|
| 169 |
+
|
| 170 |
+
# background_tasks.add_task(bg_task)
|
| 171 |
+
logger.info(f"Adding background task for Chatterbox TTS generation with ID: {audio_id}")
|
| 172 |
+
background_tasks.add_task(bg_task)
|
| 173 |
+
logger.info(f"Background task added for Chatterbox TTS generation with ID: {audio_id}")
|
| 174 |
+
|
| 175 |
+
return {"file_id": audio_id}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
@v1_media_api_router.post("/storage")
|
| 179 |
+
def upload_file(
|
| 180 |
+
file: Optional[UploadFile] = File(None, description="File to upload"),
|
| 181 |
+
url: Optional[str] = Form(None, description="URL of the file to upload (optional)"),
|
| 182 |
+
media_type: Literal["image", "video", "audio"] = Form(
|
| 183 |
+
..., description="Type of media being uploaded"
|
| 184 |
+
),
|
| 185 |
+
):
|
| 186 |
+
"""
|
| 187 |
+
Upload a file and return its ID.
|
| 188 |
+
"""
|
| 189 |
+
if media_type not in ["image", "video", "audio"]:
|
| 190 |
+
return JSONResponse(
|
| 191 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 192 |
+
content={"error": f"Invalid media type: {media_type}"},
|
| 193 |
+
)
|
| 194 |
+
if file:
|
| 195 |
+
file_id = storage.upload_media(
|
| 196 |
+
media_type=media_type,
|
| 197 |
+
media_data=file.file.read(),
|
| 198 |
+
file_extension=os.path.splitext(file.filename)[1],
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
return {"file_id": file_id}
|
| 202 |
+
elif url:
|
| 203 |
+
if not storage.is_valid_url(url):
|
| 204 |
+
return JSONResponse(
|
| 205 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 206 |
+
content={"error": f"Invalid URL: {url}"},
|
| 207 |
+
)
|
| 208 |
+
file_id = storage.upload_media_from_url(media_type=media_type, url=url)
|
| 209 |
+
return {"file_id": file_id}
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
@v1_media_api_router.get("/storage/{file_id}")
|
| 213 |
+
def download_file(file_id: str):
|
| 214 |
+
"""
|
| 215 |
+
Download a file by its ID.
|
| 216 |
+
"""
|
| 217 |
+
if not storage.media_exists(file_id):
|
| 218 |
+
return JSONResponse(
|
| 219 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 220 |
+
content={"error": f"File with ID {file_id} not found."},
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
file_path = storage.get_media_path(file_id)
|
| 224 |
+
return StreamingResponse(
|
| 225 |
+
iterfile(file_path),
|
| 226 |
+
media_type="application/octet-stream",
|
| 227 |
+
headers={
|
| 228 |
+
"Content-Disposition": f"attachment; filename={os.path.basename(file_path)}"
|
| 229 |
+
},
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
@v1_media_api_router.delete("/storage/{file_id}")
|
| 234 |
+
def delete_file(file_id: str):
|
| 235 |
+
"""
|
| 236 |
+
Delete a file by its
|
| 237 |
+
"""
|
| 238 |
+
if storage.media_exists(file_id):
|
| 239 |
+
storage.delete_media(file_id)
|
| 240 |
+
return {"status": "success"}
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@v1_media_api_router.get("/storage/{file_id}/status")
|
| 244 |
+
def file_status(file_id: str):
|
| 245 |
+
"""
|
| 246 |
+
Check the status of a file by its ID.
|
| 247 |
+
"""
|
| 248 |
+
tmp_id = storage.create_tmp_file_id(file_id)
|
| 249 |
+
if storage.media_exists(tmp_id):
|
| 250 |
+
return {"status": "processing"}
|
| 251 |
+
elif storage.media_exists(file_id):
|
| 252 |
+
return {"status": "ready"}
|
| 253 |
+
return {"status": "not_found"}
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
@v1_media_api_router.post("/video-tools/merge")
|
| 257 |
+
def merge_videos(
|
| 258 |
+
background_tasks: BackgroundTasks,
|
| 259 |
+
video_ids: str = Form(..., description="List of video IDs to merge"),
|
| 260 |
+
background_music_id: Optional[str] = Form(
|
| 261 |
+
None, description="Background music ID (optional)"
|
| 262 |
+
),
|
| 263 |
+
background_music_volume: Optional[float] = Form(
|
| 264 |
+
0.5, description="Volume for background music (0.0 to 1.0)"
|
| 265 |
+
),
|
| 266 |
+
):
|
| 267 |
+
"""
|
| 268 |
+
Merge multiple videos into one.
|
| 269 |
+
"""
|
| 270 |
+
video_ids = video_ids.split(",") if video_ids else []
|
| 271 |
+
if not video_ids:
|
| 272 |
+
return JSONResponse(
|
| 273 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 274 |
+
content={"error": "At least one video ID is required."},
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
merged_video_id, merged_video_path = storage.create_media_filename_with_id(
|
| 278 |
+
media_type="video", file_extension=".mp4"
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
video_paths = []
|
| 282 |
+
for video_id in video_ids:
|
| 283 |
+
if not storage.media_exists(video_id):
|
| 284 |
+
return JSONResponse(
|
| 285 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 286 |
+
content={"error": f"Video with ID {video_id} not found."},
|
| 287 |
+
)
|
| 288 |
+
video_paths.append(storage.get_media_path(video_id))
|
| 289 |
+
|
| 290 |
+
if background_music_id and not storage.media_exists(background_music_id):
|
| 291 |
+
return JSONResponse(
|
| 292 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 293 |
+
content={
|
| 294 |
+
"error": f"Background music with ID {background_music_id} not found."
|
| 295 |
+
},
|
| 296 |
+
)
|
| 297 |
+
background_music_path = (
|
| 298 |
+
storage.get_media_path(background_music_id) if background_music_id else None
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
utils = MediaUtils()
|
| 302 |
+
|
| 303 |
+
temp_file_id = storage.create_tmp_file(merged_video_id)
|
| 304 |
+
|
| 305 |
+
def bg_task():
|
| 306 |
+
utils.merge_videos(
|
| 307 |
+
video_paths=video_paths,
|
| 308 |
+
output_path=merged_video_path,
|
| 309 |
+
background_music_path=background_music_path,
|
| 310 |
+
background_music_volume=background_music_volume,
|
| 311 |
+
)
|
| 312 |
+
storage.delete_media(temp_file_id)
|
| 313 |
+
|
| 314 |
+
logger.info(f"Adding background task for video merge with ID: {merged_video_id}")
|
| 315 |
+
background_tasks.add_task(bg_task)
|
| 316 |
+
logger.info(f"Background task added for video merge with ID: {merged_video_id}")
|
| 317 |
+
|
| 318 |
+
return {"file_id": merged_video_id}
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
@v1_media_api_router.get('/fonts')
|
| 322 |
+
def list_fonts():
|
| 323 |
+
fonts = set()
|
| 324 |
+
for fname in fm.findSystemFonts(fontpaths=None, fontext='ttf'):
|
| 325 |
+
try:
|
| 326 |
+
prop = fm.FontProperties(fname=fname)
|
| 327 |
+
name = prop.get_name()
|
| 328 |
+
fonts.add(name)
|
| 329 |
+
except RuntimeError:
|
| 330 |
+
continue
|
| 331 |
+
return {"fonts": sorted(fonts)}
|
| 332 |
+
|
| 333 |
+
@v1_media_api_router.post("/video-tools/generate/tts-captioned-video")
|
| 334 |
+
def generate_captioned_video(
|
| 335 |
+
background_tasks: BackgroundTasks,
|
| 336 |
+
background_id: str = Form(..., description="Background image ID"),
|
| 337 |
+
text: Optional[str] = Form(None, description="Text to generate video from"),
|
| 338 |
+
width: Optional[int] = Form(1080, description="Width of the video (default: 1080)"),
|
| 339 |
+
height: Optional[int] = Form(
|
| 340 |
+
1920, description="Height of the video (default: 1920)"
|
| 341 |
+
),
|
| 342 |
+
audio_id: Optional[str] = Form(
|
| 343 |
+
None, description="Audio ID for the video (optional)"
|
| 344 |
+
),
|
| 345 |
+
kokoro_voice: Optional[str] = Form(
|
| 346 |
+
"af_heart", description="Voice for kokoro TTS (default: af_heart)"
|
| 347 |
+
),
|
| 348 |
+
kokoro_speed: Optional[float] = Form(
|
| 349 |
+
1.0, description="Speed for kokoro TTS (default: 1.0)"
|
| 350 |
+
),
|
| 351 |
+
language: Optional[str] = Form(
|
| 352 |
+
None, description="Language code for STT (optional, e.g. 'en', 'fr', 'de'), defaults to None (auto-detect language if audio_id is provided)"
|
| 353 |
+
),
|
| 354 |
+
|
| 355 |
+
image_effect: Optional[str] = Form("ken_burns", description="Effect to apply to the background image, options: ken_burns, pan (default: 'ken_burns')"),
|
| 356 |
+
|
| 357 |
+
# Flattened subtitle configuration options
|
| 358 |
+
caption_config_line_count: Optional[int] = Form(1, description="Number of lines per subtitle segment (default: 1)", ge=1, le=5),
|
| 359 |
+
caption_config_line_max_length: Optional[int] = Form(1, description="Maximum characters per line (default: 1)", ge=1, le=200),
|
| 360 |
+
caption_config_font_size: Optional[int] = Form(120, description="Font size for subtitles (default: 50)", ge=8, le=200),
|
| 361 |
+
caption_config_font_name: Optional[str] = Form("Arial", description="Font family name (default: 'EB Garamond', see the available fonts form the /fonts endpoint)"),
|
| 362 |
+
caption_config_font_bold: Optional[bool] = Form(True, description="Whether to use bold font (default: True)"),
|
| 363 |
+
caption_config_font_italic: Optional[bool] = Form(False, description="Whether to use italic font (default: false)"),
|
| 364 |
+
caption_config_font_color: Optional[str] = Form("#fff", description="Font color in hex format (default: '#fff')"),
|
| 365 |
+
caption_config_subtitle_position: Optional[Literal["top", "center", "bottom"]] = Form("top", description="Vertical position of subtitles (default: 'top')"),
|
| 366 |
+
caption_config_shadow_color: Optional[str] = Form("#000", description="Shadow color in hex format (default: '#000')"),
|
| 367 |
+
caption_config_shadow_transparency: Optional[float] = Form(0.4, description="Shadow transparency from 0.0 to 1.0 (default: 0.4)", ge=0.0, le=1.0),
|
| 368 |
+
caption_config_shadow_blur: Optional[int] = Form(10, description="Shadow blur radius (default: 10)", ge=0, le=20),
|
| 369 |
+
caption_config_stroke_color: Optional[str] = Form(None, description="Stroke/outline color in hex format (default: '#000')"),
|
| 370 |
+
caption_config_stroke_size: Optional[int] = Form(5, description="Stroke/outline size (default: 5)", ge=0, le=10),
|
| 371 |
+
):
|
| 372 |
+
"""
|
| 373 |
+
Generate a captioned video from text and background image.
|
| 374 |
+
|
| 375 |
+
"""
|
| 376 |
+
# Build subtitle options from individual parameters
|
| 377 |
+
parsed_subtitle_options = {}
|
| 378 |
+
|
| 379 |
+
# Only include non-None values
|
| 380 |
+
if caption_config_line_count is not None:
|
| 381 |
+
parsed_subtitle_options['lines'] = caption_config_line_count
|
| 382 |
+
if caption_config_line_max_length is not None:
|
| 383 |
+
parsed_subtitle_options['max_length'] = caption_config_line_max_length
|
| 384 |
+
if caption_config_font_size is not None:
|
| 385 |
+
parsed_subtitle_options['font_size'] = caption_config_font_size
|
| 386 |
+
if caption_config_font_name is not None:
|
| 387 |
+
parsed_subtitle_options['font_name'] = caption_config_font_name
|
| 388 |
+
if caption_config_font_bold is not None:
|
| 389 |
+
parsed_subtitle_options['font_bold'] = caption_config_font_bold
|
| 390 |
+
if caption_config_font_italic is not None:
|
| 391 |
+
parsed_subtitle_options['font_italic'] = caption_config_font_italic
|
| 392 |
+
if caption_config_font_color is not None:
|
| 393 |
+
parsed_subtitle_options['font_color'] = caption_config_font_color
|
| 394 |
+
if caption_config_subtitle_position is not None:
|
| 395 |
+
parsed_subtitle_options['subtitle_position'] = caption_config_subtitle_position
|
| 396 |
+
if caption_config_shadow_color is not None:
|
| 397 |
+
parsed_subtitle_options['shadow_color'] = caption_config_shadow_color
|
| 398 |
+
if caption_config_shadow_transparency is not None:
|
| 399 |
+
parsed_subtitle_options['shadow_transparency'] = caption_config_shadow_transparency
|
| 400 |
+
if caption_config_shadow_blur is not None:
|
| 401 |
+
parsed_subtitle_options['shadow_blur'] = caption_config_shadow_blur
|
| 402 |
+
if caption_config_stroke_color is not None:
|
| 403 |
+
parsed_subtitle_options['stroke_color'] = caption_config_stroke_color
|
| 404 |
+
if caption_config_stroke_size is not None:
|
| 405 |
+
parsed_subtitle_options['stroke_size'] = caption_config_stroke_size
|
| 406 |
+
|
| 407 |
+
if audio_id and not storage.media_exists(audio_id):
|
| 408 |
+
return JSONResponse(
|
| 409 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 410 |
+
content={"error": f"Audio with ID {audio_id} not found."},
|
| 411 |
+
)
|
| 412 |
+
if not audio_id and kokoro_voice not in tts_manager.valid_kokoro_voices():
|
| 413 |
+
return JSONResponse(
|
| 414 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 415 |
+
content={"error": f"Invalid voice: {kokoro_voice}."},
|
| 416 |
+
)
|
| 417 |
+
media_type = storage.get_media_type(background_id)
|
| 418 |
+
if media_type not in ["image"]:
|
| 419 |
+
return JSONResponse(
|
| 420 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 421 |
+
content={"error": f"Invalid media type: {media_type}"},
|
| 422 |
+
)
|
| 423 |
+
if not storage.media_exists(background_id):
|
| 424 |
+
return JSONResponse(
|
| 425 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 426 |
+
content={"error": f"Background image with ID {background_id} not found."},
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
output_id, output_path = storage.create_media_filename_with_id(
|
| 430 |
+
media_type="video", file_extension=".mp4"
|
| 431 |
+
)
|
| 432 |
+
dimensions = (width, height)
|
| 433 |
+
builder = VideoBuilder(
|
| 434 |
+
dimensions=dimensions,
|
| 435 |
+
)
|
| 436 |
+
builder.set_media_utils(MediaUtils())
|
| 437 |
+
|
| 438 |
+
tmp_file_id = storage.create_tmp_file(output_id)
|
| 439 |
+
|
| 440 |
+
def bg_task(
|
| 441 |
+
tmp_file_id: str = tmp_file_id,
|
| 442 |
+
):
|
| 443 |
+
tmp_file_ids = [tmp_file_id]
|
| 444 |
+
|
| 445 |
+
# set audio, generate captions
|
| 446 |
+
captions = None
|
| 447 |
+
tts_audio_id = audio_id
|
| 448 |
+
from video.tts import LANGUAGE_VOICE_MAP
|
| 449 |
+
lang_config = LANGUAGE_VOICE_MAP.get(kokoro_voice, {})
|
| 450 |
+
international = lang_config.get("international", False)
|
| 451 |
+
|
| 452 |
+
if tts_audio_id:
|
| 453 |
+
audio_path = storage.get_media_path(tts_audio_id)
|
| 454 |
+
captions = stt.transcribe(audio_path=audio_path, language=language)[0]
|
| 455 |
+
builder.set_audio(audio_path)
|
| 456 |
+
# generate TTS and set audio
|
| 457 |
+
else:
|
| 458 |
+
tts_audio_id, audio_path = storage.create_media_filename_with_id(
|
| 459 |
+
media_type="audio", file_extension=".wav"
|
| 460 |
+
)
|
| 461 |
+
tmp_file_ids.append(tts_audio_id)
|
| 462 |
+
captions = tts_manager.kokoro(
|
| 463 |
+
text=text,
|
| 464 |
+
output_path=audio_path,
|
| 465 |
+
voice=kokoro_voice,
|
| 466 |
+
speed=kokoro_speed,
|
| 467 |
+
)[0]
|
| 468 |
+
if international:
|
| 469 |
+
# use whisper to create captions
|
| 470 |
+
iso_lang_code = lang_config.get("iso639_1")
|
| 471 |
+
captions = stt.transcribe(audio_path=audio_path, language=iso_lang_code)[0]
|
| 472 |
+
|
| 473 |
+
builder.set_audio(audio_path)
|
| 474 |
+
|
| 475 |
+
# create subtitle
|
| 476 |
+
captionsManager = Caption()
|
| 477 |
+
subtitle_id, subtitle_path = storage.create_media_filename_with_id(
|
| 478 |
+
media_type="tmp", file_extension=".ass"
|
| 479 |
+
)
|
| 480 |
+
tmp_file_ids.append(subtitle_id)
|
| 481 |
+
|
| 482 |
+
# create segments based on language
|
| 483 |
+
if international:
|
| 484 |
+
segments = captionsManager.create_subtitle_segments_english(
|
| 485 |
+
captions=captions,
|
| 486 |
+
lines=parsed_subtitle_options.get('lines', parsed_subtitle_options.get("lines", 1)),
|
| 487 |
+
max_length=parsed_subtitle_options.get('max_length', parsed_subtitle_options.get("max_length", 1)),
|
| 488 |
+
)
|
| 489 |
+
else:
|
| 490 |
+
segments = captionsManager.create_subtitle_segments_international(
|
| 491 |
+
captions=captions,
|
| 492 |
+
lines=parsed_subtitle_options.get('lines', parsed_subtitle_options.get('lines', 1)),
|
| 493 |
+
max_length=parsed_subtitle_options.get('max_length', parsed_subtitle_options.get('max_length', 1)),
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
captionsManager.create_subtitle(
|
| 497 |
+
segments=segments,
|
| 498 |
+
output_path=subtitle_path,
|
| 499 |
+
dimensions=dimensions,
|
| 500 |
+
|
| 501 |
+
font_size=parsed_subtitle_options.get('font_size', 120),
|
| 502 |
+
shadow_blur=parsed_subtitle_options.get('shadow_blur', 10),
|
| 503 |
+
stroke_size=parsed_subtitle_options.get('stroke_size', 5),
|
| 504 |
+
shadow_color=parsed_subtitle_options.get('shadow_color', "#000"),
|
| 505 |
+
stroke_color=parsed_subtitle_options.get('stroke_color', "#000"),
|
| 506 |
+
font_name=parsed_subtitle_options.get('font_name', "Arial"),
|
| 507 |
+
font_bold=parsed_subtitle_options.get('font_bold', True),
|
| 508 |
+
font_italic=parsed_subtitle_options.get('font_italic', False),
|
| 509 |
+
subtitle_position=parsed_subtitle_options.get('subtitle_position', "top"),
|
| 510 |
+
font_color=parsed_subtitle_options.get('font_color', "#fff"),
|
| 511 |
+
shadow_transparency=parsed_subtitle_options.get('shadow_transparency', 0.4),
|
| 512 |
+
)
|
| 513 |
+
builder.set_captions(
|
| 514 |
+
file_path=subtitle_path,
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
# resize background image if needed
|
| 518 |
+
background_path = storage.get_media_path(background_id)
|
| 519 |
+
utils = MediaUtils()
|
| 520 |
+
info = utils.get_video_info(background_path)
|
| 521 |
+
if info.get("width", 0) != width or info.get("height", 0) != height:
|
| 522 |
+
logger.bind(
|
| 523 |
+
image_width=info.get("width", 0),
|
| 524 |
+
image_height=info.get("height", 0),
|
| 525 |
+
target_width=width,
|
| 526 |
+
target_height=height,
|
| 527 |
+
).debug(
|
| 528 |
+
"Resizing background image to fit video dimensions"
|
| 529 |
+
)
|
| 530 |
+
_, resized_background_path = storage.create_media_filename_with_id(
|
| 531 |
+
media_type="image", file_extension=".jpg"
|
| 532 |
+
)
|
| 533 |
+
resize_image_cover(
|
| 534 |
+
image_path=background_path,
|
| 535 |
+
output_path=resized_background_path,
|
| 536 |
+
target_width=width,
|
| 537 |
+
target_height=height,
|
| 538 |
+
)
|
| 539 |
+
background_path = resized_background_path
|
| 540 |
+
|
| 541 |
+
builder.set_background_image(
|
| 542 |
+
background_path,
|
| 543 |
+
effect_config={
|
| 544 |
+
"effect": image_effect,
|
| 545 |
+
}
|
| 546 |
+
)
|
| 547 |
+
|
| 548 |
+
builder.set_output_path(output_path)
|
| 549 |
+
|
| 550 |
+
builder.execute()
|
| 551 |
+
|
| 552 |
+
for tmp_file_id in tmp_file_ids:
|
| 553 |
+
if storage.media_exists(tmp_file_id):
|
| 554 |
+
storage.delete_media(tmp_file_id)
|
| 555 |
+
|
| 556 |
+
logger.info(f"Adding background task for captioned video generation with ID: {output_id}")
|
| 557 |
+
background_tasks.add_task(bg_task, tmp_file_id=tmp_file_id)
|
| 558 |
+
logger.info(f"Background task added for captioned video generation with ID: {output_id}")
|
| 559 |
+
|
| 560 |
+
return {
|
| 561 |
+
"file_id": output_id,
|
| 562 |
+
}
|
| 563 |
+
|
| 564 |
+
# https://ffmpeg.org/ffmpeg-filters.html#colorkey
|
| 565 |
+
@v1_media_api_router.post("/video-tools/add-colorkey-overlay")
|
| 566 |
+
def add_colorkey_overlay(
|
| 567 |
+
background_tasks: BackgroundTasks,
|
| 568 |
+
video_id: str = Form(..., description="Video ID to overlay"),
|
| 569 |
+
overlay_video_id: str = Form(..., description="Overlay image ID"),
|
| 570 |
+
color: Optional[str] = Form(
|
| 571 |
+
"green", description="Set the color for which alpha will be set to 0 (full transparency). Use name of the color or hex code (e.g. 'red' or '#ff0000')"
|
| 572 |
+
),
|
| 573 |
+
similarity: Optional[float] = Form(
|
| 574 |
+
0.1, description="Set the radius from the key color within which other colors also have full transparency (Default: 0.1)"
|
| 575 |
+
),
|
| 576 |
+
blend: Optional[float] = Form(
|
| 577 |
+
0.1, description="Set how the alpha value for pixels that fall outside the similarity radius is computed (default: 0.1)"
|
| 578 |
+
),
|
| 579 |
+
):
|
| 580 |
+
"""
|
| 581 |
+
Overlay a video on a video with the specified colorkey and intensity
|
| 582 |
+
"""
|
| 583 |
+
|
| 584 |
+
if not storage.media_exists(video_id):
|
| 585 |
+
return JSONResponse(
|
| 586 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 587 |
+
content={"error": f"Video with ID {video_id} not found."},
|
| 588 |
+
)
|
| 589 |
+
if not storage.media_exists(overlay_video_id):
|
| 590 |
+
return JSONResponse(
|
| 591 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 592 |
+
content={"error": f"Overlay video with ID {overlay_video_id} not found."},
|
| 593 |
+
)
|
| 594 |
+
|
| 595 |
+
video_path = storage.get_media_path(video_id)
|
| 596 |
+
overlay_video_path = storage.get_media_path(overlay_video_id)
|
| 597 |
+
|
| 598 |
+
output_id, output_path = storage.create_media_filename_with_id(
|
| 599 |
+
media_type="video", file_extension=".mp4"
|
| 600 |
+
)
|
| 601 |
+
|
| 602 |
+
tmp_file_id = storage.create_tmp_file(output_id)
|
| 603 |
+
|
| 604 |
+
def bg_task():
|
| 605 |
+
utils = MediaUtils()
|
| 606 |
+
utils.colorkey_overlay(
|
| 607 |
+
input_video_path=video_path,
|
| 608 |
+
overlay_video_path=overlay_video_path,
|
| 609 |
+
output_video_path=output_path,
|
| 610 |
+
color=color,
|
| 611 |
+
similarity=similarity,
|
| 612 |
+
blend=blend,
|
| 613 |
+
)
|
| 614 |
+
storage.delete_media(tmp_file_id)
|
| 615 |
+
|
| 616 |
+
logger.info(f"Adding background task for colorkey overlay with ID: {output_id}")
|
| 617 |
+
background_tasks.add_task(bg_task)
|
| 618 |
+
logger.info(f"Background task added for colorkey overlay with ID: {output_id}")
|
| 619 |
+
|
| 620 |
+
return {
|
| 621 |
+
"file_id": output_id,
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
@v1_media_api_router.get("/video-tools/extract-frame/{video_id}")
|
| 625 |
+
def extract_frame(
|
| 626 |
+
video_id: str,
|
| 627 |
+
timestamp: Optional[float] = Query(1.0, description="Timestamp in seconds to extract frame from (default: 1.0)")
|
| 628 |
+
):
|
| 629 |
+
"""
|
| 630 |
+
Extract a frame from a video at a specified timestamp.
|
| 631 |
+
|
| 632 |
+
Args:
|
| 633 |
+
video_id: Video ID to extract frame from
|
| 634 |
+
timestamp: Optional timestamp in seconds to extract frame from (default: first frame)
|
| 635 |
+
"""
|
| 636 |
+
if not storage.media_exists(video_id):
|
| 637 |
+
return JSONResponse(
|
| 638 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 639 |
+
content={"error": f"Video with ID {video_id} not found."},
|
| 640 |
+
)
|
| 641 |
+
|
| 642 |
+
video_path = storage.get_media_path(video_id)
|
| 643 |
+
|
| 644 |
+
_, output_path = storage.create_media_filename_with_id(
|
| 645 |
+
media_type="image", file_extension=".jpg"
|
| 646 |
+
)
|
| 647 |
+
|
| 648 |
+
utils = MediaUtils()
|
| 649 |
+
video_info = utils.get_video_info(video_path)
|
| 650 |
+
if video_info.get("duration", 0) <= float(timestamp):
|
| 651 |
+
timestamp = video_info.get("duration", 0) - 0.3
|
| 652 |
+
|
| 653 |
+
success = utils.extract_frame(
|
| 654 |
+
video_path=video_path,
|
| 655 |
+
output_path=output_path,
|
| 656 |
+
time_seconds=timestamp,
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
if not success:
|
| 660 |
+
return JSONResponse(
|
| 661 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 662 |
+
content={"error": "Failed to extract frame from video."},
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
# Load file into memory
|
| 666 |
+
with open(output_path, "rb") as file:
|
| 667 |
+
file_data = file.read()
|
| 668 |
+
|
| 669 |
+
# Remove the output file
|
| 670 |
+
os.remove(output_path)
|
| 671 |
+
|
| 672 |
+
# Create streaming response with appropriate headers
|
| 673 |
+
from io import BytesIO
|
| 674 |
+
return StreamingResponse(
|
| 675 |
+
BytesIO(file_data),
|
| 676 |
+
media_type="image/jpeg",
|
| 677 |
+
headers={
|
| 678 |
+
"Content-Disposition": f"attachment; filename=frame_{video_id}_{timestamp or 'first'}.jpg"
|
| 679 |
+
},
|
| 680 |
+
)
|
| 681 |
+
|
| 682 |
+
# extract x number of frames from the video, equally spaced
|
| 683 |
+
@v1_media_api_router.post('/video-tools/extract-frames')
|
| 684 |
+
def extract_frame_from_url(
|
| 685 |
+
url: str = Form(..., description="URL of the video to extract frame from"),
|
| 686 |
+
amount: int = Form(5, description="Number of frames to extract from the video (default: 5)"),
|
| 687 |
+
length_seconds: Optional[float] = Form(None, description="Length of the video in seconds (optional)"),
|
| 688 |
+
stitch: Optional[bool] = Form(False, description="Whether to stitch the frames into a single image (default: False)")
|
| 689 |
+
):
|
| 690 |
+
template_id, template_path = storage.create_media_template(
|
| 691 |
+
media_type="image", file_extension=".jpg"
|
| 692 |
+
)
|
| 693 |
+
utils = MediaUtils()
|
| 694 |
+
|
| 695 |
+
if not length_seconds:
|
| 696 |
+
video_info = utils.get_video_info(url)
|
| 697 |
+
length_seconds = video_info.get("duration", 0)
|
| 698 |
+
|
| 699 |
+
utils.extract_frames(
|
| 700 |
+
video_path=url,
|
| 701 |
+
length_seconds=length_seconds,
|
| 702 |
+
amount=amount,
|
| 703 |
+
output_template=template_path,
|
| 704 |
+
)
|
| 705 |
+
|
| 706 |
+
image_ids = []
|
| 707 |
+
for i in range(amount):
|
| 708 |
+
padded_index = str(i + 1).zfill(2)
|
| 709 |
+
|
| 710 |
+
image_id = template_id.replace("%02d", padded_index)
|
| 711 |
+
image_ids.append(image_id)
|
| 712 |
+
|
| 713 |
+
return {
|
| 714 |
+
"message": f"Extracted {amount} frames from the video at {url}. The frames are saved in the template directory.",
|
| 715 |
+
"template_id": template_id,
|
| 716 |
+
"template_path": template_path,
|
| 717 |
+
"image_ids": image_ids,
|
| 718 |
+
}
|
| 719 |
+
|
| 720 |
+
|
| 721 |
+
@v1_media_api_router.get("/video-tools/info/{file_id}")
|
| 722 |
+
def get_video_info(file_id: str):
|
| 723 |
+
"""
|
| 724 |
+
Get information about a video file.
|
| 725 |
+
"""
|
| 726 |
+
if not storage.media_exists(file_id):
|
| 727 |
+
return JSONResponse(
|
| 728 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 729 |
+
content={"error": f"Video with ID {file_id} not found."},
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
video_path = storage.get_media_path(file_id)
|
| 733 |
+
|
| 734 |
+
utils = MediaUtils()
|
| 735 |
+
info = utils.get_video_info(video_path)
|
| 736 |
+
|
| 737 |
+
return info
|
| 738 |
+
|
| 739 |
+
@v1_media_api_router.get("/audio-tools/info/{file_id}")
|
| 740 |
+
def get_audio_info(file_id: str):
|
| 741 |
+
"""
|
| 742 |
+
Get information about an audio file.
|
| 743 |
+
"""
|
| 744 |
+
if not storage.media_exists(file_id):
|
| 745 |
+
return JSONResponse(
|
| 746 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 747 |
+
content={"error": f"Audio with ID {file_id} not found."},
|
| 748 |
+
)
|
| 749 |
+
|
| 750 |
+
audio_path = storage.get_media_path(file_id)
|
| 751 |
+
|
| 752 |
+
utils = MediaUtils()
|
| 753 |
+
info = utils.get_audio_info(audio_path)
|
| 754 |
+
|
| 755 |
+
return info
|
api_server/v1_utils_router.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from fastapi import BackgroundTasks, Form, status, APIRouter
|
| 3 |
+
from fastapi.responses import JSONResponse, StreamingResponse
|
| 4 |
+
from loguru import logger
|
| 5 |
+
from video.storage import Storage
|
| 6 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 7 |
+
|
| 8 |
+
storage_path = os.getenv("STORAGE_PATH", os.path.join(os.path.abspath(os.getcwd()), "media"))
|
| 9 |
+
|
| 10 |
+
storage = Storage(
|
| 11 |
+
storage_path=storage_path,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
v1_utils_router = APIRouter()
|
| 15 |
+
ytt_api = YouTubeTranscriptApi()
|
| 16 |
+
|
| 17 |
+
@v1_utils_router.get("/youtube-transcript")
|
| 18 |
+
def get_youtube_transcript(
|
| 19 |
+
video_id: str,
|
| 20 |
+
):
|
| 21 |
+
"""
|
| 22 |
+
Get YouTube video transcript by video ID.
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
fetched_transcript = ytt_api.fetch(video_id)
|
| 26 |
+
return {
|
| 27 |
+
"video_id": video_id,
|
| 28 |
+
"transcript": fetched_transcript
|
| 29 |
+
}
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logger.error(f"Error fetching transcript for video {video_id}: {e}")
|
| 32 |
+
return JSONResponse(
|
| 33 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 34 |
+
content={"error": f"Transcript for video {video_id} not found."},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
@v1_utils_router.post("/stitch-images")
|
| 38 |
+
def stitch_images(
|
| 39 |
+
image_urls: str = Form(..., description="Comma-separated list of image URLs to stitch together"),
|
| 40 |
+
max_width: int = Form(1920, description="Maximum width of the final stitched image"),
|
| 41 |
+
max_height: int = Form(1080, description="Maximum height of the final stitched image"),
|
| 42 |
+
):
|
| 43 |
+
"""
|
| 44 |
+
Stitch multiple images into one.
|
| 45 |
+
"""
|
| 46 |
+
if not image_urls:
|
| 47 |
+
return JSONResponse(
|
| 48 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 49 |
+
content={"error": "No image URLs provided."}
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
image_urls = [url.strip() for url in image_urls.split(",") if url.strip()]
|
| 53 |
+
|
| 54 |
+
from utils.image import stitch_images as stitch_images_util
|
| 55 |
+
try:
|
| 56 |
+
stitched_image = stitch_images_util(image_urls, max_width, max_height)
|
| 57 |
+
|
| 58 |
+
# Convert PIL image to JPEG format in memory
|
| 59 |
+
from io import BytesIO
|
| 60 |
+
img_buffer = BytesIO()
|
| 61 |
+
stitched_image.save(img_buffer, format='JPEG', quality=95)
|
| 62 |
+
img_buffer.seek(0)
|
| 63 |
+
|
| 64 |
+
return StreamingResponse(
|
| 65 |
+
img_buffer,
|
| 66 |
+
media_type="image/jpeg",
|
| 67 |
+
headers={
|
| 68 |
+
"Content-Disposition": f"attachment; filename=stitched.jpg"
|
| 69 |
+
},
|
| 70 |
+
)
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.error(f"Error stitching images: {e}")
|
| 73 |
+
return JSONResponse(
|
| 74 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 75 |
+
content={"error": "Failed to stitch images."}
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
@v1_utils_router.post("/make-image-imperfect")
|
| 79 |
+
def image_unaize(
|
| 80 |
+
background_tasks: BackgroundTasks,
|
| 81 |
+
image_id: str = Form(..., description="ID of the image to unaize"),
|
| 82 |
+
enhance_color: float = Form(None, description="Strength of the color enhancement (0-2). 0 means black and white, 1 means no change, 2 means full color enhancement"),
|
| 83 |
+
enhance_contrast: float = Form(None, description="Strength of the contrast enhancement (0-2)"),
|
| 84 |
+
noise_strength: int = Form(0, description="Strength of the noise to apply to the image (0-100)"),
|
| 85 |
+
):
|
| 86 |
+
"""
|
| 87 |
+
Remove AI-generated artifacts from an image.
|
| 88 |
+
"""
|
| 89 |
+
if not image_id:
|
| 90 |
+
return JSONResponse(
|
| 91 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 92 |
+
content={"error": "No image URL provided."}
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
image_path = storage.get_media_path(image_id)
|
| 96 |
+
|
| 97 |
+
jpg_id, jpg_path = storage.create_media_filename_with_id(
|
| 98 |
+
media_type="image", file_extension=".jpg"
|
| 99 |
+
)
|
| 100 |
+
tmp_file_id = storage.create_tmp_file(jpg_id)
|
| 101 |
+
|
| 102 |
+
from utils.image import make_image_imperfect
|
| 103 |
+
|
| 104 |
+
def bg_task():
|
| 105 |
+
try:
|
| 106 |
+
imperfect_image = make_image_imperfect(
|
| 107 |
+
image_path,
|
| 108 |
+
enhance_color=enhance_color,
|
| 109 |
+
enhance_contrast=enhance_contrast,
|
| 110 |
+
noise_strength=noise_strength
|
| 111 |
+
)
|
| 112 |
+
imperfect_image.save(jpg_path, format='JPEG', quality=95)
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.error(f"Error making image imperfect: {e}")
|
| 115 |
+
finally:
|
| 116 |
+
storage.delete_media(tmp_file_id)
|
| 117 |
+
|
| 118 |
+
background_tasks.add_task(bg_task)
|
| 119 |
+
return {
|
| 120 |
+
"file_id": jpg_id,
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
@v1_utils_router.post("/convert/pcm/wav")
|
| 124 |
+
def convert_pcm_to_wav(
|
| 125 |
+
background_tasks: BackgroundTasks,
|
| 126 |
+
pcm_id: str = Form(..., description="ID of the PCM audio file to convert"),
|
| 127 |
+
sample_rate: int = Form(24000, description="Sample rate of the PCM audio"),
|
| 128 |
+
channels: int = Form(1, description="Number of audio channels (1 for mono, 2 for stereo)"),
|
| 129 |
+
target_sample_rate: int = Form(44100, description="Target sample rate for the WAV audio"),
|
| 130 |
+
):
|
| 131 |
+
"""
|
| 132 |
+
Convert PCM audio to WAV format.
|
| 133 |
+
"""
|
| 134 |
+
if not pcm_id or storage.media_exists(pcm_id) is False:
|
| 135 |
+
return JSONResponse(
|
| 136 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
| 137 |
+
content={"error": "PCM audio file not found."}
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
from video.media import MediaUtils
|
| 141 |
+
utils = MediaUtils()
|
| 142 |
+
|
| 143 |
+
wav_id, wav_path = storage.create_media_filename_with_id(
|
| 144 |
+
media_type="audio", file_extension=".wav"
|
| 145 |
+
)
|
| 146 |
+
tmp_file_id = storage.create_tmp_file(wav_id)
|
| 147 |
+
|
| 148 |
+
def bg_task():
|
| 149 |
+
try:
|
| 150 |
+
utils.convert_pcm_to_wav(
|
| 151 |
+
input_pcm_path=storage.get_media_path(pcm_id),
|
| 152 |
+
output_wav_path=wav_path,
|
| 153 |
+
sample_rate=sample_rate,
|
| 154 |
+
channels=channels,
|
| 155 |
+
target_sample_rate=target_sample_rate
|
| 156 |
+
)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Error converting PCM to WAV: {e}")
|
| 159 |
+
finally:
|
| 160 |
+
storage.delete_media(tmp_file_id)
|
| 161 |
+
|
| 162 |
+
background_tasks.add_task(bg_task)
|
| 163 |
+
|
| 164 |
+
return {
|
| 165 |
+
"file_id": wav_id,
|
| 166 |
+
}
|
| 167 |
+
|
assets/anton.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28beb8f6542f642ba4143bd4a1d1cfc7be7b1dedc951096efd8e0942502ea1bf
|
| 3 |
+
size 161588
|
assets/icon_volume.png
ADDED
|
|
Git LFS Details
|
assets/noto.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5cf8b2a0576d5680284ab03a7a8219499d59bbe981a79bb3dc0031f251c39736
|
| 3 |
+
size 10560616
|
assets/noto_hindi.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b8cac46a1c86d2533a616b1fcf4e1926b8e39bda69034508b0df96791f56d97
|
| 3 |
+
size 2044548
|
assets/person.png
ADDED
|
Git LFS Details
|
cuda.Dockerfile
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ARG CUDA=12.3.1
|
| 2 |
+
ARG OS=ubuntu22.04
|
| 3 |
+
ARG RUNIMAGE=${CUDA}-runtime-${OS}
|
| 4 |
+
|
| 5 |
+
FROM nvidia/cuda:${RUNIMAGE}
|
| 6 |
+
ARG CUDA
|
| 7 |
+
ARG OS
|
| 8 |
+
USER root
|
| 9 |
+
|
| 10 |
+
RUN apt update && apt install -y \
|
| 11 |
+
fonts-ebgaramond \
|
| 12 |
+
build-essential \
|
| 13 |
+
g++ \
|
| 14 |
+
curl \
|
| 15 |
+
wget \
|
| 16 |
+
git \
|
| 17 |
+
python3.10 \
|
| 18 |
+
python3-pip \
|
| 19 |
+
python3-dev \
|
| 20 |
+
python3.10-gdbm \
|
| 21 |
+
ffmpeg \
|
| 22 |
+
libsndfile1 \
|
| 23 |
+
fonts-dejavu \
|
| 24 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 25 |
+
|
| 26 |
+
WORKDIR /app
|
| 27 |
+
|
| 28 |
+
RUN ln -sf /usr/bin/python3 /usr/bin/python
|
| 29 |
+
RUN ln -sf /usr/bin/pip3 /usr/bin/pip
|
| 30 |
+
|
| 31 |
+
COPY requirements.txt .
|
| 32 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 33 |
+
|
| 34 |
+
ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
|
| 35 |
+
|
| 36 |
+
COPY api_server /app/api_server
|
| 37 |
+
COPY utils /app/utils
|
| 38 |
+
COPY video /app/video
|
| 39 |
+
COPY server.py /app/server.py
|
| 40 |
+
|
| 41 |
+
ENV PYTHONUNBUFFERED=1
|
| 42 |
+
|
| 43 |
+
EXPOSE 8000
|
| 44 |
+
|
| 45 |
+
CMD ["fastapi", "run", "server.py", "--host", "0.0.0.0", "--port", "8000"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
uuid
|
| 2 |
+
numpy
|
| 3 |
+
kokoro
|
| 4 |
+
soundfile
|
| 5 |
+
fastapi[standard]
|
| 6 |
+
loguru
|
| 7 |
+
chatterbox-tts >= 0.1.2
|
| 8 |
+
faster_whisper
|
| 9 |
+
torchaudio
|
| 10 |
+
requests_tor
|
| 11 |
+
requests[socks]
|
| 12 |
+
youtube-transcript-api
|
| 13 |
+
matplotlib
|
| 14 |
+
Pillow
|
| 15 |
+
nltk
|
| 16 |
+
imageio
|
server.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
from fastapi import FastAPI, APIRouter
|
| 4 |
+
import sys
|
| 5 |
+
from loguru import logger
|
| 6 |
+
|
| 7 |
+
from api_server.auth_middleware import auth_middleware
|
| 8 |
+
from api_server.v1_utils_router import v1_utils_router
|
| 9 |
+
from api_server.v1_media_router import v1_media_api_router
|
| 10 |
+
from video.config import device
|
| 11 |
+
|
| 12 |
+
logger.remove()
|
| 13 |
+
logger.add(
|
| 14 |
+
sys.stdout,
|
| 15 |
+
colorize=True,
|
| 16 |
+
format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level> | <blue>{extra}</blue>",
|
| 17 |
+
level="DEBUG",
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
logger.info("This server was created by the 'AI Agents A-Z' YouTube channel")
|
| 21 |
+
logger.info("https://www.youtube.com/@aiagentsaz")
|
| 22 |
+
logger.info("Using device: {}", device)
|
| 23 |
+
|
| 24 |
+
@asynccontextmanager
|
| 25 |
+
async def lifespan(app: FastAPI):
|
| 26 |
+
logger.info("Starting up the server...")
|
| 27 |
+
yield
|
| 28 |
+
logger.info("Shutting down the server...")
|
| 29 |
+
|
| 30 |
+
app = FastAPI(lifespan=lifespan)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# add middleware to app, besides the /health endpoint
|
| 34 |
+
app.middleware("http")(auth_middleware)
|
| 35 |
+
|
| 36 |
+
@app.api_route("/", methods=["GET", "HEAD"])
|
| 37 |
+
def root():
|
| 38 |
+
return {
|
| 39 |
+
"message": "Welcome to the AI Agents A-Z No-Code Server",
|
| 40 |
+
"version": "0.3.5",
|
| 41 |
+
"documentation": "/docs",
|
| 42 |
+
"created_by": "https://www.youtube.com/@aiagentsaz"
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
@app.api_route("/health", methods=["GET", "HEAD"])
|
| 46 |
+
def healthcheck():
|
| 47 |
+
return {"status": "ok"}
|
| 48 |
+
|
| 49 |
+
api_router = APIRouter()
|
| 50 |
+
v1_api_router = APIRouter()
|
| 51 |
+
|
| 52 |
+
# todo auto-delete files after 30 minutes (env var)
|
| 53 |
+
|
| 54 |
+
v1_api_router.include_router(v1_media_api_router, prefix="/media", tags=["media"])
|
| 55 |
+
v1_api_router.include_router(v1_utils_router, prefix="/utils", tags=["utils"])
|
| 56 |
+
api_router.include_router(v1_api_router, prefix="/v1", tags=["v1"])
|
| 57 |
+
app.include_router(api_router, prefix="/api", tags=["api"])
|
utils/image.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import requests
|
| 3 |
+
from PIL import Image, ImageEnhance, ImageFilter, ImageDraw, ImageChops, ImageOps, ImageFont
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
import math
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def stitch_images(
|
| 9 |
+
image_urls: list[str],
|
| 10 |
+
max_width: int = 1920,
|
| 11 |
+
max_height: int = 1080
|
| 12 |
+
):
|
| 13 |
+
"""
|
| 14 |
+
Stitch multiple images into a single image.
|
| 15 |
+
Downloads images from URLs, arranges them in a grid, and resizes proportionally to fit max dimensions.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
image_urls: List of image URLs to download and stitch
|
| 19 |
+
max_width: Maximum width of the final stitched image
|
| 20 |
+
max_height: Maximum height of the final stitched image
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
PIL Image object of the stitched result
|
| 24 |
+
"""
|
| 25 |
+
if not image_urls:
|
| 26 |
+
raise ValueError("No image URLs provided")
|
| 27 |
+
|
| 28 |
+
# Download and open all images
|
| 29 |
+
images = []
|
| 30 |
+
for url in image_urls:
|
| 31 |
+
try:
|
| 32 |
+
response = requests.get(url, timeout=30)
|
| 33 |
+
response.raise_for_status()
|
| 34 |
+
img = Image.open(BytesIO(response.content))
|
| 35 |
+
# Convert to RGB if necessary
|
| 36 |
+
if img.mode != 'RGB':
|
| 37 |
+
img = img.convert('RGB')
|
| 38 |
+
images.append(img)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"Failed to download image from {url}: {e}")
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
if not images:
|
| 44 |
+
raise ValueError("No valid images could be downloaded")
|
| 45 |
+
|
| 46 |
+
# Calculate optimal grid dimensions
|
| 47 |
+
num_images = len(images)
|
| 48 |
+
cols = math.ceil(math.sqrt(num_images))
|
| 49 |
+
rows = math.ceil(num_images / cols)
|
| 50 |
+
|
| 51 |
+
# Find the maximum dimensions among all images to ensure consistent sizing
|
| 52 |
+
max_img_width = max(img.width for img in images)
|
| 53 |
+
max_img_height = max(img.height for img in images)
|
| 54 |
+
|
| 55 |
+
# Calculate the size for each cell in the grid
|
| 56 |
+
cell_width = max_img_width
|
| 57 |
+
cell_height = max_img_height
|
| 58 |
+
|
| 59 |
+
# Create the stitched image canvas
|
| 60 |
+
canvas_width = cols * cell_width
|
| 61 |
+
canvas_height = rows * cell_height
|
| 62 |
+
stitched = Image.new('RGB', (canvas_width, canvas_height), color='white')
|
| 63 |
+
|
| 64 |
+
# Place images in the grid
|
| 65 |
+
for i, img in enumerate(images):
|
| 66 |
+
row = i // cols
|
| 67 |
+
col = i % cols
|
| 68 |
+
|
| 69 |
+
# Calculate position for this image
|
| 70 |
+
x = col * cell_width
|
| 71 |
+
y = row * cell_height
|
| 72 |
+
|
| 73 |
+
# Resize image to fit cell while maintaining aspect ratio
|
| 74 |
+
img_resized = resize_image_to_fit(img, cell_width, cell_height)
|
| 75 |
+
|
| 76 |
+
# Center the image in the cell
|
| 77 |
+
offset_x = (cell_width - img_resized.width) // 2
|
| 78 |
+
offset_y = (cell_height - img_resized.height) // 2
|
| 79 |
+
|
| 80 |
+
stitched.paste(img_resized, (x + offset_x, y + offset_y))
|
| 81 |
+
|
| 82 |
+
# Resize the final stitched image to fit within max dimensions
|
| 83 |
+
final_image = resize_image_to_fit(stitched, max_width, max_height)
|
| 84 |
+
|
| 85 |
+
return final_image
|
| 86 |
+
|
| 87 |
+
def resize_image_cover(
|
| 88 |
+
image_path: str,
|
| 89 |
+
target_width: int,
|
| 90 |
+
target_height: int,
|
| 91 |
+
output_path: str,
|
| 92 |
+
) -> Image.Image:
|
| 93 |
+
"""
|
| 94 |
+
Resize an image to fill the specified dimensions while maintaining aspect ratio.
|
| 95 |
+
The image is scaled to cover the entire target area and cropped to fit.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
image: PIL Image object to resize
|
| 99 |
+
target_width: Target width
|
| 100 |
+
target_height: Target height
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
Resized and cropped PIL Image object
|
| 104 |
+
"""
|
| 105 |
+
image = Image.open(image_path)
|
| 106 |
+
# Calculate the scaling factor to cover the entire target area
|
| 107 |
+
width_ratio = target_width / image.width
|
| 108 |
+
height_ratio = target_height / image.height
|
| 109 |
+
scale_factor = max(width_ratio, height_ratio) # Use max to ensure coverage
|
| 110 |
+
|
| 111 |
+
# Scale the image
|
| 112 |
+
new_width = int(image.width * scale_factor)
|
| 113 |
+
new_height = int(image.height * scale_factor)
|
| 114 |
+
scaled_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 115 |
+
|
| 116 |
+
# Calculate crop box to center the image
|
| 117 |
+
left = (new_width - target_width) // 2
|
| 118 |
+
top = (new_height - target_height) // 2
|
| 119 |
+
right = left + target_width
|
| 120 |
+
bottom = top + target_height
|
| 121 |
+
|
| 122 |
+
# Crop the image to the target dimensions
|
| 123 |
+
cropped_image = scaled_image.crop((left, top, right, bottom))
|
| 124 |
+
|
| 125 |
+
# Convert to RGB if the image has transparency (RGBA mode)
|
| 126 |
+
if cropped_image.mode == 'RGBA':
|
| 127 |
+
# Create a white background and paste the image on it
|
| 128 |
+
rgb_image = Image.new('RGB', cropped_image.size, (255, 255, 255))
|
| 129 |
+
rgb_image.paste(cropped_image, mask=cropped_image.split()[-1]) # Use alpha channel as mask
|
| 130 |
+
cropped_image = rgb_image
|
| 131 |
+
|
| 132 |
+
cropped_image.save(output_path)
|
| 133 |
+
|
| 134 |
+
def resize_image_to_fit(image: Image.Image, max_width: int, max_height: int) -> Image.Image:
|
| 135 |
+
"""
|
| 136 |
+
Resize an image to fit within the specified dimensions while maintaining aspect ratio.
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
image: PIL Image object to resize
|
| 140 |
+
max_width: Maximum width
|
| 141 |
+
max_height: Maximum height
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Resized PIL Image object
|
| 145 |
+
"""
|
| 146 |
+
# Calculate the scaling factor to fit within max dimensions
|
| 147 |
+
width_ratio = max_width / image.width
|
| 148 |
+
height_ratio = max_height / image.height
|
| 149 |
+
scale_factor = min(width_ratio, height_ratio)
|
| 150 |
+
|
| 151 |
+
# Only resize if the image is larger than max dimensions
|
| 152 |
+
if scale_factor < 1:
|
| 153 |
+
new_width = int(image.width * scale_factor)
|
| 154 |
+
new_height = int(image.height * scale_factor)
|
| 155 |
+
return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
| 156 |
+
|
| 157 |
+
return image
|
| 158 |
+
|
| 159 |
+
def cup_of_coffee_tone(img):
|
| 160 |
+
sepia = ImageOps.colorize(img.convert("L"), "#704214", "#C0A080")
|
| 161 |
+
return Image.blend(img, sepia, alpha=0.2) # tweak alpha
|
| 162 |
+
|
| 163 |
+
def chromatic_aberration(img, shift=2):
|
| 164 |
+
r, g, b = img.split()
|
| 165 |
+
# Use transform with AFFINE to shift the channels
|
| 166 |
+
r = r.transform(img.size, Image.AFFINE, (1, 0, -shift, 0, 1, 0))
|
| 167 |
+
b = b.transform(img.size, Image.AFFINE, (1, 0, shift, 0, 1, 0))
|
| 168 |
+
return Image.merge("RGB", (r, g, b))
|
| 169 |
+
|
| 170 |
+
def make_image_imperfect(
|
| 171 |
+
image_path: str,
|
| 172 |
+
enhance_color: float = None,
|
| 173 |
+
enhance_contrast: float = None,
|
| 174 |
+
noise_strength: int = 15
|
| 175 |
+
) -> Image.Image:
|
| 176 |
+
"""
|
| 177 |
+
Remove AI-generated artifacts from an image.
|
| 178 |
+
This is a placeholder function. Actual implementation would depend on the specific algorithm used.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
image_url: URL of the image to process
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
PIL Image object of the processed result
|
| 185 |
+
"""
|
| 186 |
+
try:
|
| 187 |
+
img = Image.open(image_path)
|
| 188 |
+
|
| 189 |
+
if enhance_color is not None:
|
| 190 |
+
img = ImageEnhance.Color(img).enhance(enhance_color)
|
| 191 |
+
if enhance_contrast is not None:
|
| 192 |
+
img = ImageEnhance.Contrast(img).enhance(enhance_contrast)
|
| 193 |
+
|
| 194 |
+
img = img.filter(ImageFilter.SHARPEN)
|
| 195 |
+
img = img.filter(ImageFilter.GaussianBlur(radius=0.5))
|
| 196 |
+
|
| 197 |
+
if img.mode != 'RGB':
|
| 198 |
+
img = img.convert('RGB')
|
| 199 |
+
img_array = np.array(img)
|
| 200 |
+
h, w, c = img_array.shape
|
| 201 |
+
grayscale_noise = np.random.randint(-noise_strength, noise_strength + 1, (h, w), dtype='int16')
|
| 202 |
+
noise = np.stack([grayscale_noise] * c, axis=2)
|
| 203 |
+
noisy_array = img_array.astype('int16') + noise
|
| 204 |
+
noisy_array = np.clip(noisy_array, 0, 255).astype('uint8')
|
| 205 |
+
img = Image.fromarray(noisy_array)
|
| 206 |
+
|
| 207 |
+
img = cup_of_coffee_tone(img)
|
| 208 |
+
img = chromatic_aberration(img, shift=1)
|
| 209 |
+
|
| 210 |
+
return img
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"Failed to process image from {image_path}: {e}")
|
| 214 |
+
raise ValueError("Failed to unaize image") from e
|
| 215 |
+
|
| 216 |
+
def create_text_image(
|
| 217 |
+
text: str,
|
| 218 |
+
size: tuple[int, int] = (1920, 1080),
|
| 219 |
+
font_size: int = 120,
|
| 220 |
+
font_color: str = "white",
|
| 221 |
+
font_path: str = None
|
| 222 |
+
) -> Image.Image:
|
| 223 |
+
"""
|
| 224 |
+
Create an image with centered text.
|
| 225 |
+
|
| 226 |
+
Args:
|
| 227 |
+
text: Text to display on the image
|
| 228 |
+
width: Width of the image
|
| 229 |
+
height: Height of the image
|
| 230 |
+
font_size: Size of the font
|
| 231 |
+
font_color: Color of the text
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
PIL Image object with the text centered
|
| 235 |
+
"""
|
| 236 |
+
img = Image.new('RGB', size, color='black')
|
| 237 |
+
draw = ImageDraw.Draw(img)
|
| 238 |
+
|
| 239 |
+
font = ImageFont.load_default(size=font_size)
|
| 240 |
+
if font_path:
|
| 241 |
+
font = ImageFont.truetype(font_path, font_size)
|
| 242 |
+
font_bbox = font.getbbox(text)
|
| 243 |
+
text_width = font_bbox[2] - font_bbox[0]
|
| 244 |
+
text_height = font_bbox[3] - font_bbox[1]
|
| 245 |
+
x = (size[0] - text_width) // 2
|
| 246 |
+
y = (size[1] - text_height) // 2
|
| 247 |
+
draw.text((x, y), text, fill=font_color, font=font)
|
| 248 |
+
|
| 249 |
+
return img
|
| 250 |
+
|
| 251 |
+
def make_image_wobbly(
|
| 252 |
+
image: Image.Image,
|
| 253 |
+
wobble_amount: float = 3.0
|
| 254 |
+
) -> Image.Image:
|
| 255 |
+
"""
|
| 256 |
+
Apply a subtle wobble/distortion effect to an image, like viewing through water or a warped mirror.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
image: PIL Image object to distort
|
| 260 |
+
wobble_amount: Strength of the wobble effect (0.5-10.0, higher = more distortion)
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
PIL Image object with wobble effect applied
|
| 264 |
+
"""
|
| 265 |
+
if image.mode != 'RGB':
|
| 266 |
+
image = image.convert('RGB')
|
| 267 |
+
|
| 268 |
+
width, height = image.size
|
| 269 |
+
img_array = np.array(image)
|
| 270 |
+
|
| 271 |
+
# Create coordinate grids
|
| 272 |
+
x_coords = np.arange(width)
|
| 273 |
+
y_coords = np.arange(height)
|
| 274 |
+
x_grid, y_grid = np.meshgrid(x_coords, y_coords)
|
| 275 |
+
|
| 276 |
+
# Create random wave patterns optimized for text
|
| 277 |
+
# Generate random parameters for each wave to ensure variety
|
| 278 |
+
|
| 279 |
+
# Random wave frequencies and phases for horizontal waves
|
| 280 |
+
freq1_h = np.random.uniform(2, 5) # Random frequency between 2-5
|
| 281 |
+
freq2_h = np.random.uniform(5, 10) # Random frequency between 5-10
|
| 282 |
+
phase1_h = np.random.uniform(0, 2 * np.pi) # Random phase
|
| 283 |
+
phase2_h = np.random.uniform(0, 2 * np.pi) # Random phase
|
| 284 |
+
|
| 285 |
+
wave_x1 = wobble_amount * 0.3 * np.sin(2 * np.pi * y_grid / (height / freq1_h) + phase1_h)
|
| 286 |
+
wave_x2 = wobble_amount * 0.1 * np.sin(2 * np.pi * y_grid / (height / freq2_h) + phase2_h)
|
| 287 |
+
|
| 288 |
+
# Random wave frequencies and phases for vertical waves
|
| 289 |
+
freq1_v = np.random.uniform(2, 6) # Random frequency between 2-6
|
| 290 |
+
freq2_v = np.random.uniform(6, 12) # Random frequency between 6-12
|
| 291 |
+
phase1_v = np.random.uniform(0, 2 * np.pi) # Random phase
|
| 292 |
+
phase2_v = np.random.uniform(0, 2 * np.pi) # Random phase
|
| 293 |
+
|
| 294 |
+
wave_y1 = wobble_amount * 0.3 * np.sin(2 * np.pi * x_grid / (width / freq1_v) + phase1_v)
|
| 295 |
+
wave_y2 = wobble_amount * 0.1 * np.sin(2 * np.pi * x_grid / (width / freq2_v) + phase2_v)
|
| 296 |
+
|
| 297 |
+
# Random circular ripples with random centers and frequencies
|
| 298 |
+
center_x = width // 2 + np.random.randint(-width//4, width//4)
|
| 299 |
+
center_y = height // 2 + np.random.randint(-height//4, height//4)
|
| 300 |
+
ripple_freq = np.random.uniform(80, 120) # Random ripple frequency
|
| 301 |
+
ripple_phase = np.random.uniform(0, 2 * np.pi) # Random ripple phase
|
| 302 |
+
|
| 303 |
+
distance = np.sqrt((x_grid - center_x)**2 + (y_grid - center_y)**2)
|
| 304 |
+
ripple_x = wobble_amount * 0.15 * np.sin(2 * np.pi * distance / ripple_freq + ripple_phase)
|
| 305 |
+
ripple_y = wobble_amount * 0.15 * np.cos(2 * np.pi * distance / ripple_freq + ripple_phase)
|
| 306 |
+
|
| 307 |
+
# Random noise for text preservation - NO FIXED SEED
|
| 308 |
+
noise_x = np.random.normal(0, wobble_amount * 0.05, (height, width))
|
| 309 |
+
noise_y = np.random.normal(0, wobble_amount * 0.05, (height, width))
|
| 310 |
+
|
| 311 |
+
# Combine all distortions
|
| 312 |
+
total_x_offset = wave_x1 + wave_x2 + ripple_x + noise_x
|
| 313 |
+
total_y_offset = wave_y1 + wave_y2 + ripple_y + noise_y
|
| 314 |
+
|
| 315 |
+
# Apply the distortion with proper boundary handling
|
| 316 |
+
new_x_coords = x_grid + total_x_offset
|
| 317 |
+
new_y_coords = y_grid + total_y_offset
|
| 318 |
+
|
| 319 |
+
# Use scipy.ndimage.map_coordinates for efficient interpolation
|
| 320 |
+
try:
|
| 321 |
+
from scipy.ndimage import map_coordinates
|
| 322 |
+
|
| 323 |
+
# Create coordinate arrays for map_coordinates (expects [y, x] order)
|
| 324 |
+
coords = np.array([new_y_coords, new_x_coords])
|
| 325 |
+
|
| 326 |
+
# Apply the transformation to each color channel with adaptive interpolation
|
| 327 |
+
# Use progressively smoother interpolation for higher wobble amounts
|
| 328 |
+
distorted_array = np.zeros_like(img_array)
|
| 329 |
+
|
| 330 |
+
# Choose interpolation method based on wobble amount for smoothest results
|
| 331 |
+
if wobble_amount <= 1.5:
|
| 332 |
+
# For very subtle wobbles, use nearest neighbor to preserve text sharpness
|
| 333 |
+
interpolation_order = 0
|
| 334 |
+
elif wobble_amount <= 3.0:
|
| 335 |
+
# For moderate wobbles, use linear interpolation
|
| 336 |
+
interpolation_order = 1
|
| 337 |
+
else:
|
| 338 |
+
# For strong wobbles, use cubic interpolation for smoothest edges
|
| 339 |
+
interpolation_order = 3
|
| 340 |
+
|
| 341 |
+
for channel in range(img_array.shape[2]):
|
| 342 |
+
distorted_array[:, :, channel] = map_coordinates(
|
| 343 |
+
img_array[:, :, channel],
|
| 344 |
+
coords,
|
| 345 |
+
order=interpolation_order,
|
| 346 |
+
mode='reflect', # Mirror edges instead of clipping
|
| 347 |
+
prefilter=True if interpolation_order > 1 else False # Use prefilter for cubic
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
result_img = Image.fromarray(distorted_array.astype(np.uint8))
|
| 351 |
+
|
| 352 |
+
# Post-process for smoother edges at higher wobble amounts
|
| 353 |
+
if wobble_amount > 2.0:
|
| 354 |
+
# Apply a very subtle Gaussian blur to smooth any remaining artifacts
|
| 355 |
+
result_img = result_img.filter(ImageFilter.GaussianBlur(radius=0.3))
|
| 356 |
+
# Then apply gentle sharpening to maintain text readability
|
| 357 |
+
result_img = result_img.filter(ImageFilter.UnsharpMask(radius=0.8, percent=60, threshold=1))
|
| 358 |
+
elif wobble_amount > 1.5:
|
| 359 |
+
# For moderate wobbles, just apply gentle sharpening
|
| 360 |
+
result_img = result_img.filter(ImageFilter.UnsharpMask(radius=0.5, percent=40, threshold=0))
|
| 361 |
+
|
| 362 |
+
return result_img
|
| 363 |
+
|
| 364 |
+
return Image.fromarray(distorted_array.astype(np.uint8))
|
| 365 |
+
|
| 366 |
+
except ImportError:
|
| 367 |
+
# Fallback to PIL's transform if scipy is not available
|
| 368 |
+
# This is much faster than the pixel-by-pixel approach
|
| 369 |
+
from PIL.Image import AFFINE
|
| 370 |
+
|
| 371 |
+
# For a simple approximation, apply a slight transform
|
| 372 |
+
# This won't be as sophisticated but will be much faster
|
| 373 |
+
transformed = image.transform(
|
| 374 |
+
image.size,
|
| 375 |
+
AFFINE,
|
| 376 |
+
(1, 0.02 * wobble_amount/10, 0.02 * wobble_amount/10, 1, 0, 0),
|
| 377 |
+
resample=Image.BILINEAR
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
# Apply a slight rotation for additional wobble with random angle
|
| 381 |
+
angle = wobble_amount * 0.3 * np.random.uniform(-1, 1) # Random rotation
|
| 382 |
+
rotated = transformed.rotate(angle, resample=Image.BILINEAR, expand=False)
|
| 383 |
+
|
| 384 |
+
return rotated
|
| 385 |
+
|
| 386 |
+
|
utils/proxy.py
ADDED
|
File without changes
|
video/builder.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from video.media import MediaUtils
|
| 2 |
+
import time
|
| 3 |
+
from loguru import logger
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class VideoBuilder:
|
| 7 |
+
"""
|
| 8 |
+
Builder class for constructing FFmpeg video commands with a fluent interface.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, dimensions: tuple[int, int], ffmpeg_path="ffmpeg"):
|
| 12 |
+
if not isinstance(dimensions, tuple) or len(dimensions) != 2:
|
| 13 |
+
raise ValueError("Dimensions must be a tuple of (width, height).")
|
| 14 |
+
|
| 15 |
+
self.width, self.height = dimensions
|
| 16 |
+
self.ffmpeg_path = ffmpeg_path
|
| 17 |
+
|
| 18 |
+
# Components
|
| 19 |
+
self.background = None
|
| 20 |
+
self.audio_file = None
|
| 21 |
+
self.captions = None
|
| 22 |
+
self.output_path = "output.mp4"
|
| 23 |
+
|
| 24 |
+
# Internal state
|
| 25 |
+
self.media_utils = None
|
| 26 |
+
|
| 27 |
+
def set_media_utils(self, media_utils: MediaUtils):
|
| 28 |
+
"""Set the media manager for duration calculations."""
|
| 29 |
+
self.media_utils = media_utils
|
| 30 |
+
return self
|
| 31 |
+
|
| 32 |
+
def set_background_image(self, file_path: str, effect_config: dict = None):
|
| 33 |
+
"""Set background as an image with optional visual effects.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
file_path: Path to the image file
|
| 37 |
+
effect_config: Configuration for visual effects. Supported effects:
|
| 38 |
+
- Ken Burns (zoom): {"effect": "ken_burns", "zoom_factor": 0.001, "direction": "zoom-to-top-left"}
|
| 39 |
+
- Pan: {"effect": "pan", "direction": "left-to-right", "speed": "normal"}
|
| 40 |
+
"""
|
| 41 |
+
self.background = {
|
| 42 |
+
"type": "image",
|
| 43 |
+
"file": file_path,
|
| 44 |
+
"effect_config": effect_config or {"effect": "ken_burns"}, # Default to Ken Burns for backward compatibility
|
| 45 |
+
}
|
| 46 |
+
return self
|
| 47 |
+
|
| 48 |
+
def set_background_video(self, file_path: str):
|
| 49 |
+
"""Set background as a video file."""
|
| 50 |
+
self.background = {"type": "video", "file": file_path}
|
| 51 |
+
return self
|
| 52 |
+
|
| 53 |
+
def set_audio(self, file_path: str):
|
| 54 |
+
"""Set audio file."""
|
| 55 |
+
self.audio_file = file_path
|
| 56 |
+
return self
|
| 57 |
+
|
| 58 |
+
def set_captions(
|
| 59 |
+
self,
|
| 60 |
+
file_path: str = None,
|
| 61 |
+
config: dict = None,
|
| 62 |
+
):
|
| 63 |
+
"""Set caption subtitles
|
| 64 |
+
|
| 65 |
+
Args:
|
| 66 |
+
file_path: Path to subtitle file
|
| 67 |
+
config: Optional configuration dict
|
| 68 |
+
"""
|
| 69 |
+
self.captions = {
|
| 70 |
+
"file": file_path,
|
| 71 |
+
**(config or {}),
|
| 72 |
+
}
|
| 73 |
+
return self
|
| 74 |
+
|
| 75 |
+
def set_output_path(self, output_path: str):
|
| 76 |
+
"""Set output file path."""
|
| 77 |
+
self.output_path = output_path
|
| 78 |
+
return self
|
| 79 |
+
|
| 80 |
+
def build_command(self):
|
| 81 |
+
"""Build the complete FFmpeg command."""
|
| 82 |
+
if not self.background:
|
| 83 |
+
raise ValueError("Background must be set (image or video).")
|
| 84 |
+
|
| 85 |
+
if not self.audio_file and not self.captions:
|
| 86 |
+
raise ValueError(
|
| 87 |
+
"At least one of audio_file, or captions must be provided."
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Validate combinations
|
| 91 |
+
if self.background["type"] == "image" and not self.audio_file:
|
| 92 |
+
raise ValueError("Audio file must be provided if background is an image.")
|
| 93 |
+
|
| 94 |
+
if (
|
| 95 |
+
self.background["type"] == "video"
|
| 96 |
+
and not self.audio_file
|
| 97 |
+
and self.captions is None
|
| 98 |
+
):
|
| 99 |
+
raise ValueError(
|
| 100 |
+
"Audio file or captions must be provided if background is a video."
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Get audio duration if audio file is provided
|
| 104 |
+
audio_duration = None
|
| 105 |
+
if self.audio_file:
|
| 106 |
+
if not self.media_utils:
|
| 107 |
+
raise ValueError(
|
| 108 |
+
"Media manager must be set to determine audio duration."
|
| 109 |
+
)
|
| 110 |
+
media_info = self.media_utils.get_audio_info(self.audio_file)
|
| 111 |
+
audio_duration = media_info.get("duration")
|
| 112 |
+
if not audio_duration:
|
| 113 |
+
raise ValueError("Could not determine audio duration")
|
| 114 |
+
|
| 115 |
+
# Build command
|
| 116 |
+
cmd = [self.ffmpeg_path, "-y"]
|
| 117 |
+
|
| 118 |
+
filter_parts = []
|
| 119 |
+
input_index = 0
|
| 120 |
+
|
| 121 |
+
# Add background input
|
| 122 |
+
if self.background["type"] == "image":
|
| 123 |
+
cmd.extend(
|
| 124 |
+
["-loop", "1", "-t", str(audio_duration), "-i", self.background["file"]]
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Get effect configuration with backward compatibility
|
| 128 |
+
effect_config = self.background.get("effect_config", {"effect": "ken_burns"})
|
| 129 |
+
|
| 130 |
+
# Handle backward compatibility for old ken_burns config
|
| 131 |
+
if "ken_burns" in self.background and "effect_config" not in self.background:
|
| 132 |
+
# Old format: {"ken_burns": {"zoom_factor": 0.001, "direction": "zoom-to-top-left"}}
|
| 133 |
+
old_ken_burns = self.background.get("ken_burns", {})
|
| 134 |
+
effect_config = {
|
| 135 |
+
"effect": "ken_burns",
|
| 136 |
+
"zoom_factor": old_ken_burns.get("zoom_factor", 0.001),
|
| 137 |
+
"direction": old_ken_burns.get("direction", "zoom-to-top-left")
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
effect_type = effect_config.get("effect", "ken_burns")
|
| 141 |
+
|
| 142 |
+
fps = 25
|
| 143 |
+
duration_frames = int(audio_duration * fps)
|
| 144 |
+
|
| 145 |
+
if effect_type == "ken_burns":
|
| 146 |
+
# Ken Burns (zoom) effect
|
| 147 |
+
zoom_factor = effect_config.get("zoom_factor", 0.001)
|
| 148 |
+
direction = effect_config.get("direction", "zoom-to-top-left")
|
| 149 |
+
|
| 150 |
+
# todo without upscaling we can't use the top and center zooms. upscaling increases the render time
|
| 151 |
+
zoom_expressions = {
|
| 152 |
+
"zoom-to-top": f"z='zoom+{zoom_factor}':x=iw/2-(iw/zoom/2):y=0",
|
| 153 |
+
"zoom-to-center": f"z='zoom+{zoom_factor}':x=iw/2-(iw/zoom/2):y=ih/2-(ih/zoom/2)",
|
| 154 |
+
"zoom-to-top-left": f"z='zoom+{zoom_factor}':x=0:y=0",
|
| 155 |
+
}
|
| 156 |
+
zoom_expr = zoom_expressions.get(direction, zoom_expressions["zoom-to-top-left"])
|
| 157 |
+
|
| 158 |
+
zoompan_d = duration_frames + 1
|
| 159 |
+
filter_parts.append(
|
| 160 |
+
f"[{input_index}]scale={self.width}:-2,setsar=1:1,"
|
| 161 |
+
f"crop={self.width}:{self.height},"
|
| 162 |
+
f"zoompan={zoom_expr}:d={zoompan_d}:s={self.width}x{self.height}:fps={fps}[bg]"
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
elif effect_type == "pan":
|
| 166 |
+
# Pan effect - camera moves across the image
|
| 167 |
+
direction = effect_config.get("direction", "left-to-right")
|
| 168 |
+
speed = effect_config.get("speed", "normal")
|
| 169 |
+
|
| 170 |
+
# Speed multipliers
|
| 171 |
+
speed_multipliers = {
|
| 172 |
+
"slow": 0.5,
|
| 173 |
+
"normal": 1.0,
|
| 174 |
+
"fast": 2.0
|
| 175 |
+
}
|
| 176 |
+
speed_mult = speed_multipliers.get(speed, 1.0)
|
| 177 |
+
|
| 178 |
+
# Calculate pan distance based on direction
|
| 179 |
+
# We'll scale the image larger to allow for panning
|
| 180 |
+
scale_factor = 1.3 # Scale image 30% larger to allow room for panning
|
| 181 |
+
scaled_width = int(self.width * scale_factor)
|
| 182 |
+
scaled_height = int(self.height * scale_factor)
|
| 183 |
+
|
| 184 |
+
# Pan expressions for different directions
|
| 185 |
+
if direction == "left-to-right":
|
| 186 |
+
# Start from left, move to right
|
| 187 |
+
start_x = 0
|
| 188 |
+
end_x = scaled_width - self.width
|
| 189 |
+
start_y = (scaled_height - self.height) // 2
|
| 190 |
+
end_y = start_y
|
| 191 |
+
elif direction == "right-to-left":
|
| 192 |
+
# Start from right, move to left
|
| 193 |
+
start_x = scaled_width - self.width
|
| 194 |
+
end_x = 0
|
| 195 |
+
start_y = (scaled_height - self.height) // 2
|
| 196 |
+
end_y = start_y
|
| 197 |
+
elif direction == "top-to-bottom":
|
| 198 |
+
# Start from top, move to bottom
|
| 199 |
+
start_x = (scaled_width - self.width) // 2
|
| 200 |
+
end_x = start_x
|
| 201 |
+
start_y = 0
|
| 202 |
+
end_y = scaled_height - self.height
|
| 203 |
+
elif direction == "bottom-to-top":
|
| 204 |
+
# Start from bottom, move to top
|
| 205 |
+
start_x = (scaled_width - self.width) // 2
|
| 206 |
+
end_x = start_x
|
| 207 |
+
start_y = scaled_height - self.height
|
| 208 |
+
end_y = 0
|
| 209 |
+
else:
|
| 210 |
+
# Default to left-to-right
|
| 211 |
+
start_x = 0
|
| 212 |
+
end_x = scaled_width - self.width
|
| 213 |
+
start_y = (scaled_height - self.height) // 2
|
| 214 |
+
end_y = start_y
|
| 215 |
+
|
| 216 |
+
# Create pan expression
|
| 217 |
+
# Linear interpolation from start to end position over the duration
|
| 218 |
+
pan_x_expr = f"{start_x}+({end_x}-{start_x})*t/{audio_duration}*{speed_mult}"
|
| 219 |
+
pan_y_expr = f"{start_y}+({end_y}-{start_y})*t/{audio_duration}*{speed_mult}"
|
| 220 |
+
|
| 221 |
+
filter_parts.append(
|
| 222 |
+
f"[{input_index}]scale={scaled_width}:{scaled_height},setsar=1:1,"
|
| 223 |
+
f"crop={self.width}:{self.height}:{pan_x_expr}:{pan_y_expr}[bg]"
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
else:
|
| 227 |
+
# No effect, just scale and crop
|
| 228 |
+
filter_parts.append(
|
| 229 |
+
f"[{input_index}]scale={self.width}:{self.height},setsar=1:1[bg]"
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
elif self.background["type"] == "video":
|
| 233 |
+
if audio_duration:
|
| 234 |
+
cmd.extend(
|
| 235 |
+
[
|
| 236 |
+
"-stream_loop",
|
| 237 |
+
"-1",
|
| 238 |
+
"-t",
|
| 239 |
+
str(audio_duration),
|
| 240 |
+
"-i",
|
| 241 |
+
self.background["file"],
|
| 242 |
+
]
|
| 243 |
+
)
|
| 244 |
+
else:
|
| 245 |
+
cmd.extend(["-i", self.background["file"]])
|
| 246 |
+
filter_parts.append(f"[{input_index}]scale={self.width}:{self.height}[bg]")
|
| 247 |
+
|
| 248 |
+
input_index += 1
|
| 249 |
+
current_video = "[bg]"
|
| 250 |
+
|
| 251 |
+
# Add audio input
|
| 252 |
+
audio_input_index = None
|
| 253 |
+
if self.audio_file:
|
| 254 |
+
cmd.extend(["-i", self.audio_file])
|
| 255 |
+
audio_input_index = input_index
|
| 256 |
+
input_index += 1
|
| 257 |
+
|
| 258 |
+
# Add subtitles or caption images if provided
|
| 259 |
+
if self.captions:
|
| 260 |
+
subtitle_file = self.captions.get("file")
|
| 261 |
+
if subtitle_file:
|
| 262 |
+
filter_parts.append(f"{current_video}subtitles={subtitle_file}[v]")
|
| 263 |
+
current_video = "[v]"
|
| 264 |
+
else:
|
| 265 |
+
# Rename final video output
|
| 266 |
+
if current_video == "[bg]":
|
| 267 |
+
current_video = "[v]"
|
| 268 |
+
filter_parts.append(f"[bg]copy[v]")
|
| 269 |
+
|
| 270 |
+
# Build filter complex
|
| 271 |
+
if filter_parts:
|
| 272 |
+
cmd.extend(["-filter_complex", ";".join(filter_parts)])
|
| 273 |
+
|
| 274 |
+
# Map video and audio
|
| 275 |
+
cmd.extend(["-map", current_video])
|
| 276 |
+
if audio_input_index is not None:
|
| 277 |
+
cmd.extend(["-map", f"{audio_input_index}:a"])
|
| 278 |
+
|
| 279 |
+
# Video codec settings
|
| 280 |
+
cmd.extend(["-c:v", "libx264", "-preset", "ultrafast"])
|
| 281 |
+
|
| 282 |
+
cmd.extend(["-crf", "23", "-pix_fmt", "yuv420p"])
|
| 283 |
+
|
| 284 |
+
# Audio codec settings
|
| 285 |
+
if self.audio_file:
|
| 286 |
+
cmd.extend(["-c:a", "aac", "-b:a", "192k"])
|
| 287 |
+
if audio_duration:
|
| 288 |
+
cmd.extend(["-t", str(audio_duration)])
|
| 289 |
+
|
| 290 |
+
cmd.append(self.output_path)
|
| 291 |
+
return cmd
|
| 292 |
+
|
| 293 |
+
def execute(self):
|
| 294 |
+
"""Build and execute the FFmpeg command using MediaUtils for progress tracking."""
|
| 295 |
+
if not self.media_utils:
|
| 296 |
+
logger.error("MediaUtils must be set before executing video build")
|
| 297 |
+
return False
|
| 298 |
+
|
| 299 |
+
start = time.time()
|
| 300 |
+
context_logger = logger.bind(
|
| 301 |
+
dimensions=(self.width, self.height),
|
| 302 |
+
background_type=self.background.get("type") if self.background else None,
|
| 303 |
+
has_audio=bool(self.audio_file),
|
| 304 |
+
has_captions=bool(self.captions),
|
| 305 |
+
output_path=self.output_path,
|
| 306 |
+
youtube_channel="https://www.youtube.com/@aiagentsaz"
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
context_logger.debug("building video with VideoBuilder")
|
| 311 |
+
cmd = self.build_command()
|
| 312 |
+
|
| 313 |
+
# Calculate expected duration for progress tracking
|
| 314 |
+
expected_duration = None
|
| 315 |
+
if self.audio_file:
|
| 316 |
+
audio_info = self.media_utils.get_audio_info(self.audio_file)
|
| 317 |
+
expected_duration = audio_info.get("duration")
|
| 318 |
+
elif self.background and self.background.get("type") == "video":
|
| 319 |
+
video_info = self.media_utils.get_video_info(self.background["file"])
|
| 320 |
+
expected_duration = video_info.get("duration")
|
| 321 |
+
|
| 322 |
+
context_logger.bind(
|
| 323 |
+
command=" ".join(cmd),
|
| 324 |
+
expected_duration=expected_duration,
|
| 325 |
+
).debug("executing video build command")
|
| 326 |
+
# Execute using MediaUtils for proper logging and progress tracking
|
| 327 |
+
success = self.media_utils.execute_ffmpeg_command(
|
| 328 |
+
cmd,
|
| 329 |
+
"build video",
|
| 330 |
+
expected_duration=expected_duration,
|
| 331 |
+
show_progress=True,
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
if success:
|
| 335 |
+
context_logger.bind(execution_time=time.time() - start).info(
|
| 336 |
+
"video built successfully"
|
| 337 |
+
)
|
| 338 |
+
return True
|
| 339 |
+
else:
|
| 340 |
+
context_logger.error("failed to build video")
|
| 341 |
+
return False
|
| 342 |
+
|
| 343 |
+
except Exception as e:
|
| 344 |
+
context_logger.bind(error=str(e), execution_time=time.time() - start).error(
|
| 345 |
+
"error during video rendering"
|
| 346 |
+
)
|
| 347 |
+
return False
|
video/caption.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import string
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
from loguru import logger
|
| 4 |
+
|
| 5 |
+
from typing import Dict, List
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Caption:
|
| 9 |
+
def is_punctuation(self, text):
|
| 10 |
+
return text in string.punctuation
|
| 11 |
+
|
| 12 |
+
def create_subtitle_segments_english(
|
| 13 |
+
self, captions: List[Dict], max_length=80, lines=2
|
| 14 |
+
):
|
| 15 |
+
"""
|
| 16 |
+
Breaks up the captions into segments of max_length characters
|
| 17 |
+
on two lines and merge punctuation with the last word
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
if not captions:
|
| 21 |
+
return []
|
| 22 |
+
|
| 23 |
+
segments = []
|
| 24 |
+
current_segment_texts = ["" for _ in range(lines)]
|
| 25 |
+
current_line = 0
|
| 26 |
+
segment_start_ts = captions[0]["start_ts"]
|
| 27 |
+
segment_end_ts = captions[0]["end_ts"]
|
| 28 |
+
|
| 29 |
+
for caption in captions:
|
| 30 |
+
text = caption["text"]
|
| 31 |
+
start_ts = caption["start_ts"]
|
| 32 |
+
end_ts = caption["end_ts"]
|
| 33 |
+
|
| 34 |
+
# Update the segment end timestamp
|
| 35 |
+
segment_end_ts = end_ts
|
| 36 |
+
|
| 37 |
+
# If the caption is a punctuation, merge it with the current line
|
| 38 |
+
if self.is_punctuation(text):
|
| 39 |
+
if current_line < lines and current_segment_texts[current_line]:
|
| 40 |
+
current_segment_texts[current_line] += text
|
| 41 |
+
continue
|
| 42 |
+
|
| 43 |
+
# If the line is too long, move to the next one
|
| 44 |
+
if (
|
| 45 |
+
current_line < lines
|
| 46 |
+
and len(current_segment_texts[current_line] + text) > max_length
|
| 47 |
+
):
|
| 48 |
+
current_line += 1
|
| 49 |
+
|
| 50 |
+
# If we've filled all lines, save the current segment and start a new one
|
| 51 |
+
if current_line >= lines:
|
| 52 |
+
segments.append(
|
| 53 |
+
{
|
| 54 |
+
"text": current_segment_texts,
|
| 55 |
+
"start_ts": segment_start_ts,
|
| 56 |
+
"end_ts": segment_end_ts,
|
| 57 |
+
}
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Reset for next segment
|
| 61 |
+
current_segment_texts = ["" for _ in range(lines)]
|
| 62 |
+
current_line = 0
|
| 63 |
+
# Add a small gap (0.05s) between segments to prevent overlap
|
| 64 |
+
segment_start_ts = start_ts + 0.05
|
| 65 |
+
|
| 66 |
+
# Add the text to the current segment
|
| 67 |
+
if current_line < lines:
|
| 68 |
+
current_segment_texts[current_line] += (
|
| 69 |
+
" " if current_segment_texts[current_line] else ""
|
| 70 |
+
)
|
| 71 |
+
current_segment_texts[current_line] += text
|
| 72 |
+
|
| 73 |
+
# Add the last segment if there's any content
|
| 74 |
+
if any(current_segment_texts):
|
| 75 |
+
segments.append(
|
| 76 |
+
{
|
| 77 |
+
"text": current_segment_texts,
|
| 78 |
+
"start_ts": segment_start_ts,
|
| 79 |
+
"end_ts": segment_end_ts,
|
| 80 |
+
}
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Post-processing to ensure no overlaps by adjusting end times if needed
|
| 84 |
+
for i in range(len(segments) - 1):
|
| 85 |
+
if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
|
| 86 |
+
segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
|
| 87 |
+
|
| 88 |
+
return segments
|
| 89 |
+
|
| 90 |
+
def create_subtitle_segments_international(
|
| 91 |
+
self, captions: List[Dict], max_length=80, lines=2
|
| 92 |
+
):
|
| 93 |
+
"""
|
| 94 |
+
Breaks up international captions (full sentences) into smaller segments that fit
|
| 95 |
+
within max_length characters per line, with proper timing distribution.
|
| 96 |
+
|
| 97 |
+
Handles both space-delimited languages like English and character-based languages like Chinese.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
captions: List of caption dictionaries with text, start_ts, and end_ts
|
| 101 |
+
max_length: Maximum number of characters per line
|
| 102 |
+
lines: Number of lines per segment
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
List of subtitle segments
|
| 106 |
+
"""
|
| 107 |
+
if not captions:
|
| 108 |
+
return []
|
| 109 |
+
|
| 110 |
+
segments = []
|
| 111 |
+
|
| 112 |
+
for caption in captions:
|
| 113 |
+
text = caption["text"].strip()
|
| 114 |
+
start_ts = caption["start_ts"]
|
| 115 |
+
end_ts = caption["end_ts"]
|
| 116 |
+
duration = end_ts - start_ts
|
| 117 |
+
|
| 118 |
+
# Check if text is using Chinese/Japanese/Korean characters (CJK)
|
| 119 |
+
# For CJK, we'll split by characters rather than words
|
| 120 |
+
is_cjk = any("\u4e00" <= char <= "\u9fff" for char in text)
|
| 121 |
+
|
| 122 |
+
parts = []
|
| 123 |
+
if is_cjk:
|
| 124 |
+
# For CJK languages, process character by character
|
| 125 |
+
current_part = ""
|
| 126 |
+
for char in text:
|
| 127 |
+
if len(current_part + char) > max_length:
|
| 128 |
+
parts.append(current_part)
|
| 129 |
+
current_part = char
|
| 130 |
+
else:
|
| 131 |
+
current_part += char
|
| 132 |
+
|
| 133 |
+
# Add the last part if not empty
|
| 134 |
+
if current_part:
|
| 135 |
+
parts.append(current_part)
|
| 136 |
+
else:
|
| 137 |
+
# Original word-based splitting for languages with spaces
|
| 138 |
+
words = text.split()
|
| 139 |
+
current_part = ""
|
| 140 |
+
|
| 141 |
+
for word in words:
|
| 142 |
+
# If adding this word would exceed max_length, start a new part
|
| 143 |
+
if len(current_part + " " + word) > max_length and current_part:
|
| 144 |
+
parts.append(current_part.strip())
|
| 145 |
+
current_part = word
|
| 146 |
+
else:
|
| 147 |
+
# Add space if not the first word in the part
|
| 148 |
+
if current_part:
|
| 149 |
+
current_part += " "
|
| 150 |
+
current_part += word
|
| 151 |
+
|
| 152 |
+
# Add the last part if not empty
|
| 153 |
+
if current_part:
|
| 154 |
+
parts.append(current_part.strip())
|
| 155 |
+
|
| 156 |
+
# Group parts into segments with 'lines' number of lines per segment
|
| 157 |
+
segment_parts = []
|
| 158 |
+
for i in range(0, len(parts), lines):
|
| 159 |
+
segment_parts.append(parts[i : i + lines])
|
| 160 |
+
|
| 161 |
+
# Calculate time proportionally based on segment text length
|
| 162 |
+
total_chars = sum(len("".join(part_group)) for part_group in segment_parts)
|
| 163 |
+
|
| 164 |
+
current_time = start_ts
|
| 165 |
+
for i, part_group in enumerate(segment_parts):
|
| 166 |
+
# Get character count for this segment group
|
| 167 |
+
segment_chars = len("".join(part_group))
|
| 168 |
+
|
| 169 |
+
# Calculate time proportionally, but ensure at least a minimum duration
|
| 170 |
+
if total_chars > 0:
|
| 171 |
+
segment_duration = (segment_chars / total_chars) * duration
|
| 172 |
+
segment_duration = max(
|
| 173 |
+
segment_duration, 0.5
|
| 174 |
+
) # Ensure minimum duration of 0.5s
|
| 175 |
+
else:
|
| 176 |
+
segment_duration = duration / len(segment_parts)
|
| 177 |
+
|
| 178 |
+
segment_start = current_time
|
| 179 |
+
segment_end = segment_start + segment_duration
|
| 180 |
+
|
| 181 |
+
# Move current time forward for next segment
|
| 182 |
+
current_time = segment_end
|
| 183 |
+
|
| 184 |
+
# Create segment with proper text array format for the subtitle renderer
|
| 185 |
+
segment_text = part_group + [""] * (lines - len(part_group))
|
| 186 |
+
|
| 187 |
+
segments.append(
|
| 188 |
+
{
|
| 189 |
+
"text": segment_text,
|
| 190 |
+
"start_ts": segment_start,
|
| 191 |
+
"end_ts": segment_end,
|
| 192 |
+
}
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Ensure no overlaps between segments by adjusting end times if needed
|
| 196 |
+
for i in range(len(segments) - 1):
|
| 197 |
+
if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
|
| 198 |
+
segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
|
| 199 |
+
|
| 200 |
+
return segments
|
| 201 |
+
|
| 202 |
+
@staticmethod
|
| 203 |
+
def hex_to_ass(hex_color: str, alpha: float = 1.0) -> str:
|
| 204 |
+
"""
|
| 205 |
+
Convert a hex color + transparency to ASS &HaaBBGGRR& format.
|
| 206 |
+
|
| 207 |
+
:param hex_color: CSS-style color string, e.g. "#FFA07A" or "00ff00"
|
| 208 |
+
:param alpha: transparency from 0.0 (opaque) to 1.0 (fully transparent)
|
| 209 |
+
:return: ASS color string, e.g. "&H8014C8FF&"
|
| 210 |
+
"""
|
| 211 |
+
|
| 212 |
+
# strip leading '#' if present
|
| 213 |
+
hex_color = hex_color.lstrip('#')
|
| 214 |
+
|
| 215 |
+
# support 3-digit shorthand like 'f0a'
|
| 216 |
+
if len(hex_color) == 3:
|
| 217 |
+
hex_color = ''.join([c*2 for c in hex_color])
|
| 218 |
+
|
| 219 |
+
if len(hex_color) != 6:
|
| 220 |
+
raise ValueError("hex_color must be in 'RRGGBB' or 'RGB' format")
|
| 221 |
+
|
| 222 |
+
# parse RGB
|
| 223 |
+
r = int(hex_color[0:2], 16)
|
| 224 |
+
g = int(hex_color[2:4], 16)
|
| 225 |
+
b = int(hex_color[4:6], 16)
|
| 226 |
+
|
| 227 |
+
# ASS alpha is inverted: 00=opaque, FF=transparent
|
| 228 |
+
# so we invert the user's alpha (0.0 = opaque)
|
| 229 |
+
a = int((1.0 - alpha) * 255)
|
| 230 |
+
a = max(0, min(255, a))
|
| 231 |
+
|
| 232 |
+
# build BGR and alpha bytes
|
| 233 |
+
aa = f"{a:02X}"
|
| 234 |
+
bb = f"{b:02X}"
|
| 235 |
+
gg = f"{g:02X}"
|
| 236 |
+
rr = f"{r:02X}"
|
| 237 |
+
|
| 238 |
+
return f"&H{aa}{bb}{gg}{rr}"
|
| 239 |
+
|
| 240 |
+
def create_subtitle(
|
| 241 |
+
self,
|
| 242 |
+
segments,
|
| 243 |
+
dimensions: Tuple[int, int],
|
| 244 |
+
output_path: str,
|
| 245 |
+
font_size=24,
|
| 246 |
+
font_color="#fff",
|
| 247 |
+
shadow_color="#000",
|
| 248 |
+
shadow_transparency=0.1,
|
| 249 |
+
shadow_blur=0,
|
| 250 |
+
stroke_color="#000",
|
| 251 |
+
stroke_size=0,
|
| 252 |
+
font_name="Arial",
|
| 253 |
+
font_bold=True,
|
| 254 |
+
font_italic=False,
|
| 255 |
+
subtitle_position="center",
|
| 256 |
+
):
|
| 257 |
+
width, height = dimensions
|
| 258 |
+
bold_value = -1 if font_bold else 0
|
| 259 |
+
italic_value = -1 if font_italic else 0
|
| 260 |
+
|
| 261 |
+
position_from_top = 0.2
|
| 262 |
+
if subtitle_position == "center":
|
| 263 |
+
position_from_top = 0.45
|
| 264 |
+
if subtitle_position == "bottom":
|
| 265 |
+
position_from_top = 0.75
|
| 266 |
+
|
| 267 |
+
ass_content = """[Script Info]
|
| 268 |
+
ScriptType: v4.00+
|
| 269 |
+
PlayResX: {width}
|
| 270 |
+
PlayResY: {height}
|
| 271 |
+
|
| 272 |
+
[V4+ Styles]
|
| 273 |
+
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
| 274 |
+
Style: Default,{font_name},{font_size},{font_color},&H000000FF,{stroke_color},&H00000000,{bold},{italic},0,0,100,100,0,0,1,{stroke_size},0,8,20,20,20,1
|
| 275 |
+
|
| 276 |
+
[Events]
|
| 277 |
+
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
| 278 |
+
""".format(
|
| 279 |
+
width=width,
|
| 280 |
+
height=height,
|
| 281 |
+
font_size=font_size,
|
| 282 |
+
font_color=self.hex_to_ass(font_color),
|
| 283 |
+
stroke_color=self.hex_to_ass(stroke_color),
|
| 284 |
+
stroke_size=stroke_size,
|
| 285 |
+
font_name=font_name,
|
| 286 |
+
bold=bold_value,
|
| 287 |
+
italic=italic_value
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
pos_x = int(width / 2)
|
| 291 |
+
pos_y = int(height * position_from_top)
|
| 292 |
+
|
| 293 |
+
# Process each segment and add to the subtitle file
|
| 294 |
+
for segment in segments:
|
| 295 |
+
start_time = self.format_time(segment["start_ts"])
|
| 296 |
+
end_time = self.format_time(segment["end_ts"])
|
| 297 |
+
|
| 298 |
+
# Create text with line breaks
|
| 299 |
+
text_lines = segment["text"]
|
| 300 |
+
formatted_text = ""
|
| 301 |
+
for i, line in enumerate(text_lines):
|
| 302 |
+
if line: # Only add non-empty lines
|
| 303 |
+
if i > 0: # Add line break if not the first line
|
| 304 |
+
formatted_text += "\\N"
|
| 305 |
+
formatted_text += line
|
| 306 |
+
|
| 307 |
+
# Create shadow if shadow_blur is specified or if we want a drop shadow effect
|
| 308 |
+
if shadow_blur > 0 or shadow_transparency < 1.0:
|
| 309 |
+
# Convert shadow color with transparency
|
| 310 |
+
shadow_color_ass = self.hex_to_ass(shadow_color, shadow_transparency)
|
| 311 |
+
|
| 312 |
+
# Offset shadow position slightly for drop shadow effect
|
| 313 |
+
shadow_pos_x = pos_x + 2
|
| 314 |
+
shadow_pos_y = pos_y + 2
|
| 315 |
+
|
| 316 |
+
# For shadow text, use shadow color only for primary color and set proper alpha
|
| 317 |
+
# Only apply shadow color to primary color (\1c) and use alpha for transparency
|
| 318 |
+
shadow_override_tags = f"\\pos({shadow_pos_x},{shadow_pos_y})\\1c{shadow_color_ass}\\bord0"
|
| 319 |
+
|
| 320 |
+
# Add alpha transparency if needed
|
| 321 |
+
if shadow_transparency > 0:
|
| 322 |
+
alpha_hex = hex(int((1.0 - shadow_transparency) * 255))[2:].upper().zfill(2)
|
| 323 |
+
shadow_override_tags += f"\\1a&H{alpha_hex}&"
|
| 324 |
+
|
| 325 |
+
if shadow_blur > 0:
|
| 326 |
+
shadow_override_tags += f"\\blur{shadow_blur}"
|
| 327 |
+
|
| 328 |
+
shadow_formatted_text = f"{{{shadow_override_tags}}}" + formatted_text
|
| 329 |
+
|
| 330 |
+
# Add shadow dialogue line first (so it appears behind)
|
| 331 |
+
ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{shadow_formatted_text}\n"
|
| 332 |
+
|
| 333 |
+
# Create main text layer
|
| 334 |
+
main_override_tags = f"\\pos({pos_x},{pos_y})"
|
| 335 |
+
main_formatted_text = f"{{{main_override_tags}}}" + formatted_text
|
| 336 |
+
|
| 337 |
+
# Add main dialogue line (appears on top)
|
| 338 |
+
ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{main_formatted_text}\n"
|
| 339 |
+
|
| 340 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 341 |
+
f.write(ass_content)
|
| 342 |
+
|
| 343 |
+
logger.debug("subtitle (ass) was created with drop shadow")
|
| 344 |
+
|
| 345 |
+
def format_time(self, seconds):
|
| 346 |
+
"""
|
| 347 |
+
Convert seconds to ASS time format (H:MM:SS.cc)
|
| 348 |
+
"""
|
| 349 |
+
hours = int(seconds // 3600)
|
| 350 |
+
minutes = int((seconds % 3600) // 60)
|
| 351 |
+
secs = int(seconds % 60)
|
| 352 |
+
centisecs = int((seconds % 1) * 100)
|
| 353 |
+
|
| 354 |
+
return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"
|
video/config.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
from loguru import logger
|
| 4 |
+
|
| 5 |
+
device = "cpu"
|
| 6 |
+
if torch.cuda.is_available():
|
| 7 |
+
device = torch.device("cuda")
|
| 8 |
+
elif torch.backends.mps.is_available():
|
| 9 |
+
device = torch.device("mps")
|
| 10 |
+
else:
|
| 11 |
+
device = torch.device("cpu")
|
| 12 |
+
num_cores = os.cpu_count()
|
| 13 |
+
if os.path.exists("/sys/fs/cgroup/cpu.max"):
|
| 14 |
+
with open("/sys/fs/cgroup/cpu.max", "r") as f:
|
| 15 |
+
line = f.readline()
|
| 16 |
+
if len(line.split()) == 2:
|
| 17 |
+
if line.split()[0] == "max":
|
| 18 |
+
logger.info(
|
| 19 |
+
"File /sys/fs/cgroup/cpu.max has max value, using os.cpu_count()"
|
| 20 |
+
)
|
| 21 |
+
else:
|
| 22 |
+
cpu_max = int(line.split()[0])
|
| 23 |
+
cpu_period = int(line.split()[1])
|
| 24 |
+
num_cores = cpu_max // cpu_period
|
| 25 |
+
logger.info("Using {} cores", num_cores)
|
| 26 |
+
else:
|
| 27 |
+
logger.warning(
|
| 28 |
+
"File /sys/fs/cgroup/cpu.max does not have 2 values, using os.cpu_count()"
|
| 29 |
+
)
|
| 30 |
+
else:
|
| 31 |
+
logger.info("File /sys/fs/cgroup/cpu.max not found, using os.cpu_count()")
|
| 32 |
+
|
| 33 |
+
logger.info("number of CPU cores: {}", num_cores)
|
| 34 |
+
num_threads = os.environ.get("NUM_THREADS", num_cores)
|
| 35 |
+
logger.info("number of threads to use with torch: {}", num_threads)
|
| 36 |
+
torch.set_num_threads(int(num_threads))
|
| 37 |
+
torch.set_num_interop_threads(int(num_threads))
|
| 38 |
+
|
| 39 |
+
map_location = torch.device(device)
|
| 40 |
+
|
| 41 |
+
torch_load_original = torch.load
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def patched_torch_load(*args, **kwargs):
|
| 45 |
+
if "map_location" not in kwargs:
|
| 46 |
+
kwargs["map_location"] = map_location
|
| 47 |
+
return torch_load_original(*args, **kwargs)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
torch.load = patched_torch_load
|
| 51 |
+
|
| 52 |
+
whisper_model = os.environ.get("WHISPER_MODEL", "small")
|
| 53 |
+
whisper_compute_type = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")
|
video/media.py
ADDED
|
@@ -0,0 +1,850 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
from loguru import logger
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class MediaUtils:
|
| 8 |
+
def __init__(self, ffmpeg_path="ffmpeg"):
|
| 9 |
+
"""
|
| 10 |
+
Initializes the MediaUtils class.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
ffmpeg_path: Path to the ffmpeg executable
|
| 14 |
+
"""
|
| 15 |
+
self.ffmpeg_path = ffmpeg_path
|
| 16 |
+
|
| 17 |
+
def merge_videos(
|
| 18 |
+
self,
|
| 19 |
+
video_paths: list,
|
| 20 |
+
output_path: str,
|
| 21 |
+
background_music_path: str = None,
|
| 22 |
+
background_music_volume: float = 0.5,
|
| 23 |
+
) -> bool:
|
| 24 |
+
"""
|
| 25 |
+
Merges multiple video files into one, optionally with background music.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
video_paths: List of paths to video files to merge
|
| 29 |
+
output_path: Path for the merged output video
|
| 30 |
+
background_music: Optional path to background music file
|
| 31 |
+
bg_music_volume: Volume level for background music (0.0 to 1.0, default 0.5)
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
bool: True if successful, False otherwise
|
| 35 |
+
"""
|
| 36 |
+
if not video_paths:
|
| 37 |
+
logger.error("no video paths provided for merging")
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
start = time.time()
|
| 41 |
+
context_logger = logger.bind(
|
| 42 |
+
number_of_videos=len(video_paths),
|
| 43 |
+
output_path=output_path,
|
| 44 |
+
background_music=bool(background_music_path),
|
| 45 |
+
background_music_volume=background_music_volume,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
# Get dimensions from the first video
|
| 50 |
+
first_video_info = self.get_video_info(video_paths[0])
|
| 51 |
+
if not first_video_info:
|
| 52 |
+
context_logger.error("failed to get video info from first video")
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
target_width = first_video_info.get("width", 1080)
|
| 56 |
+
target_height = first_video_info.get("height", 1920)
|
| 57 |
+
target_dimensions = f"{target_width}:{target_height}"
|
| 58 |
+
|
| 59 |
+
context_logger.bind(
|
| 60 |
+
target_width=target_width, target_height=target_height
|
| 61 |
+
).debug("using dimensions from first video")
|
| 62 |
+
|
| 63 |
+
# Base command
|
| 64 |
+
cmd = [self.ffmpeg_path, "-y"]
|
| 65 |
+
|
| 66 |
+
# Add input video files
|
| 67 |
+
for video_path in video_paths:
|
| 68 |
+
cmd.extend(["-i", video_path])
|
| 69 |
+
|
| 70 |
+
# Add background music if provided
|
| 71 |
+
music_input_index = None
|
| 72 |
+
if background_music_path:
|
| 73 |
+
cmd.extend(["-stream_loop", "-1", "-i", background_music_path])
|
| 74 |
+
music_input_index = len(video_paths)
|
| 75 |
+
|
| 76 |
+
# Create filter complex for concatenating videos with re-encoding
|
| 77 |
+
if len(video_paths) == 1:
|
| 78 |
+
# Single video - re-encode to ensure consistency
|
| 79 |
+
# Check if the video has audio
|
| 80 |
+
audio_info = self.get_audio_info(video_paths[0])
|
| 81 |
+
has_audio = bool(audio_info.get('duration', 0) > 0)
|
| 82 |
+
|
| 83 |
+
if background_music_path:
|
| 84 |
+
if has_audio:
|
| 85 |
+
cmd.extend(
|
| 86 |
+
[
|
| 87 |
+
"-filter_complex",
|
| 88 |
+
f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];[{music_input_index}:a]volume={background_music_volume}[bg];[0:a][bg]amix=inputs=2:duration=first[a]",
|
| 89 |
+
"-map",
|
| 90 |
+
"[v]",
|
| 91 |
+
"-map",
|
| 92 |
+
"[a]",
|
| 93 |
+
]
|
| 94 |
+
)
|
| 95 |
+
else:
|
| 96 |
+
# No audio in video, just use background music
|
| 97 |
+
cmd.extend(
|
| 98 |
+
[
|
| 99 |
+
"-filter_complex",
|
| 100 |
+
f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];[{music_input_index}:a]volume={background_music_volume}[a]",
|
| 101 |
+
"-map",
|
| 102 |
+
"[v]",
|
| 103 |
+
"-map",
|
| 104 |
+
"[a]",
|
| 105 |
+
]
|
| 106 |
+
)
|
| 107 |
+
else:
|
| 108 |
+
if has_audio:
|
| 109 |
+
cmd.extend(
|
| 110 |
+
[
|
| 111 |
+
"-filter_complex",
|
| 112 |
+
f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v]",
|
| 113 |
+
"-map",
|
| 114 |
+
"[v]",
|
| 115 |
+
"-map",
|
| 116 |
+
"0:a",
|
| 117 |
+
]
|
| 118 |
+
)
|
| 119 |
+
else:
|
| 120 |
+
# No audio in video and no background music, create silent audio
|
| 121 |
+
video_info = self.get_video_info(video_paths[0])
|
| 122 |
+
video_duration = video_info.get('duration', 10) # fallback to 10 seconds
|
| 123 |
+
cmd.extend(
|
| 124 |
+
[
|
| 125 |
+
"-filter_complex",
|
| 126 |
+
f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];anullsrc=channel_layout=stereo:sample_rate=48000:duration={video_duration}[a]",
|
| 127 |
+
"-map",
|
| 128 |
+
"[v]",
|
| 129 |
+
"-map",
|
| 130 |
+
"[a]",
|
| 131 |
+
]
|
| 132 |
+
)
|
| 133 |
+
else:
|
| 134 |
+
# Multiple videos - normalize and concatenate with re-encoding
|
| 135 |
+
# First, check which videos have audio streams
|
| 136 |
+
videos_with_audio = []
|
| 137 |
+
for i, video_path in enumerate(video_paths):
|
| 138 |
+
video_info = self.get_video_info(video_path)
|
| 139 |
+
# Check if video has audio by trying to get audio info
|
| 140 |
+
audio_info = self.get_audio_info(video_path)
|
| 141 |
+
has_audio = bool(audio_info.get('duration', 0) > 0)
|
| 142 |
+
videos_with_audio.append(has_audio)
|
| 143 |
+
context_logger.bind(video_index=i, has_audio=has_audio).debug("checked audio stream")
|
| 144 |
+
|
| 145 |
+
# Create normalized video streams for each input
|
| 146 |
+
normalize_filters = []
|
| 147 |
+
for i in range(len(video_paths)):
|
| 148 |
+
normalize_filters.append(
|
| 149 |
+
f"[{i}:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30,format=yuv420p[v{i}n]"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Create audio streams for videos without audio (silent audio)
|
| 153 |
+
audio_filters = []
|
| 154 |
+
for i in range(len(video_paths)):
|
| 155 |
+
if not videos_with_audio[i]:
|
| 156 |
+
# Get video duration for silent audio generation
|
| 157 |
+
video_info = self.get_video_info(video_paths[i])
|
| 158 |
+
video_duration = video_info.get('duration', 10) # fallback to 10 seconds
|
| 159 |
+
audio_filters.append(f"anullsrc=channel_layout=stereo:sample_rate=48000:duration={video_duration}[a{i}n]")
|
| 160 |
+
else:
|
| 161 |
+
audio_filters.append(f"[{i}:a]aformat=sample_rates=48000:channel_layouts=stereo[a{i}n]")
|
| 162 |
+
|
| 163 |
+
# Create the concat filter using normalized streams
|
| 164 |
+
concat_inputs = ""
|
| 165 |
+
for i in range(len(video_paths)):
|
| 166 |
+
concat_inputs += f"[v{i}n][a{i}n]"
|
| 167 |
+
|
| 168 |
+
# Combine all filters
|
| 169 |
+
all_filters = normalize_filters + audio_filters
|
| 170 |
+
filter_complex = (
|
| 171 |
+
";".join(all_filters)
|
| 172 |
+
+ f";{concat_inputs}concat=n={len(video_paths)}:v=1:a=1[v][a]"
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
if background_music_path:
|
| 176 |
+
# Mix the concatenated audio with background music
|
| 177 |
+
filter_complex += f";[{music_input_index}:a]volume={background_music_volume}[bg];[a][bg]amix=inputs=2:duration=first[final_a]"
|
| 178 |
+
cmd.extend(
|
| 179 |
+
[
|
| 180 |
+
"-filter_complex",
|
| 181 |
+
filter_complex,
|
| 182 |
+
"-map",
|
| 183 |
+
"[v]",
|
| 184 |
+
"-map",
|
| 185 |
+
"[final_a]",
|
| 186 |
+
]
|
| 187 |
+
)
|
| 188 |
+
else:
|
| 189 |
+
cmd.extend(
|
| 190 |
+
[
|
| 191 |
+
"-filter_complex",
|
| 192 |
+
filter_complex,
|
| 193 |
+
"-map",
|
| 194 |
+
"[v]",
|
| 195 |
+
"-map",
|
| 196 |
+
"[a]",
|
| 197 |
+
]
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Video codec settings
|
| 201 |
+
cmd.extend(
|
| 202 |
+
[
|
| 203 |
+
"-c:v",
|
| 204 |
+
"libx264",
|
| 205 |
+
"-preset",
|
| 206 |
+
"veryfast",
|
| 207 |
+
"-crf",
|
| 208 |
+
"23",
|
| 209 |
+
]
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Audio codec settings
|
| 213 |
+
cmd.extend(["-c:a", "aac", "-b:a", "192k"])
|
| 214 |
+
|
| 215 |
+
# Other settings
|
| 216 |
+
cmd.extend(["-pix_fmt", "yuv420p", output_path])
|
| 217 |
+
|
| 218 |
+
# Execute the command using the new method
|
| 219 |
+
|
| 220 |
+
# calculate expected duration for progress tracking
|
| 221 |
+
expected_duration = 0
|
| 222 |
+
for video_path in video_paths:
|
| 223 |
+
video_info = self.get_video_info(video_path)
|
| 224 |
+
expected_duration += video_info.get("duration", 0)
|
| 225 |
+
|
| 226 |
+
success = self.execute_ffmpeg_command(
|
| 227 |
+
cmd,
|
| 228 |
+
"merge videos",
|
| 229 |
+
expected_duration=expected_duration,
|
| 230 |
+
show_progress=True,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
if success:
|
| 234 |
+
context_logger.bind(execution_time=time.time() - start).debug(
|
| 235 |
+
"videos merged successfully",
|
| 236 |
+
)
|
| 237 |
+
return True
|
| 238 |
+
else:
|
| 239 |
+
context_logger.error("ffmpeg failed to merge videos")
|
| 240 |
+
return False
|
| 241 |
+
|
| 242 |
+
except Exception as e:
|
| 243 |
+
context_logger.bind(error=str(e)).error(
|
| 244 |
+
"error merging videos",
|
| 245 |
+
)
|
| 246 |
+
return False
|
| 247 |
+
|
| 248 |
+
def get_video_info(self, file_path: str) -> dict:
|
| 249 |
+
"""
|
| 250 |
+
Retrieves video information such as duration, width, height, codec, fps, etc.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
file_path: Path to the video file
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
Dictionary containing video information
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
cmd = [
|
| 260 |
+
"ffprobe",
|
| 261 |
+
"-v",
|
| 262 |
+
"quiet",
|
| 263 |
+
"-print_format",
|
| 264 |
+
"json",
|
| 265 |
+
"-show_format",
|
| 266 |
+
"-show_streams",
|
| 267 |
+
"-select_streams",
|
| 268 |
+
"v:0", # Select first video stream
|
| 269 |
+
file_path,
|
| 270 |
+
]
|
| 271 |
+
|
| 272 |
+
success, stdout, stderr = self.execute_ffprobe_command(
|
| 273 |
+
cmd, "get video info"
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
if not success:
|
| 277 |
+
raise Exception(f"ffprobe failed: {stderr}")
|
| 278 |
+
|
| 279 |
+
probe_data = json.loads(stdout)
|
| 280 |
+
|
| 281 |
+
# Extract format information
|
| 282 |
+
format_info = probe_data.get("format", {})
|
| 283 |
+
streams = probe_data.get("streams", [])
|
| 284 |
+
|
| 285 |
+
if not streams:
|
| 286 |
+
raise Exception("No video stream found in file")
|
| 287 |
+
|
| 288 |
+
video_stream = streams[0]
|
| 289 |
+
|
| 290 |
+
video_info = {
|
| 291 |
+
"duration": float(format_info.get("duration", 0)),
|
| 292 |
+
"width": video_stream.get("width"),
|
| 293 |
+
"height": video_stream.get("height"),
|
| 294 |
+
"fps": video_stream.get("avg_frame_rate", "0/1").split("/")[0],
|
| 295 |
+
"aspect_ratio": video_stream.get("display_aspect_ratio", "1:1"),
|
| 296 |
+
"codec": video_stream.get("codec_name"),
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
return video_info
|
| 300 |
+
|
| 301 |
+
except Exception as e:
|
| 302 |
+
logger.bind(file_path=file_path, error=str(e)).error(
|
| 303 |
+
"error getting video info"
|
| 304 |
+
)
|
| 305 |
+
return {}
|
| 306 |
+
|
| 307 |
+
def get_audio_info(self, file_path: str) -> dict:
|
| 308 |
+
"""
|
| 309 |
+
Retrieves audio information such as duration, codec, bitrate, sample rate, channels, etc.
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
file_path: Path to the audio file
|
| 313 |
+
|
| 314 |
+
Returns:
|
| 315 |
+
Dictionary containing audio information
|
| 316 |
+
"""
|
| 317 |
+
try:
|
| 318 |
+
cmd = [
|
| 319 |
+
"ffprobe",
|
| 320 |
+
"-v",
|
| 321 |
+
"quiet",
|
| 322 |
+
"-print_format",
|
| 323 |
+
"json",
|
| 324 |
+
"-show_format",
|
| 325 |
+
"-show_streams",
|
| 326 |
+
"-select_streams",
|
| 327 |
+
"a:0", # Select first audio stream
|
| 328 |
+
file_path,
|
| 329 |
+
]
|
| 330 |
+
|
| 331 |
+
success, stdout, stderr = self.execute_ffprobe_command(
|
| 332 |
+
cmd, "get audio info"
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
if not success:
|
| 336 |
+
raise Exception(f"ffprobe failed: {stderr}")
|
| 337 |
+
|
| 338 |
+
probe_data = json.loads(stdout)
|
| 339 |
+
|
| 340 |
+
# Extract format information
|
| 341 |
+
format_info = probe_data.get("format", {})
|
| 342 |
+
streams = probe_data.get("streams", [])
|
| 343 |
+
|
| 344 |
+
if not streams:
|
| 345 |
+
raise Exception("No audio stream found in file")
|
| 346 |
+
|
| 347 |
+
audio_stream = streams[0]
|
| 348 |
+
|
| 349 |
+
audio_info = {
|
| 350 |
+
"duration": float(format_info.get("duration", 0)),
|
| 351 |
+
"channels": audio_stream.get("channels", 0),
|
| 352 |
+
"sample_rate": audio_stream.get("sample_rate", "0"),
|
| 353 |
+
"codec": audio_stream.get("codec_name", ""),
|
| 354 |
+
"bitrate": audio_stream.get("bit_rate", "0"),
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
return audio_info
|
| 358 |
+
|
| 359 |
+
except Exception as e:
|
| 360 |
+
logger.bind(file_path=file_path, error=str(e)).error(
|
| 361 |
+
"Error getting audio info"
|
| 362 |
+
)
|
| 363 |
+
return {}
|
| 364 |
+
|
| 365 |
+
def extract_frame(
|
| 366 |
+
self,
|
| 367 |
+
video_path: str,
|
| 368 |
+
output_path: str,
|
| 369 |
+
time_seconds: float = 0.0,
|
| 370 |
+
) -> bool:
|
| 371 |
+
"""
|
| 372 |
+
Extracts a frame from a video at a specified time.
|
| 373 |
+
|
| 374 |
+
Args:
|
| 375 |
+
video_path: Path to the input video file
|
| 376 |
+
output_path: Path for the extracted frame image
|
| 377 |
+
time_seconds: Time in seconds to extract the frame (default: 0.0)
|
| 378 |
+
|
| 379 |
+
Returns:
|
| 380 |
+
bool: True if successful, False otherwise
|
| 381 |
+
"""
|
| 382 |
+
try:
|
| 383 |
+
# Base command
|
| 384 |
+
cmd = [self.ffmpeg_path, "-y"]
|
| 385 |
+
|
| 386 |
+
# Add input video file
|
| 387 |
+
cmd.extend(["-i", video_path])
|
| 388 |
+
|
| 389 |
+
# Seek to the specified time and extract one frame
|
| 390 |
+
cmd.extend(
|
| 391 |
+
[
|
| 392 |
+
"-ss",
|
| 393 |
+
str(time_seconds), # Seek to time
|
| 394 |
+
"-vframes",
|
| 395 |
+
"1", # Extract only one frame
|
| 396 |
+
"-q:v",
|
| 397 |
+
"2", # High quality (scale 1-31, lower is better)
|
| 398 |
+
output_path,
|
| 399 |
+
]
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
# Execute the command using the new method
|
| 403 |
+
success = self.execute_ffmpeg_command(
|
| 404 |
+
cmd,
|
| 405 |
+
"extract frame",
|
| 406 |
+
show_progress=False, # No progress tracking for single frame extraction
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
if success:
|
| 410 |
+
logger.bind(video_path=video_path, time_seconds=time_seconds).debug(
|
| 411 |
+
"frame extracted successfully"
|
| 412 |
+
)
|
| 413 |
+
return True
|
| 414 |
+
else:
|
| 415 |
+
logger.bind(video_path=video_path, time_seconds=time_seconds).error(
|
| 416 |
+
"failed to extract frame from video"
|
| 417 |
+
)
|
| 418 |
+
return False
|
| 419 |
+
|
| 420 |
+
except Exception as e:
|
| 421 |
+
logger.bind(error=str(e)).error("Error extracting frame from video")
|
| 422 |
+
return False
|
| 423 |
+
|
| 424 |
+
def extract_frames(
|
| 425 |
+
self,
|
| 426 |
+
video_path: str,
|
| 427 |
+
output_template: str,
|
| 428 |
+
amount: int = 5,
|
| 429 |
+
length_seconds: float = None,
|
| 430 |
+
) -> bool:
|
| 431 |
+
"""
|
| 432 |
+
Args:
|
| 433 |
+
video_path: Path to the input video file
|
| 434 |
+
output_template: Template for output image files (e.g., "frame-%03d.jpg")
|
| 435 |
+
amount: Number of frames to extract (default: 5)
|
| 436 |
+
length_seconds: Length of the video in seconds (optional, if not provided will be calculated)
|
| 437 |
+
|
| 438 |
+
Returns:
|
| 439 |
+
bool: True if successful, False otherwise
|
| 440 |
+
"""
|
| 441 |
+
try:
|
| 442 |
+
# Get video duration if not provided
|
| 443 |
+
if length_seconds is None:
|
| 444 |
+
video_info = self.get_video_info(video_path)
|
| 445 |
+
length_seconds = video_info.get("duration", 0)
|
| 446 |
+
|
| 447 |
+
if length_seconds <= 0:
|
| 448 |
+
logger.error("invalid video duration for frame extraction")
|
| 449 |
+
return False
|
| 450 |
+
|
| 451 |
+
# Calculate frame interval (time between frames)
|
| 452 |
+
# This gives us the correct fps rate to extract exactly 'amount' frames
|
| 453 |
+
# evenly distributed across the video duration
|
| 454 |
+
frame_interval = length_seconds / amount
|
| 455 |
+
|
| 456 |
+
# Base command - using the corrected fps calculation
|
| 457 |
+
# fps=1/frame_interval extracts one frame every frame_interval seconds
|
| 458 |
+
cmd = [
|
| 459 |
+
self.ffmpeg_path,
|
| 460 |
+
"-y",
|
| 461 |
+
"-i",
|
| 462 |
+
video_path,
|
| 463 |
+
"-vf",
|
| 464 |
+
f"fps=1/{frame_interval}",
|
| 465 |
+
"-vframes",
|
| 466 |
+
str(amount),
|
| 467 |
+
"-qscale:v",
|
| 468 |
+
"2", # High quality
|
| 469 |
+
output_template,
|
| 470 |
+
]
|
| 471 |
+
|
| 472 |
+
# Execute the command using the new method
|
| 473 |
+
success = self.execute_ffmpeg_command(
|
| 474 |
+
cmd,
|
| 475 |
+
"extract frames",
|
| 476 |
+
expected_duration=length_seconds,
|
| 477 |
+
show_progress=True,
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
if success:
|
| 481 |
+
logger.bind(video_path=video_path, amount=amount).debug(
|
| 482 |
+
"frames extracted successfully"
|
| 483 |
+
)
|
| 484 |
+
return True
|
| 485 |
+
else:
|
| 486 |
+
logger.bind(video_path=video_path, amount=amount).error(
|
| 487 |
+
"failed to extract frames from video"
|
| 488 |
+
)
|
| 489 |
+
return False
|
| 490 |
+
|
| 491 |
+
except Exception as e:
|
| 492 |
+
logger.bind(error=str(e)).error("Error extracting frames from video")
|
| 493 |
+
return False
|
| 494 |
+
|
| 495 |
+
def format_time(self, seconds: float) -> str:
|
| 496 |
+
"""
|
| 497 |
+
Format seconds into HH:MM:SS format.
|
| 498 |
+
|
| 499 |
+
Args:
|
| 500 |
+
seconds: Time in seconds
|
| 501 |
+
|
| 502 |
+
Returns:
|
| 503 |
+
Formatted time string
|
| 504 |
+
"""
|
| 505 |
+
hours = int(seconds // 3600)
|
| 506 |
+
minutes = int((seconds % 3600) // 60)
|
| 507 |
+
seconds = int(seconds % 60)
|
| 508 |
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
| 509 |
+
|
| 510 |
+
def execute_ffmpeg_command(
|
| 511 |
+
self,
|
| 512 |
+
cmd: list,
|
| 513 |
+
operation_name: str,
|
| 514 |
+
expected_duration: float = None,
|
| 515 |
+
show_progress: bool = True,
|
| 516 |
+
) -> bool:
|
| 517 |
+
"""
|
| 518 |
+
Execute an ffmpeg command with proper logging and progress tracking.
|
| 519 |
+
|
| 520 |
+
Args:
|
| 521 |
+
cmd: The ffmpeg command as a list
|
| 522 |
+
operation_name: Name of the operation for logging
|
| 523 |
+
expected_duration: Expected duration for progress calculation
|
| 524 |
+
show_progress: Whether to show progress information
|
| 525 |
+
|
| 526 |
+
Returns:
|
| 527 |
+
bool: True if successful, False otherwise
|
| 528 |
+
"""
|
| 529 |
+
try:
|
| 530 |
+
logger.bind(command=" ".join(cmd), operation=operation_name).debug(
|
| 531 |
+
f"executing ffmpeg command for {operation_name}"
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
process = subprocess.Popen(
|
| 535 |
+
cmd,
|
| 536 |
+
stderr=subprocess.PIPE,
|
| 537 |
+
universal_newlines=True,
|
| 538 |
+
text=True,
|
| 539 |
+
)
|
| 540 |
+
|
| 541 |
+
# Process the output line by line as it becomes available
|
| 542 |
+
for line in process.stderr:
|
| 543 |
+
# Extract time information for progress tracking
|
| 544 |
+
if (
|
| 545 |
+
show_progress
|
| 546 |
+
and expected_duration
|
| 547 |
+
and "time=" in line
|
| 548 |
+
and "speed=" in line
|
| 549 |
+
):
|
| 550 |
+
try:
|
| 551 |
+
# Extract the time information
|
| 552 |
+
time_str = line.split("time=")[1].split(" ")[0]
|
| 553 |
+
# Convert HH:MM:SS.MS format to seconds
|
| 554 |
+
h, m, s = time_str.split(":")
|
| 555 |
+
seconds = float(h) * 3600 + float(m) * 60 + float(s)
|
| 556 |
+
|
| 557 |
+
# Calculate progress percentage
|
| 558 |
+
progress = min(100, (seconds / expected_duration) * 100)
|
| 559 |
+
logger.info(
|
| 560 |
+
f"{operation_name}: {progress:.2f}% complete (Time: {time_str} / Total: {self.format_time(expected_duration)})"
|
| 561 |
+
)
|
| 562 |
+
except (ValueError, IndexError):
|
| 563 |
+
# If parsing fails, continue silently
|
| 564 |
+
pass
|
| 565 |
+
elif any(
|
| 566 |
+
keyword in line
|
| 567 |
+
for keyword in [
|
| 568 |
+
# Skip initialization information
|
| 569 |
+
"ffmpeg version",
|
| 570 |
+
"built with",
|
| 571 |
+
"configuration:",
|
| 572 |
+
"libav",
|
| 573 |
+
"Input #",
|
| 574 |
+
"Metadata:",
|
| 575 |
+
"Duration:",
|
| 576 |
+
"Stream #",
|
| 577 |
+
"Press [q]",
|
| 578 |
+
"Output #",
|
| 579 |
+
"Stream mapping:",
|
| 580 |
+
# Skip processing details
|
| 581 |
+
"frame=",
|
| 582 |
+
"fps=",
|
| 583 |
+
"[libx264",
|
| 584 |
+
"kb/s:",
|
| 585 |
+
"Qavg:",
|
| 586 |
+
"video:",
|
| 587 |
+
"audio:",
|
| 588 |
+
"subtitle:",
|
| 589 |
+
"frame I:",
|
| 590 |
+
"frame P:",
|
| 591 |
+
"mb I",
|
| 592 |
+
"mb P",
|
| 593 |
+
"coded y,",
|
| 594 |
+
"i16 v,h,dc,p:",
|
| 595 |
+
"i8c dc,h,v,p:",
|
| 596 |
+
"compatible_brands:",
|
| 597 |
+
"encoder",
|
| 598 |
+
"Side data:",
|
| 599 |
+
"libswscale",
|
| 600 |
+
"libswresample",
|
| 601 |
+
"libpostproc",
|
| 602 |
+
# Additional patterns to filter
|
| 603 |
+
"ffmpeg: libswscale",
|
| 604 |
+
"ffmpeg: libswresample",
|
| 605 |
+
"ffmpeg: libpostproc",
|
| 606 |
+
]
|
| 607 |
+
):
|
| 608 |
+
# Skip all technical output lines
|
| 609 |
+
pass
|
| 610 |
+
else:
|
| 611 |
+
# Only print important messages (like errors and warnings)
|
| 612 |
+
# that don't match any of the filtered patterns
|
| 613 |
+
if not line.strip() or line.strip().startswith("["):
|
| 614 |
+
continue
|
| 615 |
+
|
| 616 |
+
# Skip header lines that describe inputs
|
| 617 |
+
if ":" in line and any(
|
| 618 |
+
header in line
|
| 619 |
+
for header in [
|
| 620 |
+
"major_brand",
|
| 621 |
+
"minor_version",
|
| 622 |
+
"creation_time",
|
| 623 |
+
"handler_name",
|
| 624 |
+
"vendor_id",
|
| 625 |
+
"Duration",
|
| 626 |
+
"bitrate",
|
| 627 |
+
]
|
| 628 |
+
):
|
| 629 |
+
continue
|
| 630 |
+
|
| 631 |
+
logger.debug(f"ffmpeg: {line.strip()}")
|
| 632 |
+
|
| 633 |
+
# Wait for the process to complete and check the return code
|
| 634 |
+
return_code = process.wait()
|
| 635 |
+
if return_code != 0:
|
| 636 |
+
logger.bind(return_code=return_code, operation=operation_name).error(
|
| 637 |
+
f"ffmpeg exited with code: {return_code} for {operation_name}"
|
| 638 |
+
)
|
| 639 |
+
return False
|
| 640 |
+
|
| 641 |
+
logger.bind(operation=operation_name).debug(
|
| 642 |
+
f"{operation_name} completed successfully"
|
| 643 |
+
)
|
| 644 |
+
return True
|
| 645 |
+
|
| 646 |
+
except Exception as e:
|
| 647 |
+
logger.bind(error=str(e), operation=operation_name).error(
|
| 648 |
+
f"error executing ffmpeg command for {operation_name}"
|
| 649 |
+
)
|
| 650 |
+
return False
|
| 651 |
+
|
| 652 |
+
def execute_ffprobe_command(
|
| 653 |
+
self, cmd: list, operation_name: str
|
| 654 |
+
) -> tuple[bool, str, str]:
|
| 655 |
+
"""
|
| 656 |
+
Execute an ffprobe command with proper logging.
|
| 657 |
+
|
| 658 |
+
Args:
|
| 659 |
+
cmd: The ffprobe command as a list
|
| 660 |
+
operation_name: Name of the operation for logging
|
| 661 |
+
|
| 662 |
+
Returns:
|
| 663 |
+
tuple: (success, stdout, stderr)
|
| 664 |
+
"""
|
| 665 |
+
try:
|
| 666 |
+
logger.bind(command=" ".join(cmd), operation=operation_name).debug(
|
| 667 |
+
f"executing ffprobe command for {operation_name}"
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
process = subprocess.Popen(
|
| 671 |
+
cmd,
|
| 672 |
+
stdout=subprocess.PIPE,
|
| 673 |
+
stderr=subprocess.PIPE,
|
| 674 |
+
text=True,
|
| 675 |
+
)
|
| 676 |
+
stdout, stderr = process.communicate()
|
| 677 |
+
|
| 678 |
+
if process.returncode != 0:
|
| 679 |
+
logger.bind(stderr=stderr, operation=operation_name).error(
|
| 680 |
+
f"ffprobe failed for {operation_name}"
|
| 681 |
+
)
|
| 682 |
+
return False, stdout, stderr
|
| 683 |
+
|
| 684 |
+
logger.bind(operation=operation_name).debug(
|
| 685 |
+
f"{operation_name} completed successfully"
|
| 686 |
+
)
|
| 687 |
+
return True, stdout, stderr
|
| 688 |
+
|
| 689 |
+
except Exception as e:
|
| 690 |
+
logger.bind(error=str(e), operation=operation_name).error(
|
| 691 |
+
f"error executing ffprobe command for {operation_name}"
|
| 692 |
+
)
|
| 693 |
+
return False, "", str(e)
|
| 694 |
+
|
| 695 |
+
@staticmethod
|
| 696 |
+
def is_hex_color(color: str) -> bool:
|
| 697 |
+
"""
|
| 698 |
+
Checks if the given color string is a valid hex color.
|
| 699 |
+
|
| 700 |
+
Args:
|
| 701 |
+
color: Color string to check
|
| 702 |
+
|
| 703 |
+
Returns:
|
| 704 |
+
bool: True if it's a hex color, False otherwise
|
| 705 |
+
"""
|
| 706 |
+
return all(
|
| 707 |
+
c in "0123456789abcdefABCDEF" for c in color[1:]
|
| 708 |
+
)
|
| 709 |
+
|
| 710 |
+
def colorkey_overlay(
|
| 711 |
+
self,
|
| 712 |
+
input_video_path: str,
|
| 713 |
+
overlay_video_path: str,
|
| 714 |
+
output_video_path: str,
|
| 715 |
+
color: str = "green",
|
| 716 |
+
similarity: float = 0.1,
|
| 717 |
+
blend: float = 0.1,
|
| 718 |
+
):
|
| 719 |
+
"""
|
| 720 |
+
Applies a colorkey overlay to a video using FFmpeg.
|
| 721 |
+
"""
|
| 722 |
+
|
| 723 |
+
"""
|
| 724 |
+
ffmpeg -i input.mp4 -stream_loop -1 -i black_dust.mp4 \
|
| 725 |
+
-filter_complex "[1]colorkey=0x000000:0.1:0.1[ckout];[0][ckout]overlay" \
|
| 726 |
+
-shortest \
|
| 727 |
+
-c:v libx264 -preset ultrafast -crf 18 \
|
| 728 |
+
-c:a copy \
|
| 729 |
+
output.mp4
|
| 730 |
+
"""
|
| 731 |
+
|
| 732 |
+
start = time.time()
|
| 733 |
+
info = self.get_video_info(input_video_path)
|
| 734 |
+
video_duration = info.get("duration", 0)
|
| 735 |
+
|
| 736 |
+
if not video_duration:
|
| 737 |
+
logger.error("failed to get video duration from input video")
|
| 738 |
+
return False
|
| 739 |
+
|
| 740 |
+
color = color.lstrip("#")
|
| 741 |
+
if self.is_hex_color(color):
|
| 742 |
+
color = f"0x{color.upper()}"
|
| 743 |
+
|
| 744 |
+
context_logger = logger.bind(
|
| 745 |
+
input_video_path=input_video_path,
|
| 746 |
+
overlay_video_path=overlay_video_path,
|
| 747 |
+
output_video_path=output_video_path,
|
| 748 |
+
video_duration=video_duration,
|
| 749 |
+
color=color,
|
| 750 |
+
similarity=similarity,
|
| 751 |
+
blend=blend,
|
| 752 |
+
)
|
| 753 |
+
context_logger.debug("Starting colorkey overlay process")
|
| 754 |
+
|
| 755 |
+
context_logger = context_logger.bind(
|
| 756 |
+
video_duration=video_duration,
|
| 757 |
+
)
|
| 758 |
+
|
| 759 |
+
cmd = [
|
| 760 |
+
self.ffmpeg_path, "-y",
|
| 761 |
+
"-i", input_video_path,
|
| 762 |
+
"-stream_loop", "-1",
|
| 763 |
+
"-i", overlay_video_path,
|
| 764 |
+
"-filter_complex", f"[1:v]colorkey={color}:{similarity}:{blend}[ckout];[0:v][ckout]overlay=eof_action=repeat[v]",
|
| 765 |
+
"-map", "[v]",
|
| 766 |
+
"-map", "0:a",
|
| 767 |
+
"-c:v", "libx264",
|
| 768 |
+
"-preset", "ultrafast",
|
| 769 |
+
"-crf", "18",
|
| 770 |
+
"-c:a", "copy",
|
| 771 |
+
"-t", f"{video_duration}s",
|
| 772 |
+
output_video_path,
|
| 773 |
+
]
|
| 774 |
+
|
| 775 |
+
try:
|
| 776 |
+
success = self.execute_ffmpeg_command(
|
| 777 |
+
cmd,
|
| 778 |
+
"add colorkey overlay to video",
|
| 779 |
+
expected_duration=video_duration,
|
| 780 |
+
show_progress=True,
|
| 781 |
+
)
|
| 782 |
+
|
| 783 |
+
if success:
|
| 784 |
+
context_logger.bind(execution_time=time.time() - start).debug(
|
| 785 |
+
"colorkey overlay added successfully",
|
| 786 |
+
)
|
| 787 |
+
return True
|
| 788 |
+
else:
|
| 789 |
+
context_logger.error("ffmpeg failed to create colorkey overlay")
|
| 790 |
+
return False
|
| 791 |
+
|
| 792 |
+
except Exception as e:
|
| 793 |
+
context_logger.bind(error=str(e)).error(
|
| 794 |
+
"error adding colorkey overlay to video",
|
| 795 |
+
)
|
| 796 |
+
return False
|
| 797 |
+
|
| 798 |
+
def convert_pcm_to_wav(
|
| 799 |
+
self,
|
| 800 |
+
input_pcm_path: str,
|
| 801 |
+
output_wav_path: str,
|
| 802 |
+
sample_rate: int = 24000,
|
| 803 |
+
channels: int = 1,
|
| 804 |
+
target_sample_rate: int = 44100,
|
| 805 |
+
) -> bool:
|
| 806 |
+
"""
|
| 807 |
+
ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm -ar 44100 -ac 2 out_44k_stereo.wav
|
| 808 |
+
"""
|
| 809 |
+
start = time.time()
|
| 810 |
+
context_logger = logger.bind(
|
| 811 |
+
input_pcm_path=input_pcm_path,
|
| 812 |
+
output_wav_path=output_wav_path,
|
| 813 |
+
sample_rate=sample_rate,
|
| 814 |
+
channels=channels,
|
| 815 |
+
target_sample_rate=target_sample_rate,
|
| 816 |
+
)
|
| 817 |
+
context_logger.debug("Starting PCM to WAV conversion")
|
| 818 |
+
|
| 819 |
+
cmd = [
|
| 820 |
+
self.ffmpeg_path, "-y",
|
| 821 |
+
"-f", "s16le",
|
| 822 |
+
"-ar", str(sample_rate),
|
| 823 |
+
"-ac", str(channels),
|
| 824 |
+
"-i", input_pcm_path,
|
| 825 |
+
"-ar", str(target_sample_rate),
|
| 826 |
+
"-ac", "2", # Convert to stereo
|
| 827 |
+
output_wav_path,
|
| 828 |
+
]
|
| 829 |
+
|
| 830 |
+
try:
|
| 831 |
+
success = self.execute_ffmpeg_command(
|
| 832 |
+
cmd,
|
| 833 |
+
"convert PCM to WAV",
|
| 834 |
+
show_progress=False,
|
| 835 |
+
)
|
| 836 |
+
|
| 837 |
+
if success:
|
| 838 |
+
context_logger.bind(execution_time=time.time() - start).debug(
|
| 839 |
+
"PCM to WAV conversion successful",
|
| 840 |
+
)
|
| 841 |
+
return True
|
| 842 |
+
else:
|
| 843 |
+
context_logger.error("ffmpeg failed to convert PCM to WAV")
|
| 844 |
+
return False
|
| 845 |
+
|
| 846 |
+
except Exception as e:
|
| 847 |
+
context_logger.bind(error=str(e)).error(
|
| 848 |
+
"error converting PCM to WAV",
|
| 849 |
+
)
|
| 850 |
+
return False
|
video/storage.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Tuple
|
| 2 |
+
import uuid
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class MediaType:
|
| 8 |
+
IMAGE = "image"
|
| 9 |
+
VIDEO = "video"
|
| 10 |
+
AUDIO = "audio"
|
| 11 |
+
TMP = "tmp"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Storage:
|
| 15 |
+
def __init__(self, storage_path):
|
| 16 |
+
self.storage_path = storage_path
|
| 17 |
+
os.makedirs(self.storage_path, exist_ok=True)
|
| 18 |
+
# make all the subdirectories for the media types
|
| 19 |
+
for media_type in [
|
| 20 |
+
MediaType.IMAGE,
|
| 21 |
+
MediaType.VIDEO,
|
| 22 |
+
MediaType.AUDIO,
|
| 23 |
+
MediaType.TMP,
|
| 24 |
+
]:
|
| 25 |
+
os.makedirs(os.path.join(self.storage_path, media_type), exist_ok=True)
|
| 26 |
+
|
| 27 |
+
def _validate_media_id(self, media_id: str) -> tuple[str, str]:
|
| 28 |
+
"""
|
| 29 |
+
Validates and parses a media ID to prevent path traversal attacks.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
media_id (str): Media ID to validate
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
tuple[str, str]: (media_type, filename)
|
| 36 |
+
|
| 37 |
+
Raises:
|
| 38 |
+
ValueError: If media_id is invalid or contains path traversal attempts
|
| 39 |
+
"""
|
| 40 |
+
if not media_id or "_" not in media_id:
|
| 41 |
+
raise ValueError("Invalid media ID format")
|
| 42 |
+
|
| 43 |
+
media_type, filename = media_id.split("_", 1)
|
| 44 |
+
|
| 45 |
+
# Validate media type
|
| 46 |
+
valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
|
| 47 |
+
if media_type not in valid_types:
|
| 48 |
+
raise ValueError(f"Invalid media type: {media_type}")
|
| 49 |
+
|
| 50 |
+
# Prevent path traversal by checking for dangerous patterns
|
| 51 |
+
if ".." in filename or "/" in filename or "\\" in filename:
|
| 52 |
+
raise ValueError(
|
| 53 |
+
"Filename contains invalid characters or path traversal attempt"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Additional validation: filename should not be empty and should be reasonable
|
| 57 |
+
if not filename or len(filename) > 255:
|
| 58 |
+
raise ValueError("Invalid filename")
|
| 59 |
+
|
| 60 |
+
return media_type, filename
|
| 61 |
+
|
| 62 |
+
def _get_safe_file_path(self, media_id: str) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Gets a safe file path for the given media ID after validation.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
media_id (str): Media ID to get path for
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
str: Safe file path
|
| 71 |
+
"""
|
| 72 |
+
media_type, filename = self._validate_media_id(media_id)
|
| 73 |
+
file_path = os.path.join(self.storage_path, media_type, filename)
|
| 74 |
+
|
| 75 |
+
# Double-check that the resolved path is within the storage directory
|
| 76 |
+
resolved_path = os.path.abspath(file_path)
|
| 77 |
+
storage_abs_path = os.path.abspath(self.storage_path)
|
| 78 |
+
|
| 79 |
+
if not resolved_path.startswith(storage_abs_path):
|
| 80 |
+
raise ValueError("Path traversal attempt detected")
|
| 81 |
+
|
| 82 |
+
return file_path
|
| 83 |
+
|
| 84 |
+
def upload_media(
|
| 85 |
+
self, media_type: MediaType, media_data: bytes, file_extension: str = ""
|
| 86 |
+
) -> str:
|
| 87 |
+
"""
|
| 88 |
+
Uploads media to the server.
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
media_type (str): Type of media, e.g., 'image' or 'video'.
|
| 92 |
+
media_data (bytes): Binary data of the media file.
|
| 93 |
+
file_extension (str): File extension, e.g., '.jpg', '.mp4', '.wav'.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
str: Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
|
| 97 |
+
"""
|
| 98 |
+
# Validate media type
|
| 99 |
+
valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
|
| 100 |
+
if media_type not in valid_types:
|
| 101 |
+
raise ValueError(f"Invalid media type: {media_type}")
|
| 102 |
+
|
| 103 |
+
# Validate file extension to prevent path traversal
|
| 104 |
+
if file_extension and (
|
| 105 |
+
".." in file_extension or "/" in file_extension or "\\" in file_extension
|
| 106 |
+
):
|
| 107 |
+
raise ValueError("File extension contains invalid characters")
|
| 108 |
+
|
| 109 |
+
asset_id = str(uuid.uuid4())
|
| 110 |
+
filename = f"{asset_id}{file_extension}" if file_extension else asset_id
|
| 111 |
+
file_path = os.path.join(self.storage_path, media_type, filename)
|
| 112 |
+
|
| 113 |
+
# Additional safety check
|
| 114 |
+
resolved_path = os.path.abspath(file_path)
|
| 115 |
+
storage_abs_path = os.path.abspath(self.storage_path)
|
| 116 |
+
if not resolved_path.startswith(storage_abs_path):
|
| 117 |
+
raise ValueError("Path traversal attempt detected")
|
| 118 |
+
|
| 119 |
+
with open(file_path, "wb") as f:
|
| 120 |
+
f.write(media_data)
|
| 121 |
+
|
| 122 |
+
media_id = f"{media_type}_{filename}"
|
| 123 |
+
return media_id
|
| 124 |
+
|
| 125 |
+
def get_media(self, media_id: str) -> bytes:
|
| 126 |
+
"""
|
| 127 |
+
Retrieves media by ID.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
bytes: Binary data of the media file.
|
| 134 |
+
"""
|
| 135 |
+
file_path = self._get_safe_file_path(media_id)
|
| 136 |
+
|
| 137 |
+
if not os.path.exists(file_path):
|
| 138 |
+
raise FileNotFoundError(f"Media file {media_id} not found.")
|
| 139 |
+
|
| 140 |
+
with open(file_path, "rb") as f:
|
| 141 |
+
return f.read()
|
| 142 |
+
|
| 143 |
+
def delete_media(self, media_id: str) -> None:
|
| 144 |
+
"""
|
| 145 |
+
Deletes media by ID.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
|
| 149 |
+
"""
|
| 150 |
+
file_path = self._get_safe_file_path(media_id)
|
| 151 |
+
|
| 152 |
+
if os.path.exists(file_path):
|
| 153 |
+
os.remove(file_path)
|
| 154 |
+
else:
|
| 155 |
+
raise FileNotFoundError(f"Media file {media_id} not found.")
|
| 156 |
+
|
| 157 |
+
def media_exists(self, media_id: str) -> bool:
|
| 158 |
+
"""
|
| 159 |
+
Checks if media exists by ID.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
|
| 163 |
+
|
| 164 |
+
Returns:
|
| 165 |
+
bool: True if media exists, False otherwise.
|
| 166 |
+
"""
|
| 167 |
+
try:
|
| 168 |
+
file_path = self._get_safe_file_path(media_id)
|
| 169 |
+
return os.path.exists(file_path)
|
| 170 |
+
except ValueError:
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
def get_media_path(self, media_id: str) -> str:
|
| 174 |
+
"""
|
| 175 |
+
Gets the file path of the media by ID.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
str: Full file path of the media.
|
| 182 |
+
"""
|
| 183 |
+
return self._get_safe_file_path(media_id)
|
| 184 |
+
|
| 185 |
+
### untested
|
| 186 |
+
def create_media_filename(
|
| 187 |
+
self, media_type: MediaType, file_extension: str = ""
|
| 188 |
+
) -> str:
|
| 189 |
+
# Validate media type
|
| 190 |
+
valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
|
| 191 |
+
if media_type not in valid_types:
|
| 192 |
+
raise ValueError(f"Invalid media type: {media_type}")
|
| 193 |
+
|
| 194 |
+
# Validate file extension to prevent path traversal
|
| 195 |
+
if file_extension and (
|
| 196 |
+
".." in file_extension or "/" in file_extension or "\\" in file_extension
|
| 197 |
+
):
|
| 198 |
+
raise ValueError("File extension contains invalid characters")
|
| 199 |
+
|
| 200 |
+
asset_id = str(uuid.uuid4())
|
| 201 |
+
filename = f"{asset_id}{file_extension}" if file_extension else asset_id
|
| 202 |
+
return f"{media_type}_{filename}"
|
| 203 |
+
|
| 204 |
+
def create_media_filename_with_id(
|
| 205 |
+
self, media_type: MediaType, file_extension: str = ""
|
| 206 |
+
) -> Tuple[str, str]:
|
| 207 |
+
file_id = self.create_media_filename(media_type, file_extension)
|
| 208 |
+
return file_id, self.get_media_path(file_id)
|
| 209 |
+
|
| 210 |
+
def create_media_template(
|
| 211 |
+
self, media_type: MediaType, file_extension: str
|
| 212 |
+
) -> str:
|
| 213 |
+
"""
|
| 214 |
+
Creates a media template filename for the given media type and file extension.
|
| 215 |
+
Args:
|
| 216 |
+
media_type (MediaType): Type of media, e.g., MediaType.IMAGE.
|
| 217 |
+
file_extension (str): File extension, e.g., '.jpg', '.mp4'.
|
| 218 |
+
):
|
| 219 |
+
Returns:
|
| 220 |
+
|
| 221 |
+
"""
|
| 222 |
+
if not file_extension.startswith("."):
|
| 223 |
+
file_extension = "." + file_extension
|
| 224 |
+
|
| 225 |
+
valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
|
| 226 |
+
if media_type not in valid_types:
|
| 227 |
+
raise ValueError(f"Invalid media type: {media_type}")
|
| 228 |
+
|
| 229 |
+
if file_extension and (
|
| 230 |
+
".." in file_extension or "/" in file_extension or "\\" in file_extension
|
| 231 |
+
):
|
| 232 |
+
raise ValueError("File extension contains invalid characters")
|
| 233 |
+
|
| 234 |
+
asset_id = str(uuid.uuid4())
|
| 235 |
+
filename = f"{asset_id}-%02d{file_extension}" if file_extension else f"{asset_id}-%02d"
|
| 236 |
+
file_path = os.path.join(
|
| 237 |
+
self.storage_path, media_type, filename
|
| 238 |
+
)
|
| 239 |
+
return filename, file_path
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def create_tmp_file_id(self, media_id: str) -> str:
|
| 243 |
+
"""
|
| 244 |
+
Creates a temporary filename for media upload.
|
| 245 |
+
|
| 246 |
+
Args:
|
| 247 |
+
media_id (str): Media ID to create a temporary filename for.
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
str: Temporary media ID.
|
| 251 |
+
"""
|
| 252 |
+
return f"{media_id}.tmp"
|
| 253 |
+
|
| 254 |
+
def create_tmp_file(self, media_id: str) -> str:
|
| 255 |
+
"""
|
| 256 |
+
Creates a temporary file for media upload.
|
| 257 |
+
|
| 258 |
+
Args:
|
| 259 |
+
media_id (str): Media ID to create a temporary file for.
|
| 260 |
+
|
| 261 |
+
Returns:
|
| 262 |
+
str: Temporary media ID.
|
| 263 |
+
"""
|
| 264 |
+
tmp_id = f"{media_id}.tmp"
|
| 265 |
+
tmp_path = self.get_media_path(tmp_id)
|
| 266 |
+
|
| 267 |
+
with open(tmp_path, "wb") as f:
|
| 268 |
+
pass
|
| 269 |
+
return tmp_id
|
| 270 |
+
|
| 271 |
+
def get_media_type(self, media_id: str) -> MediaType:
|
| 272 |
+
"""
|
| 273 |
+
Gets the media type of the given media ID.
|
| 274 |
+
|
| 275 |
+
Args:
|
| 276 |
+
media_id (str): Media ID to get the type for.
|
| 277 |
+
|
| 278 |
+
Returns:
|
| 279 |
+
MediaType: The type of the media.
|
| 280 |
+
"""
|
| 281 |
+
media_type, _ = self._validate_media_id(media_id)
|
| 282 |
+
return media_type
|
| 283 |
+
|
| 284 |
+
def is_valid_url(self, url: str) -> bool:
|
| 285 |
+
"""
|
| 286 |
+
Validates a URL to ensure it is well-formed.
|
| 287 |
+
|
| 288 |
+
Args:
|
| 289 |
+
url (str): The URL to validate.
|
| 290 |
+
|
| 291 |
+
Returns:
|
| 292 |
+
bool: True if the URL is valid, False otherwise.
|
| 293 |
+
"""
|
| 294 |
+
from urllib.parse import urlparse
|
| 295 |
+
|
| 296 |
+
try:
|
| 297 |
+
result = urlparse(url)
|
| 298 |
+
return all([result.scheme, result.netloc])
|
| 299 |
+
except Exception:
|
| 300 |
+
return False
|
| 301 |
+
|
| 302 |
+
def upload_media_from_url(
|
| 303 |
+
self, media_type: MediaType, url: str
|
| 304 |
+
) -> str:
|
| 305 |
+
"""
|
| 306 |
+
Uploads media from a URL.
|
| 307 |
+
|
| 308 |
+
Args:
|
| 309 |
+
media_type (MediaType): Type of media, e.g., MediaType.IMAGE.
|
| 310 |
+
url (str): URL of the media file.
|
| 311 |
+
|
| 312 |
+
Returns:
|
| 313 |
+
str: Media ID, e.g., 'image_12345.jpg'.
|
| 314 |
+
"""
|
| 315 |
+
if not self.is_valid_url(url):
|
| 316 |
+
raise ValueError("Invalid URL")
|
| 317 |
+
|
| 318 |
+
response = requests.get(url)
|
| 319 |
+
if response.status_code != 200:
|
| 320 |
+
raise ValueError(f"Failed to download media from {url}")
|
| 321 |
+
|
| 322 |
+
file_extension = os.path.splitext(url)[1]
|
| 323 |
+
return self.upload_media(media_type, response.content, file_extension)
|
video/stt.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from faster_whisper import WhisperModel
|
| 2 |
+
from loguru import logger
|
| 3 |
+
from video.config import device, whisper_model, whisper_compute_type
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class STT:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.model = WhisperModel(
|
| 9 |
+
model_size_or_path=whisper_model,
|
| 10 |
+
compute_type=whisper_compute_type
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
def transcribe(self, audio_path, language = None, beam_size=5):
|
| 14 |
+
logger.bind(
|
| 15 |
+
device=device.type,
|
| 16 |
+
model_size=whisper_model,
|
| 17 |
+
compute_type=whisper_compute_type,
|
| 18 |
+
audio_path=audio_path,
|
| 19 |
+
language=language,
|
| 20 |
+
).debug(
|
| 21 |
+
"transcribing audio with Whisper model",
|
| 22 |
+
)
|
| 23 |
+
segments, info = self.model.transcribe(
|
| 24 |
+
audio_path,
|
| 25 |
+
beam_size=beam_size,
|
| 26 |
+
word_timestamps=True,
|
| 27 |
+
language=language,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
duration = info.duration
|
| 31 |
+
captions = []
|
| 32 |
+
for segment in segments:
|
| 33 |
+
for word in segment.words:
|
| 34 |
+
captions.append(
|
| 35 |
+
{
|
| 36 |
+
"text": word.word,
|
| 37 |
+
"start_ts": word.start,
|
| 38 |
+
"end_ts": word.end,
|
| 39 |
+
}
|
| 40 |
+
)
|
| 41 |
+
return captions, duration
|
video/tts.py
ADDED
|
@@ -0,0 +1,443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import time
|
| 3 |
+
import warnings
|
| 4 |
+
from typing import List
|
| 5 |
+
from kokoro import KPipeline
|
| 6 |
+
import numpy as np
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
from loguru import logger
|
| 9 |
+
import torchaudio as ta
|
| 10 |
+
from chatterbox.tts import ChatterboxTTS
|
| 11 |
+
from video.config import device
|
| 12 |
+
|
| 13 |
+
# Suppress PyTorch warnings
|
| 14 |
+
warnings.filterwarnings("ignore")
|
| 15 |
+
|
| 16 |
+
LANGUAGE_CONFIG = {
|
| 17 |
+
"en-us": {
|
| 18 |
+
"lang_code": "a",
|
| 19 |
+
"international": False,
|
| 20 |
+
"iso639_1": "en",
|
| 21 |
+
},
|
| 22 |
+
"en": {
|
| 23 |
+
"lang_code": "a",
|
| 24 |
+
"international": False,
|
| 25 |
+
"iso639_1": "en",
|
| 26 |
+
},
|
| 27 |
+
"en-gb": {
|
| 28 |
+
"lang_code": "b",
|
| 29 |
+
"international": False,
|
| 30 |
+
"iso639_1": "en",
|
| 31 |
+
},
|
| 32 |
+
"es": {"lang_code": "e", "international": True, "iso639_1": "es"},
|
| 33 |
+
"fr": {"lang_code": "f", "international": True, "iso639_1": "fr"},
|
| 34 |
+
"hi": {"lang_code": "h", "international": True, "iso639_1": "hi"},
|
| 35 |
+
"it": {"lang_code": "i", "international": True, "iso639_1": "it"},
|
| 36 |
+
"pt": {"lang_code": "p", "international": True, "iso639_1": "pt"},
|
| 37 |
+
"ja": {"lang_code": "j", "international": True, "iso639_1": "ja"},
|
| 38 |
+
"zh": {"lang_code": "z", "international": True, "iso639_1": "zh"},
|
| 39 |
+
}
|
| 40 |
+
LANGUAGE_VOICE_CONFIG = {
|
| 41 |
+
"en-us": [
|
| 42 |
+
"af_heart",
|
| 43 |
+
"af_alloy",
|
| 44 |
+
"af_aoede",
|
| 45 |
+
"af_bella",
|
| 46 |
+
"af_jessica",
|
| 47 |
+
"af_kore",
|
| 48 |
+
"af_nicole",
|
| 49 |
+
"af_nova",
|
| 50 |
+
"af_river",
|
| 51 |
+
"af_sarah",
|
| 52 |
+
"af_sky",
|
| 53 |
+
"am_adam",
|
| 54 |
+
"am_echo",
|
| 55 |
+
"am_eric",
|
| 56 |
+
"am_fenrir",
|
| 57 |
+
"am_liam",
|
| 58 |
+
"am_michael",
|
| 59 |
+
"am_onyx",
|
| 60 |
+
"am_puck",
|
| 61 |
+
"am_santa",
|
| 62 |
+
],
|
| 63 |
+
"en-gb": [
|
| 64 |
+
"bf_alice",
|
| 65 |
+
"bf_emma",
|
| 66 |
+
"bf_isabella",
|
| 67 |
+
"bf_lily",
|
| 68 |
+
"bm_daniel",
|
| 69 |
+
"bm_fable",
|
| 70 |
+
"bm_george",
|
| 71 |
+
"bm_lewis",
|
| 72 |
+
],
|
| 73 |
+
"zh": [
|
| 74 |
+
"zf_xiaobei",
|
| 75 |
+
"zf_xiaoni",
|
| 76 |
+
"zf_xiaoxiao",
|
| 77 |
+
"zf_xiaoyi",
|
| 78 |
+
"zm_yunjian",
|
| 79 |
+
"zm_yunxi",
|
| 80 |
+
"zm_yunxia",
|
| 81 |
+
"zm_yunyang",
|
| 82 |
+
],
|
| 83 |
+
"es": ["ef_dora", "em_alex", "em_santa"],
|
| 84 |
+
"fr": ["ff_siwis"],
|
| 85 |
+
"it": ["if_sara", "im_nicola"],
|
| 86 |
+
"pt": ["pf_dora", "pm_alex", "pm_santa"],
|
| 87 |
+
"hi": ["hf_alpha", "hf_beta", "hm_omega", "hm_psi"],
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
LANGUAGE_VOICE_MAP = {}
|
| 91 |
+
for lang, voices in LANGUAGE_VOICE_CONFIG.items():
|
| 92 |
+
for voice in voices:
|
| 93 |
+
if lang in LANGUAGE_CONFIG:
|
| 94 |
+
LANGUAGE_VOICE_MAP[voice] = LANGUAGE_CONFIG[lang]
|
| 95 |
+
else:
|
| 96 |
+
print(f"Warning: Language {lang} not found in LANGUAGE_CONFIG")
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class TTS:
|
| 100 |
+
def break_text_into_sentences(self, text, lang_code) -> List[str]:
|
| 101 |
+
"""
|
| 102 |
+
Advanced sentence splitting with better handling of abbreviations and edge cases.
|
| 103 |
+
"""
|
| 104 |
+
if not text or not text.strip():
|
| 105 |
+
return []
|
| 106 |
+
|
| 107 |
+
# Language-specific sentence boundary patterns
|
| 108 |
+
patterns = {
|
| 109 |
+
"a": r"(?<=[.!?])\s+(?=[A-Z_])", # English
|
| 110 |
+
"e": r"(?<=[.!?])\s+(?=[A-ZÁÉÍÓÚÑÜ¿¡_])", # Spanish - allow inverted punctuation after boundaries
|
| 111 |
+
"f": r"(?<=[.!?])\s+(?=[A-ZÁÀÂÄÇÉÈÊËÏÎÔÖÙÛÜŸ_])", # French
|
| 112 |
+
"h": r"(?<=[।!?])\s+", # Hindi: Split after devanagari danda
|
| 113 |
+
"i": r"(?<=[.!?])\s+(?=[A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß_])", # Italian
|
| 114 |
+
"p": r"(?<=[.!?])\s+(?=[A-ZÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ_])", # Portuguese
|
| 115 |
+
"z": r"(?<=[。!?])", # Chinese: Split after Chinese punctuation
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Common abbreviations that shouldn't trigger sentence breaks
|
| 119 |
+
abbreviations = {
|
| 120 |
+
"a": {
|
| 121 |
+
"Mr.",
|
| 122 |
+
"Mrs.",
|
| 123 |
+
"Ms.",
|
| 124 |
+
"Dr.",
|
| 125 |
+
"Prof.",
|
| 126 |
+
"Sr.",
|
| 127 |
+
"Jr.",
|
| 128 |
+
"Inc.",
|
| 129 |
+
"Corp.",
|
| 130 |
+
"Ltd.",
|
| 131 |
+
"Co.",
|
| 132 |
+
"etc.",
|
| 133 |
+
"vs.",
|
| 134 |
+
"eg.",
|
| 135 |
+
"i.e.",
|
| 136 |
+
"e.g.",
|
| 137 |
+
"Vol.",
|
| 138 |
+
"Ch.",
|
| 139 |
+
"Fig.",
|
| 140 |
+
"No.",
|
| 141 |
+
"p.",
|
| 142 |
+
"pp.",
|
| 143 |
+
}, # English
|
| 144 |
+
"e": {
|
| 145 |
+
"Sr.",
|
| 146 |
+
"Sra.",
|
| 147 |
+
"Dr.",
|
| 148 |
+
"Dra.",
|
| 149 |
+
"Prof.",
|
| 150 |
+
"etc.",
|
| 151 |
+
"pág.",
|
| 152 |
+
"art.",
|
| 153 |
+
"núm.",
|
| 154 |
+
"cap.",
|
| 155 |
+
"vol.",
|
| 156 |
+
}, # Spanish
|
| 157 |
+
"f": {
|
| 158 |
+
"M.",
|
| 159 |
+
"Mme.",
|
| 160 |
+
"Dr.",
|
| 161 |
+
"Prof.",
|
| 162 |
+
"etc.",
|
| 163 |
+
"art.",
|
| 164 |
+
"p.",
|
| 165 |
+
"vol.",
|
| 166 |
+
"ch.",
|
| 167 |
+
"fig.",
|
| 168 |
+
"n°",
|
| 169 |
+
}, # French
|
| 170 |
+
"h": {"श्री", "श्रीमती", "डॉ.", "प्रो.", "etc.", "पृ.", "अध."}, # Hindi
|
| 171 |
+
"i": {
|
| 172 |
+
"Sig.",
|
| 173 |
+
"Sig.ra",
|
| 174 |
+
"Dr.",
|
| 175 |
+
"Prof.",
|
| 176 |
+
"ecc.",
|
| 177 |
+
"pag.",
|
| 178 |
+
"art.",
|
| 179 |
+
"n.",
|
| 180 |
+
"vol.",
|
| 181 |
+
"cap.",
|
| 182 |
+
"fig.",
|
| 183 |
+
}, # Italian
|
| 184 |
+
"p": {
|
| 185 |
+
"Sr.",
|
| 186 |
+
"Sra.",
|
| 187 |
+
"Dr.",
|
| 188 |
+
"Dra.",
|
| 189 |
+
"Prof.",
|
| 190 |
+
"etc.",
|
| 191 |
+
"pág.",
|
| 192 |
+
"art.",
|
| 193 |
+
"n.º",
|
| 194 |
+
"vol.",
|
| 195 |
+
"cap.",
|
| 196 |
+
}, # Portuguese
|
| 197 |
+
"z": {"先生", "女士", "博士", "教授", "等等", "第", "页", "章"}, # Chinese
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
abbrevs = abbreviations.get(lang_code, set())
|
| 201 |
+
|
| 202 |
+
# Protect abbreviations by temporarily replacing them
|
| 203 |
+
protected_text = text
|
| 204 |
+
replacements = {}
|
| 205 |
+
for i, abbrev in enumerate(abbrevs):
|
| 206 |
+
placeholder = f"__ABBREV_{i}__"
|
| 207 |
+
protected_text = protected_text.replace(abbrev, placeholder)
|
| 208 |
+
replacements[placeholder] = abbrev
|
| 209 |
+
|
| 210 |
+
# Apply the regex splitting
|
| 211 |
+
pattern = patterns.get(lang_code, patterns["a"])
|
| 212 |
+
sentences = re.split(pattern, protected_text.strip())
|
| 213 |
+
|
| 214 |
+
# Restore abbreviations and clean up
|
| 215 |
+
restored_sentences = []
|
| 216 |
+
for sentence in sentences:
|
| 217 |
+
for placeholder, original in replacements.items():
|
| 218 |
+
sentence = sentence.replace(placeholder, original)
|
| 219 |
+
sentence = sentence.strip()
|
| 220 |
+
if sentence:
|
| 221 |
+
restored_sentences.append(sentence)
|
| 222 |
+
|
| 223 |
+
return restored_sentences if restored_sentences else [text.strip()]
|
| 224 |
+
|
| 225 |
+
def kokoro_international(
|
| 226 |
+
self, text: str, output_path: str, voice: str, lang_code: str, speed=1
|
| 227 |
+
) -> tuple[str, List[dict], float]:
|
| 228 |
+
if not text or not text.strip():
|
| 229 |
+
raise ValueError("Text cannot be empty or whitespace")
|
| 230 |
+
lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
|
| 231 |
+
if not lang_code:
|
| 232 |
+
raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
|
| 233 |
+
start = time.time()
|
| 234 |
+
context_logger = logger.bind(
|
| 235 |
+
voice=voice,
|
| 236 |
+
speed=speed,
|
| 237 |
+
text_length=len(text),
|
| 238 |
+
)
|
| 239 |
+
context_logger.debug("Starting TTS generation (international) with kokoro")
|
| 240 |
+
sentences = self.break_text_into_sentences(text, lang_code)
|
| 241 |
+
context_logger.debug(
|
| 242 |
+
"Text split into sentences",
|
| 243 |
+
sentences=sentences,
|
| 244 |
+
num_sentences=len(sentences),
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# generate the audio for each sentence
|
| 248 |
+
audio_data = []
|
| 249 |
+
captions = []
|
| 250 |
+
full_audio_length = 0
|
| 251 |
+
pipeline = KPipeline(lang_code=lang_code, repo_id="hexgrad/Kokoro-82M", device=device)
|
| 252 |
+
for sentence in sentences:
|
| 253 |
+
context_logger.debug(
|
| 254 |
+
"Processing sentence",
|
| 255 |
+
sentence=sentence,
|
| 256 |
+
voice=voice,
|
| 257 |
+
speed=speed,
|
| 258 |
+
)
|
| 259 |
+
generator = pipeline(sentence, voice=voice, speed=speed)
|
| 260 |
+
|
| 261 |
+
for i, result in enumerate(generator):
|
| 262 |
+
context_logger.debug(
|
| 263 |
+
"Generated audio for sentence",
|
| 264 |
+
)
|
| 265 |
+
data = result.audio
|
| 266 |
+
audio_length = len(data) / 24000
|
| 267 |
+
audio_data.append(data)
|
| 268 |
+
# since there are no tokens, we can just use the sentence as the text
|
| 269 |
+
captions.append(
|
| 270 |
+
{
|
| 271 |
+
"text": sentence,
|
| 272 |
+
"start_ts": full_audio_length,
|
| 273 |
+
"end_ts": full_audio_length + audio_length,
|
| 274 |
+
}
|
| 275 |
+
)
|
| 276 |
+
full_audio_length += audio_length
|
| 277 |
+
|
| 278 |
+
context_logger = context_logger.bind(
|
| 279 |
+
execution_time=time.time() - start,
|
| 280 |
+
audio_length=full_audio_length,
|
| 281 |
+
speedup=full_audio_length / (time.time() - start),
|
| 282 |
+
)
|
| 283 |
+
context_logger.debug(
|
| 284 |
+
"TTS generation (international) completed with kokoro",
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
audio_data = np.concatenate(audio_data)
|
| 288 |
+
audio_data = np.column_stack((audio_data, audio_data))
|
| 289 |
+
sf.write(output_path, audio_data, 24000, format="WAV")
|
| 290 |
+
return captions, full_audio_length
|
| 291 |
+
|
| 292 |
+
def kokoro_english(
|
| 293 |
+
self, text: str, output_path: str, voice="af_heart", speed=1
|
| 294 |
+
) -> tuple[str, List[dict], float]:
|
| 295 |
+
if not text or not text.strip():
|
| 296 |
+
raise ValueError("Text cannot be empty or whitespace")
|
| 297 |
+
lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
|
| 298 |
+
if not lang_code:
|
| 299 |
+
raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
|
| 300 |
+
if lang_code != "a":
|
| 301 |
+
raise NotImplementedError(
|
| 302 |
+
f"TTS for language code '{lang_code}' is not implemented."
|
| 303 |
+
)
|
| 304 |
+
start = time.time()
|
| 305 |
+
|
| 306 |
+
context_logger = logger.bind(
|
| 307 |
+
voice=voice,
|
| 308 |
+
speed=speed,
|
| 309 |
+
text_length=len(text),
|
| 310 |
+
device=device.type,
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
context_logger.debug("Starting TTS generation with kokoro")
|
| 314 |
+
if not text or not text.strip():
|
| 315 |
+
raise ValueError("Text cannot be empty or whitespace")
|
| 316 |
+
pipeline = KPipeline(lang_code=lang_code, repo_id="hexgrad/Kokoro-82M", device=device.type)
|
| 317 |
+
|
| 318 |
+
generator = pipeline(text, voice=voice, speed=speed)
|
| 319 |
+
|
| 320 |
+
captions = []
|
| 321 |
+
audio_data = []
|
| 322 |
+
full_audio_length = 0
|
| 323 |
+
for _, result in enumerate(generator):
|
| 324 |
+
data = result.audio
|
| 325 |
+
audio_length = len(data) / 24000
|
| 326 |
+
audio_data.append(data)
|
| 327 |
+
if result.tokens:
|
| 328 |
+
tokens = result.tokens
|
| 329 |
+
for t in tokens:
|
| 330 |
+
if t.start_ts is None or t.end_ts is None:
|
| 331 |
+
if captions:
|
| 332 |
+
captions[-1]["text"] += t.text
|
| 333 |
+
captions[-1]["end_ts"] = full_audio_length + audio_length
|
| 334 |
+
continue
|
| 335 |
+
try:
|
| 336 |
+
captions.append(
|
| 337 |
+
{
|
| 338 |
+
"text": t.text,
|
| 339 |
+
"start_ts": full_audio_length + t.start_ts,
|
| 340 |
+
"end_ts": full_audio_length + t.end_ts,
|
| 341 |
+
}
|
| 342 |
+
)
|
| 343 |
+
except Exception as e:
|
| 344 |
+
logger.error(
|
| 345 |
+
"Error processing token: {}, Error: {}",
|
| 346 |
+
t,
|
| 347 |
+
e,
|
| 348 |
+
)
|
| 349 |
+
raise ValueError(f"Error processing token: {t}, Error: {e}")
|
| 350 |
+
full_audio_length += audio_length
|
| 351 |
+
|
| 352 |
+
audio_data = np.concatenate(audio_data)
|
| 353 |
+
audio_data = np.column_stack((audio_data, audio_data))
|
| 354 |
+
sf.write(output_path, audio_data, 24000, format="WAV")
|
| 355 |
+
context_logger.bind(
|
| 356 |
+
execution_time=time.time() - start,
|
| 357 |
+
audio_length=full_audio_length,
|
| 358 |
+
speedup=full_audio_length / (time.time() - start),
|
| 359 |
+
youtube_channel="https://www.youtube.com/@aiagentsaz"
|
| 360 |
+
).debug(
|
| 361 |
+
"TTS generation completed with kokoro",
|
| 362 |
+
)
|
| 363 |
+
return captions, full_audio_length
|
| 364 |
+
|
| 365 |
+
def kokoro(
|
| 366 |
+
self, text: str, output_path: str, voice="af_heart", speed=1
|
| 367 |
+
) -> tuple[str, List[dict], float]:
|
| 368 |
+
if not text or not text.strip():
|
| 369 |
+
raise ValueError("Text cannot be empty or whitespace")
|
| 370 |
+
lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
|
| 371 |
+
if not lang_code:
|
| 372 |
+
raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
|
| 373 |
+
if lang_code == "a":
|
| 374 |
+
return self.kokoro_english(text, output_path, voice, speed)
|
| 375 |
+
else:
|
| 376 |
+
return self.kokoro_international(text, output_path, voice, lang_code, speed)
|
| 377 |
+
|
| 378 |
+
def chatterbox(
|
| 379 |
+
self,
|
| 380 |
+
text: str,
|
| 381 |
+
output_path: str,
|
| 382 |
+
sample_audio_path: str = None,
|
| 383 |
+
exaggeration=0.5,
|
| 384 |
+
cfg_weight=0.5,
|
| 385 |
+
temperature=0.8,
|
| 386 |
+
):
|
| 387 |
+
start = time.time()
|
| 388 |
+
context_logger = logger.bind(
|
| 389 |
+
text_length=len(text),
|
| 390 |
+
sample_audio_path=sample_audio_path,
|
| 391 |
+
exaggeration=exaggeration,
|
| 392 |
+
cfg_weight=cfg_weight,
|
| 393 |
+
temperature=temperature,
|
| 394 |
+
model="ChatterboxTTS",
|
| 395 |
+
language="en-US",
|
| 396 |
+
device=device.type,
|
| 397 |
+
)
|
| 398 |
+
context_logger.debug("starting TTS generation with Chatterbox")
|
| 399 |
+
model = ChatterboxTTS.from_pretrained(device=device.type)
|
| 400 |
+
|
| 401 |
+
if sample_audio_path:
|
| 402 |
+
wav = model.generate(
|
| 403 |
+
text,
|
| 404 |
+
audio_prompt_path=sample_audio_path,
|
| 405 |
+
exaggeration=exaggeration,
|
| 406 |
+
cfg_weight=cfg_weight,
|
| 407 |
+
temperature=temperature,
|
| 408 |
+
)
|
| 409 |
+
else:
|
| 410 |
+
wav = model.generate(
|
| 411 |
+
text,
|
| 412 |
+
exaggeration=exaggeration,
|
| 413 |
+
cfg_weight=cfg_weight,
|
| 414 |
+
temperature=temperature,
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
if wav.dim() == 2 and wav.shape[0] == 1:
|
| 418 |
+
wav = wav.repeat(2, 1)
|
| 419 |
+
elif wav.dim() == 1:
|
| 420 |
+
wav = wav.unsqueeze(0).repeat(2, 1)
|
| 421 |
+
|
| 422 |
+
audio_length = wav.shape[1] / model.sr
|
| 423 |
+
ta.save(output_path, wav, model.sr)
|
| 424 |
+
context_logger.bind(
|
| 425 |
+
execution_time=time.time() - start,
|
| 426 |
+
audio_length=audio_length,
|
| 427 |
+
speedup=audio_length / (time.time() - start),
|
| 428 |
+
youtube_channel="https://www.youtube.com/@aiagentsaz"
|
| 429 |
+
).debug(
|
| 430 |
+
"TTS generation with Chatterbox completed",
|
| 431 |
+
)
|
| 432 |
+
|
| 433 |
+
def valid_kokoro_voices(self, lang_code = None) -> List[str]:
|
| 434 |
+
"""
|
| 435 |
+
Returns a list of valid voices for the given language code.
|
| 436 |
+
If no language code is provided, returns all voices.
|
| 437 |
+
"""
|
| 438 |
+
if lang_code:
|
| 439 |
+
return LANGUAGE_VOICE_CONFIG.get(lang_code, [])
|
| 440 |
+
else:
|
| 441 |
+
return [
|
| 442 |
+
voice for voices in LANGUAGE_VOICE_CONFIG.values() for voice in voices
|
| 443 |
+
]
|
video/tts_chatterbox.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import traceback
|
| 4 |
+
import warnings
|
| 5 |
+
from loguru import logger
|
| 6 |
+
import torchaudio as ta
|
| 7 |
+
from chatterbox.tts import ChatterboxTTS
|
| 8 |
+
from video.config import device
|
| 9 |
+
import nltk
|
| 10 |
+
import torch
|
| 11 |
+
from typing import List, Optional
|
| 12 |
+
|
| 13 |
+
# Suppress PyTorch warnings
|
| 14 |
+
warnings.filterwarnings("ignore")
|
| 15 |
+
|
| 16 |
+
class TTSChatterbox:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
"""Initialize ChatterboxTTS and ensure NLTK data is available."""
|
| 19 |
+
self.ensure_nltk_data()
|
| 20 |
+
logger.debug("ChatterboxTTS initialized")
|
| 21 |
+
|
| 22 |
+
def ensure_nltk_data(self):
|
| 23 |
+
"""Ensure NLTK punkt tokenizer is available."""
|
| 24 |
+
try:
|
| 25 |
+
nltk.data.find('tokenizers/punkt')
|
| 26 |
+
nltk.data.find('tokenizers/punkt_tab')
|
| 27 |
+
logger.debug("NLTK punkt tokenizer found")
|
| 28 |
+
except LookupError:
|
| 29 |
+
logger.debug("Downloading NLTK punkt tokenizer...")
|
| 30 |
+
try:
|
| 31 |
+
nltk.download('punkt', quiet=True)
|
| 32 |
+
nltk.download('punkt_tab', quiet=True)
|
| 33 |
+
logger.debug("NLTK punkt tokenizer downloaded successfully")
|
| 34 |
+
except Exception as e:
|
| 35 |
+
logger.error(f"Failed to download NLTK punkt tokenizer: {e}")
|
| 36 |
+
raise
|
| 37 |
+
|
| 38 |
+
def split_text_into_chunks(self, text: str, max_chars_per_chunk: int = 300) -> List[str]:
|
| 39 |
+
"""Split text into chunks respecting sentence boundaries without breaking sentences."""
|
| 40 |
+
try:
|
| 41 |
+
sentences = nltk.sent_tokenize(text)
|
| 42 |
+
# Filter out empty sentences and strip whitespace
|
| 43 |
+
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
|
| 44 |
+
|
| 45 |
+
chunks = []
|
| 46 |
+
current_chunk = ""
|
| 47 |
+
|
| 48 |
+
for sentence in sentences:
|
| 49 |
+
# If adding this sentence would exceed the limit, finalize current chunk
|
| 50 |
+
if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chars_per_chunk:
|
| 51 |
+
chunks.append(current_chunk.strip())
|
| 52 |
+
current_chunk = sentence
|
| 53 |
+
else:
|
| 54 |
+
# Add sentence to current chunk
|
| 55 |
+
if current_chunk:
|
| 56 |
+
current_chunk += " " + sentence
|
| 57 |
+
else:
|
| 58 |
+
current_chunk = sentence
|
| 59 |
+
|
| 60 |
+
# Add the last chunk if it's not empty
|
| 61 |
+
if current_chunk.strip():
|
| 62 |
+
chunks.append(current_chunk.strip())
|
| 63 |
+
|
| 64 |
+
logger.debug(f"Text split into {len(chunks)} chunks (max {max_chars_per_chunk} chars each, preserving sentences)")
|
| 65 |
+
return chunks
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"Error splitting text: {e}")
|
| 68 |
+
# Fallback: return original text as single chunk
|
| 69 |
+
return [text]
|
| 70 |
+
|
| 71 |
+
def generate_audio_chunk(
|
| 72 |
+
self,
|
| 73 |
+
text_chunk: str,
|
| 74 |
+
model: ChatterboxTTS,
|
| 75 |
+
audio_prompt_path: Optional[str] = None,
|
| 76 |
+
temperature: float = 0.8,
|
| 77 |
+
cfg_weight: float = 0.5,
|
| 78 |
+
exaggeration: float = 0.5
|
| 79 |
+
) -> Optional[torch.Tensor]:
|
| 80 |
+
"""Generate audio tensor for a single text chunk."""
|
| 81 |
+
try:
|
| 82 |
+
logger.debug(f"Generating audio for chunk: {text_chunk[:50]}...")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Check if audio prompt exists
|
| 86 |
+
effective_prompt_path = None
|
| 87 |
+
if audio_prompt_path and os.path.exists(audio_prompt_path):
|
| 88 |
+
effective_prompt_path = audio_prompt_path
|
| 89 |
+
elif audio_prompt_path:
|
| 90 |
+
logger.warning(f"Audio prompt path not found: {audio_prompt_path}")
|
| 91 |
+
|
| 92 |
+
# Generate audio
|
| 93 |
+
wav_tensor = model.generate(
|
| 94 |
+
text_chunk,
|
| 95 |
+
audio_prompt_path=effective_prompt_path,
|
| 96 |
+
temperature=temperature,
|
| 97 |
+
cfg_weight=cfg_weight,
|
| 98 |
+
exaggeration=exaggeration
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Ensure tensor is on CPU and properly shaped
|
| 102 |
+
wav_tensor_cpu = wav_tensor.cpu().float()
|
| 103 |
+
|
| 104 |
+
# Ensure tensor is 2D: [channels, samples]
|
| 105 |
+
if wav_tensor_cpu.ndim == 1:
|
| 106 |
+
wav_tensor_cpu = wav_tensor_cpu.unsqueeze(0)
|
| 107 |
+
elif wav_tensor_cpu.ndim > 2:
|
| 108 |
+
logger.warning(f"Unexpected tensor shape {wav_tensor_cpu.shape}, attempting to fix")
|
| 109 |
+
wav_tensor_cpu = wav_tensor_cpu.squeeze()
|
| 110 |
+
if wav_tensor_cpu.ndim == 1:
|
| 111 |
+
wav_tensor_cpu = wav_tensor_cpu.unsqueeze(0)
|
| 112 |
+
elif wav_tensor_cpu.ndim != 2 or wav_tensor_cpu.shape[0] != 1:
|
| 113 |
+
logger.error(f"Could not reshape tensor {wav_tensor.shape} to [1, N]")
|
| 114 |
+
return None
|
| 115 |
+
|
| 116 |
+
return wav_tensor_cpu
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Error generating audio chunk: {e}")
|
| 120 |
+
logger.error(traceback.format_exc())
|
| 121 |
+
return None
|
| 122 |
+
|
| 123 |
+
def text_to_speech_pipeline(
|
| 124 |
+
self,
|
| 125 |
+
text: str,
|
| 126 |
+
model: ChatterboxTTS,
|
| 127 |
+
max_chars_per_chunk: int = 1024,
|
| 128 |
+
inter_chunk_silence_ms: int = 350,
|
| 129 |
+
audio_prompt_path: Optional[str] = None,
|
| 130 |
+
temperature: float = 0.8,
|
| 131 |
+
cfg_weight: float = 0.5,
|
| 132 |
+
exaggeration: float = 0.5
|
| 133 |
+
) -> Optional[torch.Tensor]:
|
| 134 |
+
"""Convert text to speech with chunking support."""
|
| 135 |
+
try:
|
| 136 |
+
# Split text into chunks
|
| 137 |
+
text_chunks = self.split_text_into_chunks(text, max_chars_per_chunk)
|
| 138 |
+
|
| 139 |
+
if not text_chunks:
|
| 140 |
+
logger.error("No text chunks to process")
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
+
all_audio_tensors = []
|
| 144 |
+
sample_rate = model.sr
|
| 145 |
+
|
| 146 |
+
logger.debug(f"Processing {len(text_chunks)} chunks at {sample_rate} Hz")
|
| 147 |
+
|
| 148 |
+
for i, chunk_text in enumerate(text_chunks):
|
| 149 |
+
logger.debug(f"Processing chunk {i+1}/{len(text_chunks)}")
|
| 150 |
+
|
| 151 |
+
chunk_tensor = self.generate_audio_chunk(
|
| 152 |
+
chunk_text,
|
| 153 |
+
model,
|
| 154 |
+
audio_prompt_path,
|
| 155 |
+
temperature,
|
| 156 |
+
cfg_weight,
|
| 157 |
+
exaggeration
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
if chunk_tensor is None:
|
| 161 |
+
logger.warning(f"Skipping chunk {i+1} due to generation error")
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
all_audio_tensors.append(chunk_tensor)
|
| 165 |
+
|
| 166 |
+
# Add silence between chunks (except after the last chunk)
|
| 167 |
+
if i < len(text_chunks) - 1 and inter_chunk_silence_ms > 0:
|
| 168 |
+
silence_samples = int(sample_rate * inter_chunk_silence_ms / 1000.0)
|
| 169 |
+
silence_tensor = torch.zeros(
|
| 170 |
+
(1, silence_samples),
|
| 171 |
+
dtype=chunk_tensor.dtype,
|
| 172 |
+
device=chunk_tensor.device
|
| 173 |
+
)
|
| 174 |
+
all_audio_tensors.append(silence_tensor)
|
| 175 |
+
|
| 176 |
+
if not all_audio_tensors:
|
| 177 |
+
logger.error("No audio tensors generated")
|
| 178 |
+
return None
|
| 179 |
+
|
| 180 |
+
# Concatenate all audio tensors
|
| 181 |
+
logger.debug("Concatenating audio tensors...")
|
| 182 |
+
final_audio_tensor = torch.cat(all_audio_tensors, dim=1)
|
| 183 |
+
|
| 184 |
+
logger.debug(f"Final audio shape: {final_audio_tensor.shape}")
|
| 185 |
+
return final_audio_tensor
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
logger.error(f"Error in text-to-speech pipeline: {e}")
|
| 189 |
+
logger.error(traceback.format_exc())
|
| 190 |
+
return None
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def chatterbox(
|
| 194 |
+
self,
|
| 195 |
+
text: str,
|
| 196 |
+
output_path: str,
|
| 197 |
+
sample_audio_path: str = None,
|
| 198 |
+
exaggeration=0.5,
|
| 199 |
+
cfg_weight=0.5,
|
| 200 |
+
temperature=0.8,
|
| 201 |
+
chunk_chars: int = 1024,
|
| 202 |
+
chunk_silence_ms: int = 350,
|
| 203 |
+
):
|
| 204 |
+
start = time.time()
|
| 205 |
+
context_logger = logger.bind(
|
| 206 |
+
text_length=len(text),
|
| 207 |
+
sample_audio_path=sample_audio_path,
|
| 208 |
+
exaggeration=exaggeration,
|
| 209 |
+
cfg_weight=cfg_weight,
|
| 210 |
+
temperature=temperature,
|
| 211 |
+
model="ChatterboxTTS",
|
| 212 |
+
language="en-US",
|
| 213 |
+
device=device.type,
|
| 214 |
+
)
|
| 215 |
+
context_logger.debug("starting TTS generation with Chatterbox")
|
| 216 |
+
model = ChatterboxTTS.from_pretrained(device=device.type)
|
| 217 |
+
|
| 218 |
+
if sample_audio_path:
|
| 219 |
+
wav = self.text_to_speech_pipeline(
|
| 220 |
+
text,
|
| 221 |
+
model,
|
| 222 |
+
audio_prompt_path=sample_audio_path,
|
| 223 |
+
temperature=temperature,
|
| 224 |
+
cfg_weight=cfg_weight,
|
| 225 |
+
exaggeration=exaggeration,
|
| 226 |
+
max_chars_per_chunk=chunk_chars,
|
| 227 |
+
inter_chunk_silence_ms=chunk_silence_ms
|
| 228 |
+
)
|
| 229 |
+
else:
|
| 230 |
+
wav = self.text_to_speech_pipeline(
|
| 231 |
+
text,
|
| 232 |
+
model,
|
| 233 |
+
temperature=temperature,
|
| 234 |
+
cfg_weight=cfg_weight,
|
| 235 |
+
exaggeration=exaggeration,
|
| 236 |
+
max_chars_per_chunk=chunk_chars,
|
| 237 |
+
inter_chunk_silence_ms=chunk_silence_ms
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
if wav.dim() == 2 and wav.shape[0] == 1:
|
| 241 |
+
wav = wav.repeat(2, 1)
|
| 242 |
+
elif wav.dim() == 1:
|
| 243 |
+
wav = wav.unsqueeze(0).repeat(2, 1)
|
| 244 |
+
|
| 245 |
+
audio_length = wav.shape[1] / model.sr
|
| 246 |
+
ta.save(output_path, wav, model.sr)
|
| 247 |
+
context_logger.bind(
|
| 248 |
+
execution_time=time.time() - start,
|
| 249 |
+
audio_length=audio_length,
|
| 250 |
+
speedup=audio_length / (time.time() - start),
|
| 251 |
+
youtube_channel="https://www.youtube.com/@aiagentsaz"
|
| 252 |
+
).debug(
|
| 253 |
+
"TTS generation with Chatterbox completed",
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
|