sam12345324 commited on
Commit
0c8f7e3
·
verified ·
1 Parent(s): 1d98c06

Upload 26 files

Browse files
.dockerignore ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+
5
+ # Docker
6
+ Dockerfile
7
+ .dockerignore
8
+ ._.DS_Store
9
+
10
+ # Python
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+ *.so
15
+ .Python
16
+ env/
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ .pytest_cache
32
+
33
+ # Generated files
34
+ *.mp4
35
+ !bgvideo.mp4
36
+ !bgvideo2.mp4
37
+ !video.mp4
38
+ !video2.mp4
39
+ *.wav
40
+ !0.wav
41
+ multi.mp4
42
+ tmp/
43
+ !captions/
44
+
45
+ # Editor files
46
+ .idea/
47
+ .vscode/
48
+ *.swp
49
+ *.swo
50
+
51
+ # Temporary files
52
+ .DS_Store
53
+ .cache/
54
+ *.tmp
55
+ *.bak
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/anton.ttf filter=lfs diff=lfs merge=lfs -text
37
+ assets/icon_volume.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/noto_hindi.ttf filter=lfs diff=lfs merge=lfs -text
39
+ assets/noto.ttf filter=lfs diff=lfs merge=lfs -text
40
+ assets/person.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ .hypothesis
3
+ .venv
4
+ media
5
+ tmp
6
+ captions
7
+ .DS_Store
8
+ ._.DS_Store
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ fonts-ebgaramond \
7
+ ffmpeg \
8
+ libsndfile1 \
9
+ fonts-dejavu \
10
+ build-essential \
11
+ g++ \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ COPY api_server /app/api_server
18
+ COPY utils /app/utils
19
+ COPY video /app/video
20
+ COPY server.py /app/server.py
21
+
22
+ ENV PYTHONUNBUFFERED=1
23
+
24
+ CMD ["fastapi", "run", "server.py", "--host", "0.0.0.0", "--port", "8000"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 David Gyori
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,59 @@
1
- ---
2
- title: Shortsrender
3
- emoji: 📚
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI Agents A-Z No-Code Tools (V1)
2
+
3
+ Video editing tools to use with no-code tools like n8n, Zapier, and Make. Brought to you by [AI Agents A-Z](https://www.youtube.com/@aiagentsaz).
4
+
5
+ ## [📚 Join our Skool community for the premium edition of the server and other premium content](https://www.skool.com/ai-agents-az/about)
6
+
7
+ [Watch the YouTube video featuring this project](https://www.youtube.com/watch?v=1-UuldAM6fQ)
8
+
9
+ ### Be part of a growing community and help us create more content like this
10
+
11
+ # Starting the project
12
+
13
+ ## Using Docker
14
+
15
+ ```
16
+ docker run --rm -p 8000:8000 -it gyoridavid/ai-agents-no-code-tools:latest
17
+ ```
18
+
19
+ If you have an NVidia GPU and have the [Cuda Toolkit](https://developer.nvidia.com/cuda-toolkit) installed, you can run the server with GPU support
20
+
21
+ ```
22
+ docker run --rm --gpus=all -e NVIDIA_VISIBLE_DEVICES=all -e NVIDIA_DRIVER_CAPABILITIES=all -p 8000:8000 -it gyoridavid/ai-agents-no-code-tools:latest-cuda
23
+ ```
24
+
25
+ ## With python
26
+
27
+ 1. Clone the repository
28
+ 2. Create a virtual environment
29
+ ```bash
30
+ python -m venv venv
31
+ ```
32
+ 3. Activate the virtual environment
33
+ - On Windows:
34
+ ```bash
35
+ venv\Scripts\activate
36
+ ```
37
+ - On macOS/Linux:
38
+ ```bash
39
+ source venv/bin/activate
40
+ ```
41
+ 4. Install the dependencies
42
+ ```bash
43
+ pip install -r requirements.txt
44
+ ```
45
+ 5. Run the application
46
+ ```bash
47
+ fastapi dev server.py --host 0.0.0.0
48
+ ```
49
+
50
+ # Documentation
51
+
52
+ After starting the project, you can access the documentation at [http://localhost:8000/docs](http://localhost:8000/docs).
53
+
54
+ # Contributing
55
+
56
+ While PRs are welcome, please note that due to the nature of the project, I may not be able to review them in a timely manner. If you have any questions or suggestions, feel free to open an issue.
57
+
58
+ # License
59
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
api_server/auth_middleware.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Request, status
2
+ from fastapi.responses import JSONResponse
3
+ from loguru import logger
4
+ import os
5
+
6
+
7
+ auth_tokens = os.getenv("AUTH_TOKENS", "").split(",") if os.getenv("AUTH_TOKENS") else []
8
+
9
+ async def auth_middleware(request: Request, call_next):
10
+ # skip authentication if the auth_tokens list is empty
11
+ if not len(auth_tokens):
12
+ return await call_next(request)
13
+ # authenticate all requests except the /health endpoint
14
+ if request.url.path != "/health":
15
+ auth_token = request.headers.get("Authorization")
16
+ logger.bind(
17
+ path=request.url.path,
18
+ method=request.method,
19
+ auth_token=auth_token,
20
+ ).debug("Received request")
21
+ if not auth_token or auth_token not in auth_tokens:
22
+ return JSONResponse(
23
+ status_code=status.HTTP_401_UNAUTHORIZED,
24
+ content={"error": "Unauthorized"},
25
+ )
26
+ response = await call_next(request)
27
+ return response
api_server/v1_media_router.py ADDED
@@ -0,0 +1,755 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Query, Request, status, APIRouter, UploadFile, File, Form, BackgroundTasks
2
+ from fastapi.responses import JSONResponse, StreamingResponse
3
+ from typing import Literal, Optional
4
+ import os
5
+ from loguru import logger
6
+ import matplotlib.font_manager as fm
7
+
8
+ from video.tts import TTS
9
+ from video.tts_chatterbox import TTSChatterbox
10
+ from video.stt import STT
11
+ from video.storage import Storage
12
+ from video.caption import Caption
13
+ from video.media import MediaUtils
14
+ from video.builder import VideoBuilder
15
+ from utils.image import resize_image_cover
16
+
17
+ CHUNK_SIZE = 1024 * 1024 * 10 # 10MB chunks
18
+
19
+ def iterfile(path: str):
20
+ with open(path, mode="rb") as file:
21
+ while chunk := file.read(CHUNK_SIZE):
22
+ yield chunk
23
+
24
+
25
+ v1_media_api_router = APIRouter()
26
+
27
+ storage_path = os.getenv("STORAGE_PATH", os.path.join(os.path.abspath(os.getcwd()), "media"))
28
+
29
+ storage = Storage(
30
+ storage_path=storage_path,
31
+ )
32
+ stt = STT()
33
+ tts_manager = TTS()
34
+ tts_chatterbox = TTSChatterbox()
35
+
36
+ @v1_media_api_router.post("/audio-tools/transcribe")
37
+ def transcribe(
38
+ audio_file: UploadFile = File(..., description="Audio file to transcribe"),
39
+ language: Optional[str] = Form(None, description="Language code (optional)"),
40
+ ):
41
+ """
42
+ Transcribe audio file to text.
43
+ """
44
+ logger.bind(language=language, filename=audio_file.filename).info(
45
+ "Transcribing audio file"
46
+ )
47
+ captions, duration = stt.transcribe(audio_file.file, beam_size=5, language=language)
48
+ transcription = "".join([cap["text"] for cap in captions])
49
+
50
+ return {
51
+ "transcription": transcription,
52
+ "duration": duration,
53
+ }
54
+
55
+ @v1_media_api_router.get("/audio-tools/tts/kokoro/voices")
56
+ def get_kokoro_voices():
57
+ voices = tts_manager.valid_kokoro_voices()
58
+ return {"voices": voices}
59
+
60
+
61
+ @v1_media_api_router.post("/audio-tools/tts/kokoro")
62
+ def generate_kokoro_tts(
63
+ background_tasks: BackgroundTasks,
64
+ text: str = Form(..., description="Text to convert to speech"),
65
+ voice: Optional[str] = Form(None, description="Voice name for kokoro TTS"),
66
+ speed: Optional[float] = Form(None, description="Speed for kokoro TTS"),
67
+ ):
68
+ """
69
+ Generate audio from text using specified TTS engine.
70
+ """
71
+ if not voice:
72
+ voice = "af_heart"
73
+ voices = tts_manager.valid_kokoro_voices()
74
+ if voice not in voices:
75
+ return JSONResponse(
76
+ status_code=status.HTTP_400_BAD_REQUEST,
77
+ content={"error": f"Invalid voice: {voice}. Valid voices: {voices}"},
78
+ )
79
+ audio_id, audio_path = storage.create_media_filename_with_id(
80
+ media_type="audio", file_extension=".wav"
81
+ )
82
+ tmp_file_id = storage.create_tmp_file(audio_id)
83
+
84
+ def bg_task():
85
+ tts_manager.kokoro(
86
+ text=text,
87
+ output_path=audio_path,
88
+ voice=voice,
89
+ speed=speed if speed else 1.0,
90
+ )
91
+ storage.delete_media(tmp_file_id)
92
+
93
+ # background_tasks.add_task(bg_task)
94
+ logger.info(f"Adding background task for TTS generation with ID: {audio_id}")
95
+ background_tasks.add_task(bg_task)
96
+ logger.info(f"Background task added for TTS generation with ID: {audio_id}")
97
+
98
+ return {"file_id": audio_id}
99
+
100
+
101
+ @v1_media_api_router.post("/audio-tools/tts/chatterbox")
102
+ def generate_chatterbox_tts(
103
+ background_tasks: BackgroundTasks,
104
+ text: str = Form(..., description="Text to convert to speech"),
105
+ sample_audio_id: Optional[str] = Form(
106
+ None, description="Sample audio ID for voice cloning"
107
+ ),
108
+ sample_audio_file: Optional[UploadFile] = File(
109
+ None, description="Sample audio file for voice cloning"
110
+ ),
111
+ exaggeration: Optional[float] = Form(
112
+ 0.5, description="Exaggeration factor for voice cloning, default: 0.5"
113
+ ),
114
+ cfg_weight: Optional[float] = Form(0.5, description="CFG weight for voice cloning, default: 0.5"),
115
+ temperature: Optional[float] = Form(
116
+ 0.8, description="Temperature for voice cloning (default: 0.8)"
117
+ ),
118
+ chunk_chars: Optional[int] = Form(1024, description="Max characters per chunk (default: 1024)"),
119
+ chunk_silence_ms: Optional[int] = Form(
120
+ 350, description="Silence duration between chunks in milliseconds (default: 350)"
121
+ )
122
+ ):
123
+ """
124
+ Generate audio from text using Chatterbox TTS.
125
+ """
126
+ audio_id, audio_path = storage.create_media_filename_with_id(
127
+ media_type="audio", file_extension=".wav"
128
+ )
129
+
130
+ sample_audio_path = None
131
+ if sample_audio_file:
132
+ if not sample_audio_file.filename.endswith(".wav"):
133
+ return JSONResponse(
134
+ status_code=status.HTTP_400_BAD_REQUEST,
135
+ content={"error": "Sample audio file must be a .wav file."},
136
+ )
137
+ sample_audio_id = storage.upload_media(
138
+ media_type="tmp",
139
+ media_data=sample_audio_file.file.read(),
140
+ file_extension=".wav",
141
+ )
142
+ sample_audio_path = storage.get_media_path(sample_audio_id)
143
+ elif sample_audio_id:
144
+ if not storage.media_exists(sample_audio_id):
145
+ return JSONResponse(
146
+ status_code=status.HTTP_404_NOT_FOUND,
147
+ content={"error": f"Sample audio with ID {sample_audio_id} not found."},
148
+ )
149
+ sample_audio_path = storage.get_media_path(sample_audio_id)
150
+
151
+ tmp_file_id = storage.create_tmp_file(audio_id)
152
+
153
+ def bg_task():
154
+ try:
155
+ tts_chatterbox.chatterbox(
156
+ text=text,
157
+ output_path=audio_path,
158
+ sample_audio_path=sample_audio_path,
159
+ exaggeration=exaggeration,
160
+ cfg_weight=cfg_weight,
161
+ temperature=temperature,
162
+ chunk_chars=chunk_chars,
163
+ chunk_silence_ms=chunk_silence_ms,
164
+ )
165
+ except Exception as e:
166
+ logger.error(f"Error in Chatterbox TTS: {e}")
167
+ finally:
168
+ storage.delete_media(tmp_file_id)
169
+
170
+ # background_tasks.add_task(bg_task)
171
+ logger.info(f"Adding background task for Chatterbox TTS generation with ID: {audio_id}")
172
+ background_tasks.add_task(bg_task)
173
+ logger.info(f"Background task added for Chatterbox TTS generation with ID: {audio_id}")
174
+
175
+ return {"file_id": audio_id}
176
+
177
+
178
+ @v1_media_api_router.post("/storage")
179
+ def upload_file(
180
+ file: Optional[UploadFile] = File(None, description="File to upload"),
181
+ url: Optional[str] = Form(None, description="URL of the file to upload (optional)"),
182
+ media_type: Literal["image", "video", "audio"] = Form(
183
+ ..., description="Type of media being uploaded"
184
+ ),
185
+ ):
186
+ """
187
+ Upload a file and return its ID.
188
+ """
189
+ if media_type not in ["image", "video", "audio"]:
190
+ return JSONResponse(
191
+ status_code=status.HTTP_400_BAD_REQUEST,
192
+ content={"error": f"Invalid media type: {media_type}"},
193
+ )
194
+ if file:
195
+ file_id = storage.upload_media(
196
+ media_type=media_type,
197
+ media_data=file.file.read(),
198
+ file_extension=os.path.splitext(file.filename)[1],
199
+ )
200
+
201
+ return {"file_id": file_id}
202
+ elif url:
203
+ if not storage.is_valid_url(url):
204
+ return JSONResponse(
205
+ status_code=status.HTTP_400_BAD_REQUEST,
206
+ content={"error": f"Invalid URL: {url}"},
207
+ )
208
+ file_id = storage.upload_media_from_url(media_type=media_type, url=url)
209
+ return {"file_id": file_id}
210
+
211
+
212
+ @v1_media_api_router.get("/storage/{file_id}")
213
+ def download_file(file_id: str):
214
+ """
215
+ Download a file by its ID.
216
+ """
217
+ if not storage.media_exists(file_id):
218
+ return JSONResponse(
219
+ status_code=status.HTTP_404_NOT_FOUND,
220
+ content={"error": f"File with ID {file_id} not found."},
221
+ )
222
+
223
+ file_path = storage.get_media_path(file_id)
224
+ return StreamingResponse(
225
+ iterfile(file_path),
226
+ media_type="application/octet-stream",
227
+ headers={
228
+ "Content-Disposition": f"attachment; filename={os.path.basename(file_path)}"
229
+ },
230
+ )
231
+
232
+
233
+ @v1_media_api_router.delete("/storage/{file_id}")
234
+ def delete_file(file_id: str):
235
+ """
236
+ Delete a file by its
237
+ """
238
+ if storage.media_exists(file_id):
239
+ storage.delete_media(file_id)
240
+ return {"status": "success"}
241
+
242
+
243
+ @v1_media_api_router.get("/storage/{file_id}/status")
244
+ def file_status(file_id: str):
245
+ """
246
+ Check the status of a file by its ID.
247
+ """
248
+ tmp_id = storage.create_tmp_file_id(file_id)
249
+ if storage.media_exists(tmp_id):
250
+ return {"status": "processing"}
251
+ elif storage.media_exists(file_id):
252
+ return {"status": "ready"}
253
+ return {"status": "not_found"}
254
+
255
+
256
+ @v1_media_api_router.post("/video-tools/merge")
257
+ def merge_videos(
258
+ background_tasks: BackgroundTasks,
259
+ video_ids: str = Form(..., description="List of video IDs to merge"),
260
+ background_music_id: Optional[str] = Form(
261
+ None, description="Background music ID (optional)"
262
+ ),
263
+ background_music_volume: Optional[float] = Form(
264
+ 0.5, description="Volume for background music (0.0 to 1.0)"
265
+ ),
266
+ ):
267
+ """
268
+ Merge multiple videos into one.
269
+ """
270
+ video_ids = video_ids.split(",") if video_ids else []
271
+ if not video_ids:
272
+ return JSONResponse(
273
+ status_code=status.HTTP_400_BAD_REQUEST,
274
+ content={"error": "At least one video ID is required."},
275
+ )
276
+
277
+ merged_video_id, merged_video_path = storage.create_media_filename_with_id(
278
+ media_type="video", file_extension=".mp4"
279
+ )
280
+
281
+ video_paths = []
282
+ for video_id in video_ids:
283
+ if not storage.media_exists(video_id):
284
+ return JSONResponse(
285
+ status_code=status.HTTP_404_NOT_FOUND,
286
+ content={"error": f"Video with ID {video_id} not found."},
287
+ )
288
+ video_paths.append(storage.get_media_path(video_id))
289
+
290
+ if background_music_id and not storage.media_exists(background_music_id):
291
+ return JSONResponse(
292
+ status_code=status.HTTP_404_NOT_FOUND,
293
+ content={
294
+ "error": f"Background music with ID {background_music_id} not found."
295
+ },
296
+ )
297
+ background_music_path = (
298
+ storage.get_media_path(background_music_id) if background_music_id else None
299
+ )
300
+
301
+ utils = MediaUtils()
302
+
303
+ temp_file_id = storage.create_tmp_file(merged_video_id)
304
+
305
+ def bg_task():
306
+ utils.merge_videos(
307
+ video_paths=video_paths,
308
+ output_path=merged_video_path,
309
+ background_music_path=background_music_path,
310
+ background_music_volume=background_music_volume,
311
+ )
312
+ storage.delete_media(temp_file_id)
313
+
314
+ logger.info(f"Adding background task for video merge with ID: {merged_video_id}")
315
+ background_tasks.add_task(bg_task)
316
+ logger.info(f"Background task added for video merge with ID: {merged_video_id}")
317
+
318
+ return {"file_id": merged_video_id}
319
+
320
+
321
+ @v1_media_api_router.get('/fonts')
322
+ def list_fonts():
323
+ fonts = set()
324
+ for fname in fm.findSystemFonts(fontpaths=None, fontext='ttf'):
325
+ try:
326
+ prop = fm.FontProperties(fname=fname)
327
+ name = prop.get_name()
328
+ fonts.add(name)
329
+ except RuntimeError:
330
+ continue
331
+ return {"fonts": sorted(fonts)}
332
+
333
+ @v1_media_api_router.post("/video-tools/generate/tts-captioned-video")
334
+ def generate_captioned_video(
335
+ background_tasks: BackgroundTasks,
336
+ background_id: str = Form(..., description="Background image ID"),
337
+ text: Optional[str] = Form(None, description="Text to generate video from"),
338
+ width: Optional[int] = Form(1080, description="Width of the video (default: 1080)"),
339
+ height: Optional[int] = Form(
340
+ 1920, description="Height of the video (default: 1920)"
341
+ ),
342
+ audio_id: Optional[str] = Form(
343
+ None, description="Audio ID for the video (optional)"
344
+ ),
345
+ kokoro_voice: Optional[str] = Form(
346
+ "af_heart", description="Voice for kokoro TTS (default: af_heart)"
347
+ ),
348
+ kokoro_speed: Optional[float] = Form(
349
+ 1.0, description="Speed for kokoro TTS (default: 1.0)"
350
+ ),
351
+ language: Optional[str] = Form(
352
+ None, description="Language code for STT (optional, e.g. 'en', 'fr', 'de'), defaults to None (auto-detect language if audio_id is provided)"
353
+ ),
354
+
355
+ image_effect: Optional[str] = Form("ken_burns", description="Effect to apply to the background image, options: ken_burns, pan (default: 'ken_burns')"),
356
+
357
+ # Flattened subtitle configuration options
358
+ caption_config_line_count: Optional[int] = Form(1, description="Number of lines per subtitle segment (default: 1)", ge=1, le=5),
359
+ caption_config_line_max_length: Optional[int] = Form(1, description="Maximum characters per line (default: 1)", ge=1, le=200),
360
+ caption_config_font_size: Optional[int] = Form(120, description="Font size for subtitles (default: 50)", ge=8, le=200),
361
+ caption_config_font_name: Optional[str] = Form("Arial", description="Font family name (default: 'EB Garamond', see the available fonts form the /fonts endpoint)"),
362
+ caption_config_font_bold: Optional[bool] = Form(True, description="Whether to use bold font (default: True)"),
363
+ caption_config_font_italic: Optional[bool] = Form(False, description="Whether to use italic font (default: false)"),
364
+ caption_config_font_color: Optional[str] = Form("#fff", description="Font color in hex format (default: '#fff')"),
365
+ caption_config_subtitle_position: Optional[Literal["top", "center", "bottom"]] = Form("top", description="Vertical position of subtitles (default: 'top')"),
366
+ caption_config_shadow_color: Optional[str] = Form("#000", description="Shadow color in hex format (default: '#000')"),
367
+ caption_config_shadow_transparency: Optional[float] = Form(0.4, description="Shadow transparency from 0.0 to 1.0 (default: 0.4)", ge=0.0, le=1.0),
368
+ caption_config_shadow_blur: Optional[int] = Form(10, description="Shadow blur radius (default: 10)", ge=0, le=20),
369
+ caption_config_stroke_color: Optional[str] = Form(None, description="Stroke/outline color in hex format (default: '#000')"),
370
+ caption_config_stroke_size: Optional[int] = Form(5, description="Stroke/outline size (default: 5)", ge=0, le=10),
371
+ ):
372
+ """
373
+ Generate a captioned video from text and background image.
374
+
375
+ """
376
+ # Build subtitle options from individual parameters
377
+ parsed_subtitle_options = {}
378
+
379
+ # Only include non-None values
380
+ if caption_config_line_count is not None:
381
+ parsed_subtitle_options['lines'] = caption_config_line_count
382
+ if caption_config_line_max_length is not None:
383
+ parsed_subtitle_options['max_length'] = caption_config_line_max_length
384
+ if caption_config_font_size is not None:
385
+ parsed_subtitle_options['font_size'] = caption_config_font_size
386
+ if caption_config_font_name is not None:
387
+ parsed_subtitle_options['font_name'] = caption_config_font_name
388
+ if caption_config_font_bold is not None:
389
+ parsed_subtitle_options['font_bold'] = caption_config_font_bold
390
+ if caption_config_font_italic is not None:
391
+ parsed_subtitle_options['font_italic'] = caption_config_font_italic
392
+ if caption_config_font_color is not None:
393
+ parsed_subtitle_options['font_color'] = caption_config_font_color
394
+ if caption_config_subtitle_position is not None:
395
+ parsed_subtitle_options['subtitle_position'] = caption_config_subtitle_position
396
+ if caption_config_shadow_color is not None:
397
+ parsed_subtitle_options['shadow_color'] = caption_config_shadow_color
398
+ if caption_config_shadow_transparency is not None:
399
+ parsed_subtitle_options['shadow_transparency'] = caption_config_shadow_transparency
400
+ if caption_config_shadow_blur is not None:
401
+ parsed_subtitle_options['shadow_blur'] = caption_config_shadow_blur
402
+ if caption_config_stroke_color is not None:
403
+ parsed_subtitle_options['stroke_color'] = caption_config_stroke_color
404
+ if caption_config_stroke_size is not None:
405
+ parsed_subtitle_options['stroke_size'] = caption_config_stroke_size
406
+
407
+ if audio_id and not storage.media_exists(audio_id):
408
+ return JSONResponse(
409
+ status_code=status.HTTP_400_BAD_REQUEST,
410
+ content={"error": f"Audio with ID {audio_id} not found."},
411
+ )
412
+ if not audio_id and kokoro_voice not in tts_manager.valid_kokoro_voices():
413
+ return JSONResponse(
414
+ status_code=status.HTTP_400_BAD_REQUEST,
415
+ content={"error": f"Invalid voice: {kokoro_voice}."},
416
+ )
417
+ media_type = storage.get_media_type(background_id)
418
+ if media_type not in ["image"]:
419
+ return JSONResponse(
420
+ status_code=status.HTTP_400_BAD_REQUEST,
421
+ content={"error": f"Invalid media type: {media_type}"},
422
+ )
423
+ if not storage.media_exists(background_id):
424
+ return JSONResponse(
425
+ status_code=status.HTTP_404_NOT_FOUND,
426
+ content={"error": f"Background image with ID {background_id} not found."},
427
+ )
428
+
429
+ output_id, output_path = storage.create_media_filename_with_id(
430
+ media_type="video", file_extension=".mp4"
431
+ )
432
+ dimensions = (width, height)
433
+ builder = VideoBuilder(
434
+ dimensions=dimensions,
435
+ )
436
+ builder.set_media_utils(MediaUtils())
437
+
438
+ tmp_file_id = storage.create_tmp_file(output_id)
439
+
440
+ def bg_task(
441
+ tmp_file_id: str = tmp_file_id,
442
+ ):
443
+ tmp_file_ids = [tmp_file_id]
444
+
445
+ # set audio, generate captions
446
+ captions = None
447
+ tts_audio_id = audio_id
448
+ from video.tts import LANGUAGE_VOICE_MAP
449
+ lang_config = LANGUAGE_VOICE_MAP.get(kokoro_voice, {})
450
+ international = lang_config.get("international", False)
451
+
452
+ if tts_audio_id:
453
+ audio_path = storage.get_media_path(tts_audio_id)
454
+ captions = stt.transcribe(audio_path=audio_path, language=language)[0]
455
+ builder.set_audio(audio_path)
456
+ # generate TTS and set audio
457
+ else:
458
+ tts_audio_id, audio_path = storage.create_media_filename_with_id(
459
+ media_type="audio", file_extension=".wav"
460
+ )
461
+ tmp_file_ids.append(tts_audio_id)
462
+ captions = tts_manager.kokoro(
463
+ text=text,
464
+ output_path=audio_path,
465
+ voice=kokoro_voice,
466
+ speed=kokoro_speed,
467
+ )[0]
468
+ if international:
469
+ # use whisper to create captions
470
+ iso_lang_code = lang_config.get("iso639_1")
471
+ captions = stt.transcribe(audio_path=audio_path, language=iso_lang_code)[0]
472
+
473
+ builder.set_audio(audio_path)
474
+
475
+ # create subtitle
476
+ captionsManager = Caption()
477
+ subtitle_id, subtitle_path = storage.create_media_filename_with_id(
478
+ media_type="tmp", file_extension=".ass"
479
+ )
480
+ tmp_file_ids.append(subtitle_id)
481
+
482
+ # create segments based on language
483
+ if international:
484
+ segments = captionsManager.create_subtitle_segments_english(
485
+ captions=captions,
486
+ lines=parsed_subtitle_options.get('lines', parsed_subtitle_options.get("lines", 1)),
487
+ max_length=parsed_subtitle_options.get('max_length', parsed_subtitle_options.get("max_length", 1)),
488
+ )
489
+ else:
490
+ segments = captionsManager.create_subtitle_segments_international(
491
+ captions=captions,
492
+ lines=parsed_subtitle_options.get('lines', parsed_subtitle_options.get('lines', 1)),
493
+ max_length=parsed_subtitle_options.get('max_length', parsed_subtitle_options.get('max_length', 1)),
494
+ )
495
+
496
+ captionsManager.create_subtitle(
497
+ segments=segments,
498
+ output_path=subtitle_path,
499
+ dimensions=dimensions,
500
+
501
+ font_size=parsed_subtitle_options.get('font_size', 120),
502
+ shadow_blur=parsed_subtitle_options.get('shadow_blur', 10),
503
+ stroke_size=parsed_subtitle_options.get('stroke_size', 5),
504
+ shadow_color=parsed_subtitle_options.get('shadow_color', "#000"),
505
+ stroke_color=parsed_subtitle_options.get('stroke_color', "#000"),
506
+ font_name=parsed_subtitle_options.get('font_name', "Arial"),
507
+ font_bold=parsed_subtitle_options.get('font_bold', True),
508
+ font_italic=parsed_subtitle_options.get('font_italic', False),
509
+ subtitle_position=parsed_subtitle_options.get('subtitle_position', "top"),
510
+ font_color=parsed_subtitle_options.get('font_color', "#fff"),
511
+ shadow_transparency=parsed_subtitle_options.get('shadow_transparency', 0.4),
512
+ )
513
+ builder.set_captions(
514
+ file_path=subtitle_path,
515
+ )
516
+
517
+ # resize background image if needed
518
+ background_path = storage.get_media_path(background_id)
519
+ utils = MediaUtils()
520
+ info = utils.get_video_info(background_path)
521
+ if info.get("width", 0) != width or info.get("height", 0) != height:
522
+ logger.bind(
523
+ image_width=info.get("width", 0),
524
+ image_height=info.get("height", 0),
525
+ target_width=width,
526
+ target_height=height,
527
+ ).debug(
528
+ "Resizing background image to fit video dimensions"
529
+ )
530
+ _, resized_background_path = storage.create_media_filename_with_id(
531
+ media_type="image", file_extension=".jpg"
532
+ )
533
+ resize_image_cover(
534
+ image_path=background_path,
535
+ output_path=resized_background_path,
536
+ target_width=width,
537
+ target_height=height,
538
+ )
539
+ background_path = resized_background_path
540
+
541
+ builder.set_background_image(
542
+ background_path,
543
+ effect_config={
544
+ "effect": image_effect,
545
+ }
546
+ )
547
+
548
+ builder.set_output_path(output_path)
549
+
550
+ builder.execute()
551
+
552
+ for tmp_file_id in tmp_file_ids:
553
+ if storage.media_exists(tmp_file_id):
554
+ storage.delete_media(tmp_file_id)
555
+
556
+ logger.info(f"Adding background task for captioned video generation with ID: {output_id}")
557
+ background_tasks.add_task(bg_task, tmp_file_id=tmp_file_id)
558
+ logger.info(f"Background task added for captioned video generation with ID: {output_id}")
559
+
560
+ return {
561
+ "file_id": output_id,
562
+ }
563
+
564
+ # https://ffmpeg.org/ffmpeg-filters.html#colorkey
565
+ @v1_media_api_router.post("/video-tools/add-colorkey-overlay")
566
+ def add_colorkey_overlay(
567
+ background_tasks: BackgroundTasks,
568
+ video_id: str = Form(..., description="Video ID to overlay"),
569
+ overlay_video_id: str = Form(..., description="Overlay image ID"),
570
+ color: Optional[str] = Form(
571
+ "green", description="Set the color for which alpha will be set to 0 (full transparency). Use name of the color or hex code (e.g. 'red' or '#ff0000')"
572
+ ),
573
+ similarity: Optional[float] = Form(
574
+ 0.1, description="Set the radius from the key color within which other colors also have full transparency (Default: 0.1)"
575
+ ),
576
+ blend: Optional[float] = Form(
577
+ 0.1, description="Set how the alpha value for pixels that fall outside the similarity radius is computed (default: 0.1)"
578
+ ),
579
+ ):
580
+ """
581
+ Overlay a video on a video with the specified colorkey and intensity
582
+ """
583
+
584
+ if not storage.media_exists(video_id):
585
+ return JSONResponse(
586
+ status_code=status.HTTP_404_NOT_FOUND,
587
+ content={"error": f"Video with ID {video_id} not found."},
588
+ )
589
+ if not storage.media_exists(overlay_video_id):
590
+ return JSONResponse(
591
+ status_code=status.HTTP_404_NOT_FOUND,
592
+ content={"error": f"Overlay video with ID {overlay_video_id} not found."},
593
+ )
594
+
595
+ video_path = storage.get_media_path(video_id)
596
+ overlay_video_path = storage.get_media_path(overlay_video_id)
597
+
598
+ output_id, output_path = storage.create_media_filename_with_id(
599
+ media_type="video", file_extension=".mp4"
600
+ )
601
+
602
+ tmp_file_id = storage.create_tmp_file(output_id)
603
+
604
+ def bg_task():
605
+ utils = MediaUtils()
606
+ utils.colorkey_overlay(
607
+ input_video_path=video_path,
608
+ overlay_video_path=overlay_video_path,
609
+ output_video_path=output_path,
610
+ color=color,
611
+ similarity=similarity,
612
+ blend=blend,
613
+ )
614
+ storage.delete_media(tmp_file_id)
615
+
616
+ logger.info(f"Adding background task for colorkey overlay with ID: {output_id}")
617
+ background_tasks.add_task(bg_task)
618
+ logger.info(f"Background task added for colorkey overlay with ID: {output_id}")
619
+
620
+ return {
621
+ "file_id": output_id,
622
+ }
623
+
624
+ @v1_media_api_router.get("/video-tools/extract-frame/{video_id}")
625
+ def extract_frame(
626
+ video_id: str,
627
+ timestamp: Optional[float] = Query(1.0, description="Timestamp in seconds to extract frame from (default: 1.0)")
628
+ ):
629
+ """
630
+ Extract a frame from a video at a specified timestamp.
631
+
632
+ Args:
633
+ video_id: Video ID to extract frame from
634
+ timestamp: Optional timestamp in seconds to extract frame from (default: first frame)
635
+ """
636
+ if not storage.media_exists(video_id):
637
+ return JSONResponse(
638
+ status_code=status.HTTP_404_NOT_FOUND,
639
+ content={"error": f"Video with ID {video_id} not found."},
640
+ )
641
+
642
+ video_path = storage.get_media_path(video_id)
643
+
644
+ _, output_path = storage.create_media_filename_with_id(
645
+ media_type="image", file_extension=".jpg"
646
+ )
647
+
648
+ utils = MediaUtils()
649
+ video_info = utils.get_video_info(video_path)
650
+ if video_info.get("duration", 0) <= float(timestamp):
651
+ timestamp = video_info.get("duration", 0) - 0.3
652
+
653
+ success = utils.extract_frame(
654
+ video_path=video_path,
655
+ output_path=output_path,
656
+ time_seconds=timestamp,
657
+ )
658
+
659
+ if not success:
660
+ return JSONResponse(
661
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
662
+ content={"error": "Failed to extract frame from video."},
663
+ )
664
+
665
+ # Load file into memory
666
+ with open(output_path, "rb") as file:
667
+ file_data = file.read()
668
+
669
+ # Remove the output file
670
+ os.remove(output_path)
671
+
672
+ # Create streaming response with appropriate headers
673
+ from io import BytesIO
674
+ return StreamingResponse(
675
+ BytesIO(file_data),
676
+ media_type="image/jpeg",
677
+ headers={
678
+ "Content-Disposition": f"attachment; filename=frame_{video_id}_{timestamp or 'first'}.jpg"
679
+ },
680
+ )
681
+
682
+ # extract x number of frames from the video, equally spaced
683
+ @v1_media_api_router.post('/video-tools/extract-frames')
684
+ def extract_frame_from_url(
685
+ url: str = Form(..., description="URL of the video to extract frame from"),
686
+ amount: int = Form(5, description="Number of frames to extract from the video (default: 5)"),
687
+ length_seconds: Optional[float] = Form(None, description="Length of the video in seconds (optional)"),
688
+ stitch: Optional[bool] = Form(False, description="Whether to stitch the frames into a single image (default: False)")
689
+ ):
690
+ template_id, template_path = storage.create_media_template(
691
+ media_type="image", file_extension=".jpg"
692
+ )
693
+ utils = MediaUtils()
694
+
695
+ if not length_seconds:
696
+ video_info = utils.get_video_info(url)
697
+ length_seconds = video_info.get("duration", 0)
698
+
699
+ utils.extract_frames(
700
+ video_path=url,
701
+ length_seconds=length_seconds,
702
+ amount=amount,
703
+ output_template=template_path,
704
+ )
705
+
706
+ image_ids = []
707
+ for i in range(amount):
708
+ padded_index = str(i + 1).zfill(2)
709
+
710
+ image_id = template_id.replace("%02d", padded_index)
711
+ image_ids.append(image_id)
712
+
713
+ return {
714
+ "message": f"Extracted {amount} frames from the video at {url}. The frames are saved in the template directory.",
715
+ "template_id": template_id,
716
+ "template_path": template_path,
717
+ "image_ids": image_ids,
718
+ }
719
+
720
+
721
+ @v1_media_api_router.get("/video-tools/info/{file_id}")
722
+ def get_video_info(file_id: str):
723
+ """
724
+ Get information about a video file.
725
+ """
726
+ if not storage.media_exists(file_id):
727
+ return JSONResponse(
728
+ status_code=status.HTTP_404_NOT_FOUND,
729
+ content={"error": f"Video with ID {file_id} not found."},
730
+ )
731
+
732
+ video_path = storage.get_media_path(file_id)
733
+
734
+ utils = MediaUtils()
735
+ info = utils.get_video_info(video_path)
736
+
737
+ return info
738
+
739
+ @v1_media_api_router.get("/audio-tools/info/{file_id}")
740
+ def get_audio_info(file_id: str):
741
+ """
742
+ Get information about an audio file.
743
+ """
744
+ if not storage.media_exists(file_id):
745
+ return JSONResponse(
746
+ status_code=status.HTTP_404_NOT_FOUND,
747
+ content={"error": f"Audio with ID {file_id} not found."},
748
+ )
749
+
750
+ audio_path = storage.get_media_path(file_id)
751
+
752
+ utils = MediaUtils()
753
+ info = utils.get_audio_info(audio_path)
754
+
755
+ return info
api_server/v1_utils_router.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from fastapi import BackgroundTasks, Form, status, APIRouter
3
+ from fastapi.responses import JSONResponse, StreamingResponse
4
+ from loguru import logger
5
+ from video.storage import Storage
6
+ from youtube_transcript_api import YouTubeTranscriptApi
7
+
8
+ storage_path = os.getenv("STORAGE_PATH", os.path.join(os.path.abspath(os.getcwd()), "media"))
9
+
10
+ storage = Storage(
11
+ storage_path=storage_path,
12
+ )
13
+
14
+ v1_utils_router = APIRouter()
15
+ ytt_api = YouTubeTranscriptApi()
16
+
17
+ @v1_utils_router.get("/youtube-transcript")
18
+ def get_youtube_transcript(
19
+ video_id: str,
20
+ ):
21
+ """
22
+ Get YouTube video transcript by video ID.
23
+ """
24
+ try:
25
+ fetched_transcript = ytt_api.fetch(video_id)
26
+ return {
27
+ "video_id": video_id,
28
+ "transcript": fetched_transcript
29
+ }
30
+ except Exception as e:
31
+ logger.error(f"Error fetching transcript for video {video_id}: {e}")
32
+ return JSONResponse(
33
+ status_code=status.HTTP_404_NOT_FOUND,
34
+ content={"error": f"Transcript for video {video_id} not found."},
35
+ )
36
+
37
+ @v1_utils_router.post("/stitch-images")
38
+ def stitch_images(
39
+ image_urls: str = Form(..., description="Comma-separated list of image URLs to stitch together"),
40
+ max_width: int = Form(1920, description="Maximum width of the final stitched image"),
41
+ max_height: int = Form(1080, description="Maximum height of the final stitched image"),
42
+ ):
43
+ """
44
+ Stitch multiple images into one.
45
+ """
46
+ if not image_urls:
47
+ return JSONResponse(
48
+ status_code=status.HTTP_400_BAD_REQUEST,
49
+ content={"error": "No image URLs provided."}
50
+ )
51
+
52
+ image_urls = [url.strip() for url in image_urls.split(",") if url.strip()]
53
+
54
+ from utils.image import stitch_images as stitch_images_util
55
+ try:
56
+ stitched_image = stitch_images_util(image_urls, max_width, max_height)
57
+
58
+ # Convert PIL image to JPEG format in memory
59
+ from io import BytesIO
60
+ img_buffer = BytesIO()
61
+ stitched_image.save(img_buffer, format='JPEG', quality=95)
62
+ img_buffer.seek(0)
63
+
64
+ return StreamingResponse(
65
+ img_buffer,
66
+ media_type="image/jpeg",
67
+ headers={
68
+ "Content-Disposition": f"attachment; filename=stitched.jpg"
69
+ },
70
+ )
71
+ except Exception as e:
72
+ logger.error(f"Error stitching images: {e}")
73
+ return JSONResponse(
74
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
75
+ content={"error": "Failed to stitch images."}
76
+ )
77
+
78
+ @v1_utils_router.post("/make-image-imperfect")
79
+ def image_unaize(
80
+ background_tasks: BackgroundTasks,
81
+ image_id: str = Form(..., description="ID of the image to unaize"),
82
+ enhance_color: float = Form(None, description="Strength of the color enhancement (0-2). 0 means black and white, 1 means no change, 2 means full color enhancement"),
83
+ enhance_contrast: float = Form(None, description="Strength of the contrast enhancement (0-2)"),
84
+ noise_strength: int = Form(0, description="Strength of the noise to apply to the image (0-100)"),
85
+ ):
86
+ """
87
+ Remove AI-generated artifacts from an image.
88
+ """
89
+ if not image_id:
90
+ return JSONResponse(
91
+ status_code=status.HTTP_400_BAD_REQUEST,
92
+ content={"error": "No image URL provided."}
93
+ )
94
+
95
+ image_path = storage.get_media_path(image_id)
96
+
97
+ jpg_id, jpg_path = storage.create_media_filename_with_id(
98
+ media_type="image", file_extension=".jpg"
99
+ )
100
+ tmp_file_id = storage.create_tmp_file(jpg_id)
101
+
102
+ from utils.image import make_image_imperfect
103
+
104
+ def bg_task():
105
+ try:
106
+ imperfect_image = make_image_imperfect(
107
+ image_path,
108
+ enhance_color=enhance_color,
109
+ enhance_contrast=enhance_contrast,
110
+ noise_strength=noise_strength
111
+ )
112
+ imperfect_image.save(jpg_path, format='JPEG', quality=95)
113
+ except Exception as e:
114
+ logger.error(f"Error making image imperfect: {e}")
115
+ finally:
116
+ storage.delete_media(tmp_file_id)
117
+
118
+ background_tasks.add_task(bg_task)
119
+ return {
120
+ "file_id": jpg_id,
121
+ }
122
+
123
+ @v1_utils_router.post("/convert/pcm/wav")
124
+ def convert_pcm_to_wav(
125
+ background_tasks: BackgroundTasks,
126
+ pcm_id: str = Form(..., description="ID of the PCM audio file to convert"),
127
+ sample_rate: int = Form(24000, description="Sample rate of the PCM audio"),
128
+ channels: int = Form(1, description="Number of audio channels (1 for mono, 2 for stereo)"),
129
+ target_sample_rate: int = Form(44100, description="Target sample rate for the WAV audio"),
130
+ ):
131
+ """
132
+ Convert PCM audio to WAV format.
133
+ """
134
+ if not pcm_id or storage.media_exists(pcm_id) is False:
135
+ return JSONResponse(
136
+ status_code=status.HTTP_400_BAD_REQUEST,
137
+ content={"error": "PCM audio file not found."}
138
+ )
139
+
140
+ from video.media import MediaUtils
141
+ utils = MediaUtils()
142
+
143
+ wav_id, wav_path = storage.create_media_filename_with_id(
144
+ media_type="audio", file_extension=".wav"
145
+ )
146
+ tmp_file_id = storage.create_tmp_file(wav_id)
147
+
148
+ def bg_task():
149
+ try:
150
+ utils.convert_pcm_to_wav(
151
+ input_pcm_path=storage.get_media_path(pcm_id),
152
+ output_wav_path=wav_path,
153
+ sample_rate=sample_rate,
154
+ channels=channels,
155
+ target_sample_rate=target_sample_rate
156
+ )
157
+ except Exception as e:
158
+ logger.error(f"Error converting PCM to WAV: {e}")
159
+ finally:
160
+ storage.delete_media(tmp_file_id)
161
+
162
+ background_tasks.add_task(bg_task)
163
+
164
+ return {
165
+ "file_id": wav_id,
166
+ }
167
+
assets/anton.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28beb8f6542f642ba4143bd4a1d1cfc7be7b1dedc951096efd8e0942502ea1bf
3
+ size 161588
assets/icon_volume.png ADDED

Git LFS Details

  • SHA256: 019d2a13e54354427b30d02f527ec3e81aa5f1af278c2045b8600dc0a4aa651a
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
assets/noto.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf8b2a0576d5680284ab03a7a8219499d59bbe981a79bb3dc0031f251c39736
3
+ size 10560616
assets/noto_hindi.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8cac46a1c86d2533a616b1fcf4e1926b8e39bda69034508b0df96791f56d97
3
+ size 2044548
assets/person.png ADDED

Git LFS Details

  • SHA256: ebee7d3b260c84247653ae91731c40a2e42fe41b093a8e0002fd54fc472b7002
  • Pointer size: 132 Bytes
  • Size of remote file: 1.96 MB
cuda.Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG CUDA=12.3.1
2
+ ARG OS=ubuntu22.04
3
+ ARG RUNIMAGE=${CUDA}-runtime-${OS}
4
+
5
+ FROM nvidia/cuda:${RUNIMAGE}
6
+ ARG CUDA
7
+ ARG OS
8
+ USER root
9
+
10
+ RUN apt update && apt install -y \
11
+ fonts-ebgaramond \
12
+ build-essential \
13
+ g++ \
14
+ curl \
15
+ wget \
16
+ git \
17
+ python3.10 \
18
+ python3-pip \
19
+ python3-dev \
20
+ python3.10-gdbm \
21
+ ffmpeg \
22
+ libsndfile1 \
23
+ fonts-dejavu \
24
+ && rm -rf /var/lib/apt/lists/*
25
+
26
+ WORKDIR /app
27
+
28
+ RUN ln -sf /usr/bin/python3 /usr/bin/python
29
+ RUN ln -sf /usr/bin/pip3 /usr/bin/pip
30
+
31
+ COPY requirements.txt .
32
+ RUN pip install --no-cache-dir -r requirements.txt
33
+
34
+ ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
35
+
36
+ COPY api_server /app/api_server
37
+ COPY utils /app/utils
38
+ COPY video /app/video
39
+ COPY server.py /app/server.py
40
+
41
+ ENV PYTHONUNBUFFERED=1
42
+
43
+ EXPOSE 8000
44
+
45
+ CMD ["fastapi", "run", "server.py", "--host", "0.0.0.0", "--port", "8000"]
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uuid
2
+ numpy
3
+ kokoro
4
+ soundfile
5
+ fastapi[standard]
6
+ loguru
7
+ chatterbox-tts >= 0.1.2
8
+ faster_whisper
9
+ torchaudio
10
+ requests_tor
11
+ requests[socks]
12
+ youtube-transcript-api
13
+ matplotlib
14
+ Pillow
15
+ nltk
16
+ imageio
server.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from contextlib import asynccontextmanager
3
+ from fastapi import FastAPI, APIRouter
4
+ import sys
5
+ from loguru import logger
6
+
7
+ from api_server.auth_middleware import auth_middleware
8
+ from api_server.v1_utils_router import v1_utils_router
9
+ from api_server.v1_media_router import v1_media_api_router
10
+ from video.config import device
11
+
12
+ logger.remove()
13
+ logger.add(
14
+ sys.stdout,
15
+ colorize=True,
16
+ format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level> | <blue>{extra}</blue>",
17
+ level="DEBUG",
18
+ )
19
+
20
+ logger.info("This server was created by the 'AI Agents A-Z' YouTube channel")
21
+ logger.info("https://www.youtube.com/@aiagentsaz")
22
+ logger.info("Using device: {}", device)
23
+
24
+ @asynccontextmanager
25
+ async def lifespan(app: FastAPI):
26
+ logger.info("Starting up the server...")
27
+ yield
28
+ logger.info("Shutting down the server...")
29
+
30
+ app = FastAPI(lifespan=lifespan)
31
+
32
+
33
+ # add middleware to app, besides the /health endpoint
34
+ app.middleware("http")(auth_middleware)
35
+
36
+ @app.api_route("/", methods=["GET", "HEAD"])
37
+ def root():
38
+ return {
39
+ "message": "Welcome to the AI Agents A-Z No-Code Server",
40
+ "version": "0.3.5",
41
+ "documentation": "/docs",
42
+ "created_by": "https://www.youtube.com/@aiagentsaz"
43
+ }
44
+
45
+ @app.api_route("/health", methods=["GET", "HEAD"])
46
+ def healthcheck():
47
+ return {"status": "ok"}
48
+
49
+ api_router = APIRouter()
50
+ v1_api_router = APIRouter()
51
+
52
+ # todo auto-delete files after 30 minutes (env var)
53
+
54
+ v1_api_router.include_router(v1_media_api_router, prefix="/media", tags=["media"])
55
+ v1_api_router.include_router(v1_utils_router, prefix="/utils", tags=["utils"])
56
+ api_router.include_router(v1_api_router, prefix="/v1", tags=["v1"])
57
+ app.include_router(api_router, prefix="/api", tags=["api"])
utils/image.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import requests
3
+ from PIL import Image, ImageEnhance, ImageFilter, ImageDraw, ImageChops, ImageOps, ImageFont
4
+ from io import BytesIO
5
+ import math
6
+
7
+
8
+ def stitch_images(
9
+ image_urls: list[str],
10
+ max_width: int = 1920,
11
+ max_height: int = 1080
12
+ ):
13
+ """
14
+ Stitch multiple images into a single image.
15
+ Downloads images from URLs, arranges them in a grid, and resizes proportionally to fit max dimensions.
16
+
17
+ Args:
18
+ image_urls: List of image URLs to download and stitch
19
+ max_width: Maximum width of the final stitched image
20
+ max_height: Maximum height of the final stitched image
21
+
22
+ Returns:
23
+ PIL Image object of the stitched result
24
+ """
25
+ if not image_urls:
26
+ raise ValueError("No image URLs provided")
27
+
28
+ # Download and open all images
29
+ images = []
30
+ for url in image_urls:
31
+ try:
32
+ response = requests.get(url, timeout=30)
33
+ response.raise_for_status()
34
+ img = Image.open(BytesIO(response.content))
35
+ # Convert to RGB if necessary
36
+ if img.mode != 'RGB':
37
+ img = img.convert('RGB')
38
+ images.append(img)
39
+ except Exception as e:
40
+ print(f"Failed to download image from {url}: {e}")
41
+ continue
42
+
43
+ if not images:
44
+ raise ValueError("No valid images could be downloaded")
45
+
46
+ # Calculate optimal grid dimensions
47
+ num_images = len(images)
48
+ cols = math.ceil(math.sqrt(num_images))
49
+ rows = math.ceil(num_images / cols)
50
+
51
+ # Find the maximum dimensions among all images to ensure consistent sizing
52
+ max_img_width = max(img.width for img in images)
53
+ max_img_height = max(img.height for img in images)
54
+
55
+ # Calculate the size for each cell in the grid
56
+ cell_width = max_img_width
57
+ cell_height = max_img_height
58
+
59
+ # Create the stitched image canvas
60
+ canvas_width = cols * cell_width
61
+ canvas_height = rows * cell_height
62
+ stitched = Image.new('RGB', (canvas_width, canvas_height), color='white')
63
+
64
+ # Place images in the grid
65
+ for i, img in enumerate(images):
66
+ row = i // cols
67
+ col = i % cols
68
+
69
+ # Calculate position for this image
70
+ x = col * cell_width
71
+ y = row * cell_height
72
+
73
+ # Resize image to fit cell while maintaining aspect ratio
74
+ img_resized = resize_image_to_fit(img, cell_width, cell_height)
75
+
76
+ # Center the image in the cell
77
+ offset_x = (cell_width - img_resized.width) // 2
78
+ offset_y = (cell_height - img_resized.height) // 2
79
+
80
+ stitched.paste(img_resized, (x + offset_x, y + offset_y))
81
+
82
+ # Resize the final stitched image to fit within max dimensions
83
+ final_image = resize_image_to_fit(stitched, max_width, max_height)
84
+
85
+ return final_image
86
+
87
+ def resize_image_cover(
88
+ image_path: str,
89
+ target_width: int,
90
+ target_height: int,
91
+ output_path: str,
92
+ ) -> Image.Image:
93
+ """
94
+ Resize an image to fill the specified dimensions while maintaining aspect ratio.
95
+ The image is scaled to cover the entire target area and cropped to fit.
96
+
97
+ Args:
98
+ image: PIL Image object to resize
99
+ target_width: Target width
100
+ target_height: Target height
101
+
102
+ Returns:
103
+ Resized and cropped PIL Image object
104
+ """
105
+ image = Image.open(image_path)
106
+ # Calculate the scaling factor to cover the entire target area
107
+ width_ratio = target_width / image.width
108
+ height_ratio = target_height / image.height
109
+ scale_factor = max(width_ratio, height_ratio) # Use max to ensure coverage
110
+
111
+ # Scale the image
112
+ new_width = int(image.width * scale_factor)
113
+ new_height = int(image.height * scale_factor)
114
+ scaled_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
115
+
116
+ # Calculate crop box to center the image
117
+ left = (new_width - target_width) // 2
118
+ top = (new_height - target_height) // 2
119
+ right = left + target_width
120
+ bottom = top + target_height
121
+
122
+ # Crop the image to the target dimensions
123
+ cropped_image = scaled_image.crop((left, top, right, bottom))
124
+
125
+ # Convert to RGB if the image has transparency (RGBA mode)
126
+ if cropped_image.mode == 'RGBA':
127
+ # Create a white background and paste the image on it
128
+ rgb_image = Image.new('RGB', cropped_image.size, (255, 255, 255))
129
+ rgb_image.paste(cropped_image, mask=cropped_image.split()[-1]) # Use alpha channel as mask
130
+ cropped_image = rgb_image
131
+
132
+ cropped_image.save(output_path)
133
+
134
+ def resize_image_to_fit(image: Image.Image, max_width: int, max_height: int) -> Image.Image:
135
+ """
136
+ Resize an image to fit within the specified dimensions while maintaining aspect ratio.
137
+
138
+ Args:
139
+ image: PIL Image object to resize
140
+ max_width: Maximum width
141
+ max_height: Maximum height
142
+
143
+ Returns:
144
+ Resized PIL Image object
145
+ """
146
+ # Calculate the scaling factor to fit within max dimensions
147
+ width_ratio = max_width / image.width
148
+ height_ratio = max_height / image.height
149
+ scale_factor = min(width_ratio, height_ratio)
150
+
151
+ # Only resize if the image is larger than max dimensions
152
+ if scale_factor < 1:
153
+ new_width = int(image.width * scale_factor)
154
+ new_height = int(image.height * scale_factor)
155
+ return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
156
+
157
+ return image
158
+
159
+ def cup_of_coffee_tone(img):
160
+ sepia = ImageOps.colorize(img.convert("L"), "#704214", "#C0A080")
161
+ return Image.blend(img, sepia, alpha=0.2) # tweak alpha
162
+
163
+ def chromatic_aberration(img, shift=2):
164
+ r, g, b = img.split()
165
+ # Use transform with AFFINE to shift the channels
166
+ r = r.transform(img.size, Image.AFFINE, (1, 0, -shift, 0, 1, 0))
167
+ b = b.transform(img.size, Image.AFFINE, (1, 0, shift, 0, 1, 0))
168
+ return Image.merge("RGB", (r, g, b))
169
+
170
+ def make_image_imperfect(
171
+ image_path: str,
172
+ enhance_color: float = None,
173
+ enhance_contrast: float = None,
174
+ noise_strength: int = 15
175
+ ) -> Image.Image:
176
+ """
177
+ Remove AI-generated artifacts from an image.
178
+ This is a placeholder function. Actual implementation would depend on the specific algorithm used.
179
+
180
+ Args:
181
+ image_url: URL of the image to process
182
+
183
+ Returns:
184
+ PIL Image object of the processed result
185
+ """
186
+ try:
187
+ img = Image.open(image_path)
188
+
189
+ if enhance_color is not None:
190
+ img = ImageEnhance.Color(img).enhance(enhance_color)
191
+ if enhance_contrast is not None:
192
+ img = ImageEnhance.Contrast(img).enhance(enhance_contrast)
193
+
194
+ img = img.filter(ImageFilter.SHARPEN)
195
+ img = img.filter(ImageFilter.GaussianBlur(radius=0.5))
196
+
197
+ if img.mode != 'RGB':
198
+ img = img.convert('RGB')
199
+ img_array = np.array(img)
200
+ h, w, c = img_array.shape
201
+ grayscale_noise = np.random.randint(-noise_strength, noise_strength + 1, (h, w), dtype='int16')
202
+ noise = np.stack([grayscale_noise] * c, axis=2)
203
+ noisy_array = img_array.astype('int16') + noise
204
+ noisy_array = np.clip(noisy_array, 0, 255).astype('uint8')
205
+ img = Image.fromarray(noisy_array)
206
+
207
+ img = cup_of_coffee_tone(img)
208
+ img = chromatic_aberration(img, shift=1)
209
+
210
+ return img
211
+
212
+ except Exception as e:
213
+ print(f"Failed to process image from {image_path}: {e}")
214
+ raise ValueError("Failed to unaize image") from e
215
+
216
+ def create_text_image(
217
+ text: str,
218
+ size: tuple[int, int] = (1920, 1080),
219
+ font_size: int = 120,
220
+ font_color: str = "white",
221
+ font_path: str = None
222
+ ) -> Image.Image:
223
+ """
224
+ Create an image with centered text.
225
+
226
+ Args:
227
+ text: Text to display on the image
228
+ width: Width of the image
229
+ height: Height of the image
230
+ font_size: Size of the font
231
+ font_color: Color of the text
232
+
233
+ Returns:
234
+ PIL Image object with the text centered
235
+ """
236
+ img = Image.new('RGB', size, color='black')
237
+ draw = ImageDraw.Draw(img)
238
+
239
+ font = ImageFont.load_default(size=font_size)
240
+ if font_path:
241
+ font = ImageFont.truetype(font_path, font_size)
242
+ font_bbox = font.getbbox(text)
243
+ text_width = font_bbox[2] - font_bbox[0]
244
+ text_height = font_bbox[3] - font_bbox[1]
245
+ x = (size[0] - text_width) // 2
246
+ y = (size[1] - text_height) // 2
247
+ draw.text((x, y), text, fill=font_color, font=font)
248
+
249
+ return img
250
+
251
+ def make_image_wobbly(
252
+ image: Image.Image,
253
+ wobble_amount: float = 3.0
254
+ ) -> Image.Image:
255
+ """
256
+ Apply a subtle wobble/distortion effect to an image, like viewing through water or a warped mirror.
257
+
258
+ Args:
259
+ image: PIL Image object to distort
260
+ wobble_amount: Strength of the wobble effect (0.5-10.0, higher = more distortion)
261
+
262
+ Returns:
263
+ PIL Image object with wobble effect applied
264
+ """
265
+ if image.mode != 'RGB':
266
+ image = image.convert('RGB')
267
+
268
+ width, height = image.size
269
+ img_array = np.array(image)
270
+
271
+ # Create coordinate grids
272
+ x_coords = np.arange(width)
273
+ y_coords = np.arange(height)
274
+ x_grid, y_grid = np.meshgrid(x_coords, y_coords)
275
+
276
+ # Create random wave patterns optimized for text
277
+ # Generate random parameters for each wave to ensure variety
278
+
279
+ # Random wave frequencies and phases for horizontal waves
280
+ freq1_h = np.random.uniform(2, 5) # Random frequency between 2-5
281
+ freq2_h = np.random.uniform(5, 10) # Random frequency between 5-10
282
+ phase1_h = np.random.uniform(0, 2 * np.pi) # Random phase
283
+ phase2_h = np.random.uniform(0, 2 * np.pi) # Random phase
284
+
285
+ wave_x1 = wobble_amount * 0.3 * np.sin(2 * np.pi * y_grid / (height / freq1_h) + phase1_h)
286
+ wave_x2 = wobble_amount * 0.1 * np.sin(2 * np.pi * y_grid / (height / freq2_h) + phase2_h)
287
+
288
+ # Random wave frequencies and phases for vertical waves
289
+ freq1_v = np.random.uniform(2, 6) # Random frequency between 2-6
290
+ freq2_v = np.random.uniform(6, 12) # Random frequency between 6-12
291
+ phase1_v = np.random.uniform(0, 2 * np.pi) # Random phase
292
+ phase2_v = np.random.uniform(0, 2 * np.pi) # Random phase
293
+
294
+ wave_y1 = wobble_amount * 0.3 * np.sin(2 * np.pi * x_grid / (width / freq1_v) + phase1_v)
295
+ wave_y2 = wobble_amount * 0.1 * np.sin(2 * np.pi * x_grid / (width / freq2_v) + phase2_v)
296
+
297
+ # Random circular ripples with random centers and frequencies
298
+ center_x = width // 2 + np.random.randint(-width//4, width//4)
299
+ center_y = height // 2 + np.random.randint(-height//4, height//4)
300
+ ripple_freq = np.random.uniform(80, 120) # Random ripple frequency
301
+ ripple_phase = np.random.uniform(0, 2 * np.pi) # Random ripple phase
302
+
303
+ distance = np.sqrt((x_grid - center_x)**2 + (y_grid - center_y)**2)
304
+ ripple_x = wobble_amount * 0.15 * np.sin(2 * np.pi * distance / ripple_freq + ripple_phase)
305
+ ripple_y = wobble_amount * 0.15 * np.cos(2 * np.pi * distance / ripple_freq + ripple_phase)
306
+
307
+ # Random noise for text preservation - NO FIXED SEED
308
+ noise_x = np.random.normal(0, wobble_amount * 0.05, (height, width))
309
+ noise_y = np.random.normal(0, wobble_amount * 0.05, (height, width))
310
+
311
+ # Combine all distortions
312
+ total_x_offset = wave_x1 + wave_x2 + ripple_x + noise_x
313
+ total_y_offset = wave_y1 + wave_y2 + ripple_y + noise_y
314
+
315
+ # Apply the distortion with proper boundary handling
316
+ new_x_coords = x_grid + total_x_offset
317
+ new_y_coords = y_grid + total_y_offset
318
+
319
+ # Use scipy.ndimage.map_coordinates for efficient interpolation
320
+ try:
321
+ from scipy.ndimage import map_coordinates
322
+
323
+ # Create coordinate arrays for map_coordinates (expects [y, x] order)
324
+ coords = np.array([new_y_coords, new_x_coords])
325
+
326
+ # Apply the transformation to each color channel with adaptive interpolation
327
+ # Use progressively smoother interpolation for higher wobble amounts
328
+ distorted_array = np.zeros_like(img_array)
329
+
330
+ # Choose interpolation method based on wobble amount for smoothest results
331
+ if wobble_amount <= 1.5:
332
+ # For very subtle wobbles, use nearest neighbor to preserve text sharpness
333
+ interpolation_order = 0
334
+ elif wobble_amount <= 3.0:
335
+ # For moderate wobbles, use linear interpolation
336
+ interpolation_order = 1
337
+ else:
338
+ # For strong wobbles, use cubic interpolation for smoothest edges
339
+ interpolation_order = 3
340
+
341
+ for channel in range(img_array.shape[2]):
342
+ distorted_array[:, :, channel] = map_coordinates(
343
+ img_array[:, :, channel],
344
+ coords,
345
+ order=interpolation_order,
346
+ mode='reflect', # Mirror edges instead of clipping
347
+ prefilter=True if interpolation_order > 1 else False # Use prefilter for cubic
348
+ )
349
+
350
+ result_img = Image.fromarray(distorted_array.astype(np.uint8))
351
+
352
+ # Post-process for smoother edges at higher wobble amounts
353
+ if wobble_amount > 2.0:
354
+ # Apply a very subtle Gaussian blur to smooth any remaining artifacts
355
+ result_img = result_img.filter(ImageFilter.GaussianBlur(radius=0.3))
356
+ # Then apply gentle sharpening to maintain text readability
357
+ result_img = result_img.filter(ImageFilter.UnsharpMask(radius=0.8, percent=60, threshold=1))
358
+ elif wobble_amount > 1.5:
359
+ # For moderate wobbles, just apply gentle sharpening
360
+ result_img = result_img.filter(ImageFilter.UnsharpMask(radius=0.5, percent=40, threshold=0))
361
+
362
+ return result_img
363
+
364
+ return Image.fromarray(distorted_array.astype(np.uint8))
365
+
366
+ except ImportError:
367
+ # Fallback to PIL's transform if scipy is not available
368
+ # This is much faster than the pixel-by-pixel approach
369
+ from PIL.Image import AFFINE
370
+
371
+ # For a simple approximation, apply a slight transform
372
+ # This won't be as sophisticated but will be much faster
373
+ transformed = image.transform(
374
+ image.size,
375
+ AFFINE,
376
+ (1, 0.02 * wobble_amount/10, 0.02 * wobble_amount/10, 1, 0, 0),
377
+ resample=Image.BILINEAR
378
+ )
379
+
380
+ # Apply a slight rotation for additional wobble with random angle
381
+ angle = wobble_amount * 0.3 * np.random.uniform(-1, 1) # Random rotation
382
+ rotated = transformed.rotate(angle, resample=Image.BILINEAR, expand=False)
383
+
384
+ return rotated
385
+
386
+
utils/proxy.py ADDED
File without changes
video/builder.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from video.media import MediaUtils
2
+ import time
3
+ from loguru import logger
4
+
5
+
6
+ class VideoBuilder:
7
+ """
8
+ Builder class for constructing FFmpeg video commands with a fluent interface.
9
+ """
10
+
11
+ def __init__(self, dimensions: tuple[int, int], ffmpeg_path="ffmpeg"):
12
+ if not isinstance(dimensions, tuple) or len(dimensions) != 2:
13
+ raise ValueError("Dimensions must be a tuple of (width, height).")
14
+
15
+ self.width, self.height = dimensions
16
+ self.ffmpeg_path = ffmpeg_path
17
+
18
+ # Components
19
+ self.background = None
20
+ self.audio_file = None
21
+ self.captions = None
22
+ self.output_path = "output.mp4"
23
+
24
+ # Internal state
25
+ self.media_utils = None
26
+
27
+ def set_media_utils(self, media_utils: MediaUtils):
28
+ """Set the media manager for duration calculations."""
29
+ self.media_utils = media_utils
30
+ return self
31
+
32
+ def set_background_image(self, file_path: str, effect_config: dict = None):
33
+ """Set background as an image with optional visual effects.
34
+
35
+ Args:
36
+ file_path: Path to the image file
37
+ effect_config: Configuration for visual effects. Supported effects:
38
+ - Ken Burns (zoom): {"effect": "ken_burns", "zoom_factor": 0.001, "direction": "zoom-to-top-left"}
39
+ - Pan: {"effect": "pan", "direction": "left-to-right", "speed": "normal"}
40
+ """
41
+ self.background = {
42
+ "type": "image",
43
+ "file": file_path,
44
+ "effect_config": effect_config or {"effect": "ken_burns"}, # Default to Ken Burns for backward compatibility
45
+ }
46
+ return self
47
+
48
+ def set_background_video(self, file_path: str):
49
+ """Set background as a video file."""
50
+ self.background = {"type": "video", "file": file_path}
51
+ return self
52
+
53
+ def set_audio(self, file_path: str):
54
+ """Set audio file."""
55
+ self.audio_file = file_path
56
+ return self
57
+
58
+ def set_captions(
59
+ self,
60
+ file_path: str = None,
61
+ config: dict = None,
62
+ ):
63
+ """Set caption subtitles
64
+
65
+ Args:
66
+ file_path: Path to subtitle file
67
+ config: Optional configuration dict
68
+ """
69
+ self.captions = {
70
+ "file": file_path,
71
+ **(config or {}),
72
+ }
73
+ return self
74
+
75
+ def set_output_path(self, output_path: str):
76
+ """Set output file path."""
77
+ self.output_path = output_path
78
+ return self
79
+
80
+ def build_command(self):
81
+ """Build the complete FFmpeg command."""
82
+ if not self.background:
83
+ raise ValueError("Background must be set (image or video).")
84
+
85
+ if not self.audio_file and not self.captions:
86
+ raise ValueError(
87
+ "At least one of audio_file, or captions must be provided."
88
+ )
89
+
90
+ # Validate combinations
91
+ if self.background["type"] == "image" and not self.audio_file:
92
+ raise ValueError("Audio file must be provided if background is an image.")
93
+
94
+ if (
95
+ self.background["type"] == "video"
96
+ and not self.audio_file
97
+ and self.captions is None
98
+ ):
99
+ raise ValueError(
100
+ "Audio file or captions must be provided if background is a video."
101
+ )
102
+
103
+ # Get audio duration if audio file is provided
104
+ audio_duration = None
105
+ if self.audio_file:
106
+ if not self.media_utils:
107
+ raise ValueError(
108
+ "Media manager must be set to determine audio duration."
109
+ )
110
+ media_info = self.media_utils.get_audio_info(self.audio_file)
111
+ audio_duration = media_info.get("duration")
112
+ if not audio_duration:
113
+ raise ValueError("Could not determine audio duration")
114
+
115
+ # Build command
116
+ cmd = [self.ffmpeg_path, "-y"]
117
+
118
+ filter_parts = []
119
+ input_index = 0
120
+
121
+ # Add background input
122
+ if self.background["type"] == "image":
123
+ cmd.extend(
124
+ ["-loop", "1", "-t", str(audio_duration), "-i", self.background["file"]]
125
+ )
126
+
127
+ # Get effect configuration with backward compatibility
128
+ effect_config = self.background.get("effect_config", {"effect": "ken_burns"})
129
+
130
+ # Handle backward compatibility for old ken_burns config
131
+ if "ken_burns" in self.background and "effect_config" not in self.background:
132
+ # Old format: {"ken_burns": {"zoom_factor": 0.001, "direction": "zoom-to-top-left"}}
133
+ old_ken_burns = self.background.get("ken_burns", {})
134
+ effect_config = {
135
+ "effect": "ken_burns",
136
+ "zoom_factor": old_ken_burns.get("zoom_factor", 0.001),
137
+ "direction": old_ken_burns.get("direction", "zoom-to-top-left")
138
+ }
139
+
140
+ effect_type = effect_config.get("effect", "ken_burns")
141
+
142
+ fps = 25
143
+ duration_frames = int(audio_duration * fps)
144
+
145
+ if effect_type == "ken_burns":
146
+ # Ken Burns (zoom) effect
147
+ zoom_factor = effect_config.get("zoom_factor", 0.001)
148
+ direction = effect_config.get("direction", "zoom-to-top-left")
149
+
150
+ # todo without upscaling we can't use the top and center zooms. upscaling increases the render time
151
+ zoom_expressions = {
152
+ "zoom-to-top": f"z='zoom+{zoom_factor}':x=iw/2-(iw/zoom/2):y=0",
153
+ "zoom-to-center": f"z='zoom+{zoom_factor}':x=iw/2-(iw/zoom/2):y=ih/2-(ih/zoom/2)",
154
+ "zoom-to-top-left": f"z='zoom+{zoom_factor}':x=0:y=0",
155
+ }
156
+ zoom_expr = zoom_expressions.get(direction, zoom_expressions["zoom-to-top-left"])
157
+
158
+ zoompan_d = duration_frames + 1
159
+ filter_parts.append(
160
+ f"[{input_index}]scale={self.width}:-2,setsar=1:1,"
161
+ f"crop={self.width}:{self.height},"
162
+ f"zoompan={zoom_expr}:d={zoompan_d}:s={self.width}x{self.height}:fps={fps}[bg]"
163
+ )
164
+
165
+ elif effect_type == "pan":
166
+ # Pan effect - camera moves across the image
167
+ direction = effect_config.get("direction", "left-to-right")
168
+ speed = effect_config.get("speed", "normal")
169
+
170
+ # Speed multipliers
171
+ speed_multipliers = {
172
+ "slow": 0.5,
173
+ "normal": 1.0,
174
+ "fast": 2.0
175
+ }
176
+ speed_mult = speed_multipliers.get(speed, 1.0)
177
+
178
+ # Calculate pan distance based on direction
179
+ # We'll scale the image larger to allow for panning
180
+ scale_factor = 1.3 # Scale image 30% larger to allow room for panning
181
+ scaled_width = int(self.width * scale_factor)
182
+ scaled_height = int(self.height * scale_factor)
183
+
184
+ # Pan expressions for different directions
185
+ if direction == "left-to-right":
186
+ # Start from left, move to right
187
+ start_x = 0
188
+ end_x = scaled_width - self.width
189
+ start_y = (scaled_height - self.height) // 2
190
+ end_y = start_y
191
+ elif direction == "right-to-left":
192
+ # Start from right, move to left
193
+ start_x = scaled_width - self.width
194
+ end_x = 0
195
+ start_y = (scaled_height - self.height) // 2
196
+ end_y = start_y
197
+ elif direction == "top-to-bottom":
198
+ # Start from top, move to bottom
199
+ start_x = (scaled_width - self.width) // 2
200
+ end_x = start_x
201
+ start_y = 0
202
+ end_y = scaled_height - self.height
203
+ elif direction == "bottom-to-top":
204
+ # Start from bottom, move to top
205
+ start_x = (scaled_width - self.width) // 2
206
+ end_x = start_x
207
+ start_y = scaled_height - self.height
208
+ end_y = 0
209
+ else:
210
+ # Default to left-to-right
211
+ start_x = 0
212
+ end_x = scaled_width - self.width
213
+ start_y = (scaled_height - self.height) // 2
214
+ end_y = start_y
215
+
216
+ # Create pan expression
217
+ # Linear interpolation from start to end position over the duration
218
+ pan_x_expr = f"{start_x}+({end_x}-{start_x})*t/{audio_duration}*{speed_mult}"
219
+ pan_y_expr = f"{start_y}+({end_y}-{start_y})*t/{audio_duration}*{speed_mult}"
220
+
221
+ filter_parts.append(
222
+ f"[{input_index}]scale={scaled_width}:{scaled_height},setsar=1:1,"
223
+ f"crop={self.width}:{self.height}:{pan_x_expr}:{pan_y_expr}[bg]"
224
+ )
225
+
226
+ else:
227
+ # No effect, just scale and crop
228
+ filter_parts.append(
229
+ f"[{input_index}]scale={self.width}:{self.height},setsar=1:1[bg]"
230
+ )
231
+
232
+ elif self.background["type"] == "video":
233
+ if audio_duration:
234
+ cmd.extend(
235
+ [
236
+ "-stream_loop",
237
+ "-1",
238
+ "-t",
239
+ str(audio_duration),
240
+ "-i",
241
+ self.background["file"],
242
+ ]
243
+ )
244
+ else:
245
+ cmd.extend(["-i", self.background["file"]])
246
+ filter_parts.append(f"[{input_index}]scale={self.width}:{self.height}[bg]")
247
+
248
+ input_index += 1
249
+ current_video = "[bg]"
250
+
251
+ # Add audio input
252
+ audio_input_index = None
253
+ if self.audio_file:
254
+ cmd.extend(["-i", self.audio_file])
255
+ audio_input_index = input_index
256
+ input_index += 1
257
+
258
+ # Add subtitles or caption images if provided
259
+ if self.captions:
260
+ subtitle_file = self.captions.get("file")
261
+ if subtitle_file:
262
+ filter_parts.append(f"{current_video}subtitles={subtitle_file}[v]")
263
+ current_video = "[v]"
264
+ else:
265
+ # Rename final video output
266
+ if current_video == "[bg]":
267
+ current_video = "[v]"
268
+ filter_parts.append(f"[bg]copy[v]")
269
+
270
+ # Build filter complex
271
+ if filter_parts:
272
+ cmd.extend(["-filter_complex", ";".join(filter_parts)])
273
+
274
+ # Map video and audio
275
+ cmd.extend(["-map", current_video])
276
+ if audio_input_index is not None:
277
+ cmd.extend(["-map", f"{audio_input_index}:a"])
278
+
279
+ # Video codec settings
280
+ cmd.extend(["-c:v", "libx264", "-preset", "ultrafast"])
281
+
282
+ cmd.extend(["-crf", "23", "-pix_fmt", "yuv420p"])
283
+
284
+ # Audio codec settings
285
+ if self.audio_file:
286
+ cmd.extend(["-c:a", "aac", "-b:a", "192k"])
287
+ if audio_duration:
288
+ cmd.extend(["-t", str(audio_duration)])
289
+
290
+ cmd.append(self.output_path)
291
+ return cmd
292
+
293
+ def execute(self):
294
+ """Build and execute the FFmpeg command using MediaUtils for progress tracking."""
295
+ if not self.media_utils:
296
+ logger.error("MediaUtils must be set before executing video build")
297
+ return False
298
+
299
+ start = time.time()
300
+ context_logger = logger.bind(
301
+ dimensions=(self.width, self.height),
302
+ background_type=self.background.get("type") if self.background else None,
303
+ has_audio=bool(self.audio_file),
304
+ has_captions=bool(self.captions),
305
+ output_path=self.output_path,
306
+ youtube_channel="https://www.youtube.com/@aiagentsaz"
307
+ )
308
+
309
+ try:
310
+ context_logger.debug("building video with VideoBuilder")
311
+ cmd = self.build_command()
312
+
313
+ # Calculate expected duration for progress tracking
314
+ expected_duration = None
315
+ if self.audio_file:
316
+ audio_info = self.media_utils.get_audio_info(self.audio_file)
317
+ expected_duration = audio_info.get("duration")
318
+ elif self.background and self.background.get("type") == "video":
319
+ video_info = self.media_utils.get_video_info(self.background["file"])
320
+ expected_duration = video_info.get("duration")
321
+
322
+ context_logger.bind(
323
+ command=" ".join(cmd),
324
+ expected_duration=expected_duration,
325
+ ).debug("executing video build command")
326
+ # Execute using MediaUtils for proper logging and progress tracking
327
+ success = self.media_utils.execute_ffmpeg_command(
328
+ cmd,
329
+ "build video",
330
+ expected_duration=expected_duration,
331
+ show_progress=True,
332
+ )
333
+
334
+ if success:
335
+ context_logger.bind(execution_time=time.time() - start).info(
336
+ "video built successfully"
337
+ )
338
+ return True
339
+ else:
340
+ context_logger.error("failed to build video")
341
+ return False
342
+
343
+ except Exception as e:
344
+ context_logger.bind(error=str(e), execution_time=time.time() - start).error(
345
+ "error during video rendering"
346
+ )
347
+ return False
video/caption.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ from typing import List, Dict, Tuple
3
+ from loguru import logger
4
+
5
+ from typing import Dict, List
6
+
7
+
8
+ class Caption:
9
+ def is_punctuation(self, text):
10
+ return text in string.punctuation
11
+
12
+ def create_subtitle_segments_english(
13
+ self, captions: List[Dict], max_length=80, lines=2
14
+ ):
15
+ """
16
+ Breaks up the captions into segments of max_length characters
17
+ on two lines and merge punctuation with the last word
18
+ """
19
+
20
+ if not captions:
21
+ return []
22
+
23
+ segments = []
24
+ current_segment_texts = ["" for _ in range(lines)]
25
+ current_line = 0
26
+ segment_start_ts = captions[0]["start_ts"]
27
+ segment_end_ts = captions[0]["end_ts"]
28
+
29
+ for caption in captions:
30
+ text = caption["text"]
31
+ start_ts = caption["start_ts"]
32
+ end_ts = caption["end_ts"]
33
+
34
+ # Update the segment end timestamp
35
+ segment_end_ts = end_ts
36
+
37
+ # If the caption is a punctuation, merge it with the current line
38
+ if self.is_punctuation(text):
39
+ if current_line < lines and current_segment_texts[current_line]:
40
+ current_segment_texts[current_line] += text
41
+ continue
42
+
43
+ # If the line is too long, move to the next one
44
+ if (
45
+ current_line < lines
46
+ and len(current_segment_texts[current_line] + text) > max_length
47
+ ):
48
+ current_line += 1
49
+
50
+ # If we've filled all lines, save the current segment and start a new one
51
+ if current_line >= lines:
52
+ segments.append(
53
+ {
54
+ "text": current_segment_texts,
55
+ "start_ts": segment_start_ts,
56
+ "end_ts": segment_end_ts,
57
+ }
58
+ )
59
+
60
+ # Reset for next segment
61
+ current_segment_texts = ["" for _ in range(lines)]
62
+ current_line = 0
63
+ # Add a small gap (0.05s) between segments to prevent overlap
64
+ segment_start_ts = start_ts + 0.05
65
+
66
+ # Add the text to the current segment
67
+ if current_line < lines:
68
+ current_segment_texts[current_line] += (
69
+ " " if current_segment_texts[current_line] else ""
70
+ )
71
+ current_segment_texts[current_line] += text
72
+
73
+ # Add the last segment if there's any content
74
+ if any(current_segment_texts):
75
+ segments.append(
76
+ {
77
+ "text": current_segment_texts,
78
+ "start_ts": segment_start_ts,
79
+ "end_ts": segment_end_ts,
80
+ }
81
+ )
82
+
83
+ # Post-processing to ensure no overlaps by adjusting end times if needed
84
+ for i in range(len(segments) - 1):
85
+ if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
86
+ segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
87
+
88
+ return segments
89
+
90
+ def create_subtitle_segments_international(
91
+ self, captions: List[Dict], max_length=80, lines=2
92
+ ):
93
+ """
94
+ Breaks up international captions (full sentences) into smaller segments that fit
95
+ within max_length characters per line, with proper timing distribution.
96
+
97
+ Handles both space-delimited languages like English and character-based languages like Chinese.
98
+
99
+ Args:
100
+ captions: List of caption dictionaries with text, start_ts, and end_ts
101
+ max_length: Maximum number of characters per line
102
+ lines: Number of lines per segment
103
+
104
+ Returns:
105
+ List of subtitle segments
106
+ """
107
+ if not captions:
108
+ return []
109
+
110
+ segments = []
111
+
112
+ for caption in captions:
113
+ text = caption["text"].strip()
114
+ start_ts = caption["start_ts"]
115
+ end_ts = caption["end_ts"]
116
+ duration = end_ts - start_ts
117
+
118
+ # Check if text is using Chinese/Japanese/Korean characters (CJK)
119
+ # For CJK, we'll split by characters rather than words
120
+ is_cjk = any("\u4e00" <= char <= "\u9fff" for char in text)
121
+
122
+ parts = []
123
+ if is_cjk:
124
+ # For CJK languages, process character by character
125
+ current_part = ""
126
+ for char in text:
127
+ if len(current_part + char) > max_length:
128
+ parts.append(current_part)
129
+ current_part = char
130
+ else:
131
+ current_part += char
132
+
133
+ # Add the last part if not empty
134
+ if current_part:
135
+ parts.append(current_part)
136
+ else:
137
+ # Original word-based splitting for languages with spaces
138
+ words = text.split()
139
+ current_part = ""
140
+
141
+ for word in words:
142
+ # If adding this word would exceed max_length, start a new part
143
+ if len(current_part + " " + word) > max_length and current_part:
144
+ parts.append(current_part.strip())
145
+ current_part = word
146
+ else:
147
+ # Add space if not the first word in the part
148
+ if current_part:
149
+ current_part += " "
150
+ current_part += word
151
+
152
+ # Add the last part if not empty
153
+ if current_part:
154
+ parts.append(current_part.strip())
155
+
156
+ # Group parts into segments with 'lines' number of lines per segment
157
+ segment_parts = []
158
+ for i in range(0, len(parts), lines):
159
+ segment_parts.append(parts[i : i + lines])
160
+
161
+ # Calculate time proportionally based on segment text length
162
+ total_chars = sum(len("".join(part_group)) for part_group in segment_parts)
163
+
164
+ current_time = start_ts
165
+ for i, part_group in enumerate(segment_parts):
166
+ # Get character count for this segment group
167
+ segment_chars = len("".join(part_group))
168
+
169
+ # Calculate time proportionally, but ensure at least a minimum duration
170
+ if total_chars > 0:
171
+ segment_duration = (segment_chars / total_chars) * duration
172
+ segment_duration = max(
173
+ segment_duration, 0.5
174
+ ) # Ensure minimum duration of 0.5s
175
+ else:
176
+ segment_duration = duration / len(segment_parts)
177
+
178
+ segment_start = current_time
179
+ segment_end = segment_start + segment_duration
180
+
181
+ # Move current time forward for next segment
182
+ current_time = segment_end
183
+
184
+ # Create segment with proper text array format for the subtitle renderer
185
+ segment_text = part_group + [""] * (lines - len(part_group))
186
+
187
+ segments.append(
188
+ {
189
+ "text": segment_text,
190
+ "start_ts": segment_start,
191
+ "end_ts": segment_end,
192
+ }
193
+ )
194
+
195
+ # Ensure no overlaps between segments by adjusting end times if needed
196
+ for i in range(len(segments) - 1):
197
+ if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
198
+ segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
199
+
200
+ return segments
201
+
202
+ @staticmethod
203
+ def hex_to_ass(hex_color: str, alpha: float = 1.0) -> str:
204
+ """
205
+ Convert a hex color + transparency to ASS &HaaBBGGRR& format.
206
+
207
+ :param hex_color: CSS-style color string, e.g. "#FFA07A" or "00ff00"
208
+ :param alpha: transparency from 0.0 (opaque) to 1.0 (fully transparent)
209
+ :return: ASS color string, e.g. "&H8014C8FF&"
210
+ """
211
+
212
+ # strip leading '#' if present
213
+ hex_color = hex_color.lstrip('#')
214
+
215
+ # support 3-digit shorthand like 'f0a'
216
+ if len(hex_color) == 3:
217
+ hex_color = ''.join([c*2 for c in hex_color])
218
+
219
+ if len(hex_color) != 6:
220
+ raise ValueError("hex_color must be in 'RRGGBB' or 'RGB' format")
221
+
222
+ # parse RGB
223
+ r = int(hex_color[0:2], 16)
224
+ g = int(hex_color[2:4], 16)
225
+ b = int(hex_color[4:6], 16)
226
+
227
+ # ASS alpha is inverted: 00=opaque, FF=transparent
228
+ # so we invert the user's alpha (0.0 = opaque)
229
+ a = int((1.0 - alpha) * 255)
230
+ a = max(0, min(255, a))
231
+
232
+ # build BGR and alpha bytes
233
+ aa = f"{a:02X}"
234
+ bb = f"{b:02X}"
235
+ gg = f"{g:02X}"
236
+ rr = f"{r:02X}"
237
+
238
+ return f"&H{aa}{bb}{gg}{rr}"
239
+
240
+ def create_subtitle(
241
+ self,
242
+ segments,
243
+ dimensions: Tuple[int, int],
244
+ output_path: str,
245
+ font_size=24,
246
+ font_color="#fff",
247
+ shadow_color="#000",
248
+ shadow_transparency=0.1,
249
+ shadow_blur=0,
250
+ stroke_color="#000",
251
+ stroke_size=0,
252
+ font_name="Arial",
253
+ font_bold=True,
254
+ font_italic=False,
255
+ subtitle_position="center",
256
+ ):
257
+ width, height = dimensions
258
+ bold_value = -1 if font_bold else 0
259
+ italic_value = -1 if font_italic else 0
260
+
261
+ position_from_top = 0.2
262
+ if subtitle_position == "center":
263
+ position_from_top = 0.45
264
+ if subtitle_position == "bottom":
265
+ position_from_top = 0.75
266
+
267
+ ass_content = """[Script Info]
268
+ ScriptType: v4.00+
269
+ PlayResX: {width}
270
+ PlayResY: {height}
271
+
272
+ [V4+ Styles]
273
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
274
+ Style: Default,{font_name},{font_size},{font_color},&H000000FF,{stroke_color},&H00000000,{bold},{italic},0,0,100,100,0,0,1,{stroke_size},0,8,20,20,20,1
275
+
276
+ [Events]
277
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
278
+ """.format(
279
+ width=width,
280
+ height=height,
281
+ font_size=font_size,
282
+ font_color=self.hex_to_ass(font_color),
283
+ stroke_color=self.hex_to_ass(stroke_color),
284
+ stroke_size=stroke_size,
285
+ font_name=font_name,
286
+ bold=bold_value,
287
+ italic=italic_value
288
+ )
289
+
290
+ pos_x = int(width / 2)
291
+ pos_y = int(height * position_from_top)
292
+
293
+ # Process each segment and add to the subtitle file
294
+ for segment in segments:
295
+ start_time = self.format_time(segment["start_ts"])
296
+ end_time = self.format_time(segment["end_ts"])
297
+
298
+ # Create text with line breaks
299
+ text_lines = segment["text"]
300
+ formatted_text = ""
301
+ for i, line in enumerate(text_lines):
302
+ if line: # Only add non-empty lines
303
+ if i > 0: # Add line break if not the first line
304
+ formatted_text += "\\N"
305
+ formatted_text += line
306
+
307
+ # Create shadow if shadow_blur is specified or if we want a drop shadow effect
308
+ if shadow_blur > 0 or shadow_transparency < 1.0:
309
+ # Convert shadow color with transparency
310
+ shadow_color_ass = self.hex_to_ass(shadow_color, shadow_transparency)
311
+
312
+ # Offset shadow position slightly for drop shadow effect
313
+ shadow_pos_x = pos_x + 2
314
+ shadow_pos_y = pos_y + 2
315
+
316
+ # For shadow text, use shadow color only for primary color and set proper alpha
317
+ # Only apply shadow color to primary color (\1c) and use alpha for transparency
318
+ shadow_override_tags = f"\\pos({shadow_pos_x},{shadow_pos_y})\\1c{shadow_color_ass}\\bord0"
319
+
320
+ # Add alpha transparency if needed
321
+ if shadow_transparency > 0:
322
+ alpha_hex = hex(int((1.0 - shadow_transparency) * 255))[2:].upper().zfill(2)
323
+ shadow_override_tags += f"\\1a&H{alpha_hex}&"
324
+
325
+ if shadow_blur > 0:
326
+ shadow_override_tags += f"\\blur{shadow_blur}"
327
+
328
+ shadow_formatted_text = f"{{{shadow_override_tags}}}" + formatted_text
329
+
330
+ # Add shadow dialogue line first (so it appears behind)
331
+ ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{shadow_formatted_text}\n"
332
+
333
+ # Create main text layer
334
+ main_override_tags = f"\\pos({pos_x},{pos_y})"
335
+ main_formatted_text = f"{{{main_override_tags}}}" + formatted_text
336
+
337
+ # Add main dialogue line (appears on top)
338
+ ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{main_formatted_text}\n"
339
+
340
+ with open(output_path, "w", encoding="utf-8") as f:
341
+ f.write(ass_content)
342
+
343
+ logger.debug("subtitle (ass) was created with drop shadow")
344
+
345
+ def format_time(self, seconds):
346
+ """
347
+ Convert seconds to ASS time format (H:MM:SS.cc)
348
+ """
349
+ hours = int(seconds // 3600)
350
+ minutes = int((seconds % 3600) // 60)
351
+ secs = int(seconds % 60)
352
+ centisecs = int((seconds % 1) * 100)
353
+
354
+ return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"
video/config.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from loguru import logger
4
+
5
+ device = "cpu"
6
+ if torch.cuda.is_available():
7
+ device = torch.device("cuda")
8
+ elif torch.backends.mps.is_available():
9
+ device = torch.device("mps")
10
+ else:
11
+ device = torch.device("cpu")
12
+ num_cores = os.cpu_count()
13
+ if os.path.exists("/sys/fs/cgroup/cpu.max"):
14
+ with open("/sys/fs/cgroup/cpu.max", "r") as f:
15
+ line = f.readline()
16
+ if len(line.split()) == 2:
17
+ if line.split()[0] == "max":
18
+ logger.info(
19
+ "File /sys/fs/cgroup/cpu.max has max value, using os.cpu_count()"
20
+ )
21
+ else:
22
+ cpu_max = int(line.split()[0])
23
+ cpu_period = int(line.split()[1])
24
+ num_cores = cpu_max // cpu_period
25
+ logger.info("Using {} cores", num_cores)
26
+ else:
27
+ logger.warning(
28
+ "File /sys/fs/cgroup/cpu.max does not have 2 values, using os.cpu_count()"
29
+ )
30
+ else:
31
+ logger.info("File /sys/fs/cgroup/cpu.max not found, using os.cpu_count()")
32
+
33
+ logger.info("number of CPU cores: {}", num_cores)
34
+ num_threads = os.environ.get("NUM_THREADS", num_cores)
35
+ logger.info("number of threads to use with torch: {}", num_threads)
36
+ torch.set_num_threads(int(num_threads))
37
+ torch.set_num_interop_threads(int(num_threads))
38
+
39
+ map_location = torch.device(device)
40
+
41
+ torch_load_original = torch.load
42
+
43
+
44
+ def patched_torch_load(*args, **kwargs):
45
+ if "map_location" not in kwargs:
46
+ kwargs["map_location"] = map_location
47
+ return torch_load_original(*args, **kwargs)
48
+
49
+
50
+ torch.load = patched_torch_load
51
+
52
+ whisper_model = os.environ.get("WHISPER_MODEL", "small")
53
+ whisper_compute_type = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")
video/media.py ADDED
@@ -0,0 +1,850 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import json
3
+ import time
4
+ from loguru import logger
5
+
6
+
7
+ class MediaUtils:
8
+ def __init__(self, ffmpeg_path="ffmpeg"):
9
+ """
10
+ Initializes the MediaUtils class.
11
+
12
+ Args:
13
+ ffmpeg_path: Path to the ffmpeg executable
14
+ """
15
+ self.ffmpeg_path = ffmpeg_path
16
+
17
+ def merge_videos(
18
+ self,
19
+ video_paths: list,
20
+ output_path: str,
21
+ background_music_path: str = None,
22
+ background_music_volume: float = 0.5,
23
+ ) -> bool:
24
+ """
25
+ Merges multiple video files into one, optionally with background music.
26
+
27
+ Args:
28
+ video_paths: List of paths to video files to merge
29
+ output_path: Path for the merged output video
30
+ background_music: Optional path to background music file
31
+ bg_music_volume: Volume level for background music (0.0 to 1.0, default 0.5)
32
+
33
+ Returns:
34
+ bool: True if successful, False otherwise
35
+ """
36
+ if not video_paths:
37
+ logger.error("no video paths provided for merging")
38
+ return False
39
+
40
+ start = time.time()
41
+ context_logger = logger.bind(
42
+ number_of_videos=len(video_paths),
43
+ output_path=output_path,
44
+ background_music=bool(background_music_path),
45
+ background_music_volume=background_music_volume,
46
+ )
47
+
48
+ try:
49
+ # Get dimensions from the first video
50
+ first_video_info = self.get_video_info(video_paths[0])
51
+ if not first_video_info:
52
+ context_logger.error("failed to get video info from first video")
53
+ return False
54
+
55
+ target_width = first_video_info.get("width", 1080)
56
+ target_height = first_video_info.get("height", 1920)
57
+ target_dimensions = f"{target_width}:{target_height}"
58
+
59
+ context_logger.bind(
60
+ target_width=target_width, target_height=target_height
61
+ ).debug("using dimensions from first video")
62
+
63
+ # Base command
64
+ cmd = [self.ffmpeg_path, "-y"]
65
+
66
+ # Add input video files
67
+ for video_path in video_paths:
68
+ cmd.extend(["-i", video_path])
69
+
70
+ # Add background music if provided
71
+ music_input_index = None
72
+ if background_music_path:
73
+ cmd.extend(["-stream_loop", "-1", "-i", background_music_path])
74
+ music_input_index = len(video_paths)
75
+
76
+ # Create filter complex for concatenating videos with re-encoding
77
+ if len(video_paths) == 1:
78
+ # Single video - re-encode to ensure consistency
79
+ # Check if the video has audio
80
+ audio_info = self.get_audio_info(video_paths[0])
81
+ has_audio = bool(audio_info.get('duration', 0) > 0)
82
+
83
+ if background_music_path:
84
+ if has_audio:
85
+ cmd.extend(
86
+ [
87
+ "-filter_complex",
88
+ f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];[{music_input_index}:a]volume={background_music_volume}[bg];[0:a][bg]amix=inputs=2:duration=first[a]",
89
+ "-map",
90
+ "[v]",
91
+ "-map",
92
+ "[a]",
93
+ ]
94
+ )
95
+ else:
96
+ # No audio in video, just use background music
97
+ cmd.extend(
98
+ [
99
+ "-filter_complex",
100
+ f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];[{music_input_index}:a]volume={background_music_volume}[a]",
101
+ "-map",
102
+ "[v]",
103
+ "-map",
104
+ "[a]",
105
+ ]
106
+ )
107
+ else:
108
+ if has_audio:
109
+ cmd.extend(
110
+ [
111
+ "-filter_complex",
112
+ f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v]",
113
+ "-map",
114
+ "[v]",
115
+ "-map",
116
+ "0:a",
117
+ ]
118
+ )
119
+ else:
120
+ # No audio in video and no background music, create silent audio
121
+ video_info = self.get_video_info(video_paths[0])
122
+ video_duration = video_info.get('duration', 10) # fallback to 10 seconds
123
+ cmd.extend(
124
+ [
125
+ "-filter_complex",
126
+ f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];anullsrc=channel_layout=stereo:sample_rate=48000:duration={video_duration}[a]",
127
+ "-map",
128
+ "[v]",
129
+ "-map",
130
+ "[a]",
131
+ ]
132
+ )
133
+ else:
134
+ # Multiple videos - normalize and concatenate with re-encoding
135
+ # First, check which videos have audio streams
136
+ videos_with_audio = []
137
+ for i, video_path in enumerate(video_paths):
138
+ video_info = self.get_video_info(video_path)
139
+ # Check if video has audio by trying to get audio info
140
+ audio_info = self.get_audio_info(video_path)
141
+ has_audio = bool(audio_info.get('duration', 0) > 0)
142
+ videos_with_audio.append(has_audio)
143
+ context_logger.bind(video_index=i, has_audio=has_audio).debug("checked audio stream")
144
+
145
+ # Create normalized video streams for each input
146
+ normalize_filters = []
147
+ for i in range(len(video_paths)):
148
+ normalize_filters.append(
149
+ f"[{i}:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30,format=yuv420p[v{i}n]"
150
+ )
151
+
152
+ # Create audio streams for videos without audio (silent audio)
153
+ audio_filters = []
154
+ for i in range(len(video_paths)):
155
+ if not videos_with_audio[i]:
156
+ # Get video duration for silent audio generation
157
+ video_info = self.get_video_info(video_paths[i])
158
+ video_duration = video_info.get('duration', 10) # fallback to 10 seconds
159
+ audio_filters.append(f"anullsrc=channel_layout=stereo:sample_rate=48000:duration={video_duration}[a{i}n]")
160
+ else:
161
+ audio_filters.append(f"[{i}:a]aformat=sample_rates=48000:channel_layouts=stereo[a{i}n]")
162
+
163
+ # Create the concat filter using normalized streams
164
+ concat_inputs = ""
165
+ for i in range(len(video_paths)):
166
+ concat_inputs += f"[v{i}n][a{i}n]"
167
+
168
+ # Combine all filters
169
+ all_filters = normalize_filters + audio_filters
170
+ filter_complex = (
171
+ ";".join(all_filters)
172
+ + f";{concat_inputs}concat=n={len(video_paths)}:v=1:a=1[v][a]"
173
+ )
174
+
175
+ if background_music_path:
176
+ # Mix the concatenated audio with background music
177
+ filter_complex += f";[{music_input_index}:a]volume={background_music_volume}[bg];[a][bg]amix=inputs=2:duration=first[final_a]"
178
+ cmd.extend(
179
+ [
180
+ "-filter_complex",
181
+ filter_complex,
182
+ "-map",
183
+ "[v]",
184
+ "-map",
185
+ "[final_a]",
186
+ ]
187
+ )
188
+ else:
189
+ cmd.extend(
190
+ [
191
+ "-filter_complex",
192
+ filter_complex,
193
+ "-map",
194
+ "[v]",
195
+ "-map",
196
+ "[a]",
197
+ ]
198
+ )
199
+
200
+ # Video codec settings
201
+ cmd.extend(
202
+ [
203
+ "-c:v",
204
+ "libx264",
205
+ "-preset",
206
+ "veryfast",
207
+ "-crf",
208
+ "23",
209
+ ]
210
+ )
211
+
212
+ # Audio codec settings
213
+ cmd.extend(["-c:a", "aac", "-b:a", "192k"])
214
+
215
+ # Other settings
216
+ cmd.extend(["-pix_fmt", "yuv420p", output_path])
217
+
218
+ # Execute the command using the new method
219
+
220
+ # calculate expected duration for progress tracking
221
+ expected_duration = 0
222
+ for video_path in video_paths:
223
+ video_info = self.get_video_info(video_path)
224
+ expected_duration += video_info.get("duration", 0)
225
+
226
+ success = self.execute_ffmpeg_command(
227
+ cmd,
228
+ "merge videos",
229
+ expected_duration=expected_duration,
230
+ show_progress=True,
231
+ )
232
+
233
+ if success:
234
+ context_logger.bind(execution_time=time.time() - start).debug(
235
+ "videos merged successfully",
236
+ )
237
+ return True
238
+ else:
239
+ context_logger.error("ffmpeg failed to merge videos")
240
+ return False
241
+
242
+ except Exception as e:
243
+ context_logger.bind(error=str(e)).error(
244
+ "error merging videos",
245
+ )
246
+ return False
247
+
248
+ def get_video_info(self, file_path: str) -> dict:
249
+ """
250
+ Retrieves video information such as duration, width, height, codec, fps, etc.
251
+
252
+ Args:
253
+ file_path: Path to the video file
254
+
255
+ Returns:
256
+ Dictionary containing video information
257
+ """
258
+ try:
259
+ cmd = [
260
+ "ffprobe",
261
+ "-v",
262
+ "quiet",
263
+ "-print_format",
264
+ "json",
265
+ "-show_format",
266
+ "-show_streams",
267
+ "-select_streams",
268
+ "v:0", # Select first video stream
269
+ file_path,
270
+ ]
271
+
272
+ success, stdout, stderr = self.execute_ffprobe_command(
273
+ cmd, "get video info"
274
+ )
275
+
276
+ if not success:
277
+ raise Exception(f"ffprobe failed: {stderr}")
278
+
279
+ probe_data = json.loads(stdout)
280
+
281
+ # Extract format information
282
+ format_info = probe_data.get("format", {})
283
+ streams = probe_data.get("streams", [])
284
+
285
+ if not streams:
286
+ raise Exception("No video stream found in file")
287
+
288
+ video_stream = streams[0]
289
+
290
+ video_info = {
291
+ "duration": float(format_info.get("duration", 0)),
292
+ "width": video_stream.get("width"),
293
+ "height": video_stream.get("height"),
294
+ "fps": video_stream.get("avg_frame_rate", "0/1").split("/")[0],
295
+ "aspect_ratio": video_stream.get("display_aspect_ratio", "1:1"),
296
+ "codec": video_stream.get("codec_name"),
297
+ }
298
+
299
+ return video_info
300
+
301
+ except Exception as e:
302
+ logger.bind(file_path=file_path, error=str(e)).error(
303
+ "error getting video info"
304
+ )
305
+ return {}
306
+
307
+ def get_audio_info(self, file_path: str) -> dict:
308
+ """
309
+ Retrieves audio information such as duration, codec, bitrate, sample rate, channels, etc.
310
+
311
+ Args:
312
+ file_path: Path to the audio file
313
+
314
+ Returns:
315
+ Dictionary containing audio information
316
+ """
317
+ try:
318
+ cmd = [
319
+ "ffprobe",
320
+ "-v",
321
+ "quiet",
322
+ "-print_format",
323
+ "json",
324
+ "-show_format",
325
+ "-show_streams",
326
+ "-select_streams",
327
+ "a:0", # Select first audio stream
328
+ file_path,
329
+ ]
330
+
331
+ success, stdout, stderr = self.execute_ffprobe_command(
332
+ cmd, "get audio info"
333
+ )
334
+
335
+ if not success:
336
+ raise Exception(f"ffprobe failed: {stderr}")
337
+
338
+ probe_data = json.loads(stdout)
339
+
340
+ # Extract format information
341
+ format_info = probe_data.get("format", {})
342
+ streams = probe_data.get("streams", [])
343
+
344
+ if not streams:
345
+ raise Exception("No audio stream found in file")
346
+
347
+ audio_stream = streams[0]
348
+
349
+ audio_info = {
350
+ "duration": float(format_info.get("duration", 0)),
351
+ "channels": audio_stream.get("channels", 0),
352
+ "sample_rate": audio_stream.get("sample_rate", "0"),
353
+ "codec": audio_stream.get("codec_name", ""),
354
+ "bitrate": audio_stream.get("bit_rate", "0"),
355
+ }
356
+
357
+ return audio_info
358
+
359
+ except Exception as e:
360
+ logger.bind(file_path=file_path, error=str(e)).error(
361
+ "Error getting audio info"
362
+ )
363
+ return {}
364
+
365
+ def extract_frame(
366
+ self,
367
+ video_path: str,
368
+ output_path: str,
369
+ time_seconds: float = 0.0,
370
+ ) -> bool:
371
+ """
372
+ Extracts a frame from a video at a specified time.
373
+
374
+ Args:
375
+ video_path: Path to the input video file
376
+ output_path: Path for the extracted frame image
377
+ time_seconds: Time in seconds to extract the frame (default: 0.0)
378
+
379
+ Returns:
380
+ bool: True if successful, False otherwise
381
+ """
382
+ try:
383
+ # Base command
384
+ cmd = [self.ffmpeg_path, "-y"]
385
+
386
+ # Add input video file
387
+ cmd.extend(["-i", video_path])
388
+
389
+ # Seek to the specified time and extract one frame
390
+ cmd.extend(
391
+ [
392
+ "-ss",
393
+ str(time_seconds), # Seek to time
394
+ "-vframes",
395
+ "1", # Extract only one frame
396
+ "-q:v",
397
+ "2", # High quality (scale 1-31, lower is better)
398
+ output_path,
399
+ ]
400
+ )
401
+
402
+ # Execute the command using the new method
403
+ success = self.execute_ffmpeg_command(
404
+ cmd,
405
+ "extract frame",
406
+ show_progress=False, # No progress tracking for single frame extraction
407
+ )
408
+
409
+ if success:
410
+ logger.bind(video_path=video_path, time_seconds=time_seconds).debug(
411
+ "frame extracted successfully"
412
+ )
413
+ return True
414
+ else:
415
+ logger.bind(video_path=video_path, time_seconds=time_seconds).error(
416
+ "failed to extract frame from video"
417
+ )
418
+ return False
419
+
420
+ except Exception as e:
421
+ logger.bind(error=str(e)).error("Error extracting frame from video")
422
+ return False
423
+
424
+ def extract_frames(
425
+ self,
426
+ video_path: str,
427
+ output_template: str,
428
+ amount: int = 5,
429
+ length_seconds: float = None,
430
+ ) -> bool:
431
+ """
432
+ Args:
433
+ video_path: Path to the input video file
434
+ output_template: Template for output image files (e.g., "frame-%03d.jpg")
435
+ amount: Number of frames to extract (default: 5)
436
+ length_seconds: Length of the video in seconds (optional, if not provided will be calculated)
437
+
438
+ Returns:
439
+ bool: True if successful, False otherwise
440
+ """
441
+ try:
442
+ # Get video duration if not provided
443
+ if length_seconds is None:
444
+ video_info = self.get_video_info(video_path)
445
+ length_seconds = video_info.get("duration", 0)
446
+
447
+ if length_seconds <= 0:
448
+ logger.error("invalid video duration for frame extraction")
449
+ return False
450
+
451
+ # Calculate frame interval (time between frames)
452
+ # This gives us the correct fps rate to extract exactly 'amount' frames
453
+ # evenly distributed across the video duration
454
+ frame_interval = length_seconds / amount
455
+
456
+ # Base command - using the corrected fps calculation
457
+ # fps=1/frame_interval extracts one frame every frame_interval seconds
458
+ cmd = [
459
+ self.ffmpeg_path,
460
+ "-y",
461
+ "-i",
462
+ video_path,
463
+ "-vf",
464
+ f"fps=1/{frame_interval}",
465
+ "-vframes",
466
+ str(amount),
467
+ "-qscale:v",
468
+ "2", # High quality
469
+ output_template,
470
+ ]
471
+
472
+ # Execute the command using the new method
473
+ success = self.execute_ffmpeg_command(
474
+ cmd,
475
+ "extract frames",
476
+ expected_duration=length_seconds,
477
+ show_progress=True,
478
+ )
479
+
480
+ if success:
481
+ logger.bind(video_path=video_path, amount=amount).debug(
482
+ "frames extracted successfully"
483
+ )
484
+ return True
485
+ else:
486
+ logger.bind(video_path=video_path, amount=amount).error(
487
+ "failed to extract frames from video"
488
+ )
489
+ return False
490
+
491
+ except Exception as e:
492
+ logger.bind(error=str(e)).error("Error extracting frames from video")
493
+ return False
494
+
495
+ def format_time(self, seconds: float) -> str:
496
+ """
497
+ Format seconds into HH:MM:SS format.
498
+
499
+ Args:
500
+ seconds: Time in seconds
501
+
502
+ Returns:
503
+ Formatted time string
504
+ """
505
+ hours = int(seconds // 3600)
506
+ minutes = int((seconds % 3600) // 60)
507
+ seconds = int(seconds % 60)
508
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
509
+
510
+ def execute_ffmpeg_command(
511
+ self,
512
+ cmd: list,
513
+ operation_name: str,
514
+ expected_duration: float = None,
515
+ show_progress: bool = True,
516
+ ) -> bool:
517
+ """
518
+ Execute an ffmpeg command with proper logging and progress tracking.
519
+
520
+ Args:
521
+ cmd: The ffmpeg command as a list
522
+ operation_name: Name of the operation for logging
523
+ expected_duration: Expected duration for progress calculation
524
+ show_progress: Whether to show progress information
525
+
526
+ Returns:
527
+ bool: True if successful, False otherwise
528
+ """
529
+ try:
530
+ logger.bind(command=" ".join(cmd), operation=operation_name).debug(
531
+ f"executing ffmpeg command for {operation_name}"
532
+ )
533
+
534
+ process = subprocess.Popen(
535
+ cmd,
536
+ stderr=subprocess.PIPE,
537
+ universal_newlines=True,
538
+ text=True,
539
+ )
540
+
541
+ # Process the output line by line as it becomes available
542
+ for line in process.stderr:
543
+ # Extract time information for progress tracking
544
+ if (
545
+ show_progress
546
+ and expected_duration
547
+ and "time=" in line
548
+ and "speed=" in line
549
+ ):
550
+ try:
551
+ # Extract the time information
552
+ time_str = line.split("time=")[1].split(" ")[0]
553
+ # Convert HH:MM:SS.MS format to seconds
554
+ h, m, s = time_str.split(":")
555
+ seconds = float(h) * 3600 + float(m) * 60 + float(s)
556
+
557
+ # Calculate progress percentage
558
+ progress = min(100, (seconds / expected_duration) * 100)
559
+ logger.info(
560
+ f"{operation_name}: {progress:.2f}% complete (Time: {time_str} / Total: {self.format_time(expected_duration)})"
561
+ )
562
+ except (ValueError, IndexError):
563
+ # If parsing fails, continue silently
564
+ pass
565
+ elif any(
566
+ keyword in line
567
+ for keyword in [
568
+ # Skip initialization information
569
+ "ffmpeg version",
570
+ "built with",
571
+ "configuration:",
572
+ "libav",
573
+ "Input #",
574
+ "Metadata:",
575
+ "Duration:",
576
+ "Stream #",
577
+ "Press [q]",
578
+ "Output #",
579
+ "Stream mapping:",
580
+ # Skip processing details
581
+ "frame=",
582
+ "fps=",
583
+ "[libx264",
584
+ "kb/s:",
585
+ "Qavg:",
586
+ "video:",
587
+ "audio:",
588
+ "subtitle:",
589
+ "frame I:",
590
+ "frame P:",
591
+ "mb I",
592
+ "mb P",
593
+ "coded y,",
594
+ "i16 v,h,dc,p:",
595
+ "i8c dc,h,v,p:",
596
+ "compatible_brands:",
597
+ "encoder",
598
+ "Side data:",
599
+ "libswscale",
600
+ "libswresample",
601
+ "libpostproc",
602
+ # Additional patterns to filter
603
+ "ffmpeg: libswscale",
604
+ "ffmpeg: libswresample",
605
+ "ffmpeg: libpostproc",
606
+ ]
607
+ ):
608
+ # Skip all technical output lines
609
+ pass
610
+ else:
611
+ # Only print important messages (like errors and warnings)
612
+ # that don't match any of the filtered patterns
613
+ if not line.strip() or line.strip().startswith("["):
614
+ continue
615
+
616
+ # Skip header lines that describe inputs
617
+ if ":" in line and any(
618
+ header in line
619
+ for header in [
620
+ "major_brand",
621
+ "minor_version",
622
+ "creation_time",
623
+ "handler_name",
624
+ "vendor_id",
625
+ "Duration",
626
+ "bitrate",
627
+ ]
628
+ ):
629
+ continue
630
+
631
+ logger.debug(f"ffmpeg: {line.strip()}")
632
+
633
+ # Wait for the process to complete and check the return code
634
+ return_code = process.wait()
635
+ if return_code != 0:
636
+ logger.bind(return_code=return_code, operation=operation_name).error(
637
+ f"ffmpeg exited with code: {return_code} for {operation_name}"
638
+ )
639
+ return False
640
+
641
+ logger.bind(operation=operation_name).debug(
642
+ f"{operation_name} completed successfully"
643
+ )
644
+ return True
645
+
646
+ except Exception as e:
647
+ logger.bind(error=str(e), operation=operation_name).error(
648
+ f"error executing ffmpeg command for {operation_name}"
649
+ )
650
+ return False
651
+
652
+ def execute_ffprobe_command(
653
+ self, cmd: list, operation_name: str
654
+ ) -> tuple[bool, str, str]:
655
+ """
656
+ Execute an ffprobe command with proper logging.
657
+
658
+ Args:
659
+ cmd: The ffprobe command as a list
660
+ operation_name: Name of the operation for logging
661
+
662
+ Returns:
663
+ tuple: (success, stdout, stderr)
664
+ """
665
+ try:
666
+ logger.bind(command=" ".join(cmd), operation=operation_name).debug(
667
+ f"executing ffprobe command for {operation_name}"
668
+ )
669
+
670
+ process = subprocess.Popen(
671
+ cmd,
672
+ stdout=subprocess.PIPE,
673
+ stderr=subprocess.PIPE,
674
+ text=True,
675
+ )
676
+ stdout, stderr = process.communicate()
677
+
678
+ if process.returncode != 0:
679
+ logger.bind(stderr=stderr, operation=operation_name).error(
680
+ f"ffprobe failed for {operation_name}"
681
+ )
682
+ return False, stdout, stderr
683
+
684
+ logger.bind(operation=operation_name).debug(
685
+ f"{operation_name} completed successfully"
686
+ )
687
+ return True, stdout, stderr
688
+
689
+ except Exception as e:
690
+ logger.bind(error=str(e), operation=operation_name).error(
691
+ f"error executing ffprobe command for {operation_name}"
692
+ )
693
+ return False, "", str(e)
694
+
695
+ @staticmethod
696
+ def is_hex_color(color: str) -> bool:
697
+ """
698
+ Checks if the given color string is a valid hex color.
699
+
700
+ Args:
701
+ color: Color string to check
702
+
703
+ Returns:
704
+ bool: True if it's a hex color, False otherwise
705
+ """
706
+ return all(
707
+ c in "0123456789abcdefABCDEF" for c in color[1:]
708
+ )
709
+
710
+ def colorkey_overlay(
711
+ self,
712
+ input_video_path: str,
713
+ overlay_video_path: str,
714
+ output_video_path: str,
715
+ color: str = "green",
716
+ similarity: float = 0.1,
717
+ blend: float = 0.1,
718
+ ):
719
+ """
720
+ Applies a colorkey overlay to a video using FFmpeg.
721
+ """
722
+
723
+ """
724
+ ffmpeg -i input.mp4 -stream_loop -1 -i black_dust.mp4 \
725
+ -filter_complex "[1]colorkey=0x000000:0.1:0.1[ckout];[0][ckout]overlay" \
726
+ -shortest \
727
+ -c:v libx264 -preset ultrafast -crf 18 \
728
+ -c:a copy \
729
+ output.mp4
730
+ """
731
+
732
+ start = time.time()
733
+ info = self.get_video_info(input_video_path)
734
+ video_duration = info.get("duration", 0)
735
+
736
+ if not video_duration:
737
+ logger.error("failed to get video duration from input video")
738
+ return False
739
+
740
+ color = color.lstrip("#")
741
+ if self.is_hex_color(color):
742
+ color = f"0x{color.upper()}"
743
+
744
+ context_logger = logger.bind(
745
+ input_video_path=input_video_path,
746
+ overlay_video_path=overlay_video_path,
747
+ output_video_path=output_video_path,
748
+ video_duration=video_duration,
749
+ color=color,
750
+ similarity=similarity,
751
+ blend=blend,
752
+ )
753
+ context_logger.debug("Starting colorkey overlay process")
754
+
755
+ context_logger = context_logger.bind(
756
+ video_duration=video_duration,
757
+ )
758
+
759
+ cmd = [
760
+ self.ffmpeg_path, "-y",
761
+ "-i", input_video_path,
762
+ "-stream_loop", "-1",
763
+ "-i", overlay_video_path,
764
+ "-filter_complex", f"[1:v]colorkey={color}:{similarity}:{blend}[ckout];[0:v][ckout]overlay=eof_action=repeat[v]",
765
+ "-map", "[v]",
766
+ "-map", "0:a",
767
+ "-c:v", "libx264",
768
+ "-preset", "ultrafast",
769
+ "-crf", "18",
770
+ "-c:a", "copy",
771
+ "-t", f"{video_duration}s",
772
+ output_video_path,
773
+ ]
774
+
775
+ try:
776
+ success = self.execute_ffmpeg_command(
777
+ cmd,
778
+ "add colorkey overlay to video",
779
+ expected_duration=video_duration,
780
+ show_progress=True,
781
+ )
782
+
783
+ if success:
784
+ context_logger.bind(execution_time=time.time() - start).debug(
785
+ "colorkey overlay added successfully",
786
+ )
787
+ return True
788
+ else:
789
+ context_logger.error("ffmpeg failed to create colorkey overlay")
790
+ return False
791
+
792
+ except Exception as e:
793
+ context_logger.bind(error=str(e)).error(
794
+ "error adding colorkey overlay to video",
795
+ )
796
+ return False
797
+
798
+ def convert_pcm_to_wav(
799
+ self,
800
+ input_pcm_path: str,
801
+ output_wav_path: str,
802
+ sample_rate: int = 24000,
803
+ channels: int = 1,
804
+ target_sample_rate: int = 44100,
805
+ ) -> bool:
806
+ """
807
+ ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm -ar 44100 -ac 2 out_44k_stereo.wav
808
+ """
809
+ start = time.time()
810
+ context_logger = logger.bind(
811
+ input_pcm_path=input_pcm_path,
812
+ output_wav_path=output_wav_path,
813
+ sample_rate=sample_rate,
814
+ channels=channels,
815
+ target_sample_rate=target_sample_rate,
816
+ )
817
+ context_logger.debug("Starting PCM to WAV conversion")
818
+
819
+ cmd = [
820
+ self.ffmpeg_path, "-y",
821
+ "-f", "s16le",
822
+ "-ar", str(sample_rate),
823
+ "-ac", str(channels),
824
+ "-i", input_pcm_path,
825
+ "-ar", str(target_sample_rate),
826
+ "-ac", "2", # Convert to stereo
827
+ output_wav_path,
828
+ ]
829
+
830
+ try:
831
+ success = self.execute_ffmpeg_command(
832
+ cmd,
833
+ "convert PCM to WAV",
834
+ show_progress=False,
835
+ )
836
+
837
+ if success:
838
+ context_logger.bind(execution_time=time.time() - start).debug(
839
+ "PCM to WAV conversion successful",
840
+ )
841
+ return True
842
+ else:
843
+ context_logger.error("ffmpeg failed to convert PCM to WAV")
844
+ return False
845
+
846
+ except Exception as e:
847
+ context_logger.bind(error=str(e)).error(
848
+ "error converting PCM to WAV",
849
+ )
850
+ return False
video/storage.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+ import uuid
3
+ import os
4
+ import requests
5
+
6
+
7
+ class MediaType:
8
+ IMAGE = "image"
9
+ VIDEO = "video"
10
+ AUDIO = "audio"
11
+ TMP = "tmp"
12
+
13
+
14
+ class Storage:
15
+ def __init__(self, storage_path):
16
+ self.storage_path = storage_path
17
+ os.makedirs(self.storage_path, exist_ok=True)
18
+ # make all the subdirectories for the media types
19
+ for media_type in [
20
+ MediaType.IMAGE,
21
+ MediaType.VIDEO,
22
+ MediaType.AUDIO,
23
+ MediaType.TMP,
24
+ ]:
25
+ os.makedirs(os.path.join(self.storage_path, media_type), exist_ok=True)
26
+
27
+ def _validate_media_id(self, media_id: str) -> tuple[str, str]:
28
+ """
29
+ Validates and parses a media ID to prevent path traversal attacks.
30
+
31
+ Args:
32
+ media_id (str): Media ID to validate
33
+
34
+ Returns:
35
+ tuple[str, str]: (media_type, filename)
36
+
37
+ Raises:
38
+ ValueError: If media_id is invalid or contains path traversal attempts
39
+ """
40
+ if not media_id or "_" not in media_id:
41
+ raise ValueError("Invalid media ID format")
42
+
43
+ media_type, filename = media_id.split("_", 1)
44
+
45
+ # Validate media type
46
+ valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
47
+ if media_type not in valid_types:
48
+ raise ValueError(f"Invalid media type: {media_type}")
49
+
50
+ # Prevent path traversal by checking for dangerous patterns
51
+ if ".." in filename or "/" in filename or "\\" in filename:
52
+ raise ValueError(
53
+ "Filename contains invalid characters or path traversal attempt"
54
+ )
55
+
56
+ # Additional validation: filename should not be empty and should be reasonable
57
+ if not filename or len(filename) > 255:
58
+ raise ValueError("Invalid filename")
59
+
60
+ return media_type, filename
61
+
62
+ def _get_safe_file_path(self, media_id: str) -> str:
63
+ """
64
+ Gets a safe file path for the given media ID after validation.
65
+
66
+ Args:
67
+ media_id (str): Media ID to get path for
68
+
69
+ Returns:
70
+ str: Safe file path
71
+ """
72
+ media_type, filename = self._validate_media_id(media_id)
73
+ file_path = os.path.join(self.storage_path, media_type, filename)
74
+
75
+ # Double-check that the resolved path is within the storage directory
76
+ resolved_path = os.path.abspath(file_path)
77
+ storage_abs_path = os.path.abspath(self.storage_path)
78
+
79
+ if not resolved_path.startswith(storage_abs_path):
80
+ raise ValueError("Path traversal attempt detected")
81
+
82
+ return file_path
83
+
84
+ def upload_media(
85
+ self, media_type: MediaType, media_data: bytes, file_extension: str = ""
86
+ ) -> str:
87
+ """
88
+ Uploads media to the server.
89
+
90
+ Args:
91
+ media_type (str): Type of media, e.g., 'image' or 'video'.
92
+ media_data (bytes): Binary data of the media file.
93
+ file_extension (str): File extension, e.g., '.jpg', '.mp4', '.wav'.
94
+
95
+ Returns:
96
+ str: Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
97
+ """
98
+ # Validate media type
99
+ valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
100
+ if media_type not in valid_types:
101
+ raise ValueError(f"Invalid media type: {media_type}")
102
+
103
+ # Validate file extension to prevent path traversal
104
+ if file_extension and (
105
+ ".." in file_extension or "/" in file_extension or "\\" in file_extension
106
+ ):
107
+ raise ValueError("File extension contains invalid characters")
108
+
109
+ asset_id = str(uuid.uuid4())
110
+ filename = f"{asset_id}{file_extension}" if file_extension else asset_id
111
+ file_path = os.path.join(self.storage_path, media_type, filename)
112
+
113
+ # Additional safety check
114
+ resolved_path = os.path.abspath(file_path)
115
+ storage_abs_path = os.path.abspath(self.storage_path)
116
+ if not resolved_path.startswith(storage_abs_path):
117
+ raise ValueError("Path traversal attempt detected")
118
+
119
+ with open(file_path, "wb") as f:
120
+ f.write(media_data)
121
+
122
+ media_id = f"{media_type}_{filename}"
123
+ return media_id
124
+
125
+ def get_media(self, media_id: str) -> bytes:
126
+ """
127
+ Retrieves media by ID.
128
+
129
+ Args:
130
+ media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
131
+
132
+ Returns:
133
+ bytes: Binary data of the media file.
134
+ """
135
+ file_path = self._get_safe_file_path(media_id)
136
+
137
+ if not os.path.exists(file_path):
138
+ raise FileNotFoundError(f"Media file {media_id} not found.")
139
+
140
+ with open(file_path, "rb") as f:
141
+ return f.read()
142
+
143
+ def delete_media(self, media_id: str) -> None:
144
+ """
145
+ Deletes media by ID.
146
+
147
+ Args:
148
+ media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
149
+ """
150
+ file_path = self._get_safe_file_path(media_id)
151
+
152
+ if os.path.exists(file_path):
153
+ os.remove(file_path)
154
+ else:
155
+ raise FileNotFoundError(f"Media file {media_id} not found.")
156
+
157
+ def media_exists(self, media_id: str) -> bool:
158
+ """
159
+ Checks if media exists by ID.
160
+
161
+ Args:
162
+ media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
163
+
164
+ Returns:
165
+ bool: True if media exists, False otherwise.
166
+ """
167
+ try:
168
+ file_path = self._get_safe_file_path(media_id)
169
+ return os.path.exists(file_path)
170
+ except ValueError:
171
+ return False
172
+
173
+ def get_media_path(self, media_id: str) -> str:
174
+ """
175
+ Gets the file path of the media by ID.
176
+
177
+ Args:
178
+ media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
179
+
180
+ Returns:
181
+ str: Full file path of the media.
182
+ """
183
+ return self._get_safe_file_path(media_id)
184
+
185
+ ### untested
186
+ def create_media_filename(
187
+ self, media_type: MediaType, file_extension: str = ""
188
+ ) -> str:
189
+ # Validate media type
190
+ valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
191
+ if media_type not in valid_types:
192
+ raise ValueError(f"Invalid media type: {media_type}")
193
+
194
+ # Validate file extension to prevent path traversal
195
+ if file_extension and (
196
+ ".." in file_extension or "/" in file_extension or "\\" in file_extension
197
+ ):
198
+ raise ValueError("File extension contains invalid characters")
199
+
200
+ asset_id = str(uuid.uuid4())
201
+ filename = f"{asset_id}{file_extension}" if file_extension else asset_id
202
+ return f"{media_type}_{filename}"
203
+
204
+ def create_media_filename_with_id(
205
+ self, media_type: MediaType, file_extension: str = ""
206
+ ) -> Tuple[str, str]:
207
+ file_id = self.create_media_filename(media_type, file_extension)
208
+ return file_id, self.get_media_path(file_id)
209
+
210
+ def create_media_template(
211
+ self, media_type: MediaType, file_extension: str
212
+ ) -> str:
213
+ """
214
+ Creates a media template filename for the given media type and file extension.
215
+ Args:
216
+ media_type (MediaType): Type of media, e.g., MediaType.IMAGE.
217
+ file_extension (str): File extension, e.g., '.jpg', '.mp4'.
218
+ ):
219
+ Returns:
220
+
221
+ """
222
+ if not file_extension.startswith("."):
223
+ file_extension = "." + file_extension
224
+
225
+ valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
226
+ if media_type not in valid_types:
227
+ raise ValueError(f"Invalid media type: {media_type}")
228
+
229
+ if file_extension and (
230
+ ".." in file_extension or "/" in file_extension or "\\" in file_extension
231
+ ):
232
+ raise ValueError("File extension contains invalid characters")
233
+
234
+ asset_id = str(uuid.uuid4())
235
+ filename = f"{asset_id}-%02d{file_extension}" if file_extension else f"{asset_id}-%02d"
236
+ file_path = os.path.join(
237
+ self.storage_path, media_type, filename
238
+ )
239
+ return filename, file_path
240
+
241
+
242
+ def create_tmp_file_id(self, media_id: str) -> str:
243
+ """
244
+ Creates a temporary filename for media upload.
245
+
246
+ Args:
247
+ media_id (str): Media ID to create a temporary filename for.
248
+
249
+ Returns:
250
+ str: Temporary media ID.
251
+ """
252
+ return f"{media_id}.tmp"
253
+
254
+ def create_tmp_file(self, media_id: str) -> str:
255
+ """
256
+ Creates a temporary file for media upload.
257
+
258
+ Args:
259
+ media_id (str): Media ID to create a temporary file for.
260
+
261
+ Returns:
262
+ str: Temporary media ID.
263
+ """
264
+ tmp_id = f"{media_id}.tmp"
265
+ tmp_path = self.get_media_path(tmp_id)
266
+
267
+ with open(tmp_path, "wb") as f:
268
+ pass
269
+ return tmp_id
270
+
271
+ def get_media_type(self, media_id: str) -> MediaType:
272
+ """
273
+ Gets the media type of the given media ID.
274
+
275
+ Args:
276
+ media_id (str): Media ID to get the type for.
277
+
278
+ Returns:
279
+ MediaType: The type of the media.
280
+ """
281
+ media_type, _ = self._validate_media_id(media_id)
282
+ return media_type
283
+
284
+ def is_valid_url(self, url: str) -> bool:
285
+ """
286
+ Validates a URL to ensure it is well-formed.
287
+
288
+ Args:
289
+ url (str): The URL to validate.
290
+
291
+ Returns:
292
+ bool: True if the URL is valid, False otherwise.
293
+ """
294
+ from urllib.parse import urlparse
295
+
296
+ try:
297
+ result = urlparse(url)
298
+ return all([result.scheme, result.netloc])
299
+ except Exception:
300
+ return False
301
+
302
+ def upload_media_from_url(
303
+ self, media_type: MediaType, url: str
304
+ ) -> str:
305
+ """
306
+ Uploads media from a URL.
307
+
308
+ Args:
309
+ media_type (MediaType): Type of media, e.g., MediaType.IMAGE.
310
+ url (str): URL of the media file.
311
+
312
+ Returns:
313
+ str: Media ID, e.g., 'image_12345.jpg'.
314
+ """
315
+ if not self.is_valid_url(url):
316
+ raise ValueError("Invalid URL")
317
+
318
+ response = requests.get(url)
319
+ if response.status_code != 200:
320
+ raise ValueError(f"Failed to download media from {url}")
321
+
322
+ file_extension = os.path.splitext(url)[1]
323
+ return self.upload_media(media_type, response.content, file_extension)
video/stt.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+ from loguru import logger
3
+ from video.config import device, whisper_model, whisper_compute_type
4
+
5
+
6
+ class STT:
7
+ def __init__(self):
8
+ self.model = WhisperModel(
9
+ model_size_or_path=whisper_model,
10
+ compute_type=whisper_compute_type
11
+ )
12
+
13
+ def transcribe(self, audio_path, language = None, beam_size=5):
14
+ logger.bind(
15
+ device=device.type,
16
+ model_size=whisper_model,
17
+ compute_type=whisper_compute_type,
18
+ audio_path=audio_path,
19
+ language=language,
20
+ ).debug(
21
+ "transcribing audio with Whisper model",
22
+ )
23
+ segments, info = self.model.transcribe(
24
+ audio_path,
25
+ beam_size=beam_size,
26
+ word_timestamps=True,
27
+ language=language,
28
+ )
29
+
30
+ duration = info.duration
31
+ captions = []
32
+ for segment in segments:
33
+ for word in segment.words:
34
+ captions.append(
35
+ {
36
+ "text": word.word,
37
+ "start_ts": word.start,
38
+ "end_ts": word.end,
39
+ }
40
+ )
41
+ return captions, duration
video/tts.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ import warnings
4
+ from typing import List
5
+ from kokoro import KPipeline
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from loguru import logger
9
+ import torchaudio as ta
10
+ from chatterbox.tts import ChatterboxTTS
11
+ from video.config import device
12
+
13
+ # Suppress PyTorch warnings
14
+ warnings.filterwarnings("ignore")
15
+
16
+ LANGUAGE_CONFIG = {
17
+ "en-us": {
18
+ "lang_code": "a",
19
+ "international": False,
20
+ "iso639_1": "en",
21
+ },
22
+ "en": {
23
+ "lang_code": "a",
24
+ "international": False,
25
+ "iso639_1": "en",
26
+ },
27
+ "en-gb": {
28
+ "lang_code": "b",
29
+ "international": False,
30
+ "iso639_1": "en",
31
+ },
32
+ "es": {"lang_code": "e", "international": True, "iso639_1": "es"},
33
+ "fr": {"lang_code": "f", "international": True, "iso639_1": "fr"},
34
+ "hi": {"lang_code": "h", "international": True, "iso639_1": "hi"},
35
+ "it": {"lang_code": "i", "international": True, "iso639_1": "it"},
36
+ "pt": {"lang_code": "p", "international": True, "iso639_1": "pt"},
37
+ "ja": {"lang_code": "j", "international": True, "iso639_1": "ja"},
38
+ "zh": {"lang_code": "z", "international": True, "iso639_1": "zh"},
39
+ }
40
+ LANGUAGE_VOICE_CONFIG = {
41
+ "en-us": [
42
+ "af_heart",
43
+ "af_alloy",
44
+ "af_aoede",
45
+ "af_bella",
46
+ "af_jessica",
47
+ "af_kore",
48
+ "af_nicole",
49
+ "af_nova",
50
+ "af_river",
51
+ "af_sarah",
52
+ "af_sky",
53
+ "am_adam",
54
+ "am_echo",
55
+ "am_eric",
56
+ "am_fenrir",
57
+ "am_liam",
58
+ "am_michael",
59
+ "am_onyx",
60
+ "am_puck",
61
+ "am_santa",
62
+ ],
63
+ "en-gb": [
64
+ "bf_alice",
65
+ "bf_emma",
66
+ "bf_isabella",
67
+ "bf_lily",
68
+ "bm_daniel",
69
+ "bm_fable",
70
+ "bm_george",
71
+ "bm_lewis",
72
+ ],
73
+ "zh": [
74
+ "zf_xiaobei",
75
+ "zf_xiaoni",
76
+ "zf_xiaoxiao",
77
+ "zf_xiaoyi",
78
+ "zm_yunjian",
79
+ "zm_yunxi",
80
+ "zm_yunxia",
81
+ "zm_yunyang",
82
+ ],
83
+ "es": ["ef_dora", "em_alex", "em_santa"],
84
+ "fr": ["ff_siwis"],
85
+ "it": ["if_sara", "im_nicola"],
86
+ "pt": ["pf_dora", "pm_alex", "pm_santa"],
87
+ "hi": ["hf_alpha", "hf_beta", "hm_omega", "hm_psi"],
88
+ }
89
+
90
+ LANGUAGE_VOICE_MAP = {}
91
+ for lang, voices in LANGUAGE_VOICE_CONFIG.items():
92
+ for voice in voices:
93
+ if lang in LANGUAGE_CONFIG:
94
+ LANGUAGE_VOICE_MAP[voice] = LANGUAGE_CONFIG[lang]
95
+ else:
96
+ print(f"Warning: Language {lang} not found in LANGUAGE_CONFIG")
97
+
98
+
99
+ class TTS:
100
+ def break_text_into_sentences(self, text, lang_code) -> List[str]:
101
+ """
102
+ Advanced sentence splitting with better handling of abbreviations and edge cases.
103
+ """
104
+ if not text or not text.strip():
105
+ return []
106
+
107
+ # Language-specific sentence boundary patterns
108
+ patterns = {
109
+ "a": r"(?<=[.!?])\s+(?=[A-Z_])", # English
110
+ "e": r"(?<=[.!?])\s+(?=[A-ZÁÉÍÓÚÑÜ¿¡_])", # Spanish - allow inverted punctuation after boundaries
111
+ "f": r"(?<=[.!?])\s+(?=[A-ZÁÀÂÄÇÉÈÊËÏÎÔÖÙÛÜŸ_])", # French
112
+ "h": r"(?<=[।!?])\s+", # Hindi: Split after devanagari danda
113
+ "i": r"(?<=[.!?])\s+(?=[A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß_])", # Italian
114
+ "p": r"(?<=[.!?])\s+(?=[A-ZÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ_])", # Portuguese
115
+ "z": r"(?<=[。!?])", # Chinese: Split after Chinese punctuation
116
+ }
117
+
118
+ # Common abbreviations that shouldn't trigger sentence breaks
119
+ abbreviations = {
120
+ "a": {
121
+ "Mr.",
122
+ "Mrs.",
123
+ "Ms.",
124
+ "Dr.",
125
+ "Prof.",
126
+ "Sr.",
127
+ "Jr.",
128
+ "Inc.",
129
+ "Corp.",
130
+ "Ltd.",
131
+ "Co.",
132
+ "etc.",
133
+ "vs.",
134
+ "eg.",
135
+ "i.e.",
136
+ "e.g.",
137
+ "Vol.",
138
+ "Ch.",
139
+ "Fig.",
140
+ "No.",
141
+ "p.",
142
+ "pp.",
143
+ }, # English
144
+ "e": {
145
+ "Sr.",
146
+ "Sra.",
147
+ "Dr.",
148
+ "Dra.",
149
+ "Prof.",
150
+ "etc.",
151
+ "pág.",
152
+ "art.",
153
+ "núm.",
154
+ "cap.",
155
+ "vol.",
156
+ }, # Spanish
157
+ "f": {
158
+ "M.",
159
+ "Mme.",
160
+ "Dr.",
161
+ "Prof.",
162
+ "etc.",
163
+ "art.",
164
+ "p.",
165
+ "vol.",
166
+ "ch.",
167
+ "fig.",
168
+ "n°",
169
+ }, # French
170
+ "h": {"श्री", "श्रीमती", "डॉ.", "प्रो.", "etc.", "पृ.", "अध."}, # Hindi
171
+ "i": {
172
+ "Sig.",
173
+ "Sig.ra",
174
+ "Dr.",
175
+ "Prof.",
176
+ "ecc.",
177
+ "pag.",
178
+ "art.",
179
+ "n.",
180
+ "vol.",
181
+ "cap.",
182
+ "fig.",
183
+ }, # Italian
184
+ "p": {
185
+ "Sr.",
186
+ "Sra.",
187
+ "Dr.",
188
+ "Dra.",
189
+ "Prof.",
190
+ "etc.",
191
+ "pág.",
192
+ "art.",
193
+ "n.º",
194
+ "vol.",
195
+ "cap.",
196
+ }, # Portuguese
197
+ "z": {"先生", "女士", "博士", "教授", "等等", "第", "页", "章"}, # Chinese
198
+ }
199
+
200
+ abbrevs = abbreviations.get(lang_code, set())
201
+
202
+ # Protect abbreviations by temporarily replacing them
203
+ protected_text = text
204
+ replacements = {}
205
+ for i, abbrev in enumerate(abbrevs):
206
+ placeholder = f"__ABBREV_{i}__"
207
+ protected_text = protected_text.replace(abbrev, placeholder)
208
+ replacements[placeholder] = abbrev
209
+
210
+ # Apply the regex splitting
211
+ pattern = patterns.get(lang_code, patterns["a"])
212
+ sentences = re.split(pattern, protected_text.strip())
213
+
214
+ # Restore abbreviations and clean up
215
+ restored_sentences = []
216
+ for sentence in sentences:
217
+ for placeholder, original in replacements.items():
218
+ sentence = sentence.replace(placeholder, original)
219
+ sentence = sentence.strip()
220
+ if sentence:
221
+ restored_sentences.append(sentence)
222
+
223
+ return restored_sentences if restored_sentences else [text.strip()]
224
+
225
+ def kokoro_international(
226
+ self, text: str, output_path: str, voice: str, lang_code: str, speed=1
227
+ ) -> tuple[str, List[dict], float]:
228
+ if not text or not text.strip():
229
+ raise ValueError("Text cannot be empty or whitespace")
230
+ lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
231
+ if not lang_code:
232
+ raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
233
+ start = time.time()
234
+ context_logger = logger.bind(
235
+ voice=voice,
236
+ speed=speed,
237
+ text_length=len(text),
238
+ )
239
+ context_logger.debug("Starting TTS generation (international) with kokoro")
240
+ sentences = self.break_text_into_sentences(text, lang_code)
241
+ context_logger.debug(
242
+ "Text split into sentences",
243
+ sentences=sentences,
244
+ num_sentences=len(sentences),
245
+ )
246
+
247
+ # generate the audio for each sentence
248
+ audio_data = []
249
+ captions = []
250
+ full_audio_length = 0
251
+ pipeline = KPipeline(lang_code=lang_code, repo_id="hexgrad/Kokoro-82M", device=device)
252
+ for sentence in sentences:
253
+ context_logger.debug(
254
+ "Processing sentence",
255
+ sentence=sentence,
256
+ voice=voice,
257
+ speed=speed,
258
+ )
259
+ generator = pipeline(sentence, voice=voice, speed=speed)
260
+
261
+ for i, result in enumerate(generator):
262
+ context_logger.debug(
263
+ "Generated audio for sentence",
264
+ )
265
+ data = result.audio
266
+ audio_length = len(data) / 24000
267
+ audio_data.append(data)
268
+ # since there are no tokens, we can just use the sentence as the text
269
+ captions.append(
270
+ {
271
+ "text": sentence,
272
+ "start_ts": full_audio_length,
273
+ "end_ts": full_audio_length + audio_length,
274
+ }
275
+ )
276
+ full_audio_length += audio_length
277
+
278
+ context_logger = context_logger.bind(
279
+ execution_time=time.time() - start,
280
+ audio_length=full_audio_length,
281
+ speedup=full_audio_length / (time.time() - start),
282
+ )
283
+ context_logger.debug(
284
+ "TTS generation (international) completed with kokoro",
285
+ )
286
+
287
+ audio_data = np.concatenate(audio_data)
288
+ audio_data = np.column_stack((audio_data, audio_data))
289
+ sf.write(output_path, audio_data, 24000, format="WAV")
290
+ return captions, full_audio_length
291
+
292
+ def kokoro_english(
293
+ self, text: str, output_path: str, voice="af_heart", speed=1
294
+ ) -> tuple[str, List[dict], float]:
295
+ if not text or not text.strip():
296
+ raise ValueError("Text cannot be empty or whitespace")
297
+ lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
298
+ if not lang_code:
299
+ raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
300
+ if lang_code != "a":
301
+ raise NotImplementedError(
302
+ f"TTS for language code '{lang_code}' is not implemented."
303
+ )
304
+ start = time.time()
305
+
306
+ context_logger = logger.bind(
307
+ voice=voice,
308
+ speed=speed,
309
+ text_length=len(text),
310
+ device=device.type,
311
+ )
312
+
313
+ context_logger.debug("Starting TTS generation with kokoro")
314
+ if not text or not text.strip():
315
+ raise ValueError("Text cannot be empty or whitespace")
316
+ pipeline = KPipeline(lang_code=lang_code, repo_id="hexgrad/Kokoro-82M", device=device.type)
317
+
318
+ generator = pipeline(text, voice=voice, speed=speed)
319
+
320
+ captions = []
321
+ audio_data = []
322
+ full_audio_length = 0
323
+ for _, result in enumerate(generator):
324
+ data = result.audio
325
+ audio_length = len(data) / 24000
326
+ audio_data.append(data)
327
+ if result.tokens:
328
+ tokens = result.tokens
329
+ for t in tokens:
330
+ if t.start_ts is None or t.end_ts is None:
331
+ if captions:
332
+ captions[-1]["text"] += t.text
333
+ captions[-1]["end_ts"] = full_audio_length + audio_length
334
+ continue
335
+ try:
336
+ captions.append(
337
+ {
338
+ "text": t.text,
339
+ "start_ts": full_audio_length + t.start_ts,
340
+ "end_ts": full_audio_length + t.end_ts,
341
+ }
342
+ )
343
+ except Exception as e:
344
+ logger.error(
345
+ "Error processing token: {}, Error: {}",
346
+ t,
347
+ e,
348
+ )
349
+ raise ValueError(f"Error processing token: {t}, Error: {e}")
350
+ full_audio_length += audio_length
351
+
352
+ audio_data = np.concatenate(audio_data)
353
+ audio_data = np.column_stack((audio_data, audio_data))
354
+ sf.write(output_path, audio_data, 24000, format="WAV")
355
+ context_logger.bind(
356
+ execution_time=time.time() - start,
357
+ audio_length=full_audio_length,
358
+ speedup=full_audio_length / (time.time() - start),
359
+ youtube_channel="https://www.youtube.com/@aiagentsaz"
360
+ ).debug(
361
+ "TTS generation completed with kokoro",
362
+ )
363
+ return captions, full_audio_length
364
+
365
+ def kokoro(
366
+ self, text: str, output_path: str, voice="af_heart", speed=1
367
+ ) -> tuple[str, List[dict], float]:
368
+ if not text or not text.strip():
369
+ raise ValueError("Text cannot be empty or whitespace")
370
+ lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
371
+ if not lang_code:
372
+ raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
373
+ if lang_code == "a":
374
+ return self.kokoro_english(text, output_path, voice, speed)
375
+ else:
376
+ return self.kokoro_international(text, output_path, voice, lang_code, speed)
377
+
378
+ def chatterbox(
379
+ self,
380
+ text: str,
381
+ output_path: str,
382
+ sample_audio_path: str = None,
383
+ exaggeration=0.5,
384
+ cfg_weight=0.5,
385
+ temperature=0.8,
386
+ ):
387
+ start = time.time()
388
+ context_logger = logger.bind(
389
+ text_length=len(text),
390
+ sample_audio_path=sample_audio_path,
391
+ exaggeration=exaggeration,
392
+ cfg_weight=cfg_weight,
393
+ temperature=temperature,
394
+ model="ChatterboxTTS",
395
+ language="en-US",
396
+ device=device.type,
397
+ )
398
+ context_logger.debug("starting TTS generation with Chatterbox")
399
+ model = ChatterboxTTS.from_pretrained(device=device.type)
400
+
401
+ if sample_audio_path:
402
+ wav = model.generate(
403
+ text,
404
+ audio_prompt_path=sample_audio_path,
405
+ exaggeration=exaggeration,
406
+ cfg_weight=cfg_weight,
407
+ temperature=temperature,
408
+ )
409
+ else:
410
+ wav = model.generate(
411
+ text,
412
+ exaggeration=exaggeration,
413
+ cfg_weight=cfg_weight,
414
+ temperature=temperature,
415
+ )
416
+
417
+ if wav.dim() == 2 and wav.shape[0] == 1:
418
+ wav = wav.repeat(2, 1)
419
+ elif wav.dim() == 1:
420
+ wav = wav.unsqueeze(0).repeat(2, 1)
421
+
422
+ audio_length = wav.shape[1] / model.sr
423
+ ta.save(output_path, wav, model.sr)
424
+ context_logger.bind(
425
+ execution_time=time.time() - start,
426
+ audio_length=audio_length,
427
+ speedup=audio_length / (time.time() - start),
428
+ youtube_channel="https://www.youtube.com/@aiagentsaz"
429
+ ).debug(
430
+ "TTS generation with Chatterbox completed",
431
+ )
432
+
433
+ def valid_kokoro_voices(self, lang_code = None) -> List[str]:
434
+ """
435
+ Returns a list of valid voices for the given language code.
436
+ If no language code is provided, returns all voices.
437
+ """
438
+ if lang_code:
439
+ return LANGUAGE_VOICE_CONFIG.get(lang_code, [])
440
+ else:
441
+ return [
442
+ voice for voices in LANGUAGE_VOICE_CONFIG.values() for voice in voices
443
+ ]
video/tts_chatterbox.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import traceback
4
+ import warnings
5
+ from loguru import logger
6
+ import torchaudio as ta
7
+ from chatterbox.tts import ChatterboxTTS
8
+ from video.config import device
9
+ import nltk
10
+ import torch
11
+ from typing import List, Optional
12
+
13
+ # Suppress PyTorch warnings
14
+ warnings.filterwarnings("ignore")
15
+
16
+ class TTSChatterbox:
17
+ def __init__(self):
18
+ """Initialize ChatterboxTTS and ensure NLTK data is available."""
19
+ self.ensure_nltk_data()
20
+ logger.debug("ChatterboxTTS initialized")
21
+
22
+ def ensure_nltk_data(self):
23
+ """Ensure NLTK punkt tokenizer is available."""
24
+ try:
25
+ nltk.data.find('tokenizers/punkt')
26
+ nltk.data.find('tokenizers/punkt_tab')
27
+ logger.debug("NLTK punkt tokenizer found")
28
+ except LookupError:
29
+ logger.debug("Downloading NLTK punkt tokenizer...")
30
+ try:
31
+ nltk.download('punkt', quiet=True)
32
+ nltk.download('punkt_tab', quiet=True)
33
+ logger.debug("NLTK punkt tokenizer downloaded successfully")
34
+ except Exception as e:
35
+ logger.error(f"Failed to download NLTK punkt tokenizer: {e}")
36
+ raise
37
+
38
+ def split_text_into_chunks(self, text: str, max_chars_per_chunk: int = 300) -> List[str]:
39
+ """Split text into chunks respecting sentence boundaries without breaking sentences."""
40
+ try:
41
+ sentences = nltk.sent_tokenize(text)
42
+ # Filter out empty sentences and strip whitespace
43
+ sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
44
+
45
+ chunks = []
46
+ current_chunk = ""
47
+
48
+ for sentence in sentences:
49
+ # If adding this sentence would exceed the limit, finalize current chunk
50
+ if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chars_per_chunk:
51
+ chunks.append(current_chunk.strip())
52
+ current_chunk = sentence
53
+ else:
54
+ # Add sentence to current chunk
55
+ if current_chunk:
56
+ current_chunk += " " + sentence
57
+ else:
58
+ current_chunk = sentence
59
+
60
+ # Add the last chunk if it's not empty
61
+ if current_chunk.strip():
62
+ chunks.append(current_chunk.strip())
63
+
64
+ logger.debug(f"Text split into {len(chunks)} chunks (max {max_chars_per_chunk} chars each, preserving sentences)")
65
+ return chunks
66
+ except Exception as e:
67
+ logger.error(f"Error splitting text: {e}")
68
+ # Fallback: return original text as single chunk
69
+ return [text]
70
+
71
+ def generate_audio_chunk(
72
+ self,
73
+ text_chunk: str,
74
+ model: ChatterboxTTS,
75
+ audio_prompt_path: Optional[str] = None,
76
+ temperature: float = 0.8,
77
+ cfg_weight: float = 0.5,
78
+ exaggeration: float = 0.5
79
+ ) -> Optional[torch.Tensor]:
80
+ """Generate audio tensor for a single text chunk."""
81
+ try:
82
+ logger.debug(f"Generating audio for chunk: {text_chunk[:50]}...")
83
+
84
+
85
+ # Check if audio prompt exists
86
+ effective_prompt_path = None
87
+ if audio_prompt_path and os.path.exists(audio_prompt_path):
88
+ effective_prompt_path = audio_prompt_path
89
+ elif audio_prompt_path:
90
+ logger.warning(f"Audio prompt path not found: {audio_prompt_path}")
91
+
92
+ # Generate audio
93
+ wav_tensor = model.generate(
94
+ text_chunk,
95
+ audio_prompt_path=effective_prompt_path,
96
+ temperature=temperature,
97
+ cfg_weight=cfg_weight,
98
+ exaggeration=exaggeration
99
+ )
100
+
101
+ # Ensure tensor is on CPU and properly shaped
102
+ wav_tensor_cpu = wav_tensor.cpu().float()
103
+
104
+ # Ensure tensor is 2D: [channels, samples]
105
+ if wav_tensor_cpu.ndim == 1:
106
+ wav_tensor_cpu = wav_tensor_cpu.unsqueeze(0)
107
+ elif wav_tensor_cpu.ndim > 2:
108
+ logger.warning(f"Unexpected tensor shape {wav_tensor_cpu.shape}, attempting to fix")
109
+ wav_tensor_cpu = wav_tensor_cpu.squeeze()
110
+ if wav_tensor_cpu.ndim == 1:
111
+ wav_tensor_cpu = wav_tensor_cpu.unsqueeze(0)
112
+ elif wav_tensor_cpu.ndim != 2 or wav_tensor_cpu.shape[0] != 1:
113
+ logger.error(f"Could not reshape tensor {wav_tensor.shape} to [1, N]")
114
+ return None
115
+
116
+ return wav_tensor_cpu
117
+
118
+ except Exception as e:
119
+ logger.error(f"Error generating audio chunk: {e}")
120
+ logger.error(traceback.format_exc())
121
+ return None
122
+
123
+ def text_to_speech_pipeline(
124
+ self,
125
+ text: str,
126
+ model: ChatterboxTTS,
127
+ max_chars_per_chunk: int = 1024,
128
+ inter_chunk_silence_ms: int = 350,
129
+ audio_prompt_path: Optional[str] = None,
130
+ temperature: float = 0.8,
131
+ cfg_weight: float = 0.5,
132
+ exaggeration: float = 0.5
133
+ ) -> Optional[torch.Tensor]:
134
+ """Convert text to speech with chunking support."""
135
+ try:
136
+ # Split text into chunks
137
+ text_chunks = self.split_text_into_chunks(text, max_chars_per_chunk)
138
+
139
+ if not text_chunks:
140
+ logger.error("No text chunks to process")
141
+ return None
142
+
143
+ all_audio_tensors = []
144
+ sample_rate = model.sr
145
+
146
+ logger.debug(f"Processing {len(text_chunks)} chunks at {sample_rate} Hz")
147
+
148
+ for i, chunk_text in enumerate(text_chunks):
149
+ logger.debug(f"Processing chunk {i+1}/{len(text_chunks)}")
150
+
151
+ chunk_tensor = self.generate_audio_chunk(
152
+ chunk_text,
153
+ model,
154
+ audio_prompt_path,
155
+ temperature,
156
+ cfg_weight,
157
+ exaggeration
158
+ )
159
+
160
+ if chunk_tensor is None:
161
+ logger.warning(f"Skipping chunk {i+1} due to generation error")
162
+ continue
163
+
164
+ all_audio_tensors.append(chunk_tensor)
165
+
166
+ # Add silence between chunks (except after the last chunk)
167
+ if i < len(text_chunks) - 1 and inter_chunk_silence_ms > 0:
168
+ silence_samples = int(sample_rate * inter_chunk_silence_ms / 1000.0)
169
+ silence_tensor = torch.zeros(
170
+ (1, silence_samples),
171
+ dtype=chunk_tensor.dtype,
172
+ device=chunk_tensor.device
173
+ )
174
+ all_audio_tensors.append(silence_tensor)
175
+
176
+ if not all_audio_tensors:
177
+ logger.error("No audio tensors generated")
178
+ return None
179
+
180
+ # Concatenate all audio tensors
181
+ logger.debug("Concatenating audio tensors...")
182
+ final_audio_tensor = torch.cat(all_audio_tensors, dim=1)
183
+
184
+ logger.debug(f"Final audio shape: {final_audio_tensor.shape}")
185
+ return final_audio_tensor
186
+
187
+ except Exception as e:
188
+ logger.error(f"Error in text-to-speech pipeline: {e}")
189
+ logger.error(traceback.format_exc())
190
+ return None
191
+
192
+
193
+ def chatterbox(
194
+ self,
195
+ text: str,
196
+ output_path: str,
197
+ sample_audio_path: str = None,
198
+ exaggeration=0.5,
199
+ cfg_weight=0.5,
200
+ temperature=0.8,
201
+ chunk_chars: int = 1024,
202
+ chunk_silence_ms: int = 350,
203
+ ):
204
+ start = time.time()
205
+ context_logger = logger.bind(
206
+ text_length=len(text),
207
+ sample_audio_path=sample_audio_path,
208
+ exaggeration=exaggeration,
209
+ cfg_weight=cfg_weight,
210
+ temperature=temperature,
211
+ model="ChatterboxTTS",
212
+ language="en-US",
213
+ device=device.type,
214
+ )
215
+ context_logger.debug("starting TTS generation with Chatterbox")
216
+ model = ChatterboxTTS.from_pretrained(device=device.type)
217
+
218
+ if sample_audio_path:
219
+ wav = self.text_to_speech_pipeline(
220
+ text,
221
+ model,
222
+ audio_prompt_path=sample_audio_path,
223
+ temperature=temperature,
224
+ cfg_weight=cfg_weight,
225
+ exaggeration=exaggeration,
226
+ max_chars_per_chunk=chunk_chars,
227
+ inter_chunk_silence_ms=chunk_silence_ms
228
+ )
229
+ else:
230
+ wav = self.text_to_speech_pipeline(
231
+ text,
232
+ model,
233
+ temperature=temperature,
234
+ cfg_weight=cfg_weight,
235
+ exaggeration=exaggeration,
236
+ max_chars_per_chunk=chunk_chars,
237
+ inter_chunk_silence_ms=chunk_silence_ms
238
+ )
239
+
240
+ if wav.dim() == 2 and wav.shape[0] == 1:
241
+ wav = wav.repeat(2, 1)
242
+ elif wav.dim() == 1:
243
+ wav = wav.unsqueeze(0).repeat(2, 1)
244
+
245
+ audio_length = wav.shape[1] / model.sr
246
+ ta.save(output_path, wav, model.sr)
247
+ context_logger.bind(
248
+ execution_time=time.time() - start,
249
+ audio_length=audio_length,
250
+ speedup=audio_length / (time.time() - start),
251
+ youtube_channel="https://www.youtube.com/@aiagentsaz"
252
+ ).debug(
253
+ "TTS generation with Chatterbox completed",
254
+ )
255
+
256
+