Spaces:

sam12345324
/

shortsrender

Sleeping

App Files Files Community

sam12345324 commited on Aug 19, 2025

Commit

0c8f7e3

verified ·

1 Parent(s): 1d98c06

Upload 26 files

Browse files

Files changed (27) hide show

.dockerignore +55 -0
.gitattributes +5 -0
.gitignore +8 -0
Dockerfile +24 -0
LICENSE +21 -0
README.md +59 -10
api_server/auth_middleware.py +27 -0
api_server/v1_media_router.py +755 -0
api_server/v1_utils_router.py +167 -0
assets/anton.ttf +3 -0
assets/icon_volume.png +3 -0
assets/noto.ttf +3 -0
assets/noto_hindi.ttf +3 -0
assets/person.png +3 -0
cuda.Dockerfile +45 -0
requirements.txt +16 -0
server.py +57 -0
utils/image.py +386 -0
utils/proxy.py +0 -0
video/builder.py +347 -0
video/caption.py +354 -0
video/config.py +53 -0
video/media.py +850 -0
video/storage.py +323 -0
video/stt.py +41 -0
video/tts.py +443 -0
video/tts_chatterbox.py +256 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,55 @@

+# Git
+.git
+.gitignore
+# Docker
+Dockerfile
+.dockerignore
+._.DS_Store
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+.pytest_cache
+# Generated files
+*.mp4
+!bgvideo.mp4
+!bgvideo2.mp4
+!video.mp4
+!video2.mp4
+*.wav
+!0.wav
+multi.mp4
+tmp/
+!captions/
+# Editor files
+.idea/
+.vscode/
+*.swp
+*.swo
+# Temporary files
+.DS_Store
+.cache/
+*.tmp
+*.bak

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/anton.ttf filter=lfs diff=lfs merge=lfs -text
+assets/icon_volume.png filter=lfs diff=lfs merge=lfs -text
+assets/noto_hindi.ttf filter=lfs diff=lfs merge=lfs -text
+assets/noto.ttf filter=lfs diff=lfs merge=lfs -text
+assets/person.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+__pycache__
+.hypothesis
+.venv
+media
+tmp
+captions
+.DS_Store
+._.DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    fonts-ebgaramond \
+    ffmpeg \
+    libsndfile1 \
+    fonts-dejavu \
+    build-essential \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY api_server /app/api_server
+COPY utils /app/utils
+COPY video /app/video
+COPY server.py /app/server.py
+ENV PYTHONUNBUFFERED=1
+CMD ["fastapi", "run", "server.py", "--host", "0.0.0.0", "--port", "8000"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 David Gyori
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,59 @@
----
-title: Shortsrender
-emoji: 📚
-colorFrom: blue
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# AI Agents A-Z No-Code Tools (V1)
+Video editing tools to use with no-code tools like n8n, Zapier, and Make. Brought to you by [AI Agents A-Z](https://www.youtube.com/@aiagentsaz).
+## [📚 Join our Skool community for the premium edition of the server and other premium content](https://www.skool.com/ai-agents-az/about)
+[Watch the YouTube video featuring this project](https://www.youtube.com/watch?v=1-UuldAM6fQ)
+### Be part of a growing community and help us create more content like this
+# Starting the project
+## Using Docker
+```
+docker run --rm -p 8000:8000 -it gyoridavid/ai-agents-no-code-tools:latest
+```
+If you have an NVidia GPU and have the [Cuda Toolkit](https://developer.nvidia.com/cuda-toolkit) installed, you can run the server with GPU support
+```
+docker run --rm --gpus=all -e NVIDIA_VISIBLE_DEVICES=all -e NVIDIA_DRIVER_CAPABILITIES=all -p 8000:8000 -it gyoridavid/ai-agents-no-code-tools:latest-cuda
+```
+## With python
+1. Clone the repository
+2. Create a virtual environment
+   ```bash
+   python -m venv venv
+   ```
+3. Activate the virtual environment
+   - On Windows:
+     ```bash
+     venv\Scripts\activate
+     ```
+   - On macOS/Linux:
+     ```bash
+     source venv/bin/activate
+     ```
+4. Install the dependencies
+   ```bash
+   pip install -r requirements.txt
+   ```
+5. Run the application
+   ```bash
+   fastapi dev server.py --host 0.0.0.0
+   ```
+# Documentation
+After starting the project, you can access the documentation at [http://localhost:8000/docs](http://localhost:8000/docs).
+# Contributing
+While PRs are welcome, please note that due to the nature of the project, I may not be able to review them in a timely manner. If you have any questions or suggestions, feel free to open an issue.
+# License
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.

api_server/auth_middleware.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from fastapi import Request, status
+from fastapi.responses import JSONResponse
+from loguru import logger
+import os
+auth_tokens = os.getenv("AUTH_TOKENS", "").split(",") if os.getenv("AUTH_TOKENS") else []
+async def auth_middleware(request: Request, call_next):
+    # skip authentication if the auth_tokens list is empty
+    if not len(auth_tokens):
+        return await call_next(request)
+    # authenticate all requests except the /health endpoint
+    if request.url.path != "/health":
+        auth_token = request.headers.get("Authorization")
+        logger.bind(
+            path=request.url.path,
+            method=request.method,
+            auth_token=auth_token,
+        ).debug("Received request")
+        if not auth_token or auth_token not in auth_tokens:
+            return JSONResponse(
+                status_code=status.HTTP_401_UNAUTHORIZED,
+                content={"error": "Unauthorized"},
+            )
+    response = await call_next(request)
+    return response

api_server/v1_media_router.py ADDED Viewed

	@@ -0,0 +1,755 @@

+from fastapi import Query, Request, status, APIRouter, UploadFile, File, Form, BackgroundTasks
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing import Literal, Optional
+import os
+from loguru import logger
+import matplotlib.font_manager as fm
+from video.tts import TTS
+from video.tts_chatterbox import TTSChatterbox
+from video.stt import STT
+from video.storage import Storage
+from video.caption import Caption
+from video.media import MediaUtils
+from video.builder import VideoBuilder
+from utils.image import resize_image_cover
+CHUNK_SIZE = 1024 * 1024 * 10  # 10MB chunks
+def iterfile(path: str):
+    with open(path, mode="rb") as file:
+        while chunk := file.read(CHUNK_SIZE):
+            yield chunk
+v1_media_api_router = APIRouter()
+storage_path = os.getenv("STORAGE_PATH", os.path.join(os.path.abspath(os.getcwd()), "media"))
+storage = Storage(
+    storage_path=storage_path,
+)
+stt = STT()
+tts_manager = TTS()
+tts_chatterbox = TTSChatterbox()
+@v1_media_api_router.post("/audio-tools/transcribe")
+def transcribe(
+    audio_file: UploadFile = File(..., description="Audio file to transcribe"),
+    language: Optional[str] = Form(None, description="Language code (optional)"),
+):
+    """
+    Transcribe audio file to text.
+    """
+    logger.bind(language=language, filename=audio_file.filename).info(
+        "Transcribing audio file"
+    )
+    captions, duration = stt.transcribe(audio_file.file, beam_size=5, language=language)
+    transcription = "".join([cap["text"] for cap in captions])
+    return {
+        "transcription": transcription,
+        "duration": duration,
+    }
+@v1_media_api_router.get("/audio-tools/tts/kokoro/voices")
+def get_kokoro_voices():
+    voices = tts_manager.valid_kokoro_voices()
+    return {"voices": voices}
+@v1_media_api_router.post("/audio-tools/tts/kokoro")
+def generate_kokoro_tts(
+    background_tasks: BackgroundTasks,
+    text: str = Form(..., description="Text to convert to speech"),
+    voice: Optional[str] = Form(None, description="Voice name for kokoro TTS"),
+    speed: Optional[float] = Form(None, description="Speed for kokoro TTS"),
+):
+    """
+    Generate audio from text using specified TTS engine.
+    """
+    if not voice:
+        voice = "af_heart"
+    voices = tts_manager.valid_kokoro_voices()
+    if voice not in voices:
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": f"Invalid voice: {voice}. Valid voices: {voices}"},
+        )
+    audio_id, audio_path = storage.create_media_filename_with_id(
+        media_type="audio", file_extension=".wav"
+    )
+    tmp_file_id = storage.create_tmp_file(audio_id)
+    def bg_task():
+        tts_manager.kokoro(
+            text=text,
+            output_path=audio_path,
+            voice=voice,
+            speed=speed if speed else 1.0,
+        )
+        storage.delete_media(tmp_file_id)
+    # background_tasks.add_task(bg_task)
+    logger.info(f"Adding background task for TTS generation with ID: {audio_id}")
+    background_tasks.add_task(bg_task)
+    logger.info(f"Background task added for TTS generation with ID: {audio_id}")
+    return {"file_id": audio_id}
+@v1_media_api_router.post("/audio-tools/tts/chatterbox")
+def generate_chatterbox_tts(
+    background_tasks: BackgroundTasks,
+    text: str = Form(..., description="Text to convert to speech"),
+    sample_audio_id: Optional[str] = Form(
+        None, description="Sample audio ID for voice cloning"
+    ),
+    sample_audio_file: Optional[UploadFile] = File(
+        None, description="Sample audio file for voice cloning"
+    ),
+    exaggeration: Optional[float] = Form(
+        0.5, description="Exaggeration factor for voice cloning, default: 0.5"
+    ),
+    cfg_weight: Optional[float] = Form(0.5, description="CFG weight for voice cloning, default: 0.5"),
+    temperature: Optional[float] = Form(
+        0.8, description="Temperature for voice cloning (default: 0.8)"
+    ),
+    chunk_chars: Optional[int] = Form(1024, description="Max characters per chunk (default: 1024)"),
+    chunk_silence_ms: Optional[int] = Form(
+        350, description="Silence duration between chunks in milliseconds (default: 350)"
+    )
+):
+    """
+    Generate audio from text using Chatterbox TTS.
+    """
+    audio_id, audio_path = storage.create_media_filename_with_id(
+        media_type="audio", file_extension=".wav"
+    )
+    sample_audio_path = None
+    if sample_audio_file:
+        if not sample_audio_file.filename.endswith(".wav"):
+            return JSONResponse(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                content={"error": "Sample audio file must be a .wav file."},
+            )
+        sample_audio_id = storage.upload_media(
+            media_type="tmp",
+            media_data=sample_audio_file.file.read(),
+            file_extension=".wav",
+        )
+        sample_audio_path = storage.get_media_path(sample_audio_id)
+    elif sample_audio_id:
+        if not storage.media_exists(sample_audio_id):
+            return JSONResponse(
+                status_code=status.HTTP_404_NOT_FOUND,
+                content={"error": f"Sample audio with ID {sample_audio_id} not found."},
+            )
+        sample_audio_path = storage.get_media_path(sample_audio_id)
+    tmp_file_id = storage.create_tmp_file(audio_id)
+    def bg_task():
+        try:
+            tts_chatterbox.chatterbox(
+                text=text,
+                output_path=audio_path,
+                sample_audio_path=sample_audio_path,
+                exaggeration=exaggeration,
+                cfg_weight=cfg_weight,
+                temperature=temperature,
+                chunk_chars=chunk_chars,
+                chunk_silence_ms=chunk_silence_ms,
+            )
+        except Exception as e:
+            logger.error(f"Error in Chatterbox TTS: {e}")
+        finally:
+            storage.delete_media(tmp_file_id)
+    # background_tasks.add_task(bg_task)
+    logger.info(f"Adding background task for Chatterbox TTS generation with ID: {audio_id}")
+    background_tasks.add_task(bg_task)
+    logger.info(f"Background task added for Chatterbox TTS generation with ID: {audio_id}")
+    return {"file_id": audio_id}
+@v1_media_api_router.post("/storage")
+def upload_file(
+    file: Optional[UploadFile] = File(None, description="File to upload"),
+    url: Optional[str] = Form(None, description="URL of the file to upload (optional)"),
+    media_type: Literal["image", "video", "audio"] = Form(
+        ..., description="Type of media being uploaded"
+    ),
+):
+    """
+    Upload a file and return its ID.
+    """
+    if media_type not in ["image", "video", "audio"]:
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": f"Invalid media type: {media_type}"},
+        )
+    if file:
+        file_id = storage.upload_media(
+            media_type=media_type,
+            media_data=file.file.read(),
+            file_extension=os.path.splitext(file.filename)[1],
+        )
+        return {"file_id": file_id}
+    elif url:
+        if not storage.is_valid_url(url):
+            return JSONResponse(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                content={"error": f"Invalid URL: {url}"},
+            )
+        file_id = storage.upload_media_from_url(media_type=media_type, url=url)
+        return {"file_id": file_id}
+@v1_media_api_router.get("/storage/{file_id}")
+def download_file(file_id: str):
+    """
+    Download a file by its ID.
+    """
+    if not storage.media_exists(file_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"File with ID {file_id} not found."},
+        )
+    file_path = storage.get_media_path(file_id)
+    return StreamingResponse(
+        iterfile(file_path),
+        media_type="application/octet-stream",
+        headers={
+            "Content-Disposition": f"attachment; filename={os.path.basename(file_path)}"
+        },
+    )
+@v1_media_api_router.delete("/storage/{file_id}")
+def delete_file(file_id: str):
+    """
+    Delete a file by its
+    """
+    if storage.media_exists(file_id):
+        storage.delete_media(file_id)
+    return {"status": "success"}
+@v1_media_api_router.get("/storage/{file_id}/status")
+def file_status(file_id: str):
+    """
+    Check the status of a file by its ID.
+    """
+    tmp_id = storage.create_tmp_file_id(file_id)
+    if storage.media_exists(tmp_id):
+        return {"status": "processing"}
+    elif storage.media_exists(file_id):
+        return {"status": "ready"}
+    return {"status": "not_found"}
+@v1_media_api_router.post("/video-tools/merge")
+def merge_videos(
+    background_tasks: BackgroundTasks,
+    video_ids: str = Form(..., description="List of video IDs to merge"),
+    background_music_id: Optional[str] = Form(
+        None, description="Background music ID (optional)"
+    ),
+    background_music_volume: Optional[float] = Form(
+        0.5, description="Volume for background music (0.0 to 1.0)"
+    ),
+):
+    """
+    Merge multiple videos into one.
+    """
+    video_ids = video_ids.split(",") if video_ids else []
+    if not video_ids:
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": "At least one video ID is required."},
+        )
+    merged_video_id, merged_video_path = storage.create_media_filename_with_id(
+        media_type="video", file_extension=".mp4"
+    )
+    video_paths = []
+    for video_id in video_ids:
+        if not storage.media_exists(video_id):
+            return JSONResponse(
+                status_code=status.HTTP_404_NOT_FOUND,
+                content={"error": f"Video with ID {video_id} not found."},
+            )
+        video_paths.append(storage.get_media_path(video_id))
+    if background_music_id and not storage.media_exists(background_music_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={
+                "error": f"Background music with ID {background_music_id} not found."
+            },
+        )
+    background_music_path = (
+        storage.get_media_path(background_music_id) if background_music_id else None
+    )
+    utils = MediaUtils()
+    temp_file_id = storage.create_tmp_file(merged_video_id)
+    def bg_task():
+        utils.merge_videos(
+            video_paths=video_paths,
+            output_path=merged_video_path,
+            background_music_path=background_music_path,
+            background_music_volume=background_music_volume,
+        )
+        storage.delete_media(temp_file_id)
+    logger.info(f"Adding background task for video merge with ID: {merged_video_id}")
+    background_tasks.add_task(bg_task)
+    logger.info(f"Background task added for video merge with ID: {merged_video_id}")
+    return {"file_id": merged_video_id}
+@v1_media_api_router.get('/fonts')
+def list_fonts():
+    fonts = set()
+    for fname in fm.findSystemFonts(fontpaths=None, fontext='ttf'):
+        try:
+            prop = fm.FontProperties(fname=fname)
+            name = prop.get_name()
+            fonts.add(name)
+        except RuntimeError:
+            continue
+    return {"fonts": sorted(fonts)}
+@v1_media_api_router.post("/video-tools/generate/tts-captioned-video")
+def generate_captioned_video(
+    background_tasks: BackgroundTasks,
+    background_id: str = Form(..., description="Background image ID"),
+    text: Optional[str] = Form(None, description="Text to generate video from"),
+    width: Optional[int] = Form(1080, description="Width of the video (default: 1080)"),
+    height: Optional[int] = Form(
+        1920, description="Height of the video (default: 1920)"
+    ),
+    audio_id: Optional[str] = Form(
+        None, description="Audio ID for the video (optional)"
+    ),
+    kokoro_voice: Optional[str] = Form(
+        "af_heart", description="Voice for kokoro TTS (default: af_heart)"
+    ),
+    kokoro_speed: Optional[float] = Form(
+        1.0, description="Speed for kokoro TTS (default: 1.0)"
+    ),
+    language: Optional[str] = Form(
+        None, description="Language code for STT (optional, e.g. 'en', 'fr', 'de'), defaults to None (auto-detect language if audio_id is provided)"
+    ),
+    image_effect: Optional[str] = Form("ken_burns", description="Effect to apply to the background image, options: ken_burns, pan (default: 'ken_burns')"),
+    # Flattened subtitle configuration options
+    caption_config_line_count: Optional[int] = Form(1, description="Number of lines per subtitle segment (default: 1)", ge=1, le=5),
+    caption_config_line_max_length: Optional[int] = Form(1, description="Maximum characters per line (default: 1)", ge=1, le=200),
+    caption_config_font_size: Optional[int] = Form(120, description="Font size for subtitles (default: 50)", ge=8, le=200),
+    caption_config_font_name: Optional[str] = Form("Arial", description="Font family name (default: 'EB Garamond', see the available fonts form the /fonts endpoint)"),
+    caption_config_font_bold: Optional[bool] = Form(True, description="Whether to use bold font (default: True)"),
+    caption_config_font_italic: Optional[bool] = Form(False, description="Whether to use italic font (default: false)"),
+    caption_config_font_color: Optional[str] = Form("#fff", description="Font color in hex format (default: '#fff')"),
+    caption_config_subtitle_position: Optional[Literal["top", "center", "bottom"]] = Form("top", description="Vertical position of subtitles (default: 'top')"),
+    caption_config_shadow_color: Optional[str] = Form("#000", description="Shadow color in hex format (default: '#000')"),
+    caption_config_shadow_transparency: Optional[float] = Form(0.4, description="Shadow transparency from 0.0 to 1.0 (default: 0.4)", ge=0.0, le=1.0),
+    caption_config_shadow_blur: Optional[int] = Form(10, description="Shadow blur radius (default: 10)", ge=0, le=20),
+    caption_config_stroke_color: Optional[str] = Form(None, description="Stroke/outline color in hex format (default: '#000')"),
+    caption_config_stroke_size: Optional[int] = Form(5, description="Stroke/outline size (default: 5)", ge=0, le=10),
+):
+    """
+    Generate a captioned video from text and background image.
+    """
+    # Build subtitle options from individual parameters
+    parsed_subtitle_options = {}
+    # Only include non-None values
+    if caption_config_line_count is not None:
+        parsed_subtitle_options['lines'] = caption_config_line_count
+    if caption_config_line_max_length is not None:
+        parsed_subtitle_options['max_length'] = caption_config_line_max_length
+    if caption_config_font_size is not None:
+        parsed_subtitle_options['font_size'] = caption_config_font_size
+    if caption_config_font_name is not None:
+        parsed_subtitle_options['font_name'] = caption_config_font_name
+    if caption_config_font_bold is not None:
+        parsed_subtitle_options['font_bold'] = caption_config_font_bold
+    if caption_config_font_italic is not None:
+        parsed_subtitle_options['font_italic'] = caption_config_font_italic
+    if caption_config_font_color is not None:
+        parsed_subtitle_options['font_color'] = caption_config_font_color
+    if caption_config_subtitle_position is not None:
+        parsed_subtitle_options['subtitle_position'] = caption_config_subtitle_position
+    if caption_config_shadow_color is not None:
+        parsed_subtitle_options['shadow_color'] = caption_config_shadow_color
+    if caption_config_shadow_transparency is not None:
+        parsed_subtitle_options['shadow_transparency'] = caption_config_shadow_transparency
+    if caption_config_shadow_blur is not None:
+        parsed_subtitle_options['shadow_blur'] = caption_config_shadow_blur
+    if caption_config_stroke_color is not None:
+        parsed_subtitle_options['stroke_color'] = caption_config_stroke_color
+    if caption_config_stroke_size is not None:
+        parsed_subtitle_options['stroke_size'] = caption_config_stroke_size
+    if audio_id and not storage.media_exists(audio_id):
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": f"Audio with ID {audio_id} not found."},
+        )
+    if not audio_id and kokoro_voice not in tts_manager.valid_kokoro_voices():
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": f"Invalid voice: {kokoro_voice}."},
+        )
+    media_type = storage.get_media_type(background_id)
+    if media_type not in ["image"]:
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": f"Invalid media type: {media_type}"},
+        )
+    if not storage.media_exists(background_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"Background image with ID {background_id} not found."},
+        )
+    output_id, output_path = storage.create_media_filename_with_id(
+        media_type="video", file_extension=".mp4"
+    )
+    dimensions = (width, height)
+    builder = VideoBuilder(
+        dimensions=dimensions,
+    )
+    builder.set_media_utils(MediaUtils())
+    tmp_file_id = storage.create_tmp_file(output_id)
+    def bg_task(
+        tmp_file_id: str = tmp_file_id,
+    ):
+        tmp_file_ids = [tmp_file_id]
+        # set audio, generate captions
+        captions = None
+        tts_audio_id = audio_id
+        from video.tts import LANGUAGE_VOICE_MAP
+        lang_config = LANGUAGE_VOICE_MAP.get(kokoro_voice, {})
+        international = lang_config.get("international", False)
+        if tts_audio_id:
+            audio_path = storage.get_media_path(tts_audio_id)
+            captions = stt.transcribe(audio_path=audio_path, language=language)[0]
+            builder.set_audio(audio_path)
+        # generate TTS and set audio
+        else:
+            tts_audio_id, audio_path = storage.create_media_filename_with_id(
+                media_type="audio", file_extension=".wav"
+            )
+            tmp_file_ids.append(tts_audio_id)
+            captions = tts_manager.kokoro(
+                text=text,
+                output_path=audio_path,
+                voice=kokoro_voice,
+                speed=kokoro_speed,
+            )[0]
+            if international:
+                # use whisper to create captions
+                iso_lang_code = lang_config.get("iso639_1")
+                captions = stt.transcribe(audio_path=audio_path, language=iso_lang_code)[0]
+            builder.set_audio(audio_path)
+        # create subtitle
+        captionsManager = Caption()
+        subtitle_id, subtitle_path = storage.create_media_filename_with_id(
+            media_type="tmp", file_extension=".ass"
+        )
+        tmp_file_ids.append(subtitle_id)
+        # create segments based on language
+        if international:
+            segments = captionsManager.create_subtitle_segments_english(
+                captions=captions,
+                lines=parsed_subtitle_options.get('lines', parsed_subtitle_options.get("lines", 1)),
+                max_length=parsed_subtitle_options.get('max_length', parsed_subtitle_options.get("max_length", 1)),
+            )
+        else:
+            segments = captionsManager.create_subtitle_segments_international(
+                captions=captions,
+                lines=parsed_subtitle_options.get('lines', parsed_subtitle_options.get('lines', 1)),
+                max_length=parsed_subtitle_options.get('max_length', parsed_subtitle_options.get('max_length', 1)),
+            )
+        captionsManager.create_subtitle(
+            segments=segments,
+            output_path=subtitle_path,
+            dimensions=dimensions,
+            font_size=parsed_subtitle_options.get('font_size', 120),
+            shadow_blur=parsed_subtitle_options.get('shadow_blur', 10),
+            stroke_size=parsed_subtitle_options.get('stroke_size', 5),
+            shadow_color=parsed_subtitle_options.get('shadow_color', "#000"),
+            stroke_color=parsed_subtitle_options.get('stroke_color', "#000"),
+            font_name=parsed_subtitle_options.get('font_name', "Arial"),
+            font_bold=parsed_subtitle_options.get('font_bold', True),
+            font_italic=parsed_subtitle_options.get('font_italic', False),
+            subtitle_position=parsed_subtitle_options.get('subtitle_position', "top"),
+            font_color=parsed_subtitle_options.get('font_color', "#fff"),
+            shadow_transparency=parsed_subtitle_options.get('shadow_transparency', 0.4),
+        )
+        builder.set_captions(
+            file_path=subtitle_path,
+        )
+        # resize background image if needed
+        background_path = storage.get_media_path(background_id)
+        utils = MediaUtils()
+        info = utils.get_video_info(background_path)
+        if info.get("width", 0) != width or info.get("height", 0) != height:
+            logger.bind(
+                image_width=info.get("width", 0),
+                image_height=info.get("height", 0),
+                target_width=width,
+                target_height=height,
+            ).debug(
+                "Resizing background image to fit video dimensions"
+            )
+            _, resized_background_path = storage.create_media_filename_with_id(
+                media_type="image", file_extension=".jpg"
+            )
+            resize_image_cover(
+                image_path=background_path,
+                output_path=resized_background_path,
+                target_width=width,
+                target_height=height,
+            )
+            background_path = resized_background_path
+        builder.set_background_image(
+            background_path,
+            effect_config={
+                "effect": image_effect,
+            }
+        )
+        builder.set_output_path(output_path)
+        builder.execute()
+        for tmp_file_id in tmp_file_ids:
+            if storage.media_exists(tmp_file_id):
+                storage.delete_media(tmp_file_id)
+    logger.info(f"Adding background task for captioned video generation with ID: {output_id}")
+    background_tasks.add_task(bg_task, tmp_file_id=tmp_file_id)
+    logger.info(f"Background task added for captioned video generation with ID: {output_id}")
+    return {
+        "file_id": output_id,
+    }
+# https://ffmpeg.org/ffmpeg-filters.html#colorkey
+@v1_media_api_router.post("/video-tools/add-colorkey-overlay")
+def add_colorkey_overlay(
+    background_tasks: BackgroundTasks,
+    video_id: str = Form(..., description="Video ID to overlay"),
+    overlay_video_id: str = Form(..., description="Overlay image ID"),
+    color: Optional[str] =  Form(
+        "green", description="Set the color for which alpha will be set to 0 (full transparency). Use name of the color or hex code (e.g. 'red' or '#ff0000')"
+    ),
+    similarity: Optional[float] = Form(
+        0.1, description="Set the radius from the key color within which other colors also have full transparency (Default: 0.1)"
+    ),
+    blend: Optional[float] = Form(
+        0.1, description="Set how the alpha value for pixels that fall outside the similarity radius is computed (default: 0.1)"
+    ),
+):
+    """
+    Overlay a video on a video with the specified colorkey and intensity
+    """
+    if not storage.media_exists(video_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"Video with ID {video_id} not found."},
+        )
+    if not storage.media_exists(overlay_video_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"Overlay video with ID {overlay_video_id} not found."},
+        )
+    video_path = storage.get_media_path(video_id)
+    overlay_video_path = storage.get_media_path(overlay_video_id)
+    output_id, output_path = storage.create_media_filename_with_id(
+        media_type="video", file_extension=".mp4"
+    )
+    tmp_file_id = storage.create_tmp_file(output_id)
+    def bg_task():
+        utils = MediaUtils()
+        utils.colorkey_overlay(
+            input_video_path=video_path,
+            overlay_video_path=overlay_video_path,
+            output_video_path=output_path,
+            color=color,
+            similarity=similarity,
+            blend=blend,
+        )
+        storage.delete_media(tmp_file_id)
+    logger.info(f"Adding background task for colorkey overlay with ID: {output_id}")
+    background_tasks.add_task(bg_task)
+    logger.info(f"Background task added for colorkey overlay with ID: {output_id}")
+    return {
+        "file_id": output_id,
+    }
+@v1_media_api_router.get("/video-tools/extract-frame/{video_id}")
+def extract_frame(
+    video_id: str,
+    timestamp: Optional[float] = Query(1.0, description="Timestamp in seconds to extract frame from (default: 1.0)")
+):
+    """
+    Extract a frame from a video at a specified timestamp.
+    Args:
+        video_id: Video ID to extract frame from
+        timestamp: Optional timestamp in seconds to extract frame from (default: first frame)
+    """
+    if not storage.media_exists(video_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"Video with ID {video_id} not found."},
+        )
+    video_path = storage.get_media_path(video_id)
+    _, output_path = storage.create_media_filename_with_id(
+        media_type="image", file_extension=".jpg"
+    )
+    utils = MediaUtils()
+    video_info = utils.get_video_info(video_path)
+    if video_info.get("duration", 0) <= float(timestamp):
+        timestamp = video_info.get("duration", 0) - 0.3
+    success = utils.extract_frame(
+        video_path=video_path,
+        output_path=output_path,
+        time_seconds=timestamp,
+    )
+    if not success:
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={"error": "Failed to extract frame from video."},
+        )
+    # Load file into memory
+    with open(output_path, "rb") as file:
+        file_data = file.read()
+    # Remove the output file
+    os.remove(output_path)
+    # Create streaming response with appropriate headers
+    from io import BytesIO
+    return StreamingResponse(
+        BytesIO(file_data),
+        media_type="image/jpeg",
+        headers={
+            "Content-Disposition": f"attachment; filename=frame_{video_id}_{timestamp or 'first'}.jpg"
+        },
+    )
+# extract x number of frames from the video, equally spaced
+@v1_media_api_router.post('/video-tools/extract-frames')
+def extract_frame_from_url(
+    url: str = Form(..., description="URL of the video to extract frame from"),
+    amount: int = Form(5, description="Number of frames to extract from the video (default: 5)"),
+    length_seconds: Optional[float] = Form(None, description="Length of the video in seconds (optional)"),
+    stitch: Optional[bool] = Form(False, description="Whether to stitch the frames into a single image (default: False)")
+):
+    template_id, template_path = storage.create_media_template(
+        media_type="image", file_extension=".jpg"
+    )
+    utils = MediaUtils()
+    if not length_seconds:
+        video_info = utils.get_video_info(url)
+        length_seconds = video_info.get("duration", 0)
+    utils.extract_frames(
+        video_path=url,
+        length_seconds=length_seconds,
+        amount=amount,
+        output_template=template_path,
+    )
+    image_ids = []
+    for i in range(amount):
+        padded_index = str(i + 1).zfill(2)
+        image_id = template_id.replace("%02d", padded_index)
+        image_ids.append(image_id)
+    return {
+        "message": f"Extracted {amount} frames from the video at {url}. The frames are saved in the template directory.",
+        "template_id": template_id,
+        "template_path": template_path,
+        "image_ids": image_ids,
+    }
+@v1_media_api_router.get("/video-tools/info/{file_id}")
+def get_video_info(file_id: str):
+    """
+    Get information about a video file.
+    """
+    if not storage.media_exists(file_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"Video with ID {file_id} not found."},
+        )
+    video_path = storage.get_media_path(file_id)
+    utils = MediaUtils()
+    info = utils.get_video_info(video_path)
+    return info
+@v1_media_api_router.get("/audio-tools/info/{file_id}")
+def get_audio_info(file_id: str):
+    """
+    Get information about an audio file.
+    """
+    if not storage.media_exists(file_id):
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"Audio with ID {file_id} not found."},
+        )
+    audio_path = storage.get_media_path(file_id)
+    utils = MediaUtils()
+    info = utils.get_audio_info(audio_path)
+    return info

api_server/v1_utils_router.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+from fastapi import BackgroundTasks, Form, status, APIRouter
+from fastapi.responses import JSONResponse, StreamingResponse
+from loguru import logger
+from video.storage import Storage
+from youtube_transcript_api import YouTubeTranscriptApi
+storage_path = os.getenv("STORAGE_PATH", os.path.join(os.path.abspath(os.getcwd()), "media"))
+storage = Storage(
+    storage_path=storage_path,
+)
+v1_utils_router = APIRouter()
+ytt_api = YouTubeTranscriptApi()
+@v1_utils_router.get("/youtube-transcript")
+def get_youtube_transcript(
+    video_id: str,
+):
+    """
+    Get YouTube video transcript by video ID.
+    """
+    try:
+        fetched_transcript = ytt_api.fetch(video_id)
+        return {
+            "video_id": video_id,
+            "transcript": fetched_transcript
+        }
+    except Exception as e:
+        logger.error(f"Error fetching transcript for video {video_id}: {e}")
+        return JSONResponse(
+            status_code=status.HTTP_404_NOT_FOUND,
+            content={"error": f"Transcript for video {video_id} not found."},
+        )
+@v1_utils_router.post("/stitch-images")
+def stitch_images(
+    image_urls: str = Form(..., description="Comma-separated list of image URLs to stitch together"),
+    max_width: int = Form(1920, description="Maximum width of the final stitched image"),
+    max_height: int = Form(1080, description="Maximum height of the final stitched image"),
+):
+    """
+    Stitch multiple images into one.
+    """
+    if not image_urls:
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": "No image URLs provided."}
+        )
+    image_urls = [url.strip() for url in image_urls.split(",") if url.strip()]
+    from utils.image import stitch_images as stitch_images_util
+    try:
+        stitched_image = stitch_images_util(image_urls, max_width, max_height)
+        # Convert PIL image to JPEG format in memory
+        from io import BytesIO
+        img_buffer = BytesIO()
+        stitched_image.save(img_buffer, format='JPEG', quality=95)
+        img_buffer.seek(0)
+        return StreamingResponse(
+            img_buffer,
+            media_type="image/jpeg",
+            headers={
+                "Content-Disposition": f"attachment; filename=stitched.jpg"
+            },
+        )
+    except Exception as e:
+        logger.error(f"Error stitching images: {e}")
+        return JSONResponse(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            content={"error": "Failed to stitch images."}
+        )
+@v1_utils_router.post("/make-image-imperfect")
+def image_unaize(
+    background_tasks: BackgroundTasks,
+    image_id: str = Form(..., description="ID of the image to unaize"),
+    enhance_color: float = Form(None, description="Strength of the color enhancement (0-2). 0 means black and white, 1 means no change, 2 means full color enhancement"),
+    enhance_contrast: float = Form(None, description="Strength of the contrast enhancement (0-2)"),
+    noise_strength: int = Form(0, description="Strength of the noise to apply to the image (0-100)"),
+):
+    """
+    Remove AI-generated artifacts from an image.
+    """
+    if not image_id:
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": "No image URL provided."}
+        )
+    image_path = storage.get_media_path(image_id)
+    jpg_id, jpg_path = storage.create_media_filename_with_id(
+        media_type="image", file_extension=".jpg"
+    )
+    tmp_file_id = storage.create_tmp_file(jpg_id)
+    from utils.image import make_image_imperfect
+    def bg_task():
+        try:
+            imperfect_image = make_image_imperfect(
+                image_path,
+                enhance_color=enhance_color,
+                enhance_contrast=enhance_contrast,
+                noise_strength=noise_strength
+            )
+            imperfect_image.save(jpg_path, format='JPEG', quality=95)
+        except Exception as e:
+            logger.error(f"Error making image imperfect: {e}")
+        finally:
+            storage.delete_media(tmp_file_id)
+    background_tasks.add_task(bg_task)
+    return {
+        "file_id": jpg_id,
+    }
+@v1_utils_router.post("/convert/pcm/wav")
+def convert_pcm_to_wav(
+    background_tasks: BackgroundTasks,
+    pcm_id: str = Form(..., description="ID of the PCM audio file to convert"),
+    sample_rate: int = Form(24000, description="Sample rate of the PCM audio"),
+    channels: int = Form(1, description="Number of audio channels (1 for mono, 2 for stereo)"),
+    target_sample_rate: int = Form(44100, description="Target sample rate for the WAV audio"),
+):
+    """
+    Convert PCM audio to WAV format.
+    """
+    if not pcm_id or storage.media_exists(pcm_id) is False:
+        return JSONResponse(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            content={"error": "PCM audio file not found."}
+        )
+    from video.media import MediaUtils
+    utils = MediaUtils()
+    wav_id, wav_path = storage.create_media_filename_with_id(
+        media_type="audio", file_extension=".wav"
+    )
+    tmp_file_id = storage.create_tmp_file(wav_id)
+    def bg_task():
+        try:
+            utils.convert_pcm_to_wav(
+                input_pcm_path=storage.get_media_path(pcm_id),
+                output_wav_path=wav_path,
+                sample_rate=sample_rate,
+                channels=channels,
+                target_sample_rate=target_sample_rate
+            )
+        except Exception as e:
+            logger.error(f"Error converting PCM to WAV: {e}")
+        finally:
+            storage.delete_media(tmp_file_id)
+    background_tasks.add_task(bg_task)
+    return {
+        "file_id": wav_id,
+    }

assets/anton.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28beb8f6542f642ba4143bd4a1d1cfc7be7b1dedc951096efd8e0942502ea1bf
+size 161588

assets/icon_volume.png ADDED Viewed

Git LFS Details

SHA256: 019d2a13e54354427b30d02f527ec3e81aa5f1af278c2045b8600dc0a4aa651a
Pointer size: 131 Bytes
Size of remote file: 101 kB

assets/noto.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cf8b2a0576d5680284ab03a7a8219499d59bbe981a79bb3dc0031f251c39736
+size 10560616

assets/noto_hindi.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b8cac46a1c86d2533a616b1fcf4e1926b8e39bda69034508b0df96791f56d97
+size 2044548

assets/person.png ADDED Viewed

Git LFS Details

SHA256: ebee7d3b260c84247653ae91731c40a2e42fe41b093a8e0002fd54fc472b7002
Pointer size: 132 Bytes
Size of remote file: 1.96 MB

cuda.Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+ARG CUDA=12.3.1
+ARG OS=ubuntu22.04
+ARG RUNIMAGE=${CUDA}-runtime-${OS}
+FROM nvidia/cuda:${RUNIMAGE}
+ARG CUDA
+ARG OS
+USER root
+RUN apt update && apt install -y \
+    fonts-ebgaramond \
+    build-essential \
+    g++ \
+    curl \
+    wget \
+    git \
+    python3.10 \
+    python3-pip \
+    python3-dev \
+    python3.10-gdbm \
+    ffmpeg \
+    libsndfile1 \
+    fonts-dejavu \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+RUN ln -sf /usr/bin/pip3 /usr/bin/pip
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
+COPY api_server /app/api_server
+COPY utils /app/utils
+COPY video /app/video
+COPY server.py /app/server.py
+ENV PYTHONUNBUFFERED=1
+EXPOSE 8000
+CMD ["fastapi", "run", "server.py", "--host", "0.0.0.0", "--port", "8000"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+uuid
+numpy
+kokoro
+soundfile
+fastapi[standard]
+loguru
+chatterbox-tts >= 0.1.2
+faster_whisper
+torchaudio
+requests_tor
+requests[socks]
+youtube-transcript-api
+matplotlib
+Pillow
+nltk
+imageio

server.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import asyncio
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, APIRouter
+import sys
+from loguru import logger
+from api_server.auth_middleware import auth_middleware
+from api_server.v1_utils_router import v1_utils_router
+from api_server.v1_media_router import v1_media_api_router
+from video.config import device
+logger.remove()
+logger.add(
+    sys.stdout,
+    colorize=True,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level> | <blue>{extra}</blue>",
+    level="DEBUG",
+)
+logger.info("This server was created by the 'AI Agents A-Z' YouTube channel")
+logger.info("https://www.youtube.com/@aiagentsaz")
+logger.info("Using device: {}", device)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("Starting up the server...")
+    yield
+    logger.info("Shutting down the server...")
+app = FastAPI(lifespan=lifespan)
+# add middleware to app, besides the /health endpoint
+app.middleware("http")(auth_middleware)
+@app.api_route("/", methods=["GET", "HEAD"])
+def root():
+    return {
+        "message": "Welcome to the AI Agents A-Z No-Code Server",
+        "version": "0.3.5",
+        "documentation": "/docs",
+        "created_by": "https://www.youtube.com/@aiagentsaz"
+    }
+@app.api_route("/health", methods=["GET", "HEAD"])
+def healthcheck():
+    return {"status": "ok"}
+api_router = APIRouter()
+v1_api_router = APIRouter()
+# todo auto-delete files after 30 minutes (env var)
+v1_api_router.include_router(v1_media_api_router, prefix="/media", tags=["media"])
+v1_api_router.include_router(v1_utils_router, prefix="/utils", tags=["utils"])
+api_router.include_router(v1_api_router, prefix="/v1", tags=["v1"])
+app.include_router(api_router, prefix="/api", tags=["api"])

utils/image.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import numpy as np
+import requests
+from PIL import Image, ImageEnhance, ImageFilter, ImageDraw, ImageChops, ImageOps, ImageFont
+from io import BytesIO
+import math
+def stitch_images(
+    image_urls: list[str],
+    max_width: int = 1920,
+    max_height: int = 1080
+):
+    """
+    Stitch multiple images into a single image.
+    Downloads images from URLs, arranges them in a grid, and resizes proportionally to fit max dimensions.
+    Args:
+        image_urls: List of image URLs to download and stitch
+        max_width: Maximum width of the final stitched image
+        max_height: Maximum height of the final stitched image
+    Returns:
+        PIL Image object of the stitched result
+    """
+    if not image_urls:
+        raise ValueError("No image URLs provided")
+    # Download and open all images
+    images = []
+    for url in image_urls:
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            img = Image.open(BytesIO(response.content))
+            # Convert to RGB if necessary
+            if img.mode != 'RGB':
+                img = img.convert('RGB')
+            images.append(img)
+        except Exception as e:
+            print(f"Failed to download image from {url}: {e}")
+            continue
+    if not images:
+        raise ValueError("No valid images could be downloaded")
+    # Calculate optimal grid dimensions
+    num_images = len(images)
+    cols = math.ceil(math.sqrt(num_images))
+    rows = math.ceil(num_images / cols)
+    # Find the maximum dimensions among all images to ensure consistent sizing
+    max_img_width = max(img.width for img in images)
+    max_img_height = max(img.height for img in images)
+    # Calculate the size for each cell in the grid
+    cell_width = max_img_width
+    cell_height = max_img_height
+    # Create the stitched image canvas
+    canvas_width = cols * cell_width
+    canvas_height = rows * cell_height
+    stitched = Image.new('RGB', (canvas_width, canvas_height), color='white')
+    # Place images in the grid
+    for i, img in enumerate(images):
+        row = i // cols
+        col = i % cols
+        # Calculate position for this image
+        x = col * cell_width
+        y = row * cell_height
+        # Resize image to fit cell while maintaining aspect ratio
+        img_resized = resize_image_to_fit(img, cell_width, cell_height)
+        # Center the image in the cell
+        offset_x = (cell_width - img_resized.width) // 2
+        offset_y = (cell_height - img_resized.height) // 2
+        stitched.paste(img_resized, (x + offset_x, y + offset_y))
+    # Resize the final stitched image to fit within max dimensions
+    final_image = resize_image_to_fit(stitched, max_width, max_height)
+    return final_image
+def resize_image_cover(
+    image_path: str,
+    target_width: int,
+    target_height: int,
+    output_path: str,
+    ) -> Image.Image:
+    """
+    Resize an image to fill the specified dimensions while maintaining aspect ratio.
+    The image is scaled to cover the entire target area and cropped to fit.
+    Args:
+        image: PIL Image object to resize
+        target_width: Target width
+        target_height: Target height
+    Returns:
+        Resized and cropped PIL Image object
+    """
+    image = Image.open(image_path)
+    # Calculate the scaling factor to cover the entire target area
+    width_ratio = target_width / image.width
+    height_ratio = target_height / image.height
+    scale_factor = max(width_ratio, height_ratio)  # Use max to ensure coverage
+    # Scale the image
+    new_width = int(image.width * scale_factor)
+    new_height = int(image.height * scale_factor)
+    scaled_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    # Calculate crop box to center the image
+    left = (new_width - target_width) // 2
+    top = (new_height - target_height) // 2
+    right = left + target_width
+    bottom = top + target_height
+    # Crop the image to the target dimensions
+    cropped_image = scaled_image.crop((left, top, right, bottom))
+    # Convert to RGB if the image has transparency (RGBA mode)
+    if cropped_image.mode == 'RGBA':
+        # Create a white background and paste the image on it
+        rgb_image = Image.new('RGB', cropped_image.size, (255, 255, 255))
+        rgb_image.paste(cropped_image, mask=cropped_image.split()[-1])  # Use alpha channel as mask
+        cropped_image = rgb_image
+    cropped_image.save(output_path)
+def resize_image_to_fit(image: Image.Image, max_width: int, max_height: int) -> Image.Image:
+    """
+    Resize an image to fit within the specified dimensions while maintaining aspect ratio.
+    Args:
+        image: PIL Image object to resize
+        max_width: Maximum width
+        max_height: Maximum height
+    Returns:
+        Resized PIL Image object
+    """
+    # Calculate the scaling factor to fit within max dimensions
+    width_ratio = max_width / image.width
+    height_ratio = max_height / image.height
+    scale_factor = min(width_ratio, height_ratio)
+    # Only resize if the image is larger than max dimensions
+    if scale_factor < 1:
+        new_width = int(image.width * scale_factor)
+        new_height = int(image.height * scale_factor)
+        return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+    return image
+def cup_of_coffee_tone(img):
+    sepia = ImageOps.colorize(img.convert("L"), "#704214", "#C0A080")
+    return Image.blend(img, sepia, alpha=0.2)  # tweak alpha
+def chromatic_aberration(img, shift=2):
+    r, g, b = img.split()
+    # Use transform with AFFINE to shift the channels
+    r = r.transform(img.size, Image.AFFINE, (1, 0, -shift, 0, 1, 0))
+    b = b.transform(img.size, Image.AFFINE, (1, 0, shift, 0, 1, 0))
+    return Image.merge("RGB", (r, g, b))
+def make_image_imperfect(
+    image_path: str,
+    enhance_color: float = None,
+    enhance_contrast: float = None,
+    noise_strength: int = 15
+) -> Image.Image:
+    """
+    Remove AI-generated artifacts from an image.
+    This is a placeholder function. Actual implementation would depend on the specific algorithm used.
+    Args:
+        image_url: URL of the image to process
+    Returns:
+        PIL Image object of the processed result
+    """
+    try:
+        img = Image.open(image_path)
+        if enhance_color is not None:
+            img = ImageEnhance.Color(img).enhance(enhance_color)
+        if enhance_contrast is not None:
+            img = ImageEnhance.Contrast(img).enhance(enhance_contrast)
+        img = img.filter(ImageFilter.SHARPEN)
+        img = img.filter(ImageFilter.GaussianBlur(radius=0.5))
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        img_array = np.array(img)
+        h, w, c = img_array.shape
+        grayscale_noise = np.random.randint(-noise_strength, noise_strength + 1, (h, w), dtype='int16')
+        noise = np.stack([grayscale_noise] * c, axis=2)
+        noisy_array = img_array.astype('int16') + noise
+        noisy_array = np.clip(noisy_array, 0, 255).astype('uint8')
+        img = Image.fromarray(noisy_array)
+        img = cup_of_coffee_tone(img)
+        img = chromatic_aberration(img, shift=1)
+        return img
+    except Exception as e:
+        print(f"Failed to process image from {image_path}: {e}")
+        raise ValueError("Failed to unaize image") from e
+def create_text_image(
+    text: str,
+    size: tuple[int, int] = (1920, 1080),
+    font_size: int = 120,
+    font_color: str = "white",
+    font_path: str = None
+) -> Image.Image:
+    """
+    Create an image with centered text.
+    Args:
+        text: Text to display on the image
+        width: Width of the image
+        height: Height of the image
+        font_size: Size of the font
+        font_color: Color of the text
+    Returns:
+        PIL Image object with the text centered
+    """
+    img = Image.new('RGB', size, color='black')
+    draw = ImageDraw.Draw(img)
+    font = ImageFont.load_default(size=font_size)
+    if font_path:
+        font = ImageFont.truetype(font_path, font_size)
+    font_bbox = font.getbbox(text)
+    text_width = font_bbox[2] - font_bbox[0]
+    text_height = font_bbox[3] - font_bbox[1]
+    x = (size[0] - text_width) // 2
+    y = (size[1] - text_height) // 2
+    draw.text((x, y), text, fill=font_color, font=font)
+    return img
+def make_image_wobbly(
+    image: Image.Image,
+    wobble_amount: float = 3.0
+) -> Image.Image:
+    """
+    Apply a subtle wobble/distortion effect to an image, like viewing through water or a warped mirror.
+    Args:
+        image: PIL Image object to distort
+        wobble_amount: Strength of the wobble effect (0.5-10.0, higher = more distortion)
+    Returns:
+        PIL Image object with wobble effect applied
+    """
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    width, height = image.size
+    img_array = np.array(image)
+    # Create coordinate grids
+    x_coords = np.arange(width)
+    y_coords = np.arange(height)
+    x_grid, y_grid = np.meshgrid(x_coords, y_coords)
+    # Create random wave patterns optimized for text
+    # Generate random parameters for each wave to ensure variety
+    # Random wave frequencies and phases for horizontal waves
+    freq1_h = np.random.uniform(2, 5)  # Random frequency between 2-5
+    freq2_h = np.random.uniform(5, 10)  # Random frequency between 5-10
+    phase1_h = np.random.uniform(0, 2 * np.pi)  # Random phase
+    phase2_h = np.random.uniform(0, 2 * np.pi)  # Random phase
+    wave_x1 = wobble_amount * 0.3 * np.sin(2 * np.pi * y_grid / (height / freq1_h) + phase1_h)
+    wave_x2 = wobble_amount * 0.1 * np.sin(2 * np.pi * y_grid / (height / freq2_h) + phase2_h)
+    # Random wave frequencies and phases for vertical waves
+    freq1_v = np.random.uniform(2, 6)  # Random frequency between 2-6
+    freq2_v = np.random.uniform(6, 12)  # Random frequency between 6-12
+    phase1_v = np.random.uniform(0, 2 * np.pi)  # Random phase
+    phase2_v = np.random.uniform(0, 2 * np.pi)  # Random phase
+    wave_y1 = wobble_amount * 0.3 * np.sin(2 * np.pi * x_grid / (width / freq1_v) + phase1_v)
+    wave_y2 = wobble_amount * 0.1 * np.sin(2 * np.pi * x_grid / (width / freq2_v) + phase2_v)
+    # Random circular ripples with random centers and frequencies
+    center_x = width // 2 + np.random.randint(-width//4, width//4)
+    center_y = height // 2 + np.random.randint(-height//4, height//4)
+    ripple_freq = np.random.uniform(80, 120)  # Random ripple frequency
+    ripple_phase = np.random.uniform(0, 2 * np.pi)  # Random ripple phase
+    distance = np.sqrt((x_grid - center_x)**2 + (y_grid - center_y)**2)
+    ripple_x = wobble_amount * 0.15 * np.sin(2 * np.pi * distance / ripple_freq + ripple_phase)
+    ripple_y = wobble_amount * 0.15 * np.cos(2 * np.pi * distance / ripple_freq + ripple_phase)
+    # Random noise for text preservation - NO FIXED SEED
+    noise_x = np.random.normal(0, wobble_amount * 0.05, (height, width))
+    noise_y = np.random.normal(0, wobble_amount * 0.05, (height, width))
+    # Combine all distortions
+    total_x_offset = wave_x1 + wave_x2 + ripple_x + noise_x
+    total_y_offset = wave_y1 + wave_y2 + ripple_y + noise_y
+    # Apply the distortion with proper boundary handling
+    new_x_coords = x_grid + total_x_offset
+    new_y_coords = y_grid + total_y_offset
+    # Use scipy.ndimage.map_coordinates for efficient interpolation
+    try:
+        from scipy.ndimage import map_coordinates
+        # Create coordinate arrays for map_coordinates (expects [y, x] order)
+        coords = np.array([new_y_coords, new_x_coords])
+        # Apply the transformation to each color channel with adaptive interpolation
+        # Use progressively smoother interpolation for higher wobble amounts
+        distorted_array = np.zeros_like(img_array)
+        # Choose interpolation method based on wobble amount for smoothest results
+        if wobble_amount <= 1.5:
+            # For very subtle wobbles, use nearest neighbor to preserve text sharpness
+            interpolation_order = 0
+        elif wobble_amount <= 3.0:
+            # For moderate wobbles, use linear interpolation
+            interpolation_order = 1
+        else:
+            # For strong wobbles, use cubic interpolation for smoothest edges
+            interpolation_order = 3
+        for channel in range(img_array.shape[2]):
+            distorted_array[:, :, channel] = map_coordinates(
+                img_array[:, :, channel],
+                coords,
+                order=interpolation_order,
+                mode='reflect',  # Mirror edges instead of clipping
+                prefilter=True if interpolation_order > 1 else False  # Use prefilter for cubic
+            )
+        result_img = Image.fromarray(distorted_array.astype(np.uint8))
+        # Post-process for smoother edges at higher wobble amounts
+        if wobble_amount > 2.0:
+            # Apply a very subtle Gaussian blur to smooth any remaining artifacts
+            result_img = result_img.filter(ImageFilter.GaussianBlur(radius=0.3))
+            # Then apply gentle sharpening to maintain text readability
+            result_img = result_img.filter(ImageFilter.UnsharpMask(radius=0.8, percent=60, threshold=1))
+        elif wobble_amount > 1.5:
+            # For moderate wobbles, just apply gentle sharpening
+            result_img = result_img.filter(ImageFilter.UnsharpMask(radius=0.5, percent=40, threshold=0))
+        return result_img
+        return Image.fromarray(distorted_array.astype(np.uint8))
+    except ImportError:
+        # Fallback to PIL's transform if scipy is not available
+        # This is much faster than the pixel-by-pixel approach
+        from PIL.Image import AFFINE
+        # For a simple approximation, apply a slight transform
+        # This won't be as sophisticated but will be much faster
+        transformed = image.transform(
+            image.size,
+            AFFINE,
+            (1, 0.02 * wobble_amount/10, 0.02 * wobble_amount/10, 1, 0, 0),
+            resample=Image.BILINEAR
+        )
+        # Apply a slight rotation for additional wobble with random angle
+        angle = wobble_amount * 0.3 * np.random.uniform(-1, 1)  # Random rotation
+        rotated = transformed.rotate(angle, resample=Image.BILINEAR, expand=False)
+        return rotated

utils/proxy.py ADDED Viewed

File without changes

video/builder.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from video.media import MediaUtils
+import time
+from loguru import logger
+class VideoBuilder:
+    """
+    Builder class for constructing FFmpeg video commands with a fluent interface.
+    """
+    def __init__(self, dimensions: tuple[int, int], ffmpeg_path="ffmpeg"):
+        if not isinstance(dimensions, tuple) or len(dimensions) != 2:
+            raise ValueError("Dimensions must be a tuple of (width, height).")
+        self.width, self.height = dimensions
+        self.ffmpeg_path = ffmpeg_path
+        # Components
+        self.background = None
+        self.audio_file = None
+        self.captions = None
+        self.output_path = "output.mp4"
+        # Internal state
+        self.media_utils = None
+    def set_media_utils(self, media_utils: MediaUtils):
+        """Set the media manager for duration calculations."""
+        self.media_utils = media_utils
+        return self
+    def set_background_image(self, file_path: str, effect_config: dict = None):
+        """Set background as an image with optional visual effects.
+        Args:
+            file_path: Path to the image file
+            effect_config: Configuration for visual effects. Supported effects:
+                - Ken Burns (zoom): {"effect": "ken_burns", "zoom_factor": 0.001, "direction": "zoom-to-top-left"}
+                - Pan: {"effect": "pan", "direction": "left-to-right", "speed": "normal"}
+        """
+        self.background = {
+            "type": "image",
+            "file": file_path,
+            "effect_config": effect_config or {"effect": "ken_burns"},  # Default to Ken Burns for backward compatibility
+        }
+        return self
+    def set_background_video(self, file_path: str):
+        """Set background as a video file."""
+        self.background = {"type": "video", "file": file_path}
+        return self
+    def set_audio(self, file_path: str):
+        """Set audio file."""
+        self.audio_file = file_path
+        return self
+    def set_captions(
+        self,
+        file_path: str = None,
+        config: dict = None,
+    ):
+        """Set caption subtitles
+        Args:
+            file_path: Path to subtitle file
+            config: Optional configuration dict
+        """
+        self.captions = {
+            "file": file_path,
+            **(config or {}),
+        }
+        return self
+    def set_output_path(self, output_path: str):
+        """Set output file path."""
+        self.output_path = output_path
+        return self
+    def build_command(self):
+        """Build the complete FFmpeg command."""
+        if not self.background:
+            raise ValueError("Background must be set (image or video).")
+        if not self.audio_file and not self.captions:
+            raise ValueError(
+                "At least one of audio_file, or captions must be provided."
+            )
+        # Validate combinations
+        if self.background["type"] == "image" and not self.audio_file:
+            raise ValueError("Audio file must be provided if background is an image.")
+        if (
+            self.background["type"] == "video"
+            and not self.audio_file
+            and self.captions is None
+        ):
+            raise ValueError(
+                "Audio file or captions must be provided if background is a video."
+            )
+        # Get audio duration if audio file is provided
+        audio_duration = None
+        if self.audio_file:
+            if not self.media_utils:
+                raise ValueError(
+                    "Media manager must be set to determine audio duration."
+                )
+            media_info = self.media_utils.get_audio_info(self.audio_file)
+            audio_duration = media_info.get("duration")
+            if not audio_duration:
+                raise ValueError("Could not determine audio duration")
+        # Build command
+        cmd = [self.ffmpeg_path, "-y"]
+        filter_parts = []
+        input_index = 0
+        # Add background input
+        if self.background["type"] == "image":
+            cmd.extend(
+                ["-loop", "1", "-t", str(audio_duration), "-i", self.background["file"]]
+            )
+            # Get effect configuration with backward compatibility
+            effect_config = self.background.get("effect_config", {"effect": "ken_burns"})
+            # Handle backward compatibility for old ken_burns config
+            if "ken_burns" in self.background and "effect_config" not in self.background:
+                # Old format: {"ken_burns": {"zoom_factor": 0.001, "direction": "zoom-to-top-left"}}
+                old_ken_burns = self.background.get("ken_burns", {})
+                effect_config = {
+                    "effect": "ken_burns",
+                    "zoom_factor": old_ken_burns.get("zoom_factor", 0.001),
+                    "direction": old_ken_burns.get("direction", "zoom-to-top-left")
+                }
+            effect_type = effect_config.get("effect", "ken_burns")
+            fps = 25
+            duration_frames = int(audio_duration * fps)
+            if effect_type == "ken_burns":
+                # Ken Burns (zoom) effect
+                zoom_factor = effect_config.get("zoom_factor", 0.001)
+                direction = effect_config.get("direction", "zoom-to-top-left")
+                # todo without upscaling we can't use the top and center zooms. upscaling increases the render time
+                zoom_expressions = {
+                    "zoom-to-top": f"z='zoom+{zoom_factor}':x=iw/2-(iw/zoom/2):y=0",
+                    "zoom-to-center": f"z='zoom+{zoom_factor}':x=iw/2-(iw/zoom/2):y=ih/2-(ih/zoom/2)",
+                    "zoom-to-top-left": f"z='zoom+{zoom_factor}':x=0:y=0",
+                }
+                zoom_expr = zoom_expressions.get(direction, zoom_expressions["zoom-to-top-left"])
+                zoompan_d = duration_frames + 1
+                filter_parts.append(
+                    f"[{input_index}]scale={self.width}:-2,setsar=1:1,"
+                    f"crop={self.width}:{self.height},"
+                    f"zoompan={zoom_expr}:d={zoompan_d}:s={self.width}x{self.height}:fps={fps}[bg]"
+                )
+            elif effect_type == "pan":
+                # Pan effect - camera moves across the image
+                direction = effect_config.get("direction", "left-to-right")
+                speed = effect_config.get("speed", "normal")
+                # Speed multipliers
+                speed_multipliers = {
+                    "slow": 0.5,
+                    "normal": 1.0,
+                    "fast": 2.0
+                }
+                speed_mult = speed_multipliers.get(speed, 1.0)
+                # Calculate pan distance based on direction
+                # We'll scale the image larger to allow for panning
+                scale_factor = 1.3  # Scale image 30% larger to allow room for panning
+                scaled_width = int(self.width * scale_factor)
+                scaled_height = int(self.height * scale_factor)
+                # Pan expressions for different directions
+                if direction == "left-to-right":
+                    # Start from left, move to right
+                    start_x = 0
+                    end_x = scaled_width - self.width
+                    start_y = (scaled_height - self.height) // 2
+                    end_y = start_y
+                elif direction == "right-to-left":
+                    # Start from right, move to left
+                    start_x = scaled_width - self.width
+                    end_x = 0
+                    start_y = (scaled_height - self.height) // 2
+                    end_y = start_y
+                elif direction == "top-to-bottom":
+                    # Start from top, move to bottom
+                    start_x = (scaled_width - self.width) // 2
+                    end_x = start_x
+                    start_y = 0
+                    end_y = scaled_height - self.height
+                elif direction == "bottom-to-top":
+                    # Start from bottom, move to top
+                    start_x = (scaled_width - self.width) // 2
+                    end_x = start_x
+                    start_y = scaled_height - self.height
+                    end_y = 0
+                else:
+                    # Default to left-to-right
+                    start_x = 0
+                    end_x = scaled_width - self.width
+                    start_y = (scaled_height - self.height) // 2
+                    end_y = start_y
+                # Create pan expression
+                # Linear interpolation from start to end position over the duration
+                pan_x_expr = f"{start_x}+({end_x}-{start_x})*t/{audio_duration}*{speed_mult}"
+                pan_y_expr = f"{start_y}+({end_y}-{start_y})*t/{audio_duration}*{speed_mult}"
+                filter_parts.append(
+                    f"[{input_index}]scale={scaled_width}:{scaled_height},setsar=1:1,"
+                    f"crop={self.width}:{self.height}:{pan_x_expr}:{pan_y_expr}[bg]"
+                )
+            else:
+                # No effect, just scale and crop
+                filter_parts.append(
+                    f"[{input_index}]scale={self.width}:{self.height},setsar=1:1[bg]"
+                )
+        elif self.background["type"] == "video":
+            if audio_duration:
+                cmd.extend(
+                    [
+                        "-stream_loop",
+                        "-1",
+                        "-t",
+                        str(audio_duration),
+                        "-i",
+                        self.background["file"],
+                    ]
+                )
+            else:
+                cmd.extend(["-i", self.background["file"]])
+            filter_parts.append(f"[{input_index}]scale={self.width}:{self.height}[bg]")
+        input_index += 1
+        current_video = "[bg]"
+        # Add audio input
+        audio_input_index = None
+        if self.audio_file:
+            cmd.extend(["-i", self.audio_file])
+            audio_input_index = input_index
+            input_index += 1
+        # Add subtitles or caption images if provided
+        if self.captions:
+            subtitle_file = self.captions.get("file")
+            if subtitle_file:
+                filter_parts.append(f"{current_video}subtitles={subtitle_file}[v]")
+                current_video = "[v]"
+        else:
+            # Rename final video output
+            if current_video == "[bg]":
+                current_video = "[v]"
+                filter_parts.append(f"[bg]copy[v]")
+        # Build filter complex
+        if filter_parts:
+            cmd.extend(["-filter_complex", ";".join(filter_parts)])
+        # Map video and audio
+        cmd.extend(["-map", current_video])
+        if audio_input_index is not None:
+            cmd.extend(["-map", f"{audio_input_index}:a"])
+        # Video codec settings
+        cmd.extend(["-c:v", "libx264", "-preset", "ultrafast"])
+        cmd.extend(["-crf", "23", "-pix_fmt", "yuv420p"])
+        # Audio codec settings
+        if self.audio_file:
+            cmd.extend(["-c:a", "aac", "-b:a", "192k"])
+            if audio_duration:
+                cmd.extend(["-t", str(audio_duration)])
+        cmd.append(self.output_path)
+        return cmd
+    def execute(self):
+        """Build and execute the FFmpeg command using MediaUtils for progress tracking."""
+        if not self.media_utils:
+            logger.error("MediaUtils must be set before executing video build")
+            return False
+        start = time.time()
+        context_logger = logger.bind(
+            dimensions=(self.width, self.height),
+            background_type=self.background.get("type") if self.background else None,
+            has_audio=bool(self.audio_file),
+            has_captions=bool(self.captions),
+            output_path=self.output_path,
+            youtube_channel="https://www.youtube.com/@aiagentsaz"
+        )
+        try:
+            context_logger.debug("building video with VideoBuilder")
+            cmd = self.build_command()
+            # Calculate expected duration for progress tracking
+            expected_duration = None
+            if self.audio_file:
+                audio_info = self.media_utils.get_audio_info(self.audio_file)
+                expected_duration = audio_info.get("duration")
+            elif self.background and self.background.get("type") == "video":
+                video_info = self.media_utils.get_video_info(self.background["file"])
+                expected_duration = video_info.get("duration")
+            context_logger.bind(
+                command=" ".join(cmd),
+                expected_duration=expected_duration,
+            ).debug("executing video build command")
+            # Execute using MediaUtils for proper logging and progress tracking
+            success = self.media_utils.execute_ffmpeg_command(
+                cmd,
+                "build video",
+                expected_duration=expected_duration,
+                show_progress=True,
+            )
+            if success:
+                context_logger.bind(execution_time=time.time() - start).info(
+                    "video built successfully"
+                )
+                return True
+            else:
+                context_logger.error("failed to build video")
+                return False
+        except Exception as e:
+            context_logger.bind(error=str(e), execution_time=time.time() - start).error(
+                "error during video rendering"
+            )
+            return False

video/caption.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import string
+from typing import List, Dict, Tuple
+from loguru import logger
+from typing import Dict, List
+class Caption:
+    def is_punctuation(self, text):
+        return text in string.punctuation
+    def create_subtitle_segments_english(
+        self, captions: List[Dict], max_length=80, lines=2
+    ):
+        """
+        Breaks up the captions into segments of max_length characters
+        on two lines and merge punctuation with the last word
+        """
+        if not captions:
+            return []
+        segments = []
+        current_segment_texts = ["" for _ in range(lines)]
+        current_line = 0
+        segment_start_ts = captions[0]["start_ts"]
+        segment_end_ts = captions[0]["end_ts"]
+        for caption in captions:
+            text = caption["text"]
+            start_ts = caption["start_ts"]
+            end_ts = caption["end_ts"]
+            # Update the segment end timestamp
+            segment_end_ts = end_ts
+            # If the caption is a punctuation, merge it with the current line
+            if self.is_punctuation(text):
+                if current_line < lines and current_segment_texts[current_line]:
+                    current_segment_texts[current_line] += text
+                continue
+            # If the line is too long, move to the next one
+            if (
+                current_line < lines
+                and len(current_segment_texts[current_line] + text) > max_length
+            ):
+                current_line += 1
+            # If we've filled all lines, save the current segment and start a new one
+            if current_line >= lines:
+                segments.append(
+                    {
+                        "text": current_segment_texts,
+                        "start_ts": segment_start_ts,
+                        "end_ts": segment_end_ts,
+                    }
+                )
+                # Reset for next segment
+                current_segment_texts = ["" for _ in range(lines)]
+                current_line = 0
+                # Add a small gap (0.05s) between segments to prevent overlap
+                segment_start_ts = start_ts + 0.05
+            # Add the text to the current segment
+            if current_line < lines:
+                current_segment_texts[current_line] += (
+                    " " if current_segment_texts[current_line] else ""
+                )
+                current_segment_texts[current_line] += text
+        # Add the last segment if there's any content
+        if any(current_segment_texts):
+            segments.append(
+                {
+                    "text": current_segment_texts,
+                    "start_ts": segment_start_ts,
+                    "end_ts": segment_end_ts,
+                }
+            )
+        # Post-processing to ensure no overlaps by adjusting end times if needed
+        for i in range(len(segments) - 1):
+            if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
+                segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
+        return segments
+    def create_subtitle_segments_international(
+        self, captions: List[Dict], max_length=80, lines=2
+    ):
+        """
+        Breaks up international captions (full sentences) into smaller segments that fit
+        within max_length characters per line, with proper timing distribution.
+        Handles both space-delimited languages like English and character-based languages like Chinese.
+        Args:
+            captions: List of caption dictionaries with text, start_ts, and end_ts
+            max_length: Maximum number of characters per line
+            lines: Number of lines per segment
+        Returns:
+            List of subtitle segments
+        """
+        if not captions:
+            return []
+        segments = []
+        for caption in captions:
+            text = caption["text"].strip()
+            start_ts = caption["start_ts"]
+            end_ts = caption["end_ts"]
+            duration = end_ts - start_ts
+            # Check if text is using Chinese/Japanese/Korean characters (CJK)
+            # For CJK, we'll split by characters rather than words
+            is_cjk = any("\u4e00" <= char <= "\u9fff" for char in text)
+            parts = []
+            if is_cjk:
+                # For CJK languages, process character by character
+                current_part = ""
+                for char in text:
+                    if len(current_part + char) > max_length:
+                        parts.append(current_part)
+                        current_part = char
+                    else:
+                        current_part += char
+                # Add the last part if not empty
+                if current_part:
+                    parts.append(current_part)
+            else:
+                # Original word-based splitting for languages with spaces
+                words = text.split()
+                current_part = ""
+                for word in words:
+                    # If adding this word would exceed max_length, start a new part
+                    if len(current_part + " " + word) > max_length and current_part:
+                        parts.append(current_part.strip())
+                        current_part = word
+                    else:
+                        # Add space if not the first word in the part
+                        if current_part:
+                            current_part += " "
+                        current_part += word
+                # Add the last part if not empty
+                if current_part:
+                    parts.append(current_part.strip())
+            # Group parts into segments with 'lines' number of lines per segment
+            segment_parts = []
+            for i in range(0, len(parts), lines):
+                segment_parts.append(parts[i : i + lines])
+            # Calculate time proportionally based on segment text length
+            total_chars = sum(len("".join(part_group)) for part_group in segment_parts)
+            current_time = start_ts
+            for i, part_group in enumerate(segment_parts):
+                # Get character count for this segment group
+                segment_chars = len("".join(part_group))
+                # Calculate time proportionally, but ensure at least a minimum duration
+                if total_chars > 0:
+                    segment_duration = (segment_chars / total_chars) * duration
+                    segment_duration = max(
+                        segment_duration, 0.5
+                    )  # Ensure minimum duration of 0.5s
+                else:
+                    segment_duration = duration / len(segment_parts)
+                segment_start = current_time
+                segment_end = segment_start + segment_duration
+                # Move current time forward for next segment
+                current_time = segment_end
+                # Create segment with proper text array format for the subtitle renderer
+                segment_text = part_group + [""] * (lines - len(part_group))
+                segments.append(
+                    {
+                        "text": segment_text,
+                        "start_ts": segment_start,
+                        "end_ts": segment_end,
+                    }
+                )
+        # Ensure no overlaps between segments by adjusting end times if needed
+        for i in range(len(segments) - 1):
+            if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
+                segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
+        return segments
+    @staticmethod
+    def hex_to_ass(hex_color: str, alpha: float = 1.0) -> str:
+        """
+        Convert a hex color + transparency to ASS &HaaBBGGRR& format.
+        :param hex_color: CSS-style color string, e.g. "#FFA07A" or "00ff00"
+        :param alpha: transparency from 0.0 (opaque) to 1.0 (fully transparent)
+        :return: ASS color string, e.g. "&H8014C8FF&"
+        """
+        # strip leading '#' if present
+        hex_color = hex_color.lstrip('#')
+        # support 3-digit shorthand like 'f0a'
+        if len(hex_color) == 3:
+            hex_color = ''.join([c*2 for c in hex_color])
+        if len(hex_color) != 6:
+            raise ValueError("hex_color must be in 'RRGGBB' or 'RGB' format")
+        # parse RGB
+        r = int(hex_color[0:2], 16)
+        g = int(hex_color[2:4], 16)
+        b = int(hex_color[4:6], 16)
+        # ASS alpha is inverted: 00=opaque, FF=transparent
+        # so we invert the user's alpha (0.0 = opaque)
+        a = int((1.0 - alpha) * 255)
+        a = max(0, min(255, a))
+        # build BGR and alpha bytes
+        aa = f"{a:02X}"
+        bb = f"{b:02X}"
+        gg = f"{g:02X}"
+        rr = f"{r:02X}"
+        return f"&H{aa}{bb}{gg}{rr}"
+    def create_subtitle(
+        self,
+        segments,
+        dimensions: Tuple[int, int],
+        output_path: str,
+        font_size=24,
+        font_color="#fff",
+        shadow_color="#000",
+        shadow_transparency=0.1,
+        shadow_blur=0,
+        stroke_color="#000",
+        stroke_size=0,
+        font_name="Arial",
+        font_bold=True,
+        font_italic=False,
+        subtitle_position="center",
+    ):
+        width, height = dimensions
+        bold_value = -1 if font_bold else 0
+        italic_value = -1 if font_italic else 0
+        position_from_top = 0.2
+        if subtitle_position == "center":
+            position_from_top = 0.45
+        if subtitle_position == "bottom":
+            position_from_top = 0.75
+        ass_content = """[Script Info]
+ScriptType: v4.00+
+PlayResX: {width}
+PlayResY: {height}
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: Default,{font_name},{font_size},{font_color},&H000000FF,{stroke_color},&H00000000,{bold},{italic},0,0,100,100,0,0,1,{stroke_size},0,8,20,20,20,1
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+""".format(
+            width=width,
+            height=height,
+            font_size=font_size,
+            font_color=self.hex_to_ass(font_color),
+            stroke_color=self.hex_to_ass(stroke_color),
+            stroke_size=stroke_size,
+            font_name=font_name,
+            bold=bold_value,
+            italic=italic_value
+        )
+        pos_x = int(width / 2)
+        pos_y = int(height * position_from_top)
+        # Process each segment and add to the subtitle file
+        for segment in segments:
+            start_time = self.format_time(segment["start_ts"])
+            end_time = self.format_time(segment["end_ts"])
+            # Create text with line breaks
+            text_lines = segment["text"]
+            formatted_text = ""
+            for i, line in enumerate(text_lines):
+                if line:  # Only add non-empty lines
+                    if i > 0:  # Add line break if not the first line
+                        formatted_text += "\\N"
+                    formatted_text += line
+            # Create shadow if shadow_blur is specified or if we want a drop shadow effect
+            if shadow_blur > 0 or shadow_transparency < 1.0:
+                # Convert shadow color with transparency
+                shadow_color_ass = self.hex_to_ass(shadow_color, shadow_transparency)
+                # Offset shadow position slightly for drop shadow effect
+                shadow_pos_x = pos_x + 2
+                shadow_pos_y = pos_y + 2
+                # For shadow text, use shadow color only for primary color and set proper alpha
+                # Only apply shadow color to primary color (\1c) and use alpha for transparency
+                shadow_override_tags = f"\\pos({shadow_pos_x},{shadow_pos_y})\\1c{shadow_color_ass}\\bord0"
+                # Add alpha transparency if needed
+                if shadow_transparency > 0:
+                    alpha_hex = hex(int((1.0 - shadow_transparency) * 255))[2:].upper().zfill(2)
+                    shadow_override_tags += f"\\1a&H{alpha_hex}&"
+                if shadow_blur > 0:
+                    shadow_override_tags += f"\\blur{shadow_blur}"
+                shadow_formatted_text = f"{{{shadow_override_tags}}}" + formatted_text
+                # Add shadow dialogue line first (so it appears behind)
+                ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{shadow_formatted_text}\n"
+            # Create main text layer
+            main_override_tags = f"\\pos({pos_x},{pos_y})"
+            main_formatted_text = f"{{{main_override_tags}}}" + formatted_text
+            # Add main dialogue line (appears on top)
+            ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{main_formatted_text}\n"
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(ass_content)
+        logger.debug("subtitle (ass) was created with drop shadow")
+    def format_time(self, seconds):
+        """
+        Convert seconds to ASS time format (H:MM:SS.cc)
+        """
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        centisecs = int((seconds % 1) * 100)
+        return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"

video/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import torch
+from loguru import logger
+device = "cpu"
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+    num_cores = os.cpu_count()
+    if os.path.exists("/sys/fs/cgroup/cpu.max"):
+        with open("/sys/fs/cgroup/cpu.max", "r") as f:
+            line = f.readline()
+            if len(line.split()) == 2:
+                if line.split()[0] == "max":
+                    logger.info(
+                        "File /sys/fs/cgroup/cpu.max has max value, using os.cpu_count()"
+                    )
+                else:
+                    cpu_max = int(line.split()[0])
+                    cpu_period = int(line.split()[1])
+                    num_cores = cpu_max // cpu_period
+                    logger.info("Using {} cores", num_cores)
+            else:
+                logger.warning(
+                    "File /sys/fs/cgroup/cpu.max does not have 2 values, using os.cpu_count()"
+                )
+    else:
+        logger.info("File /sys/fs/cgroup/cpu.max not found, using os.cpu_count()")
+    logger.info("number of CPU cores: {}", num_cores)
+    num_threads = os.environ.get("NUM_THREADS", num_cores)
+    logger.info("number of threads to use with torch: {}", num_threads)
+    torch.set_num_threads(int(num_threads))
+    torch.set_num_interop_threads(int(num_threads))
+map_location = torch.device(device)
+torch_load_original = torch.load
+def patched_torch_load(*args, **kwargs):
+    if "map_location" not in kwargs:
+        kwargs["map_location"] = map_location
+    return torch_load_original(*args, **kwargs)
+torch.load = patched_torch_load
+whisper_model = os.environ.get("WHISPER_MODEL", "small")
+whisper_compute_type = os.environ.get("WHISPER_COMPUTE_TYPE", "int8")

video/media.py ADDED Viewed

	@@ -0,0 +1,850 @@

+import subprocess
+import json
+import time
+from loguru import logger
+class MediaUtils:
+    def __init__(self, ffmpeg_path="ffmpeg"):
+        """
+        Initializes the MediaUtils class.
+        Args:
+            ffmpeg_path: Path to the ffmpeg executable
+        """
+        self.ffmpeg_path = ffmpeg_path
+    def merge_videos(
+        self,
+        video_paths: list,
+        output_path: str,
+        background_music_path: str = None,
+        background_music_volume: float = 0.5,
+    ) -> bool:
+        """
+        Merges multiple video files into one, optionally with background music.
+        Args:
+            video_paths: List of paths to video files to merge
+            output_path: Path for the merged output video
+            background_music: Optional path to background music file
+            bg_music_volume: Volume level for background music (0.0 to 1.0, default 0.5)
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        if not video_paths:
+            logger.error("no video paths provided for merging")
+            return False
+        start = time.time()
+        context_logger = logger.bind(
+            number_of_videos=len(video_paths),
+            output_path=output_path,
+            background_music=bool(background_music_path),
+            background_music_volume=background_music_volume,
+        )
+        try:
+            # Get dimensions from the first video
+            first_video_info = self.get_video_info(video_paths[0])
+            if not first_video_info:
+                context_logger.error("failed to get video info from first video")
+                return False
+            target_width = first_video_info.get("width", 1080)
+            target_height = first_video_info.get("height", 1920)
+            target_dimensions = f"{target_width}:{target_height}"
+            context_logger.bind(
+                target_width=target_width, target_height=target_height
+            ).debug("using dimensions from first video")
+            # Base command
+            cmd = [self.ffmpeg_path, "-y"]
+            # Add input video files
+            for video_path in video_paths:
+                cmd.extend(["-i", video_path])
+            # Add background music if provided
+            music_input_index = None
+            if background_music_path:
+                cmd.extend(["-stream_loop", "-1", "-i", background_music_path])
+                music_input_index = len(video_paths)
+            # Create filter complex for concatenating videos with re-encoding
+            if len(video_paths) == 1:
+                # Single video - re-encode to ensure consistency
+                # Check if the video has audio
+                audio_info = self.get_audio_info(video_paths[0])
+                has_audio = bool(audio_info.get('duration', 0) > 0)
+                if background_music_path:
+                    if has_audio:
+                        cmd.extend(
+                            [
+                                "-filter_complex",
+                                f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];[{music_input_index}:a]volume={background_music_volume}[bg];[0:a][bg]amix=inputs=2:duration=first[a]",
+                                "-map",
+                                "[v]",
+                                "-map",
+                                "[a]",
+                            ]
+                        )
+                    else:
+                        # No audio in video, just use background music
+                        cmd.extend(
+                            [
+                                "-filter_complex",
+                                f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];[{music_input_index}:a]volume={background_music_volume}[a]",
+                                "-map",
+                                "[v]",
+                                "-map",
+                                "[a]",
+                            ]
+                        )
+                else:
+                    if has_audio:
+                        cmd.extend(
+                            [
+                                "-filter_complex",
+                                f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v]",
+                                "-map",
+                                "[v]",
+                                "-map",
+                                "0:a",
+                            ]
+                        )
+                    else:
+                        # No audio in video and no background music, create silent audio
+                        video_info = self.get_video_info(video_paths[0])
+                        video_duration = video_info.get('duration', 10)  # fallback to 10 seconds
+                        cmd.extend(
+                            [
+                                "-filter_complex",
+                                f"[0:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30[v];anullsrc=channel_layout=stereo:sample_rate=48000:duration={video_duration}[a]",
+                                "-map",
+                                "[v]",
+                                "-map",
+                                "[a]",
+                            ]
+                        )
+            else:
+                # Multiple videos - normalize and concatenate with re-encoding
+                # First, check which videos have audio streams
+                videos_with_audio = []
+                for i, video_path in enumerate(video_paths):
+                    video_info = self.get_video_info(video_path)
+                    # Check if video has audio by trying to get audio info
+                    audio_info = self.get_audio_info(video_path)
+                    has_audio = bool(audio_info.get('duration', 0) > 0)
+                    videos_with_audio.append(has_audio)
+                    context_logger.bind(video_index=i, has_audio=has_audio).debug("checked audio stream")
+                # Create normalized video streams for each input
+                normalize_filters = []
+                for i in range(len(video_paths)):
+                    normalize_filters.append(
+                        f"[{i}:v]scale={target_dimensions}:force_original_aspect_ratio=decrease,pad={target_dimensions}:(ow-iw)/2:(oh-ih)/2:black,fps=30,format=yuv420p[v{i}n]"
+                    )
+                # Create audio streams for videos without audio (silent audio)
+                audio_filters = []
+                for i in range(len(video_paths)):
+                    if not videos_with_audio[i]:
+                        # Get video duration for silent audio generation
+                        video_info = self.get_video_info(video_paths[i])
+                        video_duration = video_info.get('duration', 10)  # fallback to 10 seconds
+                        audio_filters.append(f"anullsrc=channel_layout=stereo:sample_rate=48000:duration={video_duration}[a{i}n]")
+                    else:
+                        audio_filters.append(f"[{i}:a]aformat=sample_rates=48000:channel_layouts=stereo[a{i}n]")
+                # Create the concat filter using normalized streams
+                concat_inputs = ""
+                for i in range(len(video_paths)):
+                    concat_inputs += f"[v{i}n][a{i}n]"
+                # Combine all filters
+                all_filters = normalize_filters + audio_filters
+                filter_complex = (
+                    ";".join(all_filters)
+                    + f";{concat_inputs}concat=n={len(video_paths)}:v=1:a=1[v][a]"
+                )
+                if background_music_path:
+                    # Mix the concatenated audio with background music
+                    filter_complex += f";[{music_input_index}:a]volume={background_music_volume}[bg];[a][bg]amix=inputs=2:duration=first[final_a]"
+                    cmd.extend(
+                        [
+                            "-filter_complex",
+                            filter_complex,
+                            "-map",
+                            "[v]",
+                            "-map",
+                            "[final_a]",
+                        ]
+                    )
+                else:
+                    cmd.extend(
+                        [
+                            "-filter_complex",
+                            filter_complex,
+                            "-map",
+                            "[v]",
+                            "-map",
+                            "[a]",
+                        ]
+                    )
+            # Video codec settings
+            cmd.extend(
+                [
+                    "-c:v",
+                    "libx264",
+                    "-preset",
+                    "veryfast",
+                    "-crf",
+                    "23",
+                ]
+            )
+            # Audio codec settings
+            cmd.extend(["-c:a", "aac", "-b:a", "192k"])
+            # Other settings
+            cmd.extend(["-pix_fmt", "yuv420p", output_path])
+            # Execute the command using the new method
+            # calculate expected duration for progress tracking
+            expected_duration = 0
+            for video_path in video_paths:
+                video_info = self.get_video_info(video_path)
+                expected_duration += video_info.get("duration", 0)
+            success = self.execute_ffmpeg_command(
+                cmd,
+                "merge videos",
+                expected_duration=expected_duration,
+                show_progress=True,
+            )
+            if success:
+                context_logger.bind(execution_time=time.time() - start).debug(
+                    "videos merged successfully",
+                )
+                return True
+            else:
+                context_logger.error("ffmpeg failed to merge videos")
+                return False
+        except Exception as e:
+            context_logger.bind(error=str(e)).error(
+                "error merging videos",
+            )
+            return False
+    def get_video_info(self, file_path: str) -> dict:
+        """
+        Retrieves video information such as duration, width, height, codec, fps, etc.
+        Args:
+            file_path: Path to the video file
+        Returns:
+            Dictionary containing video information
+        """
+        try:
+            cmd = [
+                "ffprobe",
+                "-v",
+                "quiet",
+                "-print_format",
+                "json",
+                "-show_format",
+                "-show_streams",
+                "-select_streams",
+                "v:0",  # Select first video stream
+                file_path,
+            ]
+            success, stdout, stderr = self.execute_ffprobe_command(
+                cmd, "get video info"
+            )
+            if not success:
+                raise Exception(f"ffprobe failed: {stderr}")
+            probe_data = json.loads(stdout)
+            # Extract format information
+            format_info = probe_data.get("format", {})
+            streams = probe_data.get("streams", [])
+            if not streams:
+                raise Exception("No video stream found in file")
+            video_stream = streams[0]
+            video_info = {
+                "duration": float(format_info.get("duration", 0)),
+                "width": video_stream.get("width"),
+                "height": video_stream.get("height"),
+                "fps": video_stream.get("avg_frame_rate", "0/1").split("/")[0],
+                "aspect_ratio": video_stream.get("display_aspect_ratio", "1:1"),
+                "codec": video_stream.get("codec_name"),
+            }
+            return video_info
+        except Exception as e:
+            logger.bind(file_path=file_path, error=str(e)).error(
+                "error getting video info"
+            )
+            return {}
+    def get_audio_info(self, file_path: str) -> dict:
+        """
+        Retrieves audio information such as duration, codec, bitrate, sample rate, channels, etc.
+        Args:
+            file_path: Path to the audio file
+        Returns:
+            Dictionary containing audio information
+        """
+        try:
+            cmd = [
+                "ffprobe",
+                "-v",
+                "quiet",
+                "-print_format",
+                "json",
+                "-show_format",
+                "-show_streams",
+                "-select_streams",
+                "a:0",  # Select first audio stream
+                file_path,
+            ]
+            success, stdout, stderr = self.execute_ffprobe_command(
+                cmd, "get audio info"
+            )
+            if not success:
+                raise Exception(f"ffprobe failed: {stderr}")
+            probe_data = json.loads(stdout)
+            # Extract format information
+            format_info = probe_data.get("format", {})
+            streams = probe_data.get("streams", [])
+            if not streams:
+                raise Exception("No audio stream found in file")
+            audio_stream = streams[0]
+            audio_info = {
+                "duration": float(format_info.get("duration", 0)),
+                "channels": audio_stream.get("channels", 0),
+                "sample_rate": audio_stream.get("sample_rate", "0"),
+                "codec": audio_stream.get("codec_name", ""),
+                "bitrate": audio_stream.get("bit_rate", "0"),
+            }
+            return audio_info
+        except Exception as e:
+            logger.bind(file_path=file_path, error=str(e)).error(
+                "Error getting audio info"
+            )
+            return {}
+    def extract_frame(
+        self,
+        video_path: str,
+        output_path: str,
+        time_seconds: float = 0.0,
+    ) -> bool:
+        """
+        Extracts a frame from a video at a specified time.
+        Args:
+            video_path: Path to the input video file
+            output_path: Path for the extracted frame image
+            time_seconds: Time in seconds to extract the frame (default: 0.0)
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            # Base command
+            cmd = [self.ffmpeg_path, "-y"]
+            # Add input video file
+            cmd.extend(["-i", video_path])
+            # Seek to the specified time and extract one frame
+            cmd.extend(
+                [
+                    "-ss",
+                    str(time_seconds),  # Seek to time
+                    "-vframes",
+                    "1",  # Extract only one frame
+                    "-q:v",
+                    "2",  # High quality (scale 1-31, lower is better)
+                    output_path,
+                ]
+            )
+            # Execute the command using the new method
+            success = self.execute_ffmpeg_command(
+                cmd,
+                "extract frame",
+                show_progress=False,  # No progress tracking for single frame extraction
+            )
+            if success:
+                logger.bind(video_path=video_path, time_seconds=time_seconds).debug(
+                    "frame extracted successfully"
+                )
+                return True
+            else:
+                logger.bind(video_path=video_path, time_seconds=time_seconds).error(
+                    "failed to extract frame from video"
+                )
+                return False
+        except Exception as e:
+            logger.bind(error=str(e)).error("Error extracting frame from video")
+            return False
+    def extract_frames(
+        self,
+        video_path: str,
+        output_template: str,
+        amount: int = 5,
+        length_seconds: float = None,
+    ) -> bool:
+        """
+        Args:
+            video_path: Path to the input video file
+            output_template: Template for output image files (e.g., "frame-%03d.jpg")
+            amount: Number of frames to extract (default: 5)
+            length_seconds: Length of the video in seconds (optional, if not provided will be calculated)
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            # Get video duration if not provided
+            if length_seconds is None:
+                video_info = self.get_video_info(video_path)
+                length_seconds = video_info.get("duration", 0)
+            if length_seconds <= 0:
+                logger.error("invalid video duration for frame extraction")
+                return False
+            # Calculate frame interval (time between frames)
+            # This gives us the correct fps rate to extract exactly 'amount' frames
+            # evenly distributed across the video duration
+            frame_interval = length_seconds / amount
+            # Base command - using the corrected fps calculation
+            # fps=1/frame_interval extracts one frame every frame_interval seconds
+            cmd = [
+                self.ffmpeg_path,
+                "-y",
+                "-i",
+                video_path,
+                "-vf",
+                f"fps=1/{frame_interval}",
+                "-vframes",
+                str(amount),
+                "-qscale:v",
+                "2",  # High quality
+                output_template,
+            ]
+            # Execute the command using the new method
+            success = self.execute_ffmpeg_command(
+                cmd,
+                "extract frames",
+                expected_duration=length_seconds,
+                show_progress=True,
+            )
+            if success:
+                logger.bind(video_path=video_path, amount=amount).debug(
+                    "frames extracted successfully"
+                )
+                return True
+            else:
+                logger.bind(video_path=video_path, amount=amount).error(
+                    "failed to extract frames from video"
+                )
+                return False
+        except Exception as e:
+            logger.bind(error=str(e)).error("Error extracting frames from video")
+            return False
+    def format_time(self, seconds: float) -> str:
+        """
+        Format seconds into HH:MM:SS format.
+        Args:
+            seconds: Time in seconds
+        Returns:
+            Formatted time string
+        """
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        seconds = int(seconds % 60)
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+    def execute_ffmpeg_command(
+        self,
+        cmd: list,
+        operation_name: str,
+        expected_duration: float = None,
+        show_progress: bool = True,
+    ) -> bool:
+        """
+        Execute an ffmpeg command with proper logging and progress tracking.
+        Args:
+            cmd: The ffmpeg command as a list
+            operation_name: Name of the operation for logging
+            expected_duration: Expected duration for progress calculation
+            show_progress: Whether to show progress information
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            logger.bind(command=" ".join(cmd), operation=operation_name).debug(
+                f"executing ffmpeg command for {operation_name}"
+            )
+            process = subprocess.Popen(
+                cmd,
+                stderr=subprocess.PIPE,
+                universal_newlines=True,
+                text=True,
+            )
+            # Process the output line by line as it becomes available
+            for line in process.stderr:
+                # Extract time information for progress tracking
+                if (
+                    show_progress
+                    and expected_duration
+                    and "time=" in line
+                    and "speed=" in line
+                ):
+                    try:
+                        # Extract the time information
+                        time_str = line.split("time=")[1].split(" ")[0]
+                        # Convert HH:MM:SS.MS format to seconds
+                        h, m, s = time_str.split(":")
+                        seconds = float(h) * 3600 + float(m) * 60 + float(s)
+                        # Calculate progress percentage
+                        progress = min(100, (seconds / expected_duration) * 100)
+                        logger.info(
+                            f"{operation_name}: {progress:.2f}% complete (Time: {time_str} / Total: {self.format_time(expected_duration)})"
+                        )
+                    except (ValueError, IndexError):
+                        # If parsing fails, continue silently
+                        pass
+                elif any(
+                    keyword in line
+                    for keyword in [
+                        # Skip initialization information
+                        "ffmpeg version",
+                        "built with",
+                        "configuration:",
+                        "libav",
+                        "Input #",
+                        "Metadata:",
+                        "Duration:",
+                        "Stream #",
+                        "Press [q]",
+                        "Output #",
+                        "Stream mapping:",
+                        # Skip processing details
+                        "frame=",
+                        "fps=",
+                        "[libx264",
+                        "kb/s:",
+                        "Qavg:",
+                        "video:",
+                        "audio:",
+                        "subtitle:",
+                        "frame I:",
+                        "frame P:",
+                        "mb I",
+                        "mb P",
+                        "coded y,",
+                        "i16 v,h,dc,p:",
+                        "i8c dc,h,v,p:",
+                        "compatible_brands:",
+                        "encoder",
+                        "Side data:",
+                        "libswscale",
+                        "libswresample",
+                        "libpostproc",
+                        # Additional patterns to filter
+                        "ffmpeg: libswscale",
+                        "ffmpeg: libswresample",
+                        "ffmpeg: libpostproc",
+                    ]
+                ):
+                    # Skip all technical output lines
+                    pass
+                else:
+                    # Only print important messages (like errors and warnings)
+                    # that don't match any of the filtered patterns
+                    if not line.strip() or line.strip().startswith("["):
+                        continue
+                    # Skip header lines that describe inputs
+                    if ":" in line and any(
+                        header in line
+                        for header in [
+                            "major_brand",
+                            "minor_version",
+                            "creation_time",
+                            "handler_name",
+                            "vendor_id",
+                            "Duration",
+                            "bitrate",
+                        ]
+                    ):
+                        continue
+                    logger.debug(f"ffmpeg: {line.strip()}")
+            # Wait for the process to complete and check the return code
+            return_code = process.wait()
+            if return_code != 0:
+                logger.bind(return_code=return_code, operation=operation_name).error(
+                    f"ffmpeg exited with code: {return_code} for {operation_name}"
+                )
+                return False
+            logger.bind(operation=operation_name).debug(
+                f"{operation_name} completed successfully"
+            )
+            return True
+        except Exception as e:
+            logger.bind(error=str(e), operation=operation_name).error(
+                f"error executing ffmpeg command for {operation_name}"
+            )
+            return False
+    def execute_ffprobe_command(
+        self, cmd: list, operation_name: str
+    ) -> tuple[bool, str, str]:
+        """
+        Execute an ffprobe command with proper logging.
+        Args:
+            cmd: The ffprobe command as a list
+            operation_name: Name of the operation for logging
+        Returns:
+            tuple: (success, stdout, stderr)
+        """
+        try:
+            logger.bind(command=" ".join(cmd), operation=operation_name).debug(
+                f"executing ffprobe command for {operation_name}"
+            )
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+            stdout, stderr = process.communicate()
+            if process.returncode != 0:
+                logger.bind(stderr=stderr, operation=operation_name).error(
+                    f"ffprobe failed for {operation_name}"
+                )
+                return False, stdout, stderr
+            logger.bind(operation=operation_name).debug(
+                f"{operation_name} completed successfully"
+            )
+            return True, stdout, stderr
+        except Exception as e:
+            logger.bind(error=str(e), operation=operation_name).error(
+                f"error executing ffprobe command for {operation_name}"
+            )
+            return False, "", str(e)
+    @staticmethod
+    def is_hex_color(color: str) -> bool:
+        """
+        Checks if the given color string is a valid hex color.
+        Args:
+            color: Color string to check
+        Returns:
+            bool: True if it's a hex color, False otherwise
+        """
+        return all(
+            c in "0123456789abcdefABCDEF" for c in color[1:]
+        )
+    def colorkey_overlay(
+        self,
+        input_video_path: str,
+        overlay_video_path: str,
+        output_video_path: str,
+        color: str = "green",
+        similarity: float = 0.1,
+        blend: float = 0.1,
+    ):
+        """
+        Applies a colorkey overlay to a video using FFmpeg.
+        """
+        """
+            ffmpeg -i input.mp4 -stream_loop -1 -i black_dust.mp4 \
+            -filter_complex "[1]colorkey=0x000000:0.1:0.1[ckout];[0][ckout]overlay" \
+            -shortest \
+            -c:v libx264 -preset ultrafast -crf 18 \
+            -c:a copy \
+            output.mp4
+        """
+        start = time.time()
+        info = self.get_video_info(input_video_path)
+        video_duration = info.get("duration", 0)
+        if not video_duration:
+            logger.error("failed to get video duration from input video")
+            return False
+        color = color.lstrip("#")
+        if self.is_hex_color(color):
+            color = f"0x{color.upper()}"
+        context_logger = logger.bind(
+            input_video_path=input_video_path,
+            overlay_video_path=overlay_video_path,
+            output_video_path=output_video_path,
+            video_duration=video_duration,
+            color=color,
+            similarity=similarity,
+            blend=blend,
+        )
+        context_logger.debug("Starting colorkey overlay process")
+        context_logger = context_logger.bind(
+            video_duration=video_duration,
+        )
+        cmd = [
+            self.ffmpeg_path, "-y",
+            "-i", input_video_path,
+            "-stream_loop", "-1",
+            "-i", overlay_video_path,
+            "-filter_complex", f"[1:v]colorkey={color}:{similarity}:{blend}[ckout];[0:v][ckout]overlay=eof_action=repeat[v]",
+            "-map", "[v]",
+            "-map", "0:a",
+            "-c:v", "libx264",
+            "-preset", "ultrafast",
+            "-crf", "18",
+            "-c:a", "copy",
+            "-t", f"{video_duration}s",
+            output_video_path,
+        ]
+        try:
+            success = self.execute_ffmpeg_command(
+                cmd,
+                "add colorkey overlay to video",
+                expected_duration=video_duration,
+                show_progress=True,
+            )
+            if success:
+                context_logger.bind(execution_time=time.time() - start).debug(
+                    "colorkey overlay added successfully",
+                )
+                return True
+            else:
+                context_logger.error("ffmpeg failed to create colorkey overlay")
+                return False
+        except Exception as e:
+            context_logger.bind(error=str(e)).error(
+                "error adding colorkey overlay to video",
+            )
+            return False
+    def convert_pcm_to_wav(
+        self,
+        input_pcm_path: str,
+        output_wav_path: str,
+        sample_rate: int = 24000,
+        channels: int = 1,
+        target_sample_rate: int = 44100,
+    ) -> bool:
+        """
+        ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm -ar 44100 -ac 2 out_44k_stereo.wav
+        """
+        start = time.time()
+        context_logger = logger.bind(
+            input_pcm_path=input_pcm_path,
+            output_wav_path=output_wav_path,
+            sample_rate=sample_rate,
+            channels=channels,
+            target_sample_rate=target_sample_rate,
+        )
+        context_logger.debug("Starting PCM to WAV conversion")
+        cmd = [
+            self.ffmpeg_path, "-y",
+            "-f", "s16le",
+            "-ar", str(sample_rate),
+            "-ac", str(channels),
+            "-i", input_pcm_path,
+            "-ar", str(target_sample_rate),
+            "-ac", "2",  # Convert to stereo
+            output_wav_path,
+        ]
+        try:
+            success = self.execute_ffmpeg_command(
+                cmd,
+                "convert PCM to WAV",
+                show_progress=False,
+            )
+            if success:
+                context_logger.bind(execution_time=time.time() - start).debug(
+                    "PCM to WAV conversion successful",
+                )
+                return True
+            else:
+                context_logger.error("ffmpeg failed to convert PCM to WAV")
+                return False
+        except Exception as e:
+            context_logger.bind(error=str(e)).error(
+                "error converting PCM to WAV",
+            )
+            return False

video/storage.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from typing import Tuple
+import uuid
+import os
+import requests
+class MediaType:
+    IMAGE = "image"
+    VIDEO = "video"
+    AUDIO = "audio"
+    TMP = "tmp"
+class Storage:
+    def __init__(self, storage_path):
+        self.storage_path = storage_path
+        os.makedirs(self.storage_path, exist_ok=True)
+        # make all the subdirectories for the media types
+        for media_type in [
+            MediaType.IMAGE,
+            MediaType.VIDEO,
+            MediaType.AUDIO,
+            MediaType.TMP,
+        ]:
+            os.makedirs(os.path.join(self.storage_path, media_type), exist_ok=True)
+    def _validate_media_id(self, media_id: str) -> tuple[str, str]:
+        """
+        Validates and parses a media ID to prevent path traversal attacks.
+        Args:
+            media_id (str): Media ID to validate
+        Returns:
+            tuple[str, str]: (media_type, filename)
+        Raises:
+            ValueError: If media_id is invalid or contains path traversal attempts
+        """
+        if not media_id or "_" not in media_id:
+            raise ValueError("Invalid media ID format")
+        media_type, filename = media_id.split("_", 1)
+        # Validate media type
+        valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
+        if media_type not in valid_types:
+            raise ValueError(f"Invalid media type: {media_type}")
+        # Prevent path traversal by checking for dangerous patterns
+        if ".." in filename or "/" in filename or "\\" in filename:
+            raise ValueError(
+                "Filename contains invalid characters or path traversal attempt"
+            )
+        # Additional validation: filename should not be empty and should be reasonable
+        if not filename or len(filename) > 255:
+            raise ValueError("Invalid filename")
+        return media_type, filename
+    def _get_safe_file_path(self, media_id: str) -> str:
+        """
+        Gets a safe file path for the given media ID after validation.
+        Args:
+            media_id (str): Media ID to get path for
+        Returns:
+            str: Safe file path
+        """
+        media_type, filename = self._validate_media_id(media_id)
+        file_path = os.path.join(self.storage_path, media_type, filename)
+        # Double-check that the resolved path is within the storage directory
+        resolved_path = os.path.abspath(file_path)
+        storage_abs_path = os.path.abspath(self.storage_path)
+        if not resolved_path.startswith(storage_abs_path):
+            raise ValueError("Path traversal attempt detected")
+        return file_path
+    def upload_media(
+        self, media_type: MediaType, media_data: bytes, file_extension: str = ""
+    ) -> str:
+        """
+        Uploads media to the server.
+        Args:
+            media_type (str): Type of media, e.g., 'image' or 'video'.
+            media_data (bytes): Binary data of the media file.
+            file_extension (str): File extension, e.g., '.jpg', '.mp4', '.wav'.
+        Returns:
+            str: Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
+        """
+        # Validate media type
+        valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
+        if media_type not in valid_types:
+            raise ValueError(f"Invalid media type: {media_type}")
+        # Validate file extension to prevent path traversal
+        if file_extension and (
+            ".." in file_extension or "/" in file_extension or "\\" in file_extension
+        ):
+            raise ValueError("File extension contains invalid characters")
+        asset_id = str(uuid.uuid4())
+        filename = f"{asset_id}{file_extension}" if file_extension else asset_id
+        file_path = os.path.join(self.storage_path, media_type, filename)
+        # Additional safety check
+        resolved_path = os.path.abspath(file_path)
+        storage_abs_path = os.path.abspath(self.storage_path)
+        if not resolved_path.startswith(storage_abs_path):
+            raise ValueError("Path traversal attempt detected")
+        with open(file_path, "wb") as f:
+            f.write(media_data)
+        media_id = f"{media_type}_{filename}"
+        return media_id
+    def get_media(self, media_id: str) -> bytes:
+        """
+        Retrieves media by ID.
+        Args:
+            media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
+        Returns:
+            bytes: Binary data of the media file.
+        """
+        file_path = self._get_safe_file_path(media_id)
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Media file {media_id} not found.")
+        with open(file_path, "rb") as f:
+            return f.read()
+    def delete_media(self, media_id: str) -> None:
+        """
+        Deletes media by ID.
+        Args:
+            media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
+        """
+        file_path = self._get_safe_file_path(media_id)
+        if os.path.exists(file_path):
+            os.remove(file_path)
+        else:
+            raise FileNotFoundError(f"Media file {media_id} not found.")
+    def media_exists(self, media_id: str) -> bool:
+        """
+        Checks if media exists by ID.
+        Args:
+            media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
+        Returns:
+            bool: True if media exists, False otherwise.
+        """
+        try:
+            file_path = self._get_safe_file_path(media_id)
+            return os.path.exists(file_path)
+        except ValueError:
+            return False
+    def get_media_path(self, media_id: str) -> str:
+        """
+        Gets the file path of the media by ID.
+        Args:
+            media_id (str): Media ID, e.g., 'image_12345.jpg' or 'video_67890.mp4'.
+        Returns:
+            str: Full file path of the media.
+        """
+        return self._get_safe_file_path(media_id)
+    ### untested
+    def create_media_filename(
+        self, media_type: MediaType, file_extension: str = ""
+    ) -> str:
+        # Validate media type
+        valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
+        if media_type not in valid_types:
+            raise ValueError(f"Invalid media type: {media_type}")
+        # Validate file extension to prevent path traversal
+        if file_extension and (
+            ".." in file_extension or "/" in file_extension or "\\" in file_extension
+        ):
+            raise ValueError("File extension contains invalid characters")
+        asset_id = str(uuid.uuid4())
+        filename = f"{asset_id}{file_extension}" if file_extension else asset_id
+        return f"{media_type}_{filename}"
+    def create_media_filename_with_id(
+        self, media_type: MediaType, file_extension: str = ""
+    ) -> Tuple[str, str]:
+        file_id = self.create_media_filename(media_type, file_extension)
+        return file_id, self.get_media_path(file_id)
+    def create_media_template(
+        self, media_type: MediaType, file_extension: str
+    ) -> str:
+        """
+        Creates a media template filename for the given media type and file extension.
+        Args:
+            media_type (MediaType): Type of media, e.g., MediaType.IMAGE.
+            file_extension (str): File extension, e.g., '.jpg', '.mp4'.
+    ):
+        Returns:
+        """
+        if not file_extension.startswith("."):
+            file_extension = "." + file_extension
+        valid_types = [MediaType.IMAGE, MediaType.VIDEO, MediaType.AUDIO, MediaType.TMP]
+        if media_type not in valid_types:
+            raise ValueError(f"Invalid media type: {media_type}")
+        if file_extension and (
+            ".." in file_extension or "/" in file_extension or "\\" in file_extension
+        ):
+            raise ValueError("File extension contains invalid characters")
+        asset_id = str(uuid.uuid4())
+        filename = f"{asset_id}-%02d{file_extension}" if file_extension else f"{asset_id}-%02d"
+        file_path = os.path.join(
+            self.storage_path, media_type, filename
+        )
+        return filename, file_path
+    def create_tmp_file_id(self, media_id: str) -> str:
+        """
+        Creates a temporary filename for media upload.
+        Args:
+            media_id (str): Media ID to create a temporary filename for.
+        Returns:
+            str: Temporary media ID.
+        """
+        return f"{media_id}.tmp"
+    def create_tmp_file(self, media_id: str) -> str:
+        """
+        Creates a temporary file for media upload.
+        Args:
+            media_id (str): Media ID to create a temporary file for.
+        Returns:
+            str: Temporary media ID.
+        """
+        tmp_id = f"{media_id}.tmp"
+        tmp_path = self.get_media_path(tmp_id)
+        with open(tmp_path, "wb") as f:
+            pass
+        return tmp_id
+    def get_media_type(self, media_id: str) -> MediaType:
+        """
+        Gets the media type of the given media ID.
+        Args:
+            media_id (str): Media ID to get the type for.
+        Returns:
+            MediaType: The type of the media.
+        """
+        media_type, _ = self._validate_media_id(media_id)
+        return media_type
+    def is_valid_url(self, url: str) -> bool:
+        """
+        Validates a URL to ensure it is well-formed.
+        Args:
+            url (str): The URL to validate.
+        Returns:
+            bool: True if the URL is valid, False otherwise.
+        """
+        from urllib.parse import urlparse
+        try:
+            result = urlparse(url)
+            return all([result.scheme, result.netloc])
+        except Exception:
+            return False
+    def upload_media_from_url(
+        self, media_type: MediaType, url: str
+    ) -> str:
+        """
+        Uploads media from a URL.
+        Args:
+            media_type (MediaType): Type of media, e.g., MediaType.IMAGE.
+            url (str): URL of the media file.
+        Returns:
+            str: Media ID, e.g., 'image_12345.jpg'.
+        """
+        if not self.is_valid_url(url):
+            raise ValueError("Invalid URL")
+        response = requests.get(url)
+        if response.status_code != 200:
+            raise ValueError(f"Failed to download media from {url}")
+        file_extension = os.path.splitext(url)[1]
+        return self.upload_media(media_type, response.content, file_extension)

video/stt.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from faster_whisper import WhisperModel
+from loguru import logger
+from video.config import device, whisper_model, whisper_compute_type
+class STT:
+    def __init__(self):
+        self.model = WhisperModel(
+            model_size_or_path=whisper_model,
+            compute_type=whisper_compute_type
+        )
+    def transcribe(self, audio_path, language = None, beam_size=5):
+        logger.bind(
+            device=device.type,
+            model_size=whisper_model,
+            compute_type=whisper_compute_type,
+            audio_path=audio_path,
+            language=language,
+        ).debug(
+            "transcribing audio with Whisper model",
+        )
+        segments, info = self.model.transcribe(
+            audio_path,
+            beam_size=beam_size,
+            word_timestamps=True,
+            language=language,
+        )
+        duration = info.duration
+        captions = []
+        for segment in segments:
+            for word in segment.words:
+                captions.append(
+                    {
+                        "text": word.word,
+                        "start_ts": word.start,
+                        "end_ts": word.end,
+                    }
+                )
+        return captions, duration

video/tts.py ADDED Viewed

	@@ -0,0 +1,443 @@

+import re
+import time
+import warnings
+from typing import List
+from kokoro import KPipeline
+import numpy as np
+import soundfile as sf
+from loguru import logger
+import torchaudio as ta
+from chatterbox.tts import ChatterboxTTS
+from video.config import device
+# Suppress PyTorch warnings
+warnings.filterwarnings("ignore")
+LANGUAGE_CONFIG = {
+    "en-us": {
+        "lang_code": "a",
+        "international": False,
+        "iso639_1": "en",
+    },
+    "en": {
+        "lang_code": "a",
+        "international": False,
+        "iso639_1": "en",
+    },
+    "en-gb": {
+        "lang_code": "b",
+        "international": False,
+        "iso639_1": "en",
+    },
+    "es": {"lang_code": "e", "international": True, "iso639_1": "es"},
+    "fr": {"lang_code": "f", "international": True, "iso639_1": "fr"},
+    "hi": {"lang_code": "h", "international": True, "iso639_1": "hi"},
+    "it": {"lang_code": "i", "international": True, "iso639_1": "it"},
+    "pt": {"lang_code": "p", "international": True, "iso639_1": "pt"},
+    "ja": {"lang_code": "j", "international": True, "iso639_1": "ja"},
+    "zh": {"lang_code": "z", "international": True, "iso639_1": "zh"},
+}
+LANGUAGE_VOICE_CONFIG = {
+    "en-us": [
+        "af_heart",
+        "af_alloy",
+        "af_aoede",
+        "af_bella",
+        "af_jessica",
+        "af_kore",
+        "af_nicole",
+        "af_nova",
+        "af_river",
+        "af_sarah",
+        "af_sky",
+        "am_adam",
+        "am_echo",
+        "am_eric",
+        "am_fenrir",
+        "am_liam",
+        "am_michael",
+        "am_onyx",
+        "am_puck",
+        "am_santa",
+    ],
+    "en-gb": [
+        "bf_alice",
+        "bf_emma",
+        "bf_isabella",
+        "bf_lily",
+        "bm_daniel",
+        "bm_fable",
+        "bm_george",
+        "bm_lewis",
+    ],
+    "zh": [
+        "zf_xiaobei",
+        "zf_xiaoni",
+        "zf_xiaoxiao",
+        "zf_xiaoyi",
+        "zm_yunjian",
+        "zm_yunxi",
+        "zm_yunxia",
+        "zm_yunyang",
+    ],
+    "es": ["ef_dora", "em_alex", "em_santa"],
+    "fr": ["ff_siwis"],
+    "it": ["if_sara", "im_nicola"],
+    "pt": ["pf_dora", "pm_alex", "pm_santa"],
+    "hi": ["hf_alpha", "hf_beta", "hm_omega", "hm_psi"],
+}
+LANGUAGE_VOICE_MAP = {}
+for lang, voices in LANGUAGE_VOICE_CONFIG.items():
+    for voice in voices:
+        if lang in LANGUAGE_CONFIG:
+            LANGUAGE_VOICE_MAP[voice] = LANGUAGE_CONFIG[lang]
+        else:
+            print(f"Warning: Language {lang} not found in LANGUAGE_CONFIG")
+class TTS:
+    def break_text_into_sentences(self, text, lang_code) -> List[str]:
+        """
+        Advanced sentence splitting with better handling of abbreviations and edge cases.
+        """
+        if not text or not text.strip():
+            return []
+        # Language-specific sentence boundary patterns
+        patterns = {
+            "a": r"(?<=[.!?])\s+(?=[A-Z_])",  # English
+            "e": r"(?<=[.!?])\s+(?=[A-ZÁÉÍÓÚÑÜ¿¡_])",  # Spanish - allow inverted punctuation after boundaries
+            "f": r"(?<=[.!?])\s+(?=[A-ZÁÀÂÄÇÉÈÊËÏÎÔÖÙÛÜŸ_])",  # French
+            "h": r"(?<=[।!?])\s+",  # Hindi: Split after devanagari danda
+            "i": r"(?<=[.!?])\s+(?=[A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß_])",  # Italian
+            "p": r"(?<=[.!?])\s+(?=[A-ZÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ_])",  # Portuguese
+            "z": r"(?<=[。！？])",  # Chinese: Split after Chinese punctuation
+        }
+        # Common abbreviations that shouldn't trigger sentence breaks
+        abbreviations = {
+            "a": {
+                "Mr.",
+                "Mrs.",
+                "Ms.",
+                "Dr.",
+                "Prof.",
+                "Sr.",
+                "Jr.",
+                "Inc.",
+                "Corp.",
+                "Ltd.",
+                "Co.",
+                "etc.",
+                "vs.",
+                "eg.",
+                "i.e.",
+                "e.g.",
+                "Vol.",
+                "Ch.",
+                "Fig.",
+                "No.",
+                "p.",
+                "pp.",
+            },  # English
+            "e": {
+                "Sr.",
+                "Sra.",
+                "Dr.",
+                "Dra.",
+                "Prof.",
+                "etc.",
+                "pág.",
+                "art.",
+                "núm.",
+                "cap.",
+                "vol.",
+            },  # Spanish
+            "f": {
+                "M.",
+                "Mme.",
+                "Dr.",
+                "Prof.",
+                "etc.",
+                "art.",
+                "p.",
+                "vol.",
+                "ch.",
+                "fig.",
+                "n°",
+            },  # French
+            "h": {"श्री", "श्रीमती", "डॉ.", "प्रो.", "etc.", "पृ.", "अध."},  # Hindi
+            "i": {
+                "Sig.",
+                "Sig.ra",
+                "Dr.",
+                "Prof.",
+                "ecc.",
+                "pag.",
+                "art.",
+                "n.",
+                "vol.",
+                "cap.",
+                "fig.",
+            },  # Italian
+            "p": {
+                "Sr.",
+                "Sra.",
+                "Dr.",
+                "Dra.",
+                "Prof.",
+                "etc.",
+                "pág.",
+                "art.",
+                "n.º",
+                "vol.",
+                "cap.",
+            },  # Portuguese
+            "z": {"先生", "女士", "博士", "教授", "等等", "第", "页", "章"},  # Chinese
+        }
+        abbrevs = abbreviations.get(lang_code, set())
+        # Protect abbreviations by temporarily replacing them
+        protected_text = text
+        replacements = {}
+        for i, abbrev in enumerate(abbrevs):
+            placeholder = f"__ABBREV_{i}__"
+            protected_text = protected_text.replace(abbrev, placeholder)
+            replacements[placeholder] = abbrev
+        # Apply the regex splitting
+        pattern = patterns.get(lang_code, patterns["a"])
+        sentences = re.split(pattern, protected_text.strip())
+        # Restore abbreviations and clean up
+        restored_sentences = []
+        for sentence in sentences:
+            for placeholder, original in replacements.items():
+                sentence = sentence.replace(placeholder, original)
+            sentence = sentence.strip()
+            if sentence:
+                restored_sentences.append(sentence)
+        return restored_sentences if restored_sentences else [text.strip()]
+    def kokoro_international(
+        self, text: str, output_path: str, voice: str, lang_code: str, speed=1
+    ) -> tuple[str, List[dict], float]:
+        if not text or not text.strip():
+            raise ValueError("Text cannot be empty or whitespace")
+        lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
+        if not lang_code:
+            raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
+        start = time.time()
+        context_logger = logger.bind(
+            voice=voice,
+            speed=speed,
+            text_length=len(text),
+        )
+        context_logger.debug("Starting TTS generation (international) with kokoro")
+        sentences = self.break_text_into_sentences(text, lang_code)
+        context_logger.debug(
+            "Text split into sentences",
+            sentences=sentences,
+            num_sentences=len(sentences),
+        )
+        # generate the audio for each sentence
+        audio_data = []
+        captions = []
+        full_audio_length = 0
+        pipeline = KPipeline(lang_code=lang_code, repo_id="hexgrad/Kokoro-82M", device=device)
+        for sentence in sentences:
+            context_logger.debug(
+                "Processing sentence",
+                sentence=sentence,
+                voice=voice,
+                speed=speed,
+            )
+            generator = pipeline(sentence, voice=voice, speed=speed)
+            for i, result in enumerate(generator):
+                context_logger.debug(
+                    "Generated audio for sentence",
+                )
+                data = result.audio
+                audio_length = len(data) / 24000
+                audio_data.append(data)
+                # since there are no tokens, we can just use the sentence as the text
+                captions.append(
+                    {
+                        "text": sentence,
+                        "start_ts": full_audio_length,
+                        "end_ts": full_audio_length + audio_length,
+                    }
+                )
+                full_audio_length += audio_length
+        context_logger = context_logger.bind(
+            execution_time=time.time() - start,
+            audio_length=full_audio_length,
+            speedup=full_audio_length / (time.time() - start),
+        )
+        context_logger.debug(
+            "TTS generation (international) completed with kokoro",
+        )
+        audio_data = np.concatenate(audio_data)
+        audio_data = np.column_stack((audio_data, audio_data))
+        sf.write(output_path, audio_data, 24000, format="WAV")
+        return captions, full_audio_length
+    def kokoro_english(
+        self, text: str, output_path: str, voice="af_heart", speed=1
+    ) -> tuple[str, List[dict], float]:
+        if not text or not text.strip():
+            raise ValueError("Text cannot be empty or whitespace")
+        lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
+        if not lang_code:
+            raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
+        if lang_code != "a":
+            raise NotImplementedError(
+                f"TTS for language code '{lang_code}' is not implemented."
+            )
+        start = time.time()
+        context_logger = logger.bind(
+            voice=voice,
+            speed=speed,
+            text_length=len(text),
+            device=device.type,
+        )
+        context_logger.debug("Starting TTS generation with kokoro")
+        if not text or not text.strip():
+            raise ValueError("Text cannot be empty or whitespace")
+        pipeline = KPipeline(lang_code=lang_code, repo_id="hexgrad/Kokoro-82M", device=device.type)
+        generator = pipeline(text, voice=voice, speed=speed)
+        captions = []
+        audio_data = []
+        full_audio_length = 0
+        for _, result in enumerate(generator):
+            data = result.audio
+            audio_length = len(data) / 24000
+            audio_data.append(data)
+            if result.tokens:
+                tokens = result.tokens
+                for t in tokens:
+                    if t.start_ts is None or t.end_ts is None:
+                        if captions:
+                            captions[-1]["text"] += t.text
+                            captions[-1]["end_ts"] = full_audio_length + audio_length
+                        continue
+                    try:
+                        captions.append(
+                            {
+                                "text": t.text,
+                                "start_ts": full_audio_length + t.start_ts,
+                                "end_ts": full_audio_length + t.end_ts,
+                            }
+                        )
+                    except Exception as e:
+                        logger.error(
+                            "Error processing token: {}, Error: {}",
+                            t,
+                            e,
+                        )
+                        raise ValueError(f"Error processing token: {t}, Error: {e}")
+            full_audio_length += audio_length
+        audio_data = np.concatenate(audio_data)
+        audio_data = np.column_stack((audio_data, audio_data))
+        sf.write(output_path, audio_data, 24000, format="WAV")
+        context_logger.bind(
+            execution_time=time.time() - start,
+            audio_length=full_audio_length,
+            speedup=full_audio_length / (time.time() - start),
+            youtube_channel="https://www.youtube.com/@aiagentsaz"
+        ).debug(
+            "TTS generation completed with kokoro",
+        )
+        return captions, full_audio_length
+    def kokoro(
+        self, text: str, output_path: str, voice="af_heart", speed=1
+    ) -> tuple[str, List[dict], float]:
+        if not text or not text.strip():
+            raise ValueError("Text cannot be empty or whitespace")
+        lang_code = LANGUAGE_VOICE_MAP.get(voice, {}).get("lang_code")
+        if not lang_code:
+            raise ValueError(f"Voice '{voice}' not found in LANGUAGE_VOICE_MAP")
+        if lang_code == "a":
+            return self.kokoro_english(text, output_path, voice, speed)
+        else:
+            return self.kokoro_international(text, output_path, voice, lang_code, speed)
+    def chatterbox(
+        self,
+        text: str,
+        output_path: str,
+        sample_audio_path: str = None,
+        exaggeration=0.5,
+        cfg_weight=0.5,
+        temperature=0.8,
+    ):
+        start = time.time()
+        context_logger = logger.bind(
+            text_length=len(text),
+            sample_audio_path=sample_audio_path,
+            exaggeration=exaggeration,
+            cfg_weight=cfg_weight,
+            temperature=temperature,
+            model="ChatterboxTTS",
+            language="en-US",
+            device=device.type,
+        )
+        context_logger.debug("starting TTS generation with Chatterbox")
+        model = ChatterboxTTS.from_pretrained(device=device.type)
+        if sample_audio_path:
+            wav = model.generate(
+                text,
+                audio_prompt_path=sample_audio_path,
+                exaggeration=exaggeration,
+                cfg_weight=cfg_weight,
+                temperature=temperature,
+            )
+        else:
+            wav = model.generate(
+                text,
+                exaggeration=exaggeration,
+                cfg_weight=cfg_weight,
+                temperature=temperature,
+            )
+        if wav.dim() == 2 and wav.shape[0] == 1:
+            wav = wav.repeat(2, 1)
+        elif wav.dim() == 1:
+            wav = wav.unsqueeze(0).repeat(2, 1)
+        audio_length = wav.shape[1] / model.sr
+        ta.save(output_path, wav, model.sr)
+        context_logger.bind(
+            execution_time=time.time() - start,
+            audio_length=audio_length,
+            speedup=audio_length / (time.time() - start),
+            youtube_channel="https://www.youtube.com/@aiagentsaz"
+        ).debug(
+            "TTS generation with Chatterbox completed",
+        )
+    def valid_kokoro_voices(self, lang_code = None) -> List[str]:
+        """
+        Returns a list of valid voices for the given language code.
+        If no language code is provided, returns all voices.
+        """
+        if lang_code:
+            return LANGUAGE_VOICE_CONFIG.get(lang_code, [])
+        else:
+            return [
+                voice for voices in LANGUAGE_VOICE_CONFIG.values() for voice in voices
+            ]

video/tts_chatterbox.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import os
+import time
+import traceback
+import warnings
+from loguru import logger
+import torchaudio as ta
+from chatterbox.tts import ChatterboxTTS
+from video.config import device
+import nltk
+import torch
+from typing import List, Optional
+# Suppress PyTorch warnings
+warnings.filterwarnings("ignore")
+class TTSChatterbox:
+    def __init__(self):
+        """Initialize ChatterboxTTS and ensure NLTK data is available."""
+        self.ensure_nltk_data()
+        logger.debug("ChatterboxTTS initialized")
+    def ensure_nltk_data(self):
+        """Ensure NLTK punkt tokenizer is available."""
+        try:
+            nltk.data.find('tokenizers/punkt')
+            nltk.data.find('tokenizers/punkt_tab')
+            logger.debug("NLTK punkt tokenizer found")
+        except LookupError:
+            logger.debug("Downloading NLTK punkt tokenizer...")
+            try:
+                nltk.download('punkt', quiet=True)
+                nltk.download('punkt_tab', quiet=True)
+                logger.debug("NLTK punkt tokenizer downloaded successfully")
+            except Exception as e:
+                logger.error(f"Failed to download NLTK punkt tokenizer: {e}")
+                raise
+    def split_text_into_chunks(self, text: str, max_chars_per_chunk: int = 300) -> List[str]:
+        """Split text into chunks respecting sentence boundaries without breaking sentences."""
+        try:
+            sentences = nltk.sent_tokenize(text)
+            # Filter out empty sentences and strip whitespace
+            sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
+            chunks = []
+            current_chunk = ""
+            for sentence in sentences:
+                # If adding this sentence would exceed the limit, finalize current chunk
+                if current_chunk and len(current_chunk) + len(sentence) + 1 > max_chars_per_chunk:
+                    chunks.append(current_chunk.strip())
+                    current_chunk = sentence
+                else:
+                    # Add sentence to current chunk
+                    if current_chunk:
+                        current_chunk += " " + sentence
+                    else:
+                        current_chunk = sentence
+            # Add the last chunk if it's not empty
+            if current_chunk.strip():
+                chunks.append(current_chunk.strip())
+            logger.debug(f"Text split into {len(chunks)} chunks (max {max_chars_per_chunk} chars each, preserving sentences)")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error splitting text: {e}")
+            # Fallback: return original text as single chunk
+            return [text]
+    def generate_audio_chunk(
+        self,
+        text_chunk: str,
+        model: ChatterboxTTS,
+        audio_prompt_path: Optional[str] = None,
+        temperature: float = 0.8,
+        cfg_weight: float = 0.5,
+        exaggeration: float = 0.5
+    ) -> Optional[torch.Tensor]:
+        """Generate audio tensor for a single text chunk."""
+        try:
+            logger.debug(f"Generating audio for chunk: {text_chunk[:50]}...")
+            # Check if audio prompt exists
+            effective_prompt_path = None
+            if audio_prompt_path and os.path.exists(audio_prompt_path):
+                effective_prompt_path = audio_prompt_path
+            elif audio_prompt_path:
+                logger.warning(f"Audio prompt path not found: {audio_prompt_path}")
+            # Generate audio
+            wav_tensor = model.generate(
+                text_chunk,
+                audio_prompt_path=effective_prompt_path,
+                temperature=temperature,
+                cfg_weight=cfg_weight,
+                exaggeration=exaggeration
+            )
+            # Ensure tensor is on CPU and properly shaped
+            wav_tensor_cpu = wav_tensor.cpu().float()
+            # Ensure tensor is 2D: [channels, samples]
+            if wav_tensor_cpu.ndim == 1:
+                wav_tensor_cpu = wav_tensor_cpu.unsqueeze(0)
+            elif wav_tensor_cpu.ndim > 2:
+                logger.warning(f"Unexpected tensor shape {wav_tensor_cpu.shape}, attempting to fix")
+                wav_tensor_cpu = wav_tensor_cpu.squeeze()
+                if wav_tensor_cpu.ndim == 1:
+                    wav_tensor_cpu = wav_tensor_cpu.unsqueeze(0)
+                elif wav_tensor_cpu.ndim != 2 or wav_tensor_cpu.shape[0] != 1:
+                    logger.error(f"Could not reshape tensor {wav_tensor.shape} to [1, N]")
+                    return None
+            return wav_tensor_cpu
+        except Exception as e:
+            logger.error(f"Error generating audio chunk: {e}")
+            logger.error(traceback.format_exc())
+            return None
+    def text_to_speech_pipeline(
+        self,
+        text: str,
+        model: ChatterboxTTS,
+        max_chars_per_chunk: int = 1024,
+        inter_chunk_silence_ms: int = 350,
+        audio_prompt_path: Optional[str] = None,
+        temperature: float = 0.8,
+        cfg_weight: float = 0.5,
+        exaggeration: float = 0.5
+    ) -> Optional[torch.Tensor]:
+        """Convert text to speech with chunking support."""
+        try:
+            # Split text into chunks
+            text_chunks = self.split_text_into_chunks(text, max_chars_per_chunk)
+            if not text_chunks:
+                logger.error("No text chunks to process")
+                return None
+            all_audio_tensors = []
+            sample_rate = model.sr
+            logger.debug(f"Processing {len(text_chunks)} chunks at {sample_rate} Hz")
+            for i, chunk_text in enumerate(text_chunks):
+                logger.debug(f"Processing chunk {i+1}/{len(text_chunks)}")
+                chunk_tensor = self.generate_audio_chunk(
+                    chunk_text,
+                    model,
+                    audio_prompt_path,
+                    temperature,
+                    cfg_weight,
+                    exaggeration
+                )
+                if chunk_tensor is None:
+                    logger.warning(f"Skipping chunk {i+1} due to generation error")
+                    continue
+                all_audio_tensors.append(chunk_tensor)
+                # Add silence between chunks (except after the last chunk)
+                if i < len(text_chunks) - 1 and inter_chunk_silence_ms > 0:
+                    silence_samples = int(sample_rate * inter_chunk_silence_ms / 1000.0)
+                    silence_tensor = torch.zeros(
+                        (1, silence_samples),
+                        dtype=chunk_tensor.dtype,
+                        device=chunk_tensor.device
+                    )
+                    all_audio_tensors.append(silence_tensor)
+            if not all_audio_tensors:
+                logger.error("No audio tensors generated")
+                return None
+            # Concatenate all audio tensors
+            logger.debug("Concatenating audio tensors...")
+            final_audio_tensor = torch.cat(all_audio_tensors, dim=1)
+            logger.debug(f"Final audio shape: {final_audio_tensor.shape}")
+            return final_audio_tensor
+        except Exception as e:
+            logger.error(f"Error in text-to-speech pipeline: {e}")
+            logger.error(traceback.format_exc())
+            return None
+    def chatterbox(
+        self,
+        text: str,
+        output_path: str,
+        sample_audio_path: str = None,
+        exaggeration=0.5,
+        cfg_weight=0.5,
+        temperature=0.8,
+        chunk_chars: int = 1024,
+        chunk_silence_ms: int = 350,
+    ):
+        start = time.time()
+        context_logger = logger.bind(
+            text_length=len(text),
+            sample_audio_path=sample_audio_path,
+            exaggeration=exaggeration,
+            cfg_weight=cfg_weight,
+            temperature=temperature,
+            model="ChatterboxTTS",
+            language="en-US",
+            device=device.type,
+        )
+        context_logger.debug("starting TTS generation with Chatterbox")
+        model = ChatterboxTTS.from_pretrained(device=device.type)
+        if sample_audio_path:
+            wav = self.text_to_speech_pipeline(
+                text,
+                model,
+                audio_prompt_path=sample_audio_path,
+                temperature=temperature,
+                cfg_weight=cfg_weight,
+                exaggeration=exaggeration,
+                max_chars_per_chunk=chunk_chars,
+                inter_chunk_silence_ms=chunk_silence_ms
+            )
+        else:
+            wav = self.text_to_speech_pipeline(
+                text,
+                model,
+                temperature=temperature,
+                cfg_weight=cfg_weight,
+                exaggeration=exaggeration,
+                max_chars_per_chunk=chunk_chars,
+                inter_chunk_silence_ms=chunk_silence_ms
+            )
+        if wav.dim() == 2 and wav.shape[0] == 1:
+            wav = wav.repeat(2, 1)
+        elif wav.dim() == 1:
+            wav = wav.unsqueeze(0).repeat(2, 1)
+        audio_length = wav.shape[1] / model.sr
+        ta.save(output_path, wav, model.sr)
+        context_logger.bind(
+            execution_time=time.time() - start,
+            audio_length=audio_length,
+            speedup=audio_length / (time.time() - start),
+            youtube_channel="https://www.youtube.com/@aiagentsaz"
+        ).debug(
+            "TTS generation with Chatterbox completed",
+        )