Spaces:
Paused
Paused
squash commit
Browse files- .env.example +0 -1
- .github/workflows/check-file-size.yml +23 -0
- .github/workflows/lint.yml +43 -0
- .github/workflows/sync-to-hf.yml +24 -0
- .gitignore +5 -1
- ADDING_MCP_TOOLS.md +0 -204
- IMPLEMENTATION_PLAN.md +0 -388
- README.md +116 -23
- lefthook.yml +14 -0
- poetry.lock +0 -0
- pyproject.toml +54 -0
- src/app/app.py +327 -3
- src/app/tools/frame_extractor.py +152 -0
- src/app/tools/langchain_tools.py +196 -0
- src/app/tools/music_selector.py +187 -0
- src/app/tools/script_generator.py +210 -0
- src/app/tools/subtitle_creator.py +274 -0
- src/app/tools/text_to_speech.py +531 -0
- src/app/tools/thumbnail_generator.py +212 -0
- src/app/tools/video_clipper.py +102 -0
- src/app/tools/video_composer.py +495 -0
- src/app/tools/video_script_generator.py +430 -0
- src/app/tools/video_summarizer.py +27 -1
- src/app/workflow.py +355 -0
- src/app/workflow_ui.py +153 -0
- tests/README.md +70 -0
- tests/__init__.py +1 -0
- tests/conftest.py +104 -0
- tests/test_adk_integration.py +104 -0
- tests/test_frame_extractor.py +186 -0
- tests/test_music_selector.py +512 -0
- tests/test_script_generator.py +298 -0
- tests/test_subtitle_creator.py +435 -0
- tests/test_text_to_speech.py +638 -0
- tests/test_thumbnail_generator.py +341 -0
- tests/test_video_clipper.py +235 -0
- tests/test_video_composer.py +393 -0
- tests/test_video_summarizer.py +295 -0
.env.example
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
GOOGLE_API_KEY=*****
|
|
|
|
|
|
.github/workflows/check-file-size.yml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Check file size
|
| 2 |
+
on:
|
| 3 |
+
pull_request:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# Allow manual trigger from Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
permissions: write-all
|
| 10 |
+
|
| 11 |
+
jobs:
|
| 12 |
+
check-file-size:
|
| 13 |
+
runs-on: ubuntu-latest
|
| 14 |
+
steps:
|
| 15 |
+
- uses: actions/checkout@v6
|
| 16 |
+
with:
|
| 17 |
+
fetch-depth: 0
|
| 18 |
+
lfs: true
|
| 19 |
+
|
| 20 |
+
- name: Check large files
|
| 21 |
+
uses: ppremk/lfs-warning@v3.3
|
| 22 |
+
with:
|
| 23 |
+
filesizelimit: 10485760 # 10MB limit for HF Spaces sync
|
.github/workflows/lint.yml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Lint
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [main]
|
| 8 |
+
|
| 9 |
+
# Allow manual trigger from Actions tab
|
| 10 |
+
workflow_dispatch:
|
| 11 |
+
|
| 12 |
+
jobs:
|
| 13 |
+
lint:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
steps:
|
| 16 |
+
- uses: actions/checkout@v6
|
| 17 |
+
|
| 18 |
+
- name: Set up Python
|
| 19 |
+
id: setup-python
|
| 20 |
+
uses: actions/setup-python@v6
|
| 21 |
+
with:
|
| 22 |
+
python-version: "3.12"
|
| 23 |
+
|
| 24 |
+
- name: Install Poetry
|
| 25 |
+
uses: snok/install-poetry@v1
|
| 26 |
+
with:
|
| 27 |
+
version: latest
|
| 28 |
+
virtualenvs-create: false
|
| 29 |
+
virtualenvs-in-project: false
|
| 30 |
+
|
| 31 |
+
- name: Load cached venv
|
| 32 |
+
id: cached-poetry-dependencies
|
| 33 |
+
uses: actions/cache@v4
|
| 34 |
+
with:
|
| 35 |
+
path: .venv
|
| 36 |
+
key: venv-${{ runner.os }}-3.12-${{ hashFiles('**/poetry.lock') }}
|
| 37 |
+
|
| 38 |
+
- name: Install dependencies
|
| 39 |
+
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
|
| 40 |
+
run: poetry install --no-interaction --no-root
|
| 41 |
+
|
| 42 |
+
- name: Run black check
|
| 43 |
+
run: poetry run black --check src/ tests/
|
.github/workflows/sync-to-hf.yml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face Space
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# Allow manual trigger from Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
sync-to-hub:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v6
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
lfs: true
|
| 17 |
+
|
| 18 |
+
- name: Push to Hugging Face Space
|
| 19 |
+
env:
|
| 20 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 21 |
+
run: |
|
| 22 |
+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
| 23 |
+
git config --global user.name "github-actions[bot]"
|
| 24 |
+
git push https://MCP-1st-Birthday:$HF_TOKEN@huggingface.co/spaces/MCP-1st-Birthday/vidzly main --force
|
.gitignore
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
__pycache__/
|
| 2 |
|
| 3 |
-
.env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
|
| 3 |
+
.env
|
| 4 |
+
|
| 5 |
+
.vscode/
|
| 6 |
+
|
| 7 |
+
tests/data/
|
ADDING_MCP_TOOLS.md
DELETED
|
@@ -1,204 +0,0 @@
|
|
| 1 |
-
# Adding a New MCP Tool
|
| 2 |
-
|
| 3 |
-
This guide will walk you through creating and integrating a new MCP (Model Context Protocol) tool into the Vidzly application.
|
| 4 |
-
|
| 5 |
-
## Overview
|
| 6 |
-
|
| 7 |
-
MCP tools in this project are Python functions that can be exposed through Gradio's MCP server. Each tool is a standalone function that performs a specific task and can be used by AI agents or directly through the Gradio interface.
|
| 8 |
-
|
| 9 |
-
## Step-by-Step Guide
|
| 10 |
-
|
| 11 |
-
### Step 1: Create the Tool Function
|
| 12 |
-
|
| 13 |
-
Create a new Python file in the `src/app/tools/` directory. Name it descriptively (e.g., `word_reverser.py`, `text_analyzer.py`).
|
| 14 |
-
|
| 15 |
-
**Example: Creating `word_reverser.py`**
|
| 16 |
-
|
| 17 |
-
```python
|
| 18 |
-
def word_reverser(text):
|
| 19 |
-
"""
|
| 20 |
-
Reverse the order of words in a given text.
|
| 21 |
-
|
| 22 |
-
Args:
|
| 23 |
-
text (str): The input text to reverse
|
| 24 |
-
|
| 25 |
-
Returns:
|
| 26 |
-
str: The text with words in reverse order
|
| 27 |
-
"""
|
| 28 |
-
words = text.split()
|
| 29 |
-
reversed_words = ' '.join(reversed(words))
|
| 30 |
-
return reversed_words
|
| 31 |
-
```
|
| 32 |
-
|
| 33 |
-
**Key Requirements:**
|
| 34 |
-
|
| 35 |
-
- The function should have a clear, descriptive name
|
| 36 |
-
- Include a docstring explaining what the function does
|
| 37 |
-
- Document all parameters in the docstring
|
| 38 |
-
- Document the return value
|
| 39 |
-
- Keep the function focused on a single task
|
| 40 |
-
|
| 41 |
-
### Step 2: Import the Tool in `app.py`
|
| 42 |
-
|
| 43 |
-
Open `src/app/app.py` and add an import statement at the top:
|
| 44 |
-
|
| 45 |
-
```python
|
| 46 |
-
from tools.word_reverser import word_reverser
|
| 47 |
-
```
|
| 48 |
-
|
| 49 |
-
### Step 3: Create a Gradio Interface
|
| 50 |
-
|
| 51 |
-
In `app.py`, within the `with gr.Tab("MCP Tools"):` block, add a new tab for your tool:
|
| 52 |
-
|
| 53 |
-
```python
|
| 54 |
-
with gr.Tab("Demo Word Reverser"):
|
| 55 |
-
gr.Interface(
|
| 56 |
-
fn=word_reverser,
|
| 57 |
-
inputs=[gr.Textbox("Hello world from Vidzly")],
|
| 58 |
-
outputs=[gr.Textbox()],
|
| 59 |
-
title="Word Reverser",
|
| 60 |
-
description="Enter text to reverse the order of words.",
|
| 61 |
-
api_name="word_reverser",
|
| 62 |
-
)
|
| 63 |
-
```
|
| 64 |
-
|
| 65 |
-
**Gradio Interface Parameters:**
|
| 66 |
-
|
| 67 |
-
- `fn`: Your tool function
|
| 68 |
-
- `inputs`: List of Gradio input components (e.g., `gr.Textbox()`, `gr.Number()`, `gr.Slider()`)
|
| 69 |
-
- `outputs`: List of Gradio output components
|
| 70 |
-
- `title`: Display title for the interface
|
| 71 |
-
- `description`: Helpful description for users
|
| 72 |
-
- `api_name`: Unique API endpoint name (used for MCP server)
|
| 73 |
-
|
| 74 |
-
### Step 4: Test Your Tool
|
| 75 |
-
|
| 76 |
-
1. Run the application:
|
| 77 |
-
|
| 78 |
-
```bash
|
| 79 |
-
poetry run python src/app/app.py
|
| 80 |
-
```
|
| 81 |
-
|
| 82 |
-
2. Navigate to the "MCP Tools" tab in the web interface
|
| 83 |
-
3. Click on your new tool's tab
|
| 84 |
-
4. Test the tool with various inputs
|
| 85 |
-
5. Verify the MCP server exposes your tool correctly
|
| 86 |
-
|
| 87 |
-
## Complete Example
|
| 88 |
-
|
| 89 |
-
Here's a complete example showing how the existing `letter_counter` tool is implemented:
|
| 90 |
-
|
| 91 |
-
**File: `src/app/tools/letter_counter.py`**
|
| 92 |
-
|
| 93 |
-
```python
|
| 94 |
-
def letter_counter(word, letter):
|
| 95 |
-
"""
|
| 96 |
-
Count the number of occurrences of a letter in a word or text.
|
| 97 |
-
|
| 98 |
-
Args:
|
| 99 |
-
word (str): The input text to search through
|
| 100 |
-
letter (str): The letter to search for
|
| 101 |
-
|
| 102 |
-
Returns:
|
| 103 |
-
str: A message indicating how many times the letter appears
|
| 104 |
-
"""
|
| 105 |
-
word = word.lower()
|
| 106 |
-
letter = letter.lower()
|
| 107 |
-
count = word.count(letter)
|
| 108 |
-
return count
|
| 109 |
-
```
|
| 110 |
-
|
| 111 |
-
**Integration in `app.py`:**
|
| 112 |
-
|
| 113 |
-
```python
|
| 114 |
-
from tools.letter_counter import letter_counter
|
| 115 |
-
|
| 116 |
-
# ... inside the MCP Tools tab ...
|
| 117 |
-
with gr.Tab("Demo Letter Counter"):
|
| 118 |
-
gr.Interface(
|
| 119 |
-
fn=letter_counter,
|
| 120 |
-
inputs=[gr.Textbox("strawberry"), gr.Textbox("r")],
|
| 121 |
-
outputs=[gr.Textbox()],
|
| 122 |
-
title="Letter Counter",
|
| 123 |
-
description="Enter text and a letter to count how many times the letter appears in the text.",
|
| 124 |
-
api_name="predict",
|
| 125 |
-
)
|
| 126 |
-
```
|
| 127 |
-
|
| 128 |
-
## Best Practices
|
| 129 |
-
|
| 130 |
-
1. **Function Design:**
|
| 131 |
-
|
| 132 |
-
- Keep functions pure when possible (no side effects)
|
| 133 |
-
- Handle edge cases and invalid inputs gracefully
|
| 134 |
-
- Return meaningful error messages if needed
|
| 135 |
-
|
| 136 |
-
2. **Documentation:**
|
| 137 |
-
|
| 138 |
-
- Write clear docstrings with parameter descriptions
|
| 139 |
-
- Include examples in docstrings if helpful
|
| 140 |
-
- Use type hints if possible (optional but recommended)
|
| 141 |
-
|
| 142 |
-
3. **Naming:**
|
| 143 |
-
|
| 144 |
-
- Use descriptive function names (e.g., `word_reverser` not `rev`)
|
| 145 |
-
- Use descriptive file names matching the function name
|
| 146 |
-
- Use clear `api_name` values for MCP endpoints
|
| 147 |
-
|
| 148 |
-
4. **Testing:**
|
| 149 |
-
|
| 150 |
-
- Test with various inputs including edge cases
|
| 151 |
-
- Verify the tool works both in the Gradio UI and via MCP
|
| 152 |
-
- Check that error handling works correctly
|
| 153 |
-
|
| 154 |
-
5. **Gradio Components:**
|
| 155 |
-
- Choose appropriate input/output components:
|
| 156 |
-
- `gr.Textbox()` for text input/output
|
| 157 |
-
- `gr.Number()` for numeric values
|
| 158 |
-
- `gr.Slider()` for numeric ranges
|
| 159 |
-
- `gr.Checkbox()` for boolean values
|
| 160 |
-
- `gr.Dropdown()` for selections
|
| 161 |
-
- Provide default values in inputs for better UX
|
| 162 |
-
|
| 163 |
-
## Advanced: Multiple Parameters
|
| 164 |
-
|
| 165 |
-
If your tool requires multiple parameters, simply add them to your function signature and corresponding Gradio inputs:
|
| 166 |
-
|
| 167 |
-
```python
|
| 168 |
-
def text_processor(text, operation, case_sensitive):
|
| 169 |
-
"""
|
| 170 |
-
Process text with various operations.
|
| 171 |
-
|
| 172 |
-
Args:
|
| 173 |
-
text (str): The input text
|
| 174 |
-
operation (str): Operation to perform ('upper', 'lower', 'reverse')
|
| 175 |
-
case_sensitive (bool): Whether operations should be case sensitive
|
| 176 |
-
|
| 177 |
-
Returns:
|
| 178 |
-
str: Processed text
|
| 179 |
-
"""
|
| 180 |
-
# Implementation here
|
| 181 |
-
pass
|
| 182 |
-
```
|
| 183 |
-
|
| 184 |
-
```python
|
| 185 |
-
gr.Interface(
|
| 186 |
-
fn=text_processor,
|
| 187 |
-
inputs=[
|
| 188 |
-
gr.Textbox("Hello World"),
|
| 189 |
-
gr.Dropdown(["upper", "lower", "reverse"]),
|
| 190 |
-
gr.Checkbox(False)
|
| 191 |
-
],
|
| 192 |
-
outputs=[gr.Textbox()],
|
| 193 |
-
title="Text Processor",
|
| 194 |
-
description="Process text with various operations.",
|
| 195 |
-
api_name="text_processor",
|
| 196 |
-
)
|
| 197 |
-
```
|
| 198 |
-
|
| 199 |
-
## Troubleshooting
|
| 200 |
-
|
| 201 |
-
- **Import errors:** Make sure your function is in the `src/app/tools/` directory and the import path matches
|
| 202 |
-
- **MCP not exposing tool:** Verify `api_name` is unique and the function is properly defined
|
| 203 |
-
- **Type errors:** Ensure your function's return type matches the Gradio output component type
|
| 204 |
-
- **UI not showing:** Check that the tab is properly nested within `with gr.Tab("MCP Tools"):`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IMPLEMENTATION_PLAN.md
DELETED
|
@@ -1,388 +0,0 @@
|
|
| 1 |
-
# Complete Vidzly Workflow Implementation Plan
|
| 2 |
-
|
| 3 |
-
## System Architecture Overview
|
| 4 |
-
|
| 5 |
-
The Vidzly system will process user-uploaded videos and descriptions to create polished 30-second short videos through the following workflow:
|
| 6 |
-
|
| 7 |
-
1. **Input**: Multiple video files + user description
|
| 8 |
-
2. **Analysis**: Understand video content and user requirements
|
| 9 |
-
3. **Script Generation**: Create detailed script/storyboard for the final video
|
| 10 |
-
4. **Selection**: Choose relevant scenes and music based on script
|
| 11 |
-
5. **Processing**: Clip, edit, and combine videos according to script
|
| 12 |
-
6. **Output**: Final 30-second video
|
| 13 |
-
|
| 14 |
-
## Required MCP Tools
|
| 15 |
-
|
| 16 |
-
### Phase 1: Video Analysis Tools
|
| 17 |
-
|
| 18 |
-
#### 1.1 Video Summarizer (`video_summarizer.py`)
|
| 19 |
-
|
| 20 |
-
- **Purpose**: Analyze video content and generate text summaries
|
| 21 |
-
- **Input**: Video file path
|
| 22 |
-
- **Output**: JSON with video summary, key scenes, detected objects/activities, mood tags
|
| 23 |
-
- **Technology**: OpenCV for frame extraction, Google Gemini Vision API for analysis
|
| 24 |
-
- **Returns**: Structured summary including duration, scene descriptions, visual elements
|
| 25 |
-
|
| 26 |
-
#### 1.2 Video Metadata Extractor (`video_metadata.py`)
|
| 27 |
-
|
| 28 |
-
- **Purpose**: Extract technical metadata from videos
|
| 29 |
-
- **Input**: Video file path
|
| 30 |
-
- **Output**: Duration, resolution, fps, codec, file size
|
| 31 |
-
- **Technology**: MoviePy or ffmpeg-python
|
| 32 |
-
- **Returns**: Technical specifications for processing decisions
|
| 33 |
-
|
| 34 |
-
### Phase 2: Content Understanding Tools
|
| 35 |
-
|
| 36 |
-
#### 2.1 Description Parser (`description_parser.py`)
|
| 37 |
-
|
| 38 |
-
- **Purpose**: Parse and understand user descriptions to extract requirements
|
| 39 |
-
- **Input**: User description text
|
| 40 |
-
- **Output**: Structured requirements (mood, style, key elements, target length)
|
| 41 |
-
- **Technology**: Google Gemini API for natural language understanding
|
| 42 |
-
- **Returns**: JSON with extracted mood, style preferences, key topics, pacing
|
| 43 |
-
|
| 44 |
-
#### 2.2 Scene Matcher (`scene_matcher.py`)
|
| 45 |
-
|
| 46 |
-
- **Purpose**: Match video scenes to user requirements
|
| 47 |
-
- **Input**: Video summaries + parsed requirements
|
| 48 |
-
- **Output**: List of matching scenes with timestamps and relevance scores
|
| 49 |
-
- **Technology**: Semantic similarity matching using embeddings
|
| 50 |
-
- **Returns**: Ranked list of scene segments to use
|
| 51 |
-
|
| 52 |
-
### Phase 3: Script Generation & Planning
|
| 53 |
-
|
| 54 |
-
#### 3.1 Video Script Generator (`video_script_generator.py`)
|
| 55 |
-
|
| 56 |
-
- **Purpose**: Create a detailed script/storyboard for the final 30-second video
|
| 57 |
-
- **Input**: Video summaries, matched scenes, parsed user requirements
|
| 58 |
-
- **Output**: Detailed script with scene sequence, timings, transitions, and structure
|
| 59 |
-
- **Technology**: Google Gemini API for intelligent script generation
|
| 60 |
-
- **Returns**: JSON script containing:
|
| 61 |
-
- Scene sequence with source video and timestamps
|
| 62 |
-
- Duration for each scene segment (must sum to ~30 seconds)
|
| 63 |
-
- Transition types between scenes (cut, fade, crossfade, etc.)
|
| 64 |
-
- Pacing and rhythm plan
|
| 65 |
-
- Music synchronization points (beat markers, mood changes)
|
| 66 |
-
- Overall narrative structure and flow
|
| 67 |
-
- Visual style recommendations
|
| 68 |
-
|
| 69 |
-
### Phase 4: Video Processing Tools
|
| 70 |
-
|
| 71 |
-
#### 4.1 Video Clipper (`video_clipper.py`)
|
| 72 |
-
|
| 73 |
-
- **Purpose**: Extract specific segments from videos based on script
|
| 74 |
-
- **Input**: Video path, start time, end time
|
| 75 |
-
- **Output**: Clipped video file path
|
| 76 |
-
- **Technology**: MoviePy or ffmpeg-python
|
| 77 |
-
- **Returns**: Path to clipped video segment
|
| 78 |
-
|
| 79 |
-
#### 4.2 Scene Selector (`scene_selector.py`)
|
| 80 |
-
|
| 81 |
-
- **Purpose**: Intelligently select best scenes to fit 30-second target (if script needs refinement)
|
| 82 |
-
- **Input**: List of matched scenes, target duration (30s), script requirements
|
| 83 |
-
- **Output**: Optimized scene selection with timestamps
|
| 84 |
-
- **Technology**: Algorithm to maximize relevance while fitting duration
|
| 85 |
-
- **Returns**: Final scene list with precise timestamps aligned to script
|
| 86 |
-
|
| 87 |
-
### Phase 5: Audio & Composition Tools
|
| 88 |
-
|
| 89 |
-
#### 5.1 Music Selector (`music_selector.py`)
|
| 90 |
-
|
| 91 |
-
- **Purpose**: Select appropriate background music based on mood/style from script
|
| 92 |
-
- **Input**: Mood/style tags, target duration, script rhythm requirements
|
| 93 |
-
- **Output**: Music file path or URL
|
| 94 |
-
- **Technology**: Music library/database or API (e.g., Free Music Archive, YouTube Audio Library)
|
| 95 |
-
- **Returns**: Music file path with BPM and mood information
|
| 96 |
-
|
| 97 |
-
#### 5.2 Video Composer (`video_composer.py`)
|
| 98 |
-
|
| 99 |
-
- **Purpose**: Combine video clips, add music, apply transitions according to script
|
| 100 |
-
- **Input**: List of video clip paths, music path, script with transitions
|
| 101 |
-
- **Output**: Final composed video file path
|
| 102 |
-
- **Technology**: MoviePy for video composition, transitions, audio mixing
|
| 103 |
-
- **Returns**: Path to final 30-second video
|
| 104 |
-
|
| 105 |
-
### Phase 6: Workflow Orchestration
|
| 106 |
-
|
| 107 |
-
#### 6.1 Video Workflow Orchestrator (`video_workflow.py`)
|
| 108 |
-
|
| 109 |
-
- **Purpose**: Main workflow that coordinates all tools
|
| 110 |
-
- **Input**: List of video files, user description
|
| 111 |
-
- **Output**: Final video file path, processing summary, generated script
|
| 112 |
-
- **Technology**: Orchestrates all MCP tools in sequence
|
| 113 |
-
- **Returns**: Final video, processing report, and script JSON
|
| 114 |
-
|
| 115 |
-
## Implementation Phases
|
| 116 |
-
|
| 117 |
-
### Phase 1: Foundation (Dependencies & Basic Tools)
|
| 118 |
-
|
| 119 |
-
1. Add video processing dependencies (opencv-python, moviepy, ffmpeg-python, numpy, pillow)
|
| 120 |
-
2. Implement Video Metadata Extractor
|
| 121 |
-
3. Implement Video Summarizer
|
| 122 |
-
4. Set up temporary file storage system
|
| 123 |
-
|
| 124 |
-
### Phase 2: Understanding Layer
|
| 125 |
-
|
| 126 |
-
1. Implement Description Parser
|
| 127 |
-
2. Implement Scene Matcher
|
| 128 |
-
3. Test analysis and matching pipeline
|
| 129 |
-
|
| 130 |
-
### Phase 3: Script Generation
|
| 131 |
-
|
| 132 |
-
1. Implement Video Script Generator
|
| 133 |
-
2. Test script generation with various inputs
|
| 134 |
-
3. Validate script timing (must sum to ~30 seconds)
|
| 135 |
-
|
| 136 |
-
### Phase 4: Processing Layer
|
| 137 |
-
|
| 138 |
-
1. Implement Video Clipper
|
| 139 |
-
2. Implement Scene Selector
|
| 140 |
-
3. Test video clipping and selection based on scripts
|
| 141 |
-
|
| 142 |
-
### Phase 5: Composition Layer
|
| 143 |
-
|
| 144 |
-
1. Implement Music Selector (start with simple mood-based selection)
|
| 145 |
-
2. Implement Video Composer
|
| 146 |
-
3. Test full composition pipeline with script
|
| 147 |
-
|
| 148 |
-
### Phase 6: Integration & UI
|
| 149 |
-
|
| 150 |
-
1. Implement Video Workflow Orchestrator
|
| 151 |
-
2. Create main Vidzly UI in app.py (upload, description, progress, script preview, output)
|
| 152 |
-
3. Integrate all MCP tools into Gradio interface
|
| 153 |
-
4. Add error handling and user feedback
|
| 154 |
-
|
| 155 |
-
### Phase 7: Polish & Optimization
|
| 156 |
-
|
| 157 |
-
1. Add progress tracking
|
| 158 |
-
2. Optimize processing speed
|
| 159 |
-
3. Add preview functionality
|
| 160 |
-
4. Improve error messages and edge case handling
|
| 161 |
-
5. Add script editing/refinement capability
|
| 162 |
-
|
| 163 |
-
## File Structure
|
| 164 |
-
|
| 165 |
-
```
|
| 166 |
-
src/app/
|
| 167 |
-
├── app.py # Main Gradio app with Vidzly workflow UI
|
| 168 |
-
├── introduction.py # Existing intro component
|
| 169 |
-
├── tools/
|
| 170 |
-
│ ├── __init__.py
|
| 171 |
-
│ ├── video_metadata.py # Extract video technical info
|
| 172 |
-
│ ├── video_summarizer.py # Analyze and summarize video content
|
| 173 |
-
│ ├── description_parser.py # Parse user descriptions
|
| 174 |
-
│ ├── scene_matcher.py # Match scenes to requirements
|
| 175 |
-
│ ├── video_script_generator.py # Generate detailed video script
|
| 176 |
-
│ ├── video_clipper.py # Extract video segments
|
| 177 |
-
│ ├── scene_selector.py # Select optimal scenes for 30s video
|
| 178 |
-
│ ├── music_selector.py # Choose background music
|
| 179 |
-
│ ├── video_composer.py # Combine clips and add music
|
| 180 |
-
│ └── video_workflow.py # Main workflow orchestrator
|
| 181 |
-
└── utils/
|
| 182 |
-
├── __init__.py
|
| 183 |
-
├── file_manager.py # Handle temporary file storage
|
| 184 |
-
└── video_utils.py # Shared video processing utilities
|
| 185 |
-
```
|
| 186 |
-
|
| 187 |
-
## File Storage Strategy
|
| 188 |
-
|
| 189 |
-
### Storage Locations
|
| 190 |
-
|
| 191 |
-
1. **Uploaded Videos (Gradio Temporary Directory)**
|
| 192 |
-
|
| 193 |
-
- Gradio automatically stores uploaded files in system temp directory
|
| 194 |
-
- Can be customized with `GRADIO_TEMP_DIR` environment variable
|
| 195 |
-
- Files accessible via file paths returned by `gr.File` component
|
| 196 |
-
- These are temporary and may be cleaned up by Gradio
|
| 197 |
-
|
| 198 |
-
2. **Processing Working Directory**
|
| 199 |
-
|
| 200 |
-
- Create: `src/app/temp/` or use system temp with unique session IDs
|
| 201 |
-
- Store: Clipped video segments, intermediate processing files
|
| 202 |
-
- Structure: `temp/{session_id}/clips/`, `temp/{session_id}/final/`
|
| 203 |
-
- Cleanup: Delete after processing completes or on session timeout
|
| 204 |
-
|
| 205 |
-
3. **Final Output Storage**
|
| 206 |
-
- Store final videos in: `src/app/outputs/` or `temp/{session_id}/final/`
|
| 207 |
-
- Return file path to Gradio `gr.Video` component for display/download
|
| 208 |
-
- Gradio will serve the file from this path
|
| 209 |
-
- Implement cleanup policy (e.g., delete after 24 hours)
|
| 210 |
-
|
| 211 |
-
### File Manager Implementation
|
| 212 |
-
|
| 213 |
-
The `file_manager.py` utility should:
|
| 214 |
-
|
| 215 |
-
- Create session-based temporary directories
|
| 216 |
-
- Generate unique file names to avoid conflicts
|
| 217 |
-
- Provide cleanup functions (session cleanup, old file cleanup)
|
| 218 |
-
- Handle path resolution (absolute paths for Gradio serving)
|
| 219 |
-
- Track file lifecycle (uploaded, processing, final, deleted)
|
| 220 |
-
|
| 221 |
-
### Gradio File Handling
|
| 222 |
-
|
| 223 |
-
- **Input**: Use `gr.File(file_count="multiple")` for video uploads
|
| 224 |
-
- Returns list of file paths (temporary Gradio paths)
|
| 225 |
-
- Copy to working directory immediately for processing
|
| 226 |
-
- **Output**: Use `gr.Video()` component
|
| 227 |
-
- Accepts file path (absolute or relative to working directory)
|
| 228 |
-
- Gradio serves the file and provides download capability
|
| 229 |
-
- File must exist at the path when component renders
|
| 230 |
-
|
| 231 |
-
### Best Practices
|
| 232 |
-
|
| 233 |
-
1. **Immediate Copy**: Copy uploaded files from Gradio temp to working directory
|
| 234 |
-
2. **Absolute Paths**: Use absolute paths for all file operations
|
| 235 |
-
3. **Session Management**: Use unique session IDs to isolate user workflows
|
| 236 |
-
4. **Cleanup Strategy**:
|
| 237 |
-
- Clean up intermediate files after final video is created
|
| 238 |
-
- Keep final videos for a retention period (e.g., 1 hour)
|
| 239 |
-
- Implement background cleanup task for old files
|
| 240 |
-
5. **Error Handling**: Ensure cleanup happens even if processing fails
|
| 241 |
-
|
| 242 |
-
## Technical Stack
|
| 243 |
-
|
| 244 |
-
### Dependencies to Add
|
| 245 |
-
|
| 246 |
-
- `opencv-python` - Video frame extraction and basic processing
|
| 247 |
-
- `moviepy` - Video editing, clipping, composition
|
| 248 |
-
- `ffmpeg-python` - Alternative/additional video processing
|
| 249 |
-
- `numpy` - Array operations for video processing
|
| 250 |
-
- `pillow` - Image processing for frame analysis
|
| 251 |
-
|
| 252 |
-
### Existing Dependencies
|
| 253 |
-
|
| 254 |
-
- `gradio` (with MCP) - UI and MCP server
|
| 255 |
-
- `google-genai` - AI analysis and understanding
|
| 256 |
-
|
| 257 |
-
## Main Workflow UI Design
|
| 258 |
-
|
| 259 |
-
The "Vidzly" tab in app.py will include:
|
| 260 |
-
|
| 261 |
-
1. **Video Upload Section**: Multiple file upload component
|
| 262 |
-
2. **Description Input**: Text area for user description
|
| 263 |
-
3. **Process Button**: Trigger workflow
|
| 264 |
-
4. **Progress Display**: Show current step (analyzing, generating script, selecting scenes, composing, etc.)
|
| 265 |
-
5. **Script Preview**: Display generated script (optional, collapsible)
|
| 266 |
-
6. **Output Video**: Display final 30-second video
|
| 267 |
-
7. **Download Button**: Allow user to download result
|
| 268 |
-
|
| 269 |
-
## Workflow Sequence
|
| 270 |
-
|
| 271 |
-
1. User uploads multiple videos
|
| 272 |
-
2. User provides description
|
| 273 |
-
3. System analyzes all videos (parallel processing)
|
| 274 |
-
4. System parses user description
|
| 275 |
-
5. System matches scenes to requirements
|
| 276 |
-
6. **System generates detailed script for 30-second video**
|
| 277 |
-
7. System clips selected scenes based on script
|
| 278 |
-
8. System selects appropriate music based on script
|
| 279 |
-
9. System composes final video according to script
|
| 280 |
-
10. System returns final video and script
|
| 281 |
-
|
| 282 |
-
## Script Format Example
|
| 283 |
-
|
| 284 |
-
```json
|
| 285 |
-
{
|
| 286 |
-
"total_duration": 30.0,
|
| 287 |
-
"scenes": [
|
| 288 |
-
{
|
| 289 |
-
"scene_id": 1,
|
| 290 |
-
"source_video": "video1.mp4",
|
| 291 |
-
"start_time": 5.2,
|
| 292 |
-
"end_time": 8.5,
|
| 293 |
-
"duration": 3.3,
|
| 294 |
-
"description": "Opening shot of landscape",
|
| 295 |
-
"transition_in": "fade",
|
| 296 |
-
"transition_out": "crossfade"
|
| 297 |
-
},
|
| 298 |
-
{
|
| 299 |
-
"scene_id": 2,
|
| 300 |
-
"source_video": "video2.mp4",
|
| 301 |
-
"start_time": 12.0,
|
| 302 |
-
"end_time": 18.5,
|
| 303 |
-
"duration": 6.5,
|
| 304 |
-
"description": "Action sequence",
|
| 305 |
-
"transition_in": "crossfade",
|
| 306 |
-
"transition_out": "cut"
|
| 307 |
-
}
|
| 308 |
-
],
|
| 309 |
-
"music": {
|
| 310 |
-
"mood": "energetic",
|
| 311 |
-
"bpm": 120,
|
| 312 |
-
"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0]
|
| 313 |
-
},
|
| 314 |
-
"pacing": "fast",
|
| 315 |
-
"narrative_structure": "hook -> build -> climax -> resolution"
|
| 316 |
-
}
|
| 317 |
-
```
|
| 318 |
-
|
| 319 |
-
## Error Handling Strategy
|
| 320 |
-
|
| 321 |
-
- Validate video file formats on upload
|
| 322 |
-
- Handle corrupted or unsupported videos gracefully
|
| 323 |
-
- Provide clear error messages for each failure point
|
| 324 |
-
- Allow partial success (e.g., if one video fails, continue with others)
|
| 325 |
-
- Timeout handling for long processing operations
|
| 326 |
-
- Validate script timing (must sum to ~30 seconds)
|
| 327 |
-
- Handle script generation failures with fallback
|
| 328 |
-
|
| 329 |
-
## Performance Considerations
|
| 330 |
-
|
| 331 |
-
- Process videos in parallel where possible
|
| 332 |
-
- Use efficient frame sampling (not every frame)
|
| 333 |
-
- Cache video summaries to avoid re-analysis
|
| 334 |
-
- Optimize video composition for speed
|
| 335 |
-
- Consider async processing for long operations
|
| 336 |
-
- Generate script before heavy processing to validate feasibility
|
| 337 |
-
|
| 338 |
-
## Testing Strategy
|
| 339 |
-
|
| 340 |
-
- Unit tests for each MCP tool
|
| 341 |
-
- Integration tests for workflow steps
|
| 342 |
-
- Test script generation with various inputs
|
| 343 |
-
- End-to-end test with sample videos
|
| 344 |
-
- Test with various video formats and sizes
|
| 345 |
-
- Test edge cases (very short videos, very long videos, etc.)
|
| 346 |
-
- Test script validation (timing, scene availability)
|
| 347 |
-
|
| 348 |
-
## Implementation Todos
|
| 349 |
-
|
| 350 |
-
1. **Phase 1: Foundation**
|
| 351 |
-
|
| 352 |
-
- Add video processing dependencies to pyproject.toml
|
| 353 |
-
- Implement video_metadata.py
|
| 354 |
-
- Implement video_summarizer.py
|
| 355 |
-
- Create utils/file_manager.py
|
| 356 |
-
|
| 357 |
-
2. **Phase 2: Understanding**
|
| 358 |
-
|
| 359 |
-
- Implement description_parser.py
|
| 360 |
-
- Implement scene_matcher.py
|
| 361 |
-
|
| 362 |
-
3. **Phase 3: Script Generation**
|
| 363 |
-
|
| 364 |
-
- Implement video_script_generator.py
|
| 365 |
-
- Add script validation logic
|
| 366 |
-
|
| 367 |
-
4. **Phase 4: Processing**
|
| 368 |
-
|
| 369 |
-
- Implement video_clipper.py
|
| 370 |
-
- Implement scene_selector.py
|
| 371 |
-
|
| 372 |
-
5. **Phase 5: Composition**
|
| 373 |
-
|
| 374 |
-
- Implement music_selector.py
|
| 375 |
-
- Implement video_composer.py
|
| 376 |
-
|
| 377 |
-
6. **Phase 6: Integration**
|
| 378 |
-
|
| 379 |
-
- Implement video_workflow.py
|
| 380 |
-
- Create main Vidzly UI in app.py
|
| 381 |
-
- Integrate all MCP tools
|
| 382 |
-
- Add error handling
|
| 383 |
-
|
| 384 |
-
7. **Phase 7: Polish**
|
| 385 |
-
- Add progress tracking
|
| 386 |
-
- Optimize performance
|
| 387 |
-
- Add preview functionality
|
| 388 |
-
- Improve UX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,18 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# 🎬 Vidzly - Your AI-Powered Short Video Creator
|
| 2 |
|
| 3 |
> **Transform raw footage into viral-ready content in seconds. No skills required. No expensive gear needed. Just your vision and our AI.**
|
| 4 |
|
| 5 |
## ✨ What is Vidzly?
|
| 6 |
|
| 7 |
-
Vidzly is an intelligent automation platform that revolutionizes short-form video creation. Whether you're a micro-influencer, content creator, or business owner, Vidzly transforms your raw clips into polished, engaging
|
| 8 |
|
| 9 |
### 🚀 Why Vidzly?
|
| 10 |
|
| 11 |
-
- **Zero Learning Curve**: No video editing skills? No problem.
|
| 12 |
-
- **AI-Powered Magic**: Advanced AI handles cutting, transitions, music
|
| 13 |
- **Lightning Fast**: What takes hours in traditional editing software takes minutes with Vidzly.
|
| 14 |
- **Professional Quality**: Get studio-quality results without the studio price tag.
|
| 15 |
-
- **
|
| 16 |
|
| 17 |
### 🎯 Perfect For
|
| 18 |
|
|
@@ -24,24 +28,35 @@ Vidzly is an intelligent automation platform that revolutionizes short-form vide
|
|
| 24 |
|
| 25 |
## 🎬 How It Works
|
| 26 |
|
| 27 |
-
1. **Upload Your Raw Footage** - Drop your clips
|
| 28 |
-
2. **Describe Your Vision** -
|
| 29 |
-
3. **AI
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
## 🛠️
|
| 33 |
|
| 34 |
-
|
| 35 |
-
- 🎵 **Music Synchronization**: Perfect beat-matching and audio enhancement
|
| 36 |
-
- 🎨 **Style Transfer**: Apply filters and effects that match your brand
|
| 37 |
-
- ⚡ **Real-Time Preview**: See your video come together as it's being created
|
| 38 |
-
- 📊 **Analytics Ready**: Optimized for maximum engagement
|
| 39 |
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
## Setup
|
| 47 |
|
|
@@ -115,27 +130,105 @@ To update all dependencies to their latest compatible versions:
|
|
| 115 |
poetry update
|
| 116 |
```
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
### Setting Up Environment Variables
|
| 119 |
|
| 120 |
Create a `.env` file in the root directory and add your environment variables.
|
| 121 |
|
| 122 |
```bash
|
| 123 |
GOOGLE_API_KEY=your_google_api_key
|
|
|
|
| 124 |
```
|
| 125 |
|
|
|
|
|
|
|
| 126 |
### Running the Application
|
| 127 |
|
|
|
|
|
|
|
| 128 |
```bash
|
| 129 |
poetry run python src/app/app.py
|
| 130 |
```
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
### Contributors
|
| 137 |
|
| 138 |
-
- 🐱 [honghanhh](https://github.com/honghanhh)🐱
|
| 139 |
-
- 🦊 [nvti](https://github.com/nvti)🦊
|
| 140 |
-
- 🐻 [Nlag](https://github.com/NLag)🐻
|
| 141 |
-
- 🐰 [DaphneeCh](https://github.com/DaphneeCh)🐰
|
|
|
|
| 1 |
+
<p align="center">
|
| 2 |
+
<img src="https://cdn.tihado.com/app.png" alt="Vidzly Logo"/>
|
| 3 |
+
</p>
|
| 4 |
+
|
| 5 |
# 🎬 Vidzly - Your AI-Powered Short Video Creator
|
| 6 |
|
| 7 |
> **Transform raw footage into viral-ready content in seconds. No skills required. No expensive gear needed. Just your vision and our AI.**
|
| 8 |
|
| 9 |
## ✨ What is Vidzly?
|
| 10 |
|
| 11 |
+
Vidzly is an intelligent automation platform that revolutionizes short-form video creation. Whether you're a micro-influencer, content creator, or business owner, Vidzly transforms your raw clips into polished, engaging videos that stop the scroll.
|
| 12 |
|
| 13 |
### 🚀 Why Vidzly?
|
| 14 |
|
| 15 |
+
- **Zero Learning Curve**: No video editing skills? No problem. Use our intuitive web interface.
|
| 16 |
+
- **AI-Powered Magic**: Advanced AI handles video analysis, cutting, transitions, music generation, and thumbnail creation automatically.
|
| 17 |
- **Lightning Fast**: What takes hours in traditional editing software takes minutes with Vidzly.
|
| 18 |
- **Professional Quality**: Get studio-quality results without the studio price tag.
|
| 19 |
+
- **MCP Tools Integration**: All tools are available as MCP (Model Context Protocol) tools for AI agent integration.
|
| 20 |
|
| 21 |
### 🎯 Perfect For
|
| 22 |
|
|
|
|
| 28 |
|
| 29 |
## 🎬 How It Works
|
| 30 |
|
| 31 |
+
1. **Upload Your Raw Footage** - Drop your clips through the Gradio web interface
|
| 32 |
+
2. **Describe Your Vision** - Optionally provide a description of the mood, style, or vibe you want
|
| 33 |
+
3. **AI Agents Work Their Magic** - Our two-agent system intelligently plans and executes:
|
| 34 |
+
- **Script Writer/Director Agent**: Analyzes videos, creates composition scripts, and generates music
|
| 35 |
+
- **Video Editor Agent**: Executes video composition, extracts frames, and generates thumbnails
|
| 36 |
+
4. **Get Your Masterpiece** - Receive a polished video with thumbnail overlay on the first frame
|
| 37 |
|
| 38 |
+
## 🛠️ Available Tools
|
| 39 |
|
| 40 |
+
Vidzly provides a comprehensive suite of MCP tools accessible through a Gradio web interface:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
- 🎥 **Video Summarizer**: Uses Google Gemini AI to analyze video content and generate detailed summaries including key scenes, detected objects, mood tags, and recommended thumbnail timestamps
|
| 43 |
+
- ✂️ **Video Clipper**: Extract specific segments from videos by specifying start and end times
|
| 44 |
+
- 🖼️ **Frame Extractor**: Extract representative frames from videos, with AI-powered selection or manual timestamp specification
|
| 45 |
+
- 🎨 **Thumbnail Generator**: Automatically generate engaging thumbnails with AI-generated text and stickers based on video frames and summaries
|
| 46 |
+
- 🎬 **Video Composer**: Combine multiple video clips with transitions (fade, crossfade, cut) and optional background music according to a JSON script. Supports optional thumbnail image overlay on the first frame
|
| 47 |
+
- 🎵 **Music Selector**: Generate background music and sound effects using ElevenLabs API based on mood, style, duration, BPM, and other parameters
|
| 48 |
|
| 49 |
+
## 🏗️ Architecture
|
| 50 |
|
| 51 |
+
- **Web Interface**: Built with Gradio (with MCP server support)
|
| 52 |
+
- **AI Agents**: Google ADK (Agent Development Kit) with two specialized agents:
|
| 53 |
+
- **Script Writer/Director Agent**: Plans video composition using video analysis and script generation
|
| 54 |
+
- **Video Editor Agent**: Executes video composition, frame extraction, and thumbnail generation
|
| 55 |
+
- **AI Integration**: Google Gemini for video understanding, analysis, and script generation
|
| 56 |
+
- **Audio Generation**: ElevenLabs API for music and sound effect generation
|
| 57 |
+
- **Video Processing**: MoviePy for video editing and composition
|
| 58 |
+
- **Image Processing**: OpenCV and Pillow for frame extraction and thumbnail generation
|
| 59 |
+
- **Testing**: Comprehensive pytest test suite with unit and integration tests
|
| 60 |
|
| 61 |
## Setup
|
| 62 |
|
|
|
|
| 130 |
poetry update
|
| 131 |
```
|
| 132 |
|
| 133 |
+
### Code Formatting with Black and Lefthook
|
| 134 |
+
|
| 135 |
+
This project uses [Black](https://black.readthedocs.io/) for code formatting and [Lefthook](https://github.com/evilmartians/lefthook) for git hooks to automatically format code before commits.
|
| 136 |
+
|
| 137 |
+
After installing dependencies, set up lefthook:
|
| 138 |
+
|
| 139 |
+
```bash
|
| 140 |
+
poetry run lefthook install
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
This will install git hooks that will:
|
| 144 |
+
|
| 145 |
+
- **Before commit**: Automatically format staged Python files with Black
|
| 146 |
+
- **Before push**: Check that all Python files in `src/` and `tests/` are properly formatted
|
| 147 |
+
|
| 148 |
+
To manually format code:
|
| 149 |
+
|
| 150 |
+
```bash
|
| 151 |
+
poetry run black src/ tests/
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
To check formatting without making changes:
|
| 155 |
+
|
| 156 |
+
```bash
|
| 157 |
+
poetry run black --check src/ tests/
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
### Setting Up Environment Variables
|
| 161 |
|
| 162 |
Create a `.env` file in the root directory and add your environment variables.
|
| 163 |
|
| 164 |
```bash
|
| 165 |
GOOGLE_API_KEY=your_google_api_key
|
| 166 |
+
ELEVENLABS_API_KEY=your_elevenlabs_api_key
|
| 167 |
```
|
| 168 |
|
| 169 |
+
**Note**: The application requires Google ADK (Agent Development Kit) to be installed. The `google-adk` package is required for the two-agent workflow architecture.
|
| 170 |
+
|
| 171 |
### Running the Application
|
| 172 |
|
| 173 |
+
Start the Gradio web interface:
|
| 174 |
+
|
| 175 |
```bash
|
| 176 |
poetry run python src/app/app.py
|
| 177 |
```
|
| 178 |
|
| 179 |
+
This will launch a web interface with:
|
| 180 |
+
|
| 181 |
+
- **Vidzly Tab**: Project introduction and overview
|
| 182 |
+
- **MCP Tools Tab**: Access to all 6 video processing tools
|
| 183 |
+
|
| 184 |
+
The application runs with MCP server support, allowing AI agents to interact with the tools programmatically.
|
| 185 |
+
|
| 186 |
+
## Testing
|
| 187 |
+
|
| 188 |
+
This project includes comprehensive unit and integration tests. See [tests/README.md](tests/README.md) for detailed testing documentation.
|
| 189 |
+
|
| 190 |
+
### Running Tests
|
| 191 |
+
|
| 192 |
+
Run all tests:
|
| 193 |
+
|
| 194 |
+
```bash
|
| 195 |
+
poetry run pytest
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
Run with coverage:
|
| 199 |
+
|
| 200 |
+
```bash
|
| 201 |
+
poetry run pytest --cov=src/app/tools --cov-report=html
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
Run specific test file:
|
| 205 |
+
|
| 206 |
+
```bash
|
| 207 |
+
poetry run pytest tests/test_video_summarizer.py
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
### Test Structure
|
| 211 |
+
|
| 212 |
+
- **Unit Tests**: Mocked tests for input validation, error handling, and logic
|
| 213 |
+
- **Integration Tests**: Real video file tests for actual functionality
|
| 214 |
+
- All tools have corresponding test files in the `tests/` directory
|
| 215 |
+
|
| 216 |
+
## Technology Stack
|
| 217 |
|
| 218 |
+
- **Python 3.12+**: Core language
|
| 219 |
+
- **Gradio 6.0+**: Web interface with MCP support
|
| 220 |
+
- **Google ADK (Agent Development Kit)**: Two-agent architecture for intelligent workflow orchestration
|
| 221 |
+
- **Google Gemini API**: Video understanding, analysis, script generation, and thumbnail creation
|
| 222 |
+
- **ElevenLabs API**: Music and sound effect generation
|
| 223 |
+
- **MoviePy 2.2.1**: Video editing, composition, and image overlay
|
| 224 |
+
- **OpenCV 4.12+**: Video processing and frame extraction
|
| 225 |
+
- **Pillow 11**: Image processing for thumbnails
|
| 226 |
+
- **Poetry**: Dependency management
|
| 227 |
+
- **pytest**: Testing framework
|
| 228 |
|
| 229 |
### Contributors
|
| 230 |
|
| 231 |
+
- 🐱 [honghanhh](https://github.com/honghanhh) 🐱
|
| 232 |
+
- 🦊 [nvti](https://github.com/nvti) 🦊
|
| 233 |
+
- 🐻 [Nlag](https://github.com/NLag) 🐻
|
| 234 |
+
- 🐰 [DaphneeCh](https://github.com/DaphneeCh) 🐰
|
lefthook.yml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pre-commit:
|
| 2 |
+
parallel: true
|
| 3 |
+
commands:
|
| 4 |
+
black:
|
| 5 |
+
run: poetry run black {staged_files}
|
| 6 |
+
glob: "*.py"
|
| 7 |
+
stage_fixed: true
|
| 8 |
+
|
| 9 |
+
pre-push:
|
| 10 |
+
parallel: true
|
| 11 |
+
commands:
|
| 12 |
+
black-check:
|
| 13 |
+
run: poetry run black --check src/ tests/
|
| 14 |
+
glob: "*.py"
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -14,8 +14,62 @@ opencv-python = "^4.12.0.88"
|
|
| 14 |
moviepy = "^2.2.1"
|
| 15 |
pillow = "11"
|
| 16 |
python-dotenv = "^1.0.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
[build-system]
|
| 20 |
requires = ["poetry-core"]
|
| 21 |
build-backend = "poetry.core.masonry.api"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
moviepy = "^2.2.1"
|
| 15 |
pillow = "11"
|
| 16 |
python-dotenv = "^1.0.0"
|
| 17 |
+
elevenlabs = "^2.24.0"
|
| 18 |
+
gtts = "^2.5.0"
|
| 19 |
+
langchain-google-genai = "^3.2.0"
|
| 20 |
+
langchain-core = "^1.1.0"
|
| 21 |
+
langchain = "^1.1.0"
|
| 22 |
|
| 23 |
|
| 24 |
+
[tool.poetry.group.dev.dependencies]
|
| 25 |
+
pytest = "^9.0.1"
|
| 26 |
+
black = "^24.0.0"
|
| 27 |
+
|
| 28 |
[build-system]
|
| 29 |
requires = ["poetry-core"]
|
| 30 |
build-backend = "poetry.core.masonry.api"
|
| 31 |
+
|
| 32 |
+
[dependency-groups]
|
| 33 |
+
dev = [
|
| 34 |
+
"pytest (>=9.0.1,<10.0.0)",
|
| 35 |
+
"pytest-mock (>=3.15.1,<4.0.0)",
|
| 36 |
+
"pytest-cov (>=7.0.0,<8.0.0)",
|
| 37 |
+
"black (>=24.0.0,<25.0.0)",
|
| 38 |
+
"lefthook (>=2.0.4,<3.0.0)"
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
[tool.pytest.ini_options]
|
| 42 |
+
testpaths = ["tests"]
|
| 43 |
+
python_files = ["test_*.py"]
|
| 44 |
+
python_classes = ["Test*"]
|
| 45 |
+
python_functions = ["test_*"]
|
| 46 |
+
addopts = [
|
| 47 |
+
"-v",
|
| 48 |
+
"--strict-markers",
|
| 49 |
+
"--tb=short",
|
| 50 |
+
]
|
| 51 |
+
markers = [
|
| 52 |
+
"unit: Unit tests",
|
| 53 |
+
"integration: Integration tests",
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
[tool.black]
|
| 57 |
+
line-length = 88
|
| 58 |
+
target-version = ['py312']
|
| 59 |
+
include = '\.pyi?$'
|
| 60 |
+
extend-exclude = '''
|
| 61 |
+
/(
|
| 62 |
+
# directories
|
| 63 |
+
\.eggs
|
| 64 |
+
| \.git
|
| 65 |
+
| \.hg
|
| 66 |
+
| \.mypy_cache
|
| 67 |
+
| \.tox
|
| 68 |
+
| \.venv
|
| 69 |
+
| venv
|
| 70 |
+
| _build
|
| 71 |
+
| buck-out
|
| 72 |
+
| build
|
| 73 |
+
| dist
|
| 74 |
+
)/
|
| 75 |
+
'''
|
src/app/app.py
CHANGED
|
@@ -7,15 +7,61 @@ load_dotenv()
|
|
| 7 |
import gradio as gr
|
| 8 |
from introduction import introduction
|
| 9 |
from tools.video_summarizer import video_summarizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
with gr.Blocks() as demo:
|
| 13 |
with gr.Tab("Vidzly"):
|
| 14 |
-
#
|
| 15 |
-
# Agent full workflow here
|
| 16 |
-
|
| 17 |
introduction()
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
with gr.Tab("MCP Tools"):
|
| 20 |
with gr.Tab("Video Summarizer"):
|
| 21 |
gr.Interface(
|
|
@@ -36,6 +82,284 @@ with gr.Blocks() as demo:
|
|
| 36 |
api_name="video_summarizer",
|
| 37 |
)
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
if __name__ == "__main__":
|
| 41 |
demo.launch(mcp_server=True)
|
|
|
|
| 7 |
import gradio as gr
|
| 8 |
from introduction import introduction
|
| 9 |
from tools.video_summarizer import video_summarizer
|
| 10 |
+
from tools.video_clipper import video_clipper
|
| 11 |
+
from tools.frame_extractor import frame_extractor
|
| 12 |
+
from tools.thumbnail_generator import thumbnail_generator
|
| 13 |
+
from tools.video_composer import video_composer
|
| 14 |
+
from tools.music_selector import music_selector
|
| 15 |
+
from tools.video_script_generator import video_script_generator
|
| 16 |
+
from workflow_ui import workflow_ui
|
| 17 |
+
from tools.text_to_speech import text_to_speech_simple
|
| 18 |
+
from tools.script_generator import script_generator
|
| 19 |
+
from tools.subtitle_creator import subtitle_creator
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def text_to_speech_wrapper(
|
| 23 |
+
text, voice, language, speed, format_type, generate_segments
|
| 24 |
+
):
|
| 25 |
+
"""
|
| 26 |
+
Wrapper to return audio file for Gradio.
|
| 27 |
+
"""
|
| 28 |
+
result = text_to_speech_simple(
|
| 29 |
+
text, voice, language, speed, format_type, generate_segments
|
| 30 |
+
)
|
| 31 |
+
# Always return audio file path (Gradio will render audio player)
|
| 32 |
+
return result
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def video_composer_wrapper(script, video_clips, music_path=None):
|
| 36 |
+
"""
|
| 37 |
+
Wrapper to return both video and script JSON for easy workflow.
|
| 38 |
+
"""
|
| 39 |
+
video_path = video_composer(video_clips, music_path)
|
| 40 |
+
# Return video path and the original script (for subtitle generator)
|
| 41 |
+
return video_path
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def frame_extractor_wrapper(video_input, thumbnail_timeframe=None):
|
| 45 |
+
"""
|
| 46 |
+
Wrapper function for frame_extractor to handle Gradio interface inputs.
|
| 47 |
+
Maps the Gradio inputs to the correct function parameters.
|
| 48 |
+
"""
|
| 49 |
+
return frame_extractor(
|
| 50 |
+
video_input=video_input,
|
| 51 |
+
output_path=None,
|
| 52 |
+
thumbnail_timeframe=thumbnail_timeframe,
|
| 53 |
+
)
|
| 54 |
|
| 55 |
|
| 56 |
with gr.Blocks() as demo:
|
| 57 |
with gr.Tab("Vidzly"):
|
| 58 |
+
# Introduction section
|
|
|
|
|
|
|
| 59 |
introduction()
|
| 60 |
|
| 61 |
+
# Full workflow UI
|
| 62 |
+
gr.Markdown("---")
|
| 63 |
+
workflow_ui()
|
| 64 |
+
|
| 65 |
with gr.Tab("MCP Tools"):
|
| 66 |
with gr.Tab("Video Summarizer"):
|
| 67 |
gr.Interface(
|
|
|
|
| 82 |
api_name="video_summarizer",
|
| 83 |
)
|
| 84 |
|
| 85 |
+
with gr.Tab("Video Clipper"):
|
| 86 |
+
gr.Interface(
|
| 87 |
+
fn=video_clipper,
|
| 88 |
+
inputs=[
|
| 89 |
+
gr.Video(label="Upload Video"),
|
| 90 |
+
gr.Number(
|
| 91 |
+
value=0.0,
|
| 92 |
+
label="Start Time (seconds)",
|
| 93 |
+
minimum=0.0,
|
| 94 |
+
step=0.1,
|
| 95 |
+
),
|
| 96 |
+
gr.Number(
|
| 97 |
+
value=10.0,
|
| 98 |
+
label="End Time (seconds)",
|
| 99 |
+
minimum=0.1,
|
| 100 |
+
step=0.1,
|
| 101 |
+
),
|
| 102 |
+
],
|
| 103 |
+
outputs=[gr.Video(label="Clipped Video")],
|
| 104 |
+
title="Video Clipper",
|
| 105 |
+
description="Extract a specific segment from a video file. Enter the start and end times in seconds to create a clipped version of your video.",
|
| 106 |
+
api_name="video_clipper",
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
with gr.Tab("Frame Extractor"):
|
| 110 |
+
gr.Interface(
|
| 111 |
+
fn=frame_extractor_wrapper,
|
| 112 |
+
inputs=[
|
| 113 |
+
gr.Video(label="Upload Video"),
|
| 114 |
+
gr.Number(
|
| 115 |
+
value=None,
|
| 116 |
+
label="Thumbnail Timeframe (Optional - seconds)",
|
| 117 |
+
minimum=0.0,
|
| 118 |
+
step=0.1,
|
| 119 |
+
info="Optional: Provide a timestamp in seconds to extract a frame at that specific time. If not provided, AI will analyze the video to find the best frame. You can get this value from the 'thumbnail_timeframe' field in Video Summarizer's JSON output.",
|
| 120 |
+
),
|
| 121 |
+
],
|
| 122 |
+
outputs=[gr.Image(label="Extracted Frame", type="filepath")],
|
| 123 |
+
title="Frame Extractor",
|
| 124 |
+
description="Extract a representative frame from a video. If you provide a thumbnail timeframe (e.g., from Video Summarizer), it will use that timestamp directly without calling AI. Otherwise, it uses Gemini AI to analyze the video and select the most engaging frame.",
|
| 125 |
+
api_name="frame_extractor",
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
with gr.Tab("Thumbnail Generation"):
|
| 129 |
+
gr.Interface(
|
| 130 |
+
fn=thumbnail_generator,
|
| 131 |
+
inputs=[
|
| 132 |
+
gr.Image(
|
| 133 |
+
label="Frame Image",
|
| 134 |
+
type="filepath",
|
| 135 |
+
),
|
| 136 |
+
gr.Textbox(
|
| 137 |
+
label="Video Summary",
|
| 138 |
+
placeholder="Enter the video summary text here...",
|
| 139 |
+
lines=10,
|
| 140 |
+
info="Enter the video summary text (extract the 'summary' field from Video Summarizer JSON output).",
|
| 141 |
+
),
|
| 142 |
+
],
|
| 143 |
+
outputs=[gr.Image(label="Generated Thumbnail", type="filepath")],
|
| 144 |
+
title="Thumbnail Generation",
|
| 145 |
+
description="Automatically generate engaging thumbnails with AI-generated text and stickers. Uses Gemini AI to analyze the frame and video summary to create context-aware thumbnail designs with optimal text placement and sticker recommendations.",
|
| 146 |
+
api_name="thumbnail_generation",
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
with gr.Tab("Video Composer"):
|
| 150 |
+
gr.Interface(
|
| 151 |
+
fn=video_composer_wrapper,
|
| 152 |
+
inputs=[
|
| 153 |
+
gr.Textbox(
|
| 154 |
+
label="Script (JSON)",
|
| 155 |
+
placeholder='{"total_duration": 30.0, "scenes": [{"scene_id": 1, "source_video": 0, "start_time": 0.0, "end_time": 5.0, "duration": 5.0, "transition_in": "fade", "transition_out": "crossfade"}]}',
|
| 156 |
+
lines=10,
|
| 157 |
+
),
|
| 158 |
+
gr.File(
|
| 159 |
+
label="Video Clips (Required - source videos)",
|
| 160 |
+
file_count="multiple",
|
| 161 |
+
file_types=["video"],
|
| 162 |
+
),
|
| 163 |
+
gr.File(
|
| 164 |
+
label="Music File (Optional)",
|
| 165 |
+
file_count="single",
|
| 166 |
+
file_types=["audio"],
|
| 167 |
+
),
|
| 168 |
+
gr.Image(
|
| 169 |
+
label="Thumbnail Image (Optional)",
|
| 170 |
+
type="filepath",
|
| 171 |
+
),
|
| 172 |
+
],
|
| 173 |
+
outputs=[
|
| 174 |
+
gr.Video(label="Composed Video"),
|
| 175 |
+
gr.Textbox(
|
| 176 |
+
label="Script JSON (Copy this to Subtitle Generator)", lines=10
|
| 177 |
+
),
|
| 178 |
+
],
|
| 179 |
+
title="Video Composer",
|
| 180 |
+
description="Combine video clips, add music, and apply transitions according to a script. Upload source videos, then provide a JSON script where each scene's 'source_video' references a video by index (0-based) or filename. The same video can be used in multiple scenes with different time ranges. The script JSON output can be copied directly to Subtitle Generator.",
|
| 181 |
+
api_name="video_composer",
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
with gr.Tab("Music Selector"):
|
| 185 |
+
gr.Interface(
|
| 186 |
+
fn=music_selector,
|
| 187 |
+
inputs=[
|
| 188 |
+
gr.Textbox(
|
| 189 |
+
label="Mood",
|
| 190 |
+
placeholder="energetic, calm, dramatic, fun",
|
| 191 |
+
value="energetic",
|
| 192 |
+
info="Enter mood tags (comma-separated) or a single mood",
|
| 193 |
+
),
|
| 194 |
+
gr.Textbox(
|
| 195 |
+
label="Style (Optional)",
|
| 196 |
+
placeholder="cinematic, modern, retro",
|
| 197 |
+
value="",
|
| 198 |
+
),
|
| 199 |
+
gr.Number(
|
| 200 |
+
value=5.0,
|
| 201 |
+
label="Target Duration (seconds)",
|
| 202 |
+
minimum=1.0,
|
| 203 |
+
maximum=30.0,
|
| 204 |
+
step=0.5,
|
| 205 |
+
info="Maximum 30 seconds for ElevenLabs",
|
| 206 |
+
),
|
| 207 |
+
gr.Number(
|
| 208 |
+
value=60,
|
| 209 |
+
label="BPM (Optional)",
|
| 210 |
+
minimum=60,
|
| 211 |
+
maximum=200,
|
| 212 |
+
step=1,
|
| 213 |
+
info="Beats per minute for rhythm matching (optional)",
|
| 214 |
+
),
|
| 215 |
+
gr.Checkbox(
|
| 216 |
+
value=True,
|
| 217 |
+
label="Looping",
|
| 218 |
+
info="Enable seamless looping for continuous playback",
|
| 219 |
+
),
|
| 220 |
+
gr.Slider(
|
| 221 |
+
value=0.3,
|
| 222 |
+
label="Prompt Influence",
|
| 223 |
+
minimum=0,
|
| 224 |
+
maximum=1,
|
| 225 |
+
step=0.01,
|
| 226 |
+
info="How closely the output matches the prompt (0-1)",
|
| 227 |
+
),
|
| 228 |
+
],
|
| 229 |
+
outputs=[gr.Audio(label="Generated Sound Effect (MP3)")],
|
| 230 |
+
title="Music Selector",
|
| 231 |
+
description="Generate background sound effects using ElevenLabs API based on mood, style, and duration. The generated audio can be used as background music or sound effects for videos. Requires ELEVENLABS_API_KEY in your .env file.",
|
| 232 |
+
api_name="music_selector",
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
with gr.Tab("Video Script Generator"):
|
| 236 |
+
gr.Interface(
|
| 237 |
+
fn=video_script_generator,
|
| 238 |
+
inputs=[
|
| 239 |
+
gr.Textbox(
|
| 240 |
+
label="Video Summaries (JSON)",
|
| 241 |
+
placeholder='[{"duration": 30.0, "summary": "...", "mood_tags": ["energetic"]}] or paste JSON string from Video Summarizer',
|
| 242 |
+
lines=15,
|
| 243 |
+
info="Enter video summaries as JSON. Can be a single JSON string, or a JSON array of summary objects. You can copy the JSON output from Video Summarizer tool.",
|
| 244 |
+
),
|
| 245 |
+
gr.Textbox(
|
| 246 |
+
label="User Description (Optional)",
|
| 247 |
+
placeholder="e.g., Create an energetic and fast-paced video with dynamic transitions...",
|
| 248 |
+
lines=3,
|
| 249 |
+
info="Optional description of desired mood, style, or content for the final video",
|
| 250 |
+
),
|
| 251 |
+
gr.Number(
|
| 252 |
+
value=30.0,
|
| 253 |
+
label="Target Duration (seconds)",
|
| 254 |
+
minimum=1.0,
|
| 255 |
+
maximum=300.0,
|
| 256 |
+
step=0.5,
|
| 257 |
+
info="Target duration for the final video in seconds",
|
| 258 |
+
),
|
| 259 |
+
],
|
| 260 |
+
outputs=[gr.Textbox(label="Generated Script (JSON)", lines=20)],
|
| 261 |
+
title="Video Script Generator",
|
| 262 |
+
description="Create a detailed script/storyboard for a video composition. Uses Google Gemini AI to intelligently generate a script based on video summaries and user requirements. The script includes scene sequences, timings, transitions, music configuration, pacing, and narrative structure. Requires GOOGLE_API_KEY in your .env file.",
|
| 263 |
+
api_name="video_script_generator",
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
with gr.Tab("Text-to-Speech"):
|
| 267 |
+
gr.Interface(
|
| 268 |
+
fn=text_to_speech_wrapper,
|
| 269 |
+
inputs=[
|
| 270 |
+
gr.Textbox(
|
| 271 |
+
label="Text or Subtitle Content",
|
| 272 |
+
placeholder='Enter text or paste subtitle content (SRT/VTT/JSON)...\n\nPlain text example:\n"Welcome to our video tutorial on AI."\n\nSRT example:\n1\n00:00:00,000 --> 00:00:03,500\nWelcome to our video.\n\n2\n00:00:03,500 --> 00:00:07,000\nToday we will learn.',
|
| 273 |
+
lines=8,
|
| 274 |
+
info="Enter plain text OR paste subtitle content. Format will be auto-detected. All subtitle dialogues will be combined into narration.",
|
| 275 |
+
),
|
| 276 |
+
gr.Radio(
|
| 277 |
+
choices=["neutral", "male", "female"],
|
| 278 |
+
value="neutral",
|
| 279 |
+
label="Voice Type",
|
| 280 |
+
info="Select voice accent: Male (British), Female (Australian), or Neutral (US)",
|
| 281 |
+
),
|
| 282 |
+
gr.Dropdown(
|
| 283 |
+
choices=[
|
| 284 |
+
("English", "en"),
|
| 285 |
+
("Spanish", "es"),
|
| 286 |
+
("French", "fr"),
|
| 287 |
+
("German", "de"),
|
| 288 |
+
("Italian", "it"),
|
| 289 |
+
("Portuguese", "pt"),
|
| 290 |
+
("Chinese", "zh"),
|
| 291 |
+
("Japanese", "ja"),
|
| 292 |
+
("Korean", "ko"),
|
| 293 |
+
("Arabic", "ar"),
|
| 294 |
+
],
|
| 295 |
+
value="en",
|
| 296 |
+
label="Language",
|
| 297 |
+
info="Select the language for text-to-speech conversion",
|
| 298 |
+
),
|
| 299 |
+
gr.Radio(
|
| 300 |
+
choices=["normal", "slow"],
|
| 301 |
+
value="normal",
|
| 302 |
+
label="Speed",
|
| 303 |
+
info="Speech speed: Normal or Slow (for learning/clarity)",
|
| 304 |
+
),
|
| 305 |
+
gr.Radio(
|
| 306 |
+
choices=["auto", "text", "srt", "vtt", "json"],
|
| 307 |
+
value="auto",
|
| 308 |
+
label="Input Format",
|
| 309 |
+
info="Auto-detect format or manually specify: Plain text, SRT subtitle, VTT subtitle, or JSON scenario",
|
| 310 |
+
),
|
| 311 |
+
gr.Checkbox(
|
| 312 |
+
value=False,
|
| 313 |
+
label="Generate Timed Segments",
|
| 314 |
+
info="Create separate audio files for each subtitle segment with timing info (for video synchronization). Only works with subtitle input formats.",
|
| 315 |
+
),
|
| 316 |
+
],
|
| 317 |
+
outputs=[gr.Audio(label="Generated Audio", type="filepath")],
|
| 318 |
+
title="Text-to-Speech Converter",
|
| 319 |
+
description="Convert text or subtitles to audio using Google Text-to-Speech. Supports plain text, SRT, VTT, and JSON formats. Enable 'Generate Timed Segments' to create individual audio files for each subtitle with timing metadata (perfect for video synchronization with Video Composer output).",
|
| 320 |
+
api_name="text_to_speech",
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
with gr.Tab("Script Generator"):
|
| 324 |
+
gr.Interface(
|
| 325 |
+
fn=script_generator,
|
| 326 |
+
inputs=[
|
| 327 |
+
gr.File(
|
| 328 |
+
label="Video Materials (Required - upload multiple videos)",
|
| 329 |
+
file_count="multiple",
|
| 330 |
+
file_types=["video"],
|
| 331 |
+
),
|
| 332 |
+
gr.Textbox(
|
| 333 |
+
label="User Prompt (Optional)",
|
| 334 |
+
placeholder="e.g., 'Create an energetic travel montage with upbeat pacing' or 'Make a dramatic product reveal video'",
|
| 335 |
+
lines=3,
|
| 336 |
+
info="Optional: Provide specific instructions or creative direction. If left empty, the AI will generate a script based on the video content analysis.",
|
| 337 |
+
),
|
| 338 |
+
],
|
| 339 |
+
outputs=[gr.Textbox(label="Video Production Script (JSON)", lines=25)],
|
| 340 |
+
title="Script Generator",
|
| 341 |
+
description="Generate comprehensive video production scripts from multiple video materials. Upload your source videos and optionally provide creative direction. The AI will analyze the content and create a detailed script including scene breakdowns, timing, transitions, audio recommendations, and visual effects. Outputs both structured JSON and narrative formats.",
|
| 342 |
+
api_name="script_generator",
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
with gr.Tab("Subtitle Creator"):
|
| 346 |
+
gr.Interface(
|
| 347 |
+
fn=subtitle_creator,
|
| 348 |
+
inputs=[
|
| 349 |
+
gr.Video(label="Upload Video"),
|
| 350 |
+
gr.Textbox(
|
| 351 |
+
label="Transcript (JSON)",
|
| 352 |
+
placeholder='{"subtitles": [{"start": 0.0, "end": 2.5, "text": "Hello!", "position": "bottom", "fontsize": 48, "color": "white"}], "default_style": {"fontsize": 48, "color": "white", "bg_color": "#00000042", "position": "bottom", "transparent": true}}',
|
| 353 |
+
lines=15,
|
| 354 |
+
info="Provide subtitle transcript in JSON format with timestamps, text, and optional styling (position, font, fontsize, color, bg_color, stroke_color, stroke_width).",
|
| 355 |
+
),
|
| 356 |
+
],
|
| 357 |
+
outputs=[gr.Video(label="Video with Subtitles")],
|
| 358 |
+
title="Subtitle Creator",
|
| 359 |
+
description="Add customizable subtitles to your videos. Upload a video and provide a JSON transcript with timestamps, text content, and styling options. Supports multiple subtitle segments with individual positioning (top/center/bottom), fonts, colors, and background styling.",
|
| 360 |
+
api_name="subtitle_creator",
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
|
| 364 |
if __name__ == "__main__":
|
| 365 |
demo.launch(mcp_server=True)
|
src/app/tools/frame_extractor.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Optional
|
| 6 |
+
import mimetypes
|
| 7 |
+
import google.genai as genai
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def frame_extractor(
|
| 11 |
+
video_input,
|
| 12 |
+
output_path: Optional[str] = None,
|
| 13 |
+
thumbnail_timeframe: Optional[float] = None,
|
| 14 |
+
) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Extract a representative frame from video.
|
| 17 |
+
|
| 18 |
+
If thumbnail_timeframe is provided, uses that timestamp directly. Otherwise,
|
| 19 |
+
uses Gemini AI to analyze the video and determine the best timestamp for
|
| 20 |
+
frame extraction.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
video_input: Video file path (str) or tuple (video_path, subtitle_path) from Gradio
|
| 24 |
+
output_path: Optional output path for frame image
|
| 25 |
+
thumbnail_timeframe: Optional timestamp in seconds to use for frame extraction.
|
| 26 |
+
If provided, skips AI analysis and uses this timestamp directly.
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
str: Path to extracted frame image (PNG format)
|
| 30 |
+
"""
|
| 31 |
+
try:
|
| 32 |
+
# Handle Gradio video input format (can be tuple or string)
|
| 33 |
+
if isinstance(video_input, tuple):
|
| 34 |
+
video_path = video_input[0]
|
| 35 |
+
elif isinstance(video_input, str):
|
| 36 |
+
video_path = video_input
|
| 37 |
+
else:
|
| 38 |
+
raise ValueError("Invalid video input format")
|
| 39 |
+
|
| 40 |
+
# Validate video file exists
|
| 41 |
+
if not video_path or not os.path.exists(video_path):
|
| 42 |
+
raise ValueError(f"Video file not found: {video_path}")
|
| 43 |
+
|
| 44 |
+
# Get video metadata
|
| 45 |
+
cap = cv2.VideoCapture(video_path)
|
| 46 |
+
if not cap.isOpened():
|
| 47 |
+
raise ValueError(f"Could not open video file: {video_path}")
|
| 48 |
+
|
| 49 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 50 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 51 |
+
duration = total_frames / fps if fps > 0 else 0
|
| 52 |
+
|
| 53 |
+
if duration == 0:
|
| 54 |
+
cap.release()
|
| 55 |
+
raise ValueError("Video has zero duration")
|
| 56 |
+
|
| 57 |
+
cap.release()
|
| 58 |
+
|
| 59 |
+
# Use provided thumbnail_timeframe if available, otherwise use Gemini API
|
| 60 |
+
if thumbnail_timeframe is not None:
|
| 61 |
+
# Use the provided timestamp directly
|
| 62 |
+
best_timestamp = float(thumbnail_timeframe)
|
| 63 |
+
# Ensure timestamp is within video duration
|
| 64 |
+
best_timestamp = max(0.0, min(best_timestamp, duration - 0.1))
|
| 65 |
+
else:
|
| 66 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 67 |
+
if not api_key:
|
| 68 |
+
raise ValueError(
|
| 69 |
+
"GOOGLE_API_KEY environment variable is required for AI frame extraction when thumbnail_timeframe is not provided"
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Use Gemini Vision API to analyze video and get best timestamp
|
| 73 |
+
client = genai.Client(api_key=api_key)
|
| 74 |
+
|
| 75 |
+
# Read video file as bytes
|
| 76 |
+
with open(video_path, "rb") as f:
|
| 77 |
+
video_data = f.read()
|
| 78 |
+
|
| 79 |
+
# Determine MIME type
|
| 80 |
+
mime_type, _ = mimetypes.guess_type(video_path)
|
| 81 |
+
if not mime_type or not mime_type.startswith("video/"):
|
| 82 |
+
# Default to mp4 if cannot determine
|
| 83 |
+
mime_type = "video/mp4"
|
| 84 |
+
|
| 85 |
+
# Create prompt asking for best timestamp
|
| 86 |
+
prompt = f"""Analyze this video and identify the best timestamp (in seconds) to extract a representative, engaging frame for a thumbnail.
|
| 87 |
+
|
| 88 |
+
Consider these factors:
|
| 89 |
+
- Visual appeal and composition quality
|
| 90 |
+
- Subject clarity and focus
|
| 91 |
+
- Color and lighting quality
|
| 92 |
+
- Overall engagement and representativeness of the video content
|
| 93 |
+
- Avoid frames with motion blur or poor quality
|
| 94 |
+
|
| 95 |
+
The video duration is approximately {duration:.2f} seconds.
|
| 96 |
+
|
| 97 |
+
Respond with ONLY the timestamp in seconds as a number (e.g., "12.5" or "8.3"). Do not include any other text or explanation."""
|
| 98 |
+
|
| 99 |
+
# Create VideoMetadata with fps parameter for efficient processing
|
| 100 |
+
# Using 2.0 fps for good balance between speed and accuracy
|
| 101 |
+
video_metadata = genai.types.VideoMetadata(fps=2.0)
|
| 102 |
+
video_blob = genai.types.Blob(data=video_data, mime_type=mime_type)
|
| 103 |
+
video_part = genai.types.Part(
|
| 104 |
+
inline_data=video_blob,
|
| 105 |
+
videoMetadata=video_metadata,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# Use Gemini's native video understanding to analyze the entire video
|
| 109 |
+
response = client.models.generate_content(
|
| 110 |
+
model="gemini-2.5-flash-lite",
|
| 111 |
+
contents=[prompt, video_part],
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
# Extract timestamp from response
|
| 115 |
+
response_text = response.text.strip()
|
| 116 |
+
|
| 117 |
+
# Try to extract numeric timestamp from response
|
| 118 |
+
# Look for patterns like "12.5", "8.3", "15", etc.
|
| 119 |
+
timestamp_match = re.search(r"(\d+\.?\d*)", response_text)
|
| 120 |
+
if timestamp_match:
|
| 121 |
+
best_timestamp = float(timestamp_match.group(1))
|
| 122 |
+
# Ensure timestamp is within video duration
|
| 123 |
+
best_timestamp = max(0.0, min(best_timestamp, duration - 0.1))
|
| 124 |
+
else:
|
| 125 |
+
# Fallback: use middle of video if we can't parse the response
|
| 126 |
+
best_timestamp = duration / 2
|
| 127 |
+
|
| 128 |
+
# Extract frame at the selected timestamp
|
| 129 |
+
cap = cv2.VideoCapture(video_path)
|
| 130 |
+
frame_number = int(best_timestamp * fps)
|
| 131 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
|
| 132 |
+
ret, best_frame = cap.read()
|
| 133 |
+
cap.release()
|
| 134 |
+
|
| 135 |
+
if not ret:
|
| 136 |
+
raise ValueError(f"Could not extract frame at timestamp {best_timestamp}s")
|
| 137 |
+
|
| 138 |
+
# Generate output path if not provided
|
| 139 |
+
if output_path is None:
|
| 140 |
+
video_name = Path(video_path).stem
|
| 141 |
+
output_dir = Path(video_path).parent / "frames"
|
| 142 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 143 |
+
output_path = str(
|
| 144 |
+
output_dir / f"{video_name}_frame_ai_{int(best_timestamp)}s.png"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# Save selected frame
|
| 148 |
+
cv2.imwrite(output_path, best_frame)
|
| 149 |
+
return output_path
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
raise Exception(f"Error extracting frame: {str(e)}")
|
src/app/tools/langchain_tools.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LangChain tool wrappers for video processing tools.
|
| 3 |
+
|
| 4 |
+
This module wraps existing video processing tools as LangChain tools
|
| 5 |
+
so they can be used by the LangChain ReAct agent.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
from typing import Optional, List, Union
|
| 10 |
+
from langchain_core.tools import tool
|
| 11 |
+
|
| 12 |
+
from .video_summarizer import video_summarizer
|
| 13 |
+
from .video_script_generator import video_script_generator
|
| 14 |
+
from .music_selector import music_selector
|
| 15 |
+
from .frame_extractor import frame_extractor
|
| 16 |
+
from .thumbnail_generator import thumbnail_generator
|
| 17 |
+
from .video_composer import video_composer
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@tool
|
| 21 |
+
def video_summarizer_tool(video_path: str, fps: float = 2.0) -> str:
|
| 22 |
+
"""
|
| 23 |
+
Analyze video content and generate a comprehensive summary.
|
| 24 |
+
|
| 25 |
+
This tool analyzes a video file and returns a JSON summary containing:
|
| 26 |
+
- Overall video description
|
| 27 |
+
- Key scenes and moments with timestamps
|
| 28 |
+
- Detected objects, people, and activities
|
| 29 |
+
- Mood and style tags (e.g., energetic, calm, dramatic, fun)
|
| 30 |
+
- Visual style description
|
| 31 |
+
- Recommended thumbnail timestamp (in seconds)
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
video_path: Path to the video file to analyze
|
| 35 |
+
fps: Frames per second for video processing (default: 2.0, range: 0.1-24.0)
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
JSON string containing video summary with all analysis details
|
| 39 |
+
"""
|
| 40 |
+
return video_summarizer(video_path, fps=fps)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@tool
|
| 44 |
+
def video_script_generator_tool(
|
| 45 |
+
video_summaries: str,
|
| 46 |
+
user_description: Optional[str] = None,
|
| 47 |
+
target_duration: float = 30.0,
|
| 48 |
+
) -> str:
|
| 49 |
+
"""
|
| 50 |
+
Generate a detailed video composition script from video summaries.
|
| 51 |
+
|
| 52 |
+
This tool creates a comprehensive script/storyboard for video composition that includes:
|
| 53 |
+
- Scene sequence with source video references and timestamps
|
| 54 |
+
- Duration for each scene segment (sums to approximately target_duration)
|
| 55 |
+
- Transition types between scenes (cut, fade, crossfade)
|
| 56 |
+
- Pacing and rhythm plan
|
| 57 |
+
- Music synchronization points
|
| 58 |
+
- Overall narrative structure and flow
|
| 59 |
+
- Visual style recommendations
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
video_summaries: JSON string containing video summaries (can be single summary or array)
|
| 63 |
+
user_description: Optional description of desired mood, style, or content
|
| 64 |
+
target_duration: Target duration in seconds for the final video (default: 30.0)
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
JSON string containing detailed script with scene information and composition details
|
| 68 |
+
"""
|
| 69 |
+
return video_script_generator(video_summaries, user_description, target_duration)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@tool
|
| 73 |
+
def music_selector_tool(
|
| 74 |
+
mood: str,
|
| 75 |
+
style: Optional[str] = None,
|
| 76 |
+
target_duration: float = 30.0,
|
| 77 |
+
bpm: Optional[int] = None,
|
| 78 |
+
looping: bool = True,
|
| 79 |
+
prompt_influence: float = 0.3,
|
| 80 |
+
) -> str:
|
| 81 |
+
"""
|
| 82 |
+
Generate background music or sound effects matching the video's mood and style.
|
| 83 |
+
|
| 84 |
+
This tool uses ElevenLabs API to generate appropriate background music that matches
|
| 85 |
+
the video content. The music is generated based on mood tags, style preferences,
|
| 86 |
+
and duration requirements.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
mood: Mood tags describing the desired mood (e.g., "energetic", "calm, dramatic")
|
| 90 |
+
style: Optional style description (e.g., "cinematic", "modern", "retro")
|
| 91 |
+
target_duration: Target duration in seconds (default: 30.0, max: 30.0)
|
| 92 |
+
bpm: Optional beats per minute for rhythm matching
|
| 93 |
+
looping: Whether the sound effect should be loopable (default: True)
|
| 94 |
+
prompt_influence: How closely output matches prompt (0-1, default: 0.3)
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
Path to the generated audio file (MP3 format)
|
| 98 |
+
"""
|
| 99 |
+
return music_selector(
|
| 100 |
+
mood=mood,
|
| 101 |
+
style=style,
|
| 102 |
+
target_duration=target_duration,
|
| 103 |
+
bpm=bpm,
|
| 104 |
+
looping=looping,
|
| 105 |
+
prompt_influence=prompt_influence,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@tool
|
| 110 |
+
def frame_extractor_tool(
|
| 111 |
+
video_path: str, thumbnail_timeframe: Optional[float] = None
|
| 112 |
+
) -> str:
|
| 113 |
+
"""
|
| 114 |
+
Extract a representative frame from a video for thumbnail creation.
|
| 115 |
+
|
| 116 |
+
This tool extracts a frame from a video at a specific timestamp. If no timestamp
|
| 117 |
+
is provided, it uses AI to analyze the video and select the best frame.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
video_path: Path to the video file
|
| 121 |
+
thumbnail_timeframe: Optional timestamp in seconds to extract frame.
|
| 122 |
+
If not provided, AI will select the best frame.
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
Path to the extracted frame image (PNG format)
|
| 126 |
+
"""
|
| 127 |
+
return frame_extractor(video_path, thumbnail_timeframe=thumbnail_timeframe)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
@tool
|
| 131 |
+
def thumbnail_generator_tool(image_path: str, summary: str) -> str:
|
| 132 |
+
"""
|
| 133 |
+
Generate an engaging thumbnail image with text overlays and stickers.
|
| 134 |
+
|
| 135 |
+
This tool creates a professional thumbnail image using the provided frame image
|
| 136 |
+
as a background. It adds catchy text, stickers, and visual elements based on the
|
| 137 |
+
video summary to create an attention-grabbing thumbnail.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
image_path: Path to the frame image to use as background
|
| 141 |
+
summary: Text summary of the video content (used to generate appropriate text and stickers)
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Path to the generated thumbnail image (PNG format)
|
| 145 |
+
"""
|
| 146 |
+
return thumbnail_generator(image_path, summary)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
@tool
|
| 150 |
+
def video_composer_tool(
|
| 151 |
+
script: str,
|
| 152 |
+
video_clips: str,
|
| 153 |
+
music_path: Optional[str] = None,
|
| 154 |
+
thumbnail_image: Optional[str] = None,
|
| 155 |
+
) -> str:
|
| 156 |
+
"""
|
| 157 |
+
Compose a final video from multiple clips according to a script.
|
| 158 |
+
|
| 159 |
+
This tool combines video clips, adds music, applies transitions, and optionally
|
| 160 |
+
overlays a thumbnail image on the first frame. It follows the script exactly to
|
| 161 |
+
create the final composed video.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
script: JSON string containing scene information with transitions and timing
|
| 165 |
+
video_clips: JSON string array of video file paths, or comma-separated paths
|
| 166 |
+
music_path: Optional path to background music file
|
| 167 |
+
thumbnail_image: Optional path to thumbnail image to overlay on first frame
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
Path to the final composed video file
|
| 171 |
+
"""
|
| 172 |
+
# Handle video_clips - can be JSON array string or comma-separated paths
|
| 173 |
+
if video_clips.strip().startswith("["):
|
| 174 |
+
# JSON array
|
| 175 |
+
clips_list = json.loads(video_clips)
|
| 176 |
+
else:
|
| 177 |
+
# Comma-separated paths
|
| 178 |
+
clips_list = [path.strip() for path in video_clips.split(",") if path.strip()]
|
| 179 |
+
|
| 180 |
+
return video_composer(
|
| 181 |
+
script=script,
|
| 182 |
+
video_clips=clips_list,
|
| 183 |
+
music_path=music_path,
|
| 184 |
+
thumbnail_image=thumbnail_image,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# List of all tools for easy import
|
| 189 |
+
ALL_TOOLS = [
|
| 190 |
+
video_summarizer_tool,
|
| 191 |
+
video_script_generator_tool,
|
| 192 |
+
music_selector_tool,
|
| 193 |
+
frame_extractor_tool,
|
| 194 |
+
thumbnail_generator_tool,
|
| 195 |
+
video_composer_tool,
|
| 196 |
+
]
|
src/app/tools/music_selector.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import time
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Optional, List, Union
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from elevenlabs import ElevenLabs
|
| 13 |
+
except ImportError:
|
| 14 |
+
ElevenLabs = None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def music_selector(
|
| 18 |
+
mood: Union[str, List[str]] = "energetic",
|
| 19 |
+
style: Optional[str] = None,
|
| 20 |
+
target_duration: float = 30.0,
|
| 21 |
+
bpm: Optional[int] = None,
|
| 22 |
+
looping: bool = True,
|
| 23 |
+
prompt_influence: float = 0.3,
|
| 24 |
+
output_path: Optional[str] = None,
|
| 25 |
+
) -> str:
|
| 26 |
+
"""
|
| 27 |
+
Generate appropriate background sound effects based on mood/style from script.
|
| 28 |
+
Uses ElevenLabs API to generate sound effects that match the video's mood and style.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
mood: Mood tags (str or list of str) describing the desired mood.
|
| 32 |
+
Can be comma-separated string or list (e.g., "energetic", "calm, dramatic, fun")
|
| 33 |
+
style: Optional style description (e.g., "cinematic", "modern", "retro")
|
| 34 |
+
target_duration: Target duration in seconds (default: 30.0, max: 30.0 for ElevenLabs)
|
| 35 |
+
bpm: Optional beats per minute for rhythm matching (used in prompt generation)
|
| 36 |
+
sync_points: Optional list of sync points in seconds for beat alignment (used in prompt)
|
| 37 |
+
looping: Whether the sound effect should be loopable (default: True).
|
| 38 |
+
Maps to ElevenLabs API parameter "loop"
|
| 39 |
+
prompt_influence: How closely the output should match the prompt (0-1, default: 0.3).
|
| 40 |
+
Higher values = more literal interpretation, lower = more creative
|
| 41 |
+
output_path: Optional path where the audio file should be saved.
|
| 42 |
+
If not provided, saves to a temporary file.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
str: Path to the generated audio file (MP3 format)
|
| 46 |
+
|
| 47 |
+
Raises:
|
| 48 |
+
ImportError: If elevenlabs package is not installed
|
| 49 |
+
ValueError: If ELEVENLABS_API_KEY is not set in environment
|
| 50 |
+
Exception: If sound effect generation fails
|
| 51 |
+
"""
|
| 52 |
+
try:
|
| 53 |
+
# Check if ElevenLabs is available
|
| 54 |
+
if ElevenLabs is None:
|
| 55 |
+
raise ImportError(
|
| 56 |
+
"elevenlabs package is not installed. Install it with: pip install elevenlabs"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Get API key from environment
|
| 60 |
+
api_key = os.getenv("ELEVENLABS_API_KEY")
|
| 61 |
+
if not api_key:
|
| 62 |
+
raise ValueError(
|
| 63 |
+
"ELEVENLABS_API_KEY environment variable is not set. "
|
| 64 |
+
"Please set it in your .env file or environment."
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Initialize ElevenLabs client
|
| 68 |
+
client = ElevenLabs(api_key=api_key)
|
| 69 |
+
|
| 70 |
+
# Process mood input (can be string or list)
|
| 71 |
+
if isinstance(mood, str):
|
| 72 |
+
# Handle comma-separated mood tags
|
| 73 |
+
mood_tags = [m.strip() for m in mood.split(",") if m.strip()]
|
| 74 |
+
if not mood_tags:
|
| 75 |
+
mood_tags = ["energetic"] # Default
|
| 76 |
+
elif isinstance(mood, list):
|
| 77 |
+
mood_tags = [str(m).strip() for m in mood if str(m).strip()]
|
| 78 |
+
if not mood_tags:
|
| 79 |
+
mood_tags = ["energetic"] # Default
|
| 80 |
+
else:
|
| 81 |
+
mood_tags = ["energetic"] # Default
|
| 82 |
+
|
| 83 |
+
# Build prompt for sound effect generation
|
| 84 |
+
prompt_parts = []
|
| 85 |
+
|
| 86 |
+
# Add mood description
|
| 87 |
+
mood_description = ", ".join(mood_tags)
|
| 88 |
+
prompt_parts.append(f"{mood_description} background sound")
|
| 89 |
+
|
| 90 |
+
# Add style if provided
|
| 91 |
+
if style and str(style).strip():
|
| 92 |
+
prompt_parts.append(f"{str(style).strip()} style")
|
| 93 |
+
|
| 94 |
+
# Add rhythm information if provided
|
| 95 |
+
if bpm is not None and bpm > 0:
|
| 96 |
+
prompt_parts.append(f"{int(bpm)} BPM rhythm")
|
| 97 |
+
|
| 98 |
+
# Combine into final prompt
|
| 99 |
+
prompt = ", ".join(prompt_parts)
|
| 100 |
+
|
| 101 |
+
# Clamp duration to ElevenLabs limits (max 30 seconds)
|
| 102 |
+
if target_duration > 30.0:
|
| 103 |
+
target_duration = 30.0
|
| 104 |
+
elif target_duration <= 0:
|
| 105 |
+
target_duration = 5.0 # Minimum reasonable duration
|
| 106 |
+
|
| 107 |
+
# Clamp prompt_influence to valid range (0-1)
|
| 108 |
+
prompt_influence = max(0, min(1, prompt_influence))
|
| 109 |
+
|
| 110 |
+
# Generate sound effect using ElevenLabs API
|
| 111 |
+
# According to ElevenLabs API documentation:
|
| 112 |
+
# - Required: text
|
| 113 |
+
# - Optional: duration_seconds (0.5 to 30 seconds)
|
| 114 |
+
# - Optional: loop (boolean) - enables seamless looping
|
| 115 |
+
# - Optional: prompt_influence (float 0-1) - how closely output matches prompt
|
| 116 |
+
# - Optional: output_format (e.g., "mp3_44100_128")
|
| 117 |
+
|
| 118 |
+
# Build parameters for the API call
|
| 119 |
+
api_params = {
|
| 120 |
+
"text": prompt,
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
# Add duration_seconds parameter (must be between 0.5 and 30 seconds)
|
| 124 |
+
if target_duration and 0.5 <= target_duration <= 30.0:
|
| 125 |
+
api_params["duration_seconds"] = target_duration
|
| 126 |
+
|
| 127 |
+
# Add looping parameter (API uses "loop" not "looping")
|
| 128 |
+
if looping:
|
| 129 |
+
api_params["loop"] = looping
|
| 130 |
+
|
| 131 |
+
# Add prompt_influence parameter (0-1 range)
|
| 132 |
+
if prompt_influence is not None:
|
| 133 |
+
api_params["prompt_influence"] = prompt_influence
|
| 134 |
+
|
| 135 |
+
# Add output format for MP3
|
| 136 |
+
api_params["output_format"] = "mp3_44100_128"
|
| 137 |
+
|
| 138 |
+
# Call the API
|
| 139 |
+
audio_data = client.text_to_sound_effects.convert(**api_params)
|
| 140 |
+
|
| 141 |
+
# Determine output path
|
| 142 |
+
if output_path is None:
|
| 143 |
+
# Create a temporary file with appropriate extension
|
| 144 |
+
temp_dir = tempfile.gettempdir()
|
| 145 |
+
timestamp = int(time.time())
|
| 146 |
+
output_path = os.path.join(
|
| 147 |
+
temp_dir,
|
| 148 |
+
f"sound_effect_{mood_tags[0]}_{int(target_duration)}s_{timestamp}.mp3",
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Ensure output directory exists
|
| 152 |
+
output_dir = os.path.dirname(output_path)
|
| 153 |
+
if output_dir and not os.path.exists(output_dir):
|
| 154 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 155 |
+
|
| 156 |
+
# Save audio data to file
|
| 157 |
+
# The audio_data should be bytes that can be written directly
|
| 158 |
+
if isinstance(audio_data, bytes):
|
| 159 |
+
with open(output_path, "wb") as f:
|
| 160 |
+
f.write(audio_data)
|
| 161 |
+
elif hasattr(audio_data, "__iter__"):
|
| 162 |
+
# If it's an iterable (generator, list, etc.), read all chunks
|
| 163 |
+
with open(output_path, "wb") as f:
|
| 164 |
+
for chunk in audio_data:
|
| 165 |
+
if isinstance(chunk, bytes):
|
| 166 |
+
f.write(chunk)
|
| 167 |
+
else:
|
| 168 |
+
# Try to convert to bytes
|
| 169 |
+
try:
|
| 170 |
+
f.write(bytes(chunk))
|
| 171 |
+
except (TypeError, ValueError):
|
| 172 |
+
# If conversion fails, try string encoding
|
| 173 |
+
if isinstance(chunk, str):
|
| 174 |
+
f.write(chunk.encode())
|
| 175 |
+
else:
|
| 176 |
+
# Try to write directly
|
| 177 |
+
with open(output_path, "wb") as f:
|
| 178 |
+
try:
|
| 179 |
+
f.write(bytes(audio_data))
|
| 180 |
+
except (TypeError, ValueError):
|
| 181 |
+
raise ValueError(f"Unexpected audio data type: {type(audio_data)}")
|
| 182 |
+
|
| 183 |
+
# Return absolute path
|
| 184 |
+
return os.path.abspath(output_path)
|
| 185 |
+
|
| 186 |
+
except Exception as e:
|
| 187 |
+
raise Exception(f"Error generating sound effect: {str(e)}")
|
src/app/tools/script_generator.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import mimetypes
|
| 5 |
+
import google.genai as genai
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def script_generator(video_inputs, user_prompt: Optional[str] = None) -> str:
|
| 10 |
+
"""
|
| 11 |
+
Generate a detailed video script based on multiple video materials.
|
| 12 |
+
Uses Google Gemini's native video understanding to analyze material videos
|
| 13 |
+
and create a comprehensive script for making a short video.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
video_inputs: List of video file paths or Gradio video inputs
|
| 17 |
+
user_prompt (str, optional): User's custom prompt/request. If not provided,
|
| 18 |
+
AI will generate a script based on material analysis.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
str: JSON string containing a detailed video script with scene breakdowns,
|
| 22 |
+
timing, transitions, and creative suggestions
|
| 23 |
+
"""
|
| 24 |
+
try:
|
| 25 |
+
# Handle various input formats
|
| 26 |
+
if not video_inputs:
|
| 27 |
+
return json.dumps({"error": "No video files provided"})
|
| 28 |
+
|
| 29 |
+
# Normalize video inputs to list of paths
|
| 30 |
+
video_paths = []
|
| 31 |
+
if isinstance(video_inputs, list):
|
| 32 |
+
for video_input in video_inputs:
|
| 33 |
+
if isinstance(video_input, tuple):
|
| 34 |
+
video_paths.append(video_input[0])
|
| 35 |
+
elif isinstance(video_input, str):
|
| 36 |
+
video_paths.append(video_input)
|
| 37 |
+
elif isinstance(video_inputs, str):
|
| 38 |
+
video_paths = [video_inputs]
|
| 39 |
+
elif isinstance(video_inputs, tuple):
|
| 40 |
+
video_paths = [video_inputs[0]]
|
| 41 |
+
else:
|
| 42 |
+
return json.dumps({"error": "Invalid video input format"})
|
| 43 |
+
|
| 44 |
+
# Validate all video files exist and extract metadata
|
| 45 |
+
videos_metadata = []
|
| 46 |
+
for idx, video_path in enumerate(video_paths):
|
| 47 |
+
if not os.path.exists(video_path):
|
| 48 |
+
return json.dumps({"error": f"Video file not found: {video_path}"})
|
| 49 |
+
|
| 50 |
+
cap = cv2.VideoCapture(video_path)
|
| 51 |
+
if not cap.isOpened():
|
| 52 |
+
return json.dumps({"error": f"Could not open video file: {video_path}"})
|
| 53 |
+
|
| 54 |
+
video_fps = cap.get(cv2.CAP_PROP_FPS)
|
| 55 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 56 |
+
duration = frame_count / video_fps if video_fps > 0 else 0
|
| 57 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 58 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 59 |
+
cap.release()
|
| 60 |
+
|
| 61 |
+
videos_metadata.append(
|
| 62 |
+
{
|
| 63 |
+
"index": idx,
|
| 64 |
+
"filename": os.path.basename(video_path),
|
| 65 |
+
"duration": round(duration, 2),
|
| 66 |
+
"resolution": f"{width}x{height}",
|
| 67 |
+
"fps": round(video_fps, 2),
|
| 68 |
+
"frame_count": frame_count,
|
| 69 |
+
}
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Check for API key
|
| 73 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 74 |
+
if not api_key:
|
| 75 |
+
return json.dumps(
|
| 76 |
+
{
|
| 77 |
+
"error": "GOOGLE_API_KEY environment variable not set",
|
| 78 |
+
"videos_analyzed": videos_metadata,
|
| 79 |
+
"script": "AI script generation requires GOOGLE_API_KEY",
|
| 80 |
+
}
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Initialize Gemini client
|
| 84 |
+
client = genai.Client(api_key=api_key)
|
| 85 |
+
|
| 86 |
+
# Prepare video parts for multimodal input
|
| 87 |
+
video_parts = []
|
| 88 |
+
for video_path in video_paths:
|
| 89 |
+
with open(video_path, "rb") as f:
|
| 90 |
+
video_data = f.read()
|
| 91 |
+
|
| 92 |
+
mime_type, _ = mimetypes.guess_type(video_path)
|
| 93 |
+
if not mime_type or not mime_type.startswith("video/"):
|
| 94 |
+
mime_type = "video/mp4"
|
| 95 |
+
|
| 96 |
+
video_metadata = genai.types.VideoMetadata(fps=2.0)
|
| 97 |
+
video_blob = genai.types.Blob(data=video_data, mime_type=mime_type)
|
| 98 |
+
video_part = genai.types.Part(
|
| 99 |
+
inline_data=video_blob,
|
| 100 |
+
videoMetadata=video_metadata,
|
| 101 |
+
)
|
| 102 |
+
video_parts.append(video_part)
|
| 103 |
+
|
| 104 |
+
# Create comprehensive prompt
|
| 105 |
+
if user_prompt and user_prompt.strip():
|
| 106 |
+
# Use user's prompt
|
| 107 |
+
base_prompt = f"""User Request: {user_prompt}
|
| 108 |
+
|
| 109 |
+
Based on the user's request above and the provided video materials, create a detailed video production script."""
|
| 110 |
+
else:
|
| 111 |
+
# Generate default prompt
|
| 112 |
+
base_prompt = """Analyze the provided video materials and create a comprehensive video production script for making an engaging short video."""
|
| 113 |
+
|
| 114 |
+
full_prompt = f"""{base_prompt}
|
| 115 |
+
|
| 116 |
+
I have {len(video_paths)} video file(s) as source material. Please analyze each video and create a detailed script that includes:
|
| 117 |
+
|
| 118 |
+
1. **Concept Overview**: Describe the overall theme, message, and creative direction for the final video.
|
| 119 |
+
|
| 120 |
+
2. **Target Duration**: Recommend optimal video length based on content (typically 15-60 seconds for short-form).
|
| 121 |
+
|
| 122 |
+
3. **Scene Breakdown**: For each scene in the final video, specify:
|
| 123 |
+
- Scene number and description
|
| 124 |
+
- Which source video to use (reference by index: 0, 1, 2, etc.)
|
| 125 |
+
- Exact start and end timestamps from the source video
|
| 126 |
+
- Duration of the scene
|
| 127 |
+
- Visual description and key moments
|
| 128 |
+
- Suggested transitions (e.g., "fade", "crossfade", "wipe", "zoom")
|
| 129 |
+
|
| 130 |
+
4. **Audio Recommendations**:
|
| 131 |
+
- Background music mood and style
|
| 132 |
+
- BPM (beats per minute) suggestion
|
| 133 |
+
- Volume levels and audio effects
|
| 134 |
+
- Any voiceover or text-to-speech suggestions
|
| 135 |
+
|
| 136 |
+
5. **Text Overlays**: Suggest any text, captions, or titles to add, including:
|
| 137 |
+
- Text content
|
| 138 |
+
- Timing (when to appear)
|
| 139 |
+
- Style suggestions (font, size, position, animation)
|
| 140 |
+
|
| 141 |
+
6. **Visual Effects**: Recommend any filters, color grading, speed adjustments, or special effects.
|
| 142 |
+
|
| 143 |
+
7. **Pacing & Flow**: Explain the rhythm and flow of the video, including any build-ups, climaxes, or emotional arcs.
|
| 144 |
+
|
| 145 |
+
8. **Call-to-Action**: Suggest ending elements (e.g., logo, text, link, subscribe prompt).
|
| 146 |
+
|
| 147 |
+
Please provide the script in a structured JSON format that includes:
|
| 148 |
+
- "concept": overall theme and message
|
| 149 |
+
- "target_duration": recommended total duration in seconds
|
| 150 |
+
- "total_duration": sum of all scene durations
|
| 151 |
+
- "scenes": array of scene objects with fields:
|
| 152 |
+
- "scene_id": integer
|
| 153 |
+
- "source_video": index of source video (0-based)
|
| 154 |
+
- "start_time": start timestamp in source video (seconds)
|
| 155 |
+
- "end_time": end timestamp in source video (seconds)
|
| 156 |
+
- "duration": scene duration (seconds)
|
| 157 |
+
- "description": what happens in this scene
|
| 158 |
+
- "transition_in": transition effect when entering scene
|
| 159 |
+
- "transition_out": transition effect when exiting scene
|
| 160 |
+
- "audio": object with "mood", "style", "bpm", "volume"
|
| 161 |
+
- "text_overlays": array of text overlay objects
|
| 162 |
+
- "visual_effects": array of suggested effects
|
| 163 |
+
- "call_to_action": string description
|
| 164 |
+
|
| 165 |
+
Also provide a human-readable narrative version of the script."""
|
| 166 |
+
|
| 167 |
+
# Call Gemini API with all video materials
|
| 168 |
+
contents = [full_prompt] + video_parts
|
| 169 |
+
response = client.models.generate_content(
|
| 170 |
+
model="gemini-2.5-flash-lite",
|
| 171 |
+
contents=contents, # type: ignore
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Parse response
|
| 175 |
+
script_text: str = response.text if response.text else ""
|
| 176 |
+
|
| 177 |
+
# Try to extract JSON if present
|
| 178 |
+
json_match = None
|
| 179 |
+
if "```json" in script_text:
|
| 180 |
+
# Extract JSON code block
|
| 181 |
+
import re
|
| 182 |
+
|
| 183 |
+
json_pattern = r"```json\s*([\s\S]*?)\s*```"
|
| 184 |
+
match = re.search(json_pattern, script_text)
|
| 185 |
+
if match:
|
| 186 |
+
json_match = match.group(1)
|
| 187 |
+
|
| 188 |
+
# Structure the response
|
| 189 |
+
result = {
|
| 190 |
+
"videos_analyzed": videos_metadata,
|
| 191 |
+
"user_prompt": (
|
| 192 |
+
user_prompt if user_prompt else "Auto-generated based on materials"
|
| 193 |
+
),
|
| 194 |
+
"script_narrative": script_text,
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
# If we found structured JSON, try to parse and include it
|
| 198 |
+
if json_match:
|
| 199 |
+
try:
|
| 200 |
+
structured_script = json.loads(json_match)
|
| 201 |
+
result["structured_script"] = structured_script
|
| 202 |
+
except json.JSONDecodeError:
|
| 203 |
+
result["structured_script_parse_error"] = (
|
| 204 |
+
"Could not parse JSON from response"
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
return json.dumps(result, indent=2)
|
| 208 |
+
|
| 209 |
+
except Exception as e:
|
| 210 |
+
return json.dumps({"error": f"Error generating script: {str(e)}"})
|
src/app/tools/subtitle_creator.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import tempfile
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Optional, Union, TypedDict, Tuple
|
| 6 |
+
from moviepy import VideoFileClip, TextClip, CompositeVideoClip
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SubtitleSegment(TypedDict, total=False):
|
| 10 |
+
"""Subtitle segment definition with timing and styling."""
|
| 11 |
+
|
| 12 |
+
start: float # Start time in seconds
|
| 13 |
+
end: float # End time in seconds
|
| 14 |
+
text: str # Subtitle text content
|
| 15 |
+
position: Optional[str] # Position: 'bottom', 'top', 'center', or tuple (x, y)
|
| 16 |
+
font: Optional[str] # Font name (default: 'Arial')
|
| 17 |
+
fontsize: Optional[int] # Font size (default: 48)
|
| 18 |
+
color: Optional[str] # Text color (default: 'white')
|
| 19 |
+
bg_color: Optional[str] # Background color (default: 'black')
|
| 20 |
+
stroke_color: Optional[str] # Stroke/outline color (optional)
|
| 21 |
+
stroke_width: Optional[int] # Stroke width (optional)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class RequiredSubtitleSegment(TypedDict):
|
| 25 |
+
"""Required fields for a subtitle segment."""
|
| 26 |
+
|
| 27 |
+
start: float
|
| 28 |
+
end: float
|
| 29 |
+
text: str
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class TranscriptData(TypedDict, total=False):
|
| 33 |
+
"""Complete transcript structure with subtitle segments."""
|
| 34 |
+
|
| 35 |
+
subtitles: List[SubtitleSegment]
|
| 36 |
+
default_style: Optional[dict] # Default styling for all subtitles
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Type alias for Gradio video input
|
| 40 |
+
GradioVideoInput = Union[
|
| 41 |
+
str, # Single file path
|
| 42 |
+
Tuple[str, str], # Gradio video format: (video_path, subtitle_path)
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def subtitle_creator(
|
| 47 |
+
video_input: GradioVideoInput,
|
| 48 |
+
transcript_json: str,
|
| 49 |
+
output_path: Optional[str] = None,
|
| 50 |
+
) -> str:
|
| 51 |
+
"""
|
| 52 |
+
Add subtitles to a video based on a JSON transcript with timestamps.
|
| 53 |
+
Creates text overlays at specified times with customizable styling.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
video_input: Video file path (str) or tuple (video_path, subtitle_path) from Gradio
|
| 57 |
+
transcript_json (str): JSON string containing subtitle segments with timing and styling
|
| 58 |
+
output_path (str, optional): Path where the subtitled video should be saved.
|
| 59 |
+
If not provided, saves to a temporary file.
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
str: Path to the subtitled video file
|
| 63 |
+
|
| 64 |
+
Example transcript JSON format:
|
| 65 |
+
{
|
| 66 |
+
"subtitles": [
|
| 67 |
+
{
|
| 68 |
+
"start": 0.0,
|
| 69 |
+
"end": 2.5,
|
| 70 |
+
"text": "Hello, welcome to the video!",
|
| 71 |
+
"position": "bottom",
|
| 72 |
+
"fontsize": 48,
|
| 73 |
+
"color": "white"
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"start": 2.5,
|
| 77 |
+
"end": 5.0,
|
| 78 |
+
"text": "This is a subtitle example.",
|
| 79 |
+
"fontsize": 52,
|
| 80 |
+
"color": "yellow"
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"default_style": {
|
| 84 |
+
"fontsize": 48,
|
| 85 |
+
"color": "white",
|
| 86 |
+
"bg_color": "black",
|
| 87 |
+
"position": "bottom",
|
| 88 |
+
"transparent": True
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
"""
|
| 92 |
+
try:
|
| 93 |
+
# Handle Gradio video input format
|
| 94 |
+
if isinstance(video_input, tuple):
|
| 95 |
+
video_path = video_input[0]
|
| 96 |
+
elif isinstance(video_input, str):
|
| 97 |
+
video_path = video_input
|
| 98 |
+
else:
|
| 99 |
+
raise ValueError("Invalid video input format. Expected string or tuple.")
|
| 100 |
+
|
| 101 |
+
# Validate video file exists
|
| 102 |
+
if not video_path or not os.path.exists(video_path):
|
| 103 |
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
| 104 |
+
|
| 105 |
+
# Parse transcript JSON
|
| 106 |
+
try:
|
| 107 |
+
transcript_data: TranscriptData = json.loads(transcript_json)
|
| 108 |
+
except json.JSONDecodeError as e:
|
| 109 |
+
raise ValueError(f"Invalid JSON format in transcript: {str(e)}")
|
| 110 |
+
|
| 111 |
+
# Validate transcript structure
|
| 112 |
+
if "subtitles" not in transcript_data or not transcript_data["subtitles"]:
|
| 113 |
+
raise ValueError(
|
| 114 |
+
"Transcript must contain 'subtitles' array with at least one subtitle"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
subtitles = transcript_data["subtitles"]
|
| 118 |
+
default_style = transcript_data.get("default_style", {})
|
| 119 |
+
|
| 120 |
+
# Set default styling values
|
| 121 |
+
default_font = default_style.get("font", None) if default_style else None
|
| 122 |
+
default_fontsize = default_style.get("fontsize", 48) if default_style else 48
|
| 123 |
+
default_color = (
|
| 124 |
+
default_style.get("color", "white") if default_style else "white"
|
| 125 |
+
)
|
| 126 |
+
default_bg_color = (
|
| 127 |
+
default_style.get("bg_color", "black") if default_style else "black"
|
| 128 |
+
)
|
| 129 |
+
default_position = (
|
| 130 |
+
default_style.get("position", "bottom") if default_style else "bottom"
|
| 131 |
+
)
|
| 132 |
+
default_transparent = (
|
| 133 |
+
default_style.get("transparent", True) if default_style else True
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Load the video
|
| 137 |
+
video = VideoFileClip(video_path)
|
| 138 |
+
video_duration = video.duration
|
| 139 |
+
video_width, video_height = video.size
|
| 140 |
+
|
| 141 |
+
# Validate all subtitle timings
|
| 142 |
+
for idx, subtitle in enumerate(subtitles):
|
| 143 |
+
if (
|
| 144 |
+
"start" not in subtitle
|
| 145 |
+
or "end" not in subtitle
|
| 146 |
+
or "text" not in subtitle
|
| 147 |
+
):
|
| 148 |
+
raise ValueError(
|
| 149 |
+
f"Subtitle {idx} must have 'start', 'end', and 'text' fields"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
start = subtitle["start"]
|
| 153 |
+
end = subtitle["end"]
|
| 154 |
+
|
| 155 |
+
if start < 0 or end < 0:
|
| 156 |
+
raise ValueError(f"Subtitle {idx}: start and end times must be >= 0")
|
| 157 |
+
|
| 158 |
+
if end <= start:
|
| 159 |
+
raise ValueError(
|
| 160 |
+
f"Subtitle {idx}: end time must be greater than start time"
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
if start >= video_duration:
|
| 164 |
+
raise ValueError(
|
| 165 |
+
f"Subtitle {idx}: start time {start}s exceeds video duration {video_duration}s"
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Create text clips for each subtitle
|
| 169 |
+
text_clips = []
|
| 170 |
+
|
| 171 |
+
for idx, subtitle in enumerate(subtitles):
|
| 172 |
+
# Access required fields with validation already done above
|
| 173 |
+
text = str(subtitle.get("text", ""))
|
| 174 |
+
start = float(subtitle.get("start", 0.0))
|
| 175 |
+
end = float(subtitle.get("end", 0.0))
|
| 176 |
+
|
| 177 |
+
# Get styling for this subtitle (use segment-specific or default)
|
| 178 |
+
font = subtitle.get("font", default_font)
|
| 179 |
+
fontsize = int(subtitle.get("fontsize", default_fontsize)) # type: ignore
|
| 180 |
+
color = str(subtitle.get("color", default_color))
|
| 181 |
+
bg_color = subtitle.get("bg_color", default_bg_color)
|
| 182 |
+
position = subtitle.get("position", default_position)
|
| 183 |
+
stroke_color = subtitle.get("stroke_color")
|
| 184 |
+
stroke_width = int(subtitle.get("stroke_width", 2)) # type: ignore
|
| 185 |
+
transparent = subtitle.get("transparent", default_transparent)
|
| 186 |
+
|
| 187 |
+
# Clamp end time to video duration
|
| 188 |
+
if end > video_duration:
|
| 189 |
+
end = video_duration
|
| 190 |
+
|
| 191 |
+
# Calculate position
|
| 192 |
+
if isinstance(position, str):
|
| 193 |
+
if position == "bottom":
|
| 194 |
+
text_position = ("center", video_height - 100)
|
| 195 |
+
elif position == "top":
|
| 196 |
+
text_position = ("center", 100)
|
| 197 |
+
elif position == "center":
|
| 198 |
+
text_position = ("center", "center")
|
| 199 |
+
else:
|
| 200 |
+
# Default to bottom if invalid
|
| 201 |
+
text_position = ("center", video_height - 100)
|
| 202 |
+
elif isinstance(position, (list, tuple)) and len(position) == 2:
|
| 203 |
+
text_position = tuple(position)
|
| 204 |
+
else:
|
| 205 |
+
text_position = ("center", video_height - 100)
|
| 206 |
+
|
| 207 |
+
# Create text clip with styling
|
| 208 |
+
try:
|
| 209 |
+
text_clip = TextClip(
|
| 210 |
+
text=text,
|
| 211 |
+
font=font,
|
| 212 |
+
font_size=fontsize,
|
| 213 |
+
color=color,
|
| 214 |
+
bg_color=bg_color,
|
| 215 |
+
stroke_color=stroke_color if stroke_color else None,
|
| 216 |
+
stroke_width=stroke_width if stroke_color and stroke_width else 0,
|
| 217 |
+
size=(video_width - 100, None), # Max width with padding
|
| 218 |
+
method="caption", # Wrap text
|
| 219 |
+
transparent=transparent,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
# Set timing and position
|
| 223 |
+
text_clip = text_clip.with_start(start).with_end(end)
|
| 224 |
+
text_clip = text_clip.with_position(text_position)
|
| 225 |
+
|
| 226 |
+
text_clips.append(text_clip)
|
| 227 |
+
|
| 228 |
+
except Exception as e:
|
| 229 |
+
raise ValueError(
|
| 230 |
+
f"Error creating subtitle {idx} ('{text[:30]}...'): {str(e)}"
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
# Composite video with all text overlays
|
| 234 |
+
final_video = CompositeVideoClip([video] + text_clips)
|
| 235 |
+
|
| 236 |
+
# Determine output path
|
| 237 |
+
if output_path is None:
|
| 238 |
+
# Create temporary file
|
| 239 |
+
temp_dir = tempfile.gettempdir()
|
| 240 |
+
output_path = os.path.join(
|
| 241 |
+
temp_dir, f"subtitled_video_{os.urandom(8).hex()}.mp4"
|
| 242 |
+
)
|
| 243 |
+
else:
|
| 244 |
+
# Ensure output directory exists
|
| 245 |
+
output_dir = os.path.dirname(output_path)
|
| 246 |
+
if output_dir:
|
| 247 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
| 248 |
+
|
| 249 |
+
# Write the final video with subtitles
|
| 250 |
+
final_video.write_videofile(
|
| 251 |
+
output_path,
|
| 252 |
+
codec="libx264",
|
| 253 |
+
audio_codec="aac",
|
| 254 |
+
temp_audiofile=os.path.join(
|
| 255 |
+
tempfile.gettempdir(), f"temp_audio_{os.urandom(8).hex()}.m4a"
|
| 256 |
+
),
|
| 257 |
+
remove_temp=True,
|
| 258 |
+
fps=video.fps,
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
# Clean up
|
| 262 |
+
video.close()
|
| 263 |
+
final_video.close()
|
| 264 |
+
for clip in text_clips:
|
| 265 |
+
clip.close()
|
| 266 |
+
|
| 267 |
+
return output_path
|
| 268 |
+
|
| 269 |
+
except FileNotFoundError as e:
|
| 270 |
+
raise e
|
| 271 |
+
except ValueError as e:
|
| 272 |
+
raise e
|
| 273 |
+
except Exception as e:
|
| 274 |
+
raise RuntimeError(f"Error creating subtitled video: {str(e)}")
|
src/app/tools/text_to_speech.py
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text-to-Speech Converter
|
| 3 |
+
Converts text transcription to audio using Google's Text-to-Speech API.
|
| 4 |
+
Supports plain text, SRT, VTT, and JSON subtitle formats.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Literal, Optional
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def parse_srt(content: str) -> list[str]:
|
| 15 |
+
"""Parse SRT subtitle content and extract dialogue text."""
|
| 16 |
+
dialogues = []
|
| 17 |
+
blocks = content.strip().split("\n\n")
|
| 18 |
+
|
| 19 |
+
for block in blocks:
|
| 20 |
+
lines = block.strip().split("\n")
|
| 21 |
+
if len(lines) < 3:
|
| 22 |
+
continue
|
| 23 |
+
|
| 24 |
+
# Get dialogue (lines after timestamp)
|
| 25 |
+
dialogue = " ".join(lines[2:])
|
| 26 |
+
# Remove speaker labels
|
| 27 |
+
dialogue = re.sub(r"\[.*?\]|\(.*?\)|^.*?:", "", dialogue).strip()
|
| 28 |
+
if dialogue:
|
| 29 |
+
dialogues.append(dialogue)
|
| 30 |
+
|
| 31 |
+
return dialogues
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def parse_srt_with_timing(content: str) -> list[dict]:
|
| 35 |
+
"""Parse SRT subtitle content with timing information."""
|
| 36 |
+
segments = []
|
| 37 |
+
blocks = content.strip().split("\n\n")
|
| 38 |
+
|
| 39 |
+
for block in blocks:
|
| 40 |
+
lines = block.strip().split("\n")
|
| 41 |
+
if len(lines) < 3:
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
# Parse timestamp line (format: 00:00:00,000 --> 00:00:05,000)
|
| 45 |
+
timestamp_line = lines[1]
|
| 46 |
+
if "-->" not in timestamp_line:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
times = timestamp_line.split("-->")
|
| 50 |
+
if len(times) != 2:
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
start_time = _parse_timestamp_to_seconds(times[0].strip())
|
| 54 |
+
end_time = _parse_timestamp_to_seconds(times[1].strip())
|
| 55 |
+
|
| 56 |
+
# Get dialogue
|
| 57 |
+
dialogue = " ".join(lines[2:])
|
| 58 |
+
# Remove speaker labels
|
| 59 |
+
dialogue = re.sub(r"\[.*?\]|\(.*?\)|^.*?:", "", dialogue).strip()
|
| 60 |
+
|
| 61 |
+
if dialogue and start_time is not None and end_time is not None:
|
| 62 |
+
segments.append(
|
| 63 |
+
{"start_time": start_time, "end_time": end_time, "dialogue": dialogue}
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
return segments
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def parse_vtt(content: str) -> list[str]:
|
| 70 |
+
"""Parse VTT subtitle content and extract dialogue text."""
|
| 71 |
+
dialogues = []
|
| 72 |
+
# Remove WEBVTT header
|
| 73 |
+
content = re.sub(r"^WEBVTT.*?\n\n", "", content, flags=re.MULTILINE)
|
| 74 |
+
blocks = content.strip().split("\n\n")
|
| 75 |
+
|
| 76 |
+
for block in blocks:
|
| 77 |
+
lines = block.strip().split("\n")
|
| 78 |
+
if len(lines) < 2:
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
# Find timestamp line
|
| 82 |
+
timestamp_idx = -1
|
| 83 |
+
for i, line in enumerate(lines):
|
| 84 |
+
if "-->" in line:
|
| 85 |
+
timestamp_idx = i
|
| 86 |
+
break
|
| 87 |
+
|
| 88 |
+
if timestamp_idx == -1:
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
+
# Get dialogue after timestamp
|
| 92 |
+
dialogue = " ".join(lines[timestamp_idx + 1 :])
|
| 93 |
+
# Remove speaker labels
|
| 94 |
+
dialogue = re.sub(r"\[.*?\]|\(.*?\)|^.*?:", "", dialogue).strip()
|
| 95 |
+
if dialogue:
|
| 96 |
+
dialogues.append(dialogue)
|
| 97 |
+
|
| 98 |
+
return dialogues
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_vtt_with_timing(content: str) -> list[dict]:
|
| 102 |
+
"""Parse VTT subtitle content with timing information."""
|
| 103 |
+
segments = []
|
| 104 |
+
# Remove WEBVTT header
|
| 105 |
+
content = re.sub(r"^WEBVTT.*?\n\n", "", content, flags=re.MULTILINE)
|
| 106 |
+
blocks = content.strip().split("\n\n")
|
| 107 |
+
|
| 108 |
+
for block in blocks:
|
| 109 |
+
lines = block.strip().split("\n")
|
| 110 |
+
if len(lines) < 2:
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
# Find timestamp line
|
| 114 |
+
timestamp_idx = -1
|
| 115 |
+
for i, line in enumerate(lines):
|
| 116 |
+
if "-->" in line:
|
| 117 |
+
timestamp_idx = i
|
| 118 |
+
break
|
| 119 |
+
|
| 120 |
+
if timestamp_idx == -1:
|
| 121 |
+
continue
|
| 122 |
+
|
| 123 |
+
# Parse timestamp
|
| 124 |
+
timestamp_line = lines[timestamp_idx]
|
| 125 |
+
times = timestamp_line.split("-->")
|
| 126 |
+
if len(times) != 2:
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
start_time = _parse_timestamp_to_seconds(times[0].strip())
|
| 130 |
+
end_time = _parse_timestamp_to_seconds(times[1].strip())
|
| 131 |
+
|
| 132 |
+
# Get dialogue
|
| 133 |
+
dialogue = " ".join(lines[timestamp_idx + 1 :])
|
| 134 |
+
# Remove speaker labels
|
| 135 |
+
dialogue = re.sub(r"\[.*?\]|\(.*?\)|^.*?:", "", dialogue).strip()
|
| 136 |
+
|
| 137 |
+
if dialogue and start_time is not None and end_time is not None:
|
| 138 |
+
segments.append(
|
| 139 |
+
{"start_time": start_time, "end_time": end_time, "dialogue": dialogue}
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
return segments
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def parse_json_scenario(content: str) -> list[str]:
|
| 146 |
+
"""Parse JSON scenario format and extract dialogue text."""
|
| 147 |
+
try:
|
| 148 |
+
data = json.loads(content)
|
| 149 |
+
if isinstance(data, str):
|
| 150 |
+
data = json.loads(data)
|
| 151 |
+
|
| 152 |
+
dialogues = []
|
| 153 |
+
if "scenes" in data:
|
| 154 |
+
for scene in data["scenes"]:
|
| 155 |
+
if "dialogue" in scene and scene["dialogue"]:
|
| 156 |
+
dialogues.append(scene["dialogue"])
|
| 157 |
+
|
| 158 |
+
return dialogues
|
| 159 |
+
except (json.JSONDecodeError, KeyError, TypeError):
|
| 160 |
+
raise ValueError("Invalid JSON scenario format")
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def parse_json_with_timing(content: str) -> list[dict]:
|
| 164 |
+
"""Parse JSON scenario format with timing information."""
|
| 165 |
+
try:
|
| 166 |
+
data = json.loads(content)
|
| 167 |
+
if isinstance(data, str):
|
| 168 |
+
data = json.loads(data)
|
| 169 |
+
|
| 170 |
+
segments = []
|
| 171 |
+
if "scenes" in data:
|
| 172 |
+
for scene in data["scenes"]:
|
| 173 |
+
start_time = scene.get("start_time")
|
| 174 |
+
end_time = scene.get("end_time")
|
| 175 |
+
duration = scene.get("duration")
|
| 176 |
+
dialogue = scene.get("dialogue", "")
|
| 177 |
+
|
| 178 |
+
# Calculate end_time if not provided but duration is
|
| 179 |
+
if end_time is None and duration is not None and start_time is not None:
|
| 180 |
+
end_time = start_time + duration
|
| 181 |
+
|
| 182 |
+
# Skip if no timing info
|
| 183 |
+
if start_time is None or end_time is None:
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
# Use placeholder if no dialogue
|
| 187 |
+
if not dialogue:
|
| 188 |
+
dialogue = f"Scene {scene.get('scene_id', '?')}"
|
| 189 |
+
|
| 190 |
+
segments.append(
|
| 191 |
+
{
|
| 192 |
+
"start_time": float(start_time),
|
| 193 |
+
"end_time": float(end_time),
|
| 194 |
+
"dialogue": dialogue,
|
| 195 |
+
}
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
return segments
|
| 199 |
+
except (json.JSONDecodeError, KeyError, TypeError, ValueError):
|
| 200 |
+
raise ValueError("Invalid JSON scenario format")
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def _parse_timestamp_to_seconds(timestamp: str) -> Optional[float]:
|
| 204 |
+
"""
|
| 205 |
+
Convert timestamp string to seconds.
|
| 206 |
+
Supports formats: HH:MM:SS,mmm or HH:MM:SS.mmm
|
| 207 |
+
"""
|
| 208 |
+
try:
|
| 209 |
+
# Replace comma with dot for milliseconds
|
| 210 |
+
timestamp = timestamp.replace(",", ".")
|
| 211 |
+
|
| 212 |
+
# Parse time components
|
| 213 |
+
parts = timestamp.split(":")
|
| 214 |
+
if len(parts) != 3:
|
| 215 |
+
return None
|
| 216 |
+
|
| 217 |
+
hours = int(parts[0])
|
| 218 |
+
minutes = int(parts[1])
|
| 219 |
+
seconds_parts = parts[2].split(".")
|
| 220 |
+
seconds = int(seconds_parts[0])
|
| 221 |
+
milliseconds = int(seconds_parts[1]) if len(seconds_parts) > 1 else 0
|
| 222 |
+
|
| 223 |
+
# Convert to total seconds
|
| 224 |
+
total_seconds = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000.0
|
| 225 |
+
return total_seconds
|
| 226 |
+
except (ValueError, IndexError):
|
| 227 |
+
return None
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def detect_format(content: str) -> str:
|
| 231 |
+
"""Auto-detect subtitle format from content."""
|
| 232 |
+
content_stripped = content.strip()
|
| 233 |
+
|
| 234 |
+
if content_stripped.startswith("WEBVTT"):
|
| 235 |
+
return "vtt"
|
| 236 |
+
elif content_stripped.startswith("{"):
|
| 237 |
+
return "json"
|
| 238 |
+
elif re.search(r"^\d+\s*\n\d{2}:\d{2}:\d{2},\d{3}\s*-->", content, re.MULTILINE):
|
| 239 |
+
return "srt"
|
| 240 |
+
else:
|
| 241 |
+
return "text"
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def text_to_speech(
|
| 245 |
+
text: str,
|
| 246 |
+
voice: Literal["male", "female", "neutral"] = "neutral",
|
| 247 |
+
speed: float = 1.0,
|
| 248 |
+
output_path: Optional[str] = None,
|
| 249 |
+
) -> str:
|
| 250 |
+
"""
|
| 251 |
+
Convert text to speech audio.
|
| 252 |
+
|
| 253 |
+
Args:
|
| 254 |
+
text: The text to convert to speech
|
| 255 |
+
voice: Voice type - "male", "female", or "neutral"
|
| 256 |
+
speed: Speech speed (0.5 to 2.0, where 1.0 is normal speed)
|
| 257 |
+
output_path: Optional custom output path for the audio file
|
| 258 |
+
|
| 259 |
+
Returns:
|
| 260 |
+
Path to the generated audio file
|
| 261 |
+
|
| 262 |
+
Raises:
|
| 263 |
+
ValueError: If text is empty or parameters are invalid
|
| 264 |
+
RuntimeError: If audio generation fails
|
| 265 |
+
"""
|
| 266 |
+
# Validate inputs
|
| 267 |
+
if not text or not text.strip():
|
| 268 |
+
raise ValueError("Text cannot be empty")
|
| 269 |
+
|
| 270 |
+
if not (0.5 <= speed <= 2.0):
|
| 271 |
+
raise ValueError("Speed must be between 0.5 and 2.0")
|
| 272 |
+
|
| 273 |
+
if voice not in ["male", "female", "neutral"]:
|
| 274 |
+
raise ValueError("Voice must be 'male', 'female', or 'neutral'")
|
| 275 |
+
|
| 276 |
+
# Import genai only when needed for advanced features
|
| 277 |
+
try:
|
| 278 |
+
import google.generativeai as genai
|
| 279 |
+
except ImportError:
|
| 280 |
+
raise ImportError("Please install google-genai: pip install google-genai")
|
| 281 |
+
|
| 282 |
+
# Configure API
|
| 283 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
| 284 |
+
if not api_key:
|
| 285 |
+
raise ValueError("GEMINI_API_KEY environment variable is not set")
|
| 286 |
+
|
| 287 |
+
genai.configure(api_key=api_key)
|
| 288 |
+
|
| 289 |
+
# Determine output path
|
| 290 |
+
if output_path is None:
|
| 291 |
+
output_dir = Path("outputs/audio")
|
| 292 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 293 |
+
output_path = str(output_dir / "generated_speech.mp3")
|
| 294 |
+
else:
|
| 295 |
+
# Ensure directory exists
|
| 296 |
+
output_dir = Path(output_path).parent
|
| 297 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 298 |
+
|
| 299 |
+
try:
|
| 300 |
+
# Use Gemini to generate audio description/narration prompt
|
| 301 |
+
# Since Gemini doesn't have direct TTS, we'll use a workaround with available APIs
|
| 302 |
+
# For now, we'll create a simple implementation that can be extended
|
| 303 |
+
|
| 304 |
+
# Voice characteristics mapping
|
| 305 |
+
voice_prompts = {
|
| 306 |
+
"male": "deep, masculine, authoritative voice",
|
| 307 |
+
"female": "clear, feminine, warm voice",
|
| 308 |
+
"neutral": "balanced, professional, clear voice",
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
speed_description = ""
|
| 312 |
+
if speed < 0.8:
|
| 313 |
+
speed_description = "speaking slowly and clearly"
|
| 314 |
+
elif speed > 1.2:
|
| 315 |
+
speed_description = "speaking at a brisk pace"
|
| 316 |
+
else:
|
| 317 |
+
speed_description = "speaking at a normal pace"
|
| 318 |
+
|
| 319 |
+
# Generate enhanced text with voice instructions
|
| 320 |
+
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 321 |
+
|
| 322 |
+
prompt = f"""
|
| 323 |
+
Prepare this text for text-to-speech conversion with a {voice_prompts[voice]}, {speed_description}:
|
| 324 |
+
|
| 325 |
+
Text: {text}
|
| 326 |
+
|
| 327 |
+
Provide the text in a format optimized for speech synthesis, with appropriate pauses marked by commas and periods.
|
| 328 |
+
Only return the optimized text, nothing else.
|
| 329 |
+
"""
|
| 330 |
+
|
| 331 |
+
response = model.generate_content(prompt)
|
| 332 |
+
optimized_text = response.text.strip()
|
| 333 |
+
|
| 334 |
+
# Note: Since Google Gemini doesn't provide direct TTS API in the SDK,
|
| 335 |
+
# we'll save the optimized text and return a placeholder
|
| 336 |
+
# In production, you would integrate with Google Cloud Text-to-Speech API
|
| 337 |
+
# or another TTS service like ElevenLabs, Azure TTS, etc.
|
| 338 |
+
|
| 339 |
+
# For now, create a text file with instructions
|
| 340 |
+
text_output_path = output_path.replace(".mp3", ".txt")
|
| 341 |
+
with open(text_output_path, "w", encoding="utf-8") as f:
|
| 342 |
+
f.write(f"Voice: {voice}\n")
|
| 343 |
+
f.write(f"Speed: {speed}x\n")
|
| 344 |
+
f.write(f"Original Text:\n{text}\n\n")
|
| 345 |
+
f.write(f"Optimized for TTS:\n{optimized_text}\n")
|
| 346 |
+
|
| 347 |
+
# Return the path with a note
|
| 348 |
+
result_message = f"""
|
| 349 |
+
Audio generation prepared successfully!
|
| 350 |
+
|
| 351 |
+
Text file created at: {text_output_path}
|
| 352 |
+
|
| 353 |
+
To generate actual audio, you need to:
|
| 354 |
+
1. Install Google Cloud Text-to-Speech: pip install google-cloud-texttospeech
|
| 355 |
+
2. Set up Google Cloud credentials
|
| 356 |
+
3. Or use alternative TTS services like:
|
| 357 |
+
- ElevenLabs (https://elevenlabs.io/)
|
| 358 |
+
- Azure Speech Services
|
| 359 |
+
- Amazon Polly
|
| 360 |
+
- gTTS (free, but limited)
|
| 361 |
+
|
| 362 |
+
The text has been optimized for {voice} voice at {speed}x speed.
|
| 363 |
+
"""
|
| 364 |
+
|
| 365 |
+
return result_message
|
| 366 |
+
|
| 367 |
+
except Exception as e:
|
| 368 |
+
raise RuntimeError(f"Failed to generate audio: {str(e)}")
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def text_to_speech_simple(
|
| 372 |
+
text: str,
|
| 373 |
+
voice: str = "neutral",
|
| 374 |
+
language: str = "en",
|
| 375 |
+
speed: str = "normal",
|
| 376 |
+
format_type: str = "text",
|
| 377 |
+
generate_segments: bool = False,
|
| 378 |
+
) -> str:
|
| 379 |
+
"""
|
| 380 |
+
Text-to-speech using gTTS (Google Text-to-Speech) - free tier.
|
| 381 |
+
Supports plain text, SRT, VTT, and JSON subtitle formats.
|
| 382 |
+
Can generate single audio file or timed segments matching subtitle timing.
|
| 383 |
+
|
| 384 |
+
Args:
|
| 385 |
+
text: The text/subtitle content to convert to speech
|
| 386 |
+
voice: Voice type - "male", "female", or "neutral" (affects language variant)
|
| 387 |
+
language: Language code (e.g., "en" for English, "es" for Spanish)
|
| 388 |
+
speed: Speech speed - "normal" or "slow"
|
| 389 |
+
format_type: Input format - "text", "srt", "vtt", "json", or "auto" to detect
|
| 390 |
+
generate_segments: If True and input is subtitles, generates individual audio files
|
| 391 |
+
for each subtitle segment with timing info (for video sync)
|
| 392 |
+
|
| 393 |
+
Returns:
|
| 394 |
+
Path to the generated audio file(s).
|
| 395 |
+
- If generate_segments=False: Returns single combined audio file path
|
| 396 |
+
- If generate_segments=True: Returns JSON string with audio segments and timing info
|
| 397 |
+
|
| 398 |
+
Note:
|
| 399 |
+
gTTS doesn't directly support voice gender, but we can use different
|
| 400 |
+
language variants (TLDs) that may sound slightly different:
|
| 401 |
+
- male: uses .co.uk (British English - often perceived as more masculine)
|
| 402 |
+
- female: uses .com.au (Australian English - often perceived as more feminine)
|
| 403 |
+
- neutral: uses .com (Standard English)
|
| 404 |
+
|
| 405 |
+
Timed Audio Segments Format (when generate_segments=True):
|
| 406 |
+
{
|
| 407 |
+
"segments": [
|
| 408 |
+
{
|
| 409 |
+
"segment_id": 1,
|
| 410 |
+
"start_time": 0.0,
|
| 411 |
+
"end_time": 5.0,
|
| 412 |
+
"dialogue": "Welcome to our video",
|
| 413 |
+
"audio_file": "outputs/audio/segment_1.mp3"
|
| 414 |
+
},
|
| 415 |
+
...
|
| 416 |
+
],
|
| 417 |
+
"total_duration": 15.0
|
| 418 |
+
}
|
| 419 |
+
"""
|
| 420 |
+
try:
|
| 421 |
+
from gtts import gTTS
|
| 422 |
+
except ImportError:
|
| 423 |
+
return "Please install gTTS: pip install gtts"
|
| 424 |
+
|
| 425 |
+
if not text or not text.strip():
|
| 426 |
+
raise ValueError("Text cannot be empty")
|
| 427 |
+
|
| 428 |
+
# Auto-detect format if requested
|
| 429 |
+
if format_type == "auto":
|
| 430 |
+
format_type = detect_format(text)
|
| 431 |
+
|
| 432 |
+
# Parse subtitles based on format
|
| 433 |
+
subtitle_segments = []
|
| 434 |
+
if format_type == "srt":
|
| 435 |
+
dialogues = parse_srt(text)
|
| 436 |
+
if not dialogues:
|
| 437 |
+
raise ValueError("No dialogue found in SRT content")
|
| 438 |
+
if generate_segments:
|
| 439 |
+
subtitle_segments = parse_srt_with_timing(text)
|
| 440 |
+
if not subtitle_segments:
|
| 441 |
+
raise ValueError("Failed to parse SRT timing information")
|
| 442 |
+
else:
|
| 443 |
+
final_text = " ".join(dialogues)
|
| 444 |
+
elif format_type == "vtt":
|
| 445 |
+
dialogues = parse_vtt(text)
|
| 446 |
+
if not dialogues:
|
| 447 |
+
raise ValueError("No dialogue found in VTT content")
|
| 448 |
+
if generate_segments:
|
| 449 |
+
subtitle_segments = parse_vtt_with_timing(text)
|
| 450 |
+
if not subtitle_segments:
|
| 451 |
+
raise ValueError("Failed to parse VTT timing information")
|
| 452 |
+
else:
|
| 453 |
+
final_text = " ".join(dialogues)
|
| 454 |
+
elif format_type == "json":
|
| 455 |
+
dialogues = parse_json_scenario(text)
|
| 456 |
+
if not dialogues:
|
| 457 |
+
raise ValueError("No dialogue found in JSON content")
|
| 458 |
+
if generate_segments:
|
| 459 |
+
subtitle_segments = parse_json_with_timing(text)
|
| 460 |
+
if not subtitle_segments:
|
| 461 |
+
raise ValueError("Failed to parse JSON timing information")
|
| 462 |
+
else:
|
| 463 |
+
final_text = " ".join(dialogues)
|
| 464 |
+
else:
|
| 465 |
+
# Plain text - use as is
|
| 466 |
+
final_text = text
|
| 467 |
+
generate_segments = False # Can't generate segments without timing info
|
| 468 |
+
|
| 469 |
+
# Map voice preference to gTTS TLD (top-level domain)
|
| 470 |
+
# Different accents can give perception of different voice characteristics
|
| 471 |
+
voice_tld_map = {
|
| 472 |
+
"male": "co.uk", # British English (deeper/masculine perception)
|
| 473 |
+
"female": "com.au", # Australian English (lighter/feminine perception)
|
| 474 |
+
"neutral": "com", # US English (neutral)
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
tld = voice_tld_map.get(voice, "com")
|
| 478 |
+
slow = speed == "slow"
|
| 479 |
+
|
| 480 |
+
# Create output directory
|
| 481 |
+
output_dir = Path("outputs/audio")
|
| 482 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 483 |
+
|
| 484 |
+
try:
|
| 485 |
+
if generate_segments and subtitle_segments:
|
| 486 |
+
# Generate individual audio files for each subtitle segment
|
| 487 |
+
result_segments = []
|
| 488 |
+
total_duration = 0.0
|
| 489 |
+
|
| 490 |
+
for idx, segment in enumerate(subtitle_segments, 1):
|
| 491 |
+
segment_text = segment["dialogue"]
|
| 492 |
+
start_time = segment["start_time"]
|
| 493 |
+
end_time = segment["end_time"]
|
| 494 |
+
total_duration = max(total_duration, end_time)
|
| 495 |
+
|
| 496 |
+
# Generate audio for this segment
|
| 497 |
+
segment_path = str(output_dir / f"segment_{idx}.mp3")
|
| 498 |
+
tts = gTTS(text=segment_text, lang=language, slow=slow, tld=tld)
|
| 499 |
+
tts.save(segment_path)
|
| 500 |
+
|
| 501 |
+
result_segments.append(
|
| 502 |
+
{
|
| 503 |
+
"segment_id": idx,
|
| 504 |
+
"start_time": start_time,
|
| 505 |
+
"end_time": end_time,
|
| 506 |
+
"duration": end_time - start_time,
|
| 507 |
+
"dialogue": segment_text,
|
| 508 |
+
"audio_file": segment_path,
|
| 509 |
+
}
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
# Return JSON with segment information
|
| 513 |
+
result = {
|
| 514 |
+
"segments": result_segments,
|
| 515 |
+
"total_duration": total_duration,
|
| 516 |
+
"voice": voice,
|
| 517 |
+
"language": language,
|
| 518 |
+
"speed": speed,
|
| 519 |
+
}
|
| 520 |
+
return json.dumps(result, indent=2)
|
| 521 |
+
else:
|
| 522 |
+
# Generate single combined audio file
|
| 523 |
+
output_path = str(output_dir / "generated_speech.mp3")
|
| 524 |
+
tts = gTTS(text=final_text, lang=language, slow=slow, tld=tld)
|
| 525 |
+
tts.save(output_path)
|
| 526 |
+
|
| 527 |
+
# Return the path so Gradio can load the audio file
|
| 528 |
+
return output_path
|
| 529 |
+
|
| 530 |
+
except Exception as e:
|
| 531 |
+
raise RuntimeError(f"Failed to generate audio: {str(e)}")
|
src/app/tools/thumbnail_generator.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import time
|
| 4 |
+
from typing import Optional
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
import google.genai as genai
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
import mimetypes
|
| 10 |
+
|
| 11 |
+
# Load environment variables
|
| 12 |
+
load_dotenv()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def thumbnail_generator(
|
| 16 |
+
image_input,
|
| 17 |
+
summary: str,
|
| 18 |
+
output_path: Optional[str] = None,
|
| 19 |
+
) -> str:
|
| 20 |
+
"""
|
| 21 |
+
Generate a highly engaging and funny thumbnail image for a TikTok video/social media post.
|
| 22 |
+
Uses Gemini AI to generate the complete thumbnail image directly, using the input image as
|
| 23 |
+
a background and adding strategically placed text overlays and humorous stickers/emojis.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
image_input: Image file path (str) or tuple from Gradio - used as the background image
|
| 27 |
+
summary: Text summary of the video content (used to generate appropriate text and stickers)
|
| 28 |
+
output_path: Optional path where the thumbnail should be saved.
|
| 29 |
+
If not provided, saves to a temporary file.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
str: Path to the generated thumbnail image (PNG format)
|
| 33 |
+
|
| 34 |
+
Raises:
|
| 35 |
+
ValueError: If image file not found or GOOGLE_API_KEY not set
|
| 36 |
+
Exception: If thumbnail generation fails
|
| 37 |
+
"""
|
| 38 |
+
try:
|
| 39 |
+
# Handle Gradio image input format (can be tuple or string)
|
| 40 |
+
if isinstance(image_input, tuple):
|
| 41 |
+
image_path = image_input[0]
|
| 42 |
+
elif isinstance(image_input, str):
|
| 43 |
+
image_path = image_input
|
| 44 |
+
else:
|
| 45 |
+
raise ValueError("Invalid image input format")
|
| 46 |
+
|
| 47 |
+
# Validate image file exists
|
| 48 |
+
if not image_path or not os.path.exists(image_path):
|
| 49 |
+
raise ValueError(f"Image file not found: {image_path}")
|
| 50 |
+
|
| 51 |
+
# Get API key from environment
|
| 52 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 53 |
+
if not api_key:
|
| 54 |
+
raise ValueError(
|
| 55 |
+
"GOOGLE_API_KEY environment variable is not set. "
|
| 56 |
+
"Please set it in your .env file or environment."
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Initialize Gemini client
|
| 60 |
+
client = genai.Client(api_key=api_key)
|
| 61 |
+
|
| 62 |
+
# Read image file as bytes
|
| 63 |
+
with open(image_path, "rb") as f:
|
| 64 |
+
image_data = f.read()
|
| 65 |
+
|
| 66 |
+
# Determine MIME type
|
| 67 |
+
mime_type, _ = mimetypes.guess_type(image_path)
|
| 68 |
+
if not mime_type or not mime_type.startswith("image/"):
|
| 69 |
+
# Default to png if cannot determine
|
| 70 |
+
mime_type = "image/png"
|
| 71 |
+
|
| 72 |
+
# Create comprehensive prompt for thumbnail generation
|
| 73 |
+
prompt = f"""Generate a highly engaging and funny thumbnail image for a TikTok video/social media post.
|
| 74 |
+
|
| 75 |
+
Use the provided background image as the foundation, and create a complete thumbnail that includes:
|
| 76 |
+
|
| 77 |
+
BACKGROUND IMAGE: Use the provided image as the base. This is a high-quality, captivating photograph that should fill the entire thumbnail area.
|
| 78 |
+
|
| 79 |
+
TEXT OVERLAY: Add prominent, dramatic text that is:
|
| 80 |
+
- Catchy, attention-grabbing, and creates curiosity based on this video summary: "{summary}"
|
| 81 |
+
- Keep it short (3-8 words max) and impactful
|
| 82 |
+
- Use bright, contrasting colors (neon yellow, electric green, vibrant orange) with a dark outline for maximum readability
|
| 83 |
+
- Position the text in an empty or less busy area (upper left, upper right, or bottom) so it doesn't obscure key subjects
|
| 84 |
+
- Make the text bold and large enough to be easily readable
|
| 85 |
+
|
| 86 |
+
STICKER/EMOJI: Add a large, expressive, high-contrast sticker or emoji that enhances the comedic effect:
|
| 87 |
+
- Options: 🚨 (siren/emergency), 😰 (sweating face), ☠️ (skull), 😱 (screaming), 🔥 (fire), ⚠️ (warning), 💥 (explosion)
|
| 88 |
+
- Position it near the main subject or focal point, as if it's an extension of the dramatic reaction
|
| 89 |
+
- Make it large and prominent without covering the main subject entirely
|
| 90 |
+
|
| 91 |
+
OVERALL DESIGN:
|
| 92 |
+
- Vibe: absurd, dramatic, high-energy, slightly chaotic
|
| 93 |
+
- Colors: vibrant and eye-catching
|
| 94 |
+
- Style: professional yet intentionally over-the-top
|
| 95 |
+
- Integration: all overlays should be seamlessly integrated for comedic effect
|
| 96 |
+
- The final image should be engaging, funny, and make viewers curious to watch the video
|
| 97 |
+
|
| 98 |
+
Generate the complete thumbnail image with all these elements integrated."""
|
| 99 |
+
|
| 100 |
+
# Create image blob
|
| 101 |
+
image_blob = genai.types.Blob(data=image_data, mime_type=mime_type)
|
| 102 |
+
image_part = genai.types.Part(inline_data=image_blob)
|
| 103 |
+
|
| 104 |
+
# Use Gemini to generate the thumbnail image directly
|
| 105 |
+
# Use gemini-2.5-flash-image model which supports image generation
|
| 106 |
+
response = None
|
| 107 |
+
last_error = None
|
| 108 |
+
|
| 109 |
+
# Try with response_modalities first
|
| 110 |
+
try:
|
| 111 |
+
response = client.models.generate_content(
|
| 112 |
+
model="gemini-2.5-flash-image",
|
| 113 |
+
contents=[prompt, image_part],
|
| 114 |
+
config={
|
| 115 |
+
"response_modalities": ["IMAGE"],
|
| 116 |
+
},
|
| 117 |
+
)
|
| 118 |
+
except (AttributeError, TypeError) as e:
|
| 119 |
+
# If response_modalities attribute error, try without config
|
| 120 |
+
last_error = e
|
| 121 |
+
try:
|
| 122 |
+
response = client.models.generate_content(
|
| 123 |
+
model="gemini-2.5-flash-image",
|
| 124 |
+
contents=[prompt, image_part],
|
| 125 |
+
)
|
| 126 |
+
except Exception as e2:
|
| 127 |
+
raise Exception(
|
| 128 |
+
f"Failed to generate image with Gemini (response_modalities error: {str(e)}): {str(e2)}"
|
| 129 |
+
)
|
| 130 |
+
except Exception as e:
|
| 131 |
+
# For other errors, also try without response_modalities
|
| 132 |
+
last_error = e
|
| 133 |
+
try:
|
| 134 |
+
response = client.models.generate_content(
|
| 135 |
+
model="gemini-2.5-flash-image",
|
| 136 |
+
contents=[prompt, image_part],
|
| 137 |
+
)
|
| 138 |
+
except Exception as e2:
|
| 139 |
+
raise Exception(
|
| 140 |
+
f"Failed to generate image with Gemini: {str(e)} (fallback also failed: {str(e2)})"
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Check if response is None
|
| 144 |
+
if response is None:
|
| 145 |
+
raise Exception("Gemini API returned None response")
|
| 146 |
+
|
| 147 |
+
# Extract the generated image from the response
|
| 148 |
+
generated_image = None
|
| 149 |
+
|
| 150 |
+
# Check if response has image data
|
| 151 |
+
if hasattr(response, "candidates") and response.candidates:
|
| 152 |
+
for candidate in response.candidates:
|
| 153 |
+
if hasattr(candidate, "content") and candidate.content:
|
| 154 |
+
if hasattr(candidate.content, "parts"):
|
| 155 |
+
for part in candidate.content.parts:
|
| 156 |
+
# Check for inline_data (image data)
|
| 157 |
+
if hasattr(part, "inline_data") and part.inline_data:
|
| 158 |
+
generated_image = part.inline_data.data
|
| 159 |
+
break
|
| 160 |
+
# Check for blob (alternative format)
|
| 161 |
+
elif hasattr(part, "blob") and part.blob:
|
| 162 |
+
generated_image = part.blob.data
|
| 163 |
+
break
|
| 164 |
+
|
| 165 |
+
# If no image found in candidates, try alternative response structure
|
| 166 |
+
if generated_image is None:
|
| 167 |
+
# Try to get image from response directly
|
| 168 |
+
if hasattr(response, "parts"):
|
| 169 |
+
for part in response.parts:
|
| 170 |
+
if hasattr(part, "inline_data") and part.inline_data:
|
| 171 |
+
generated_image = part.inline_data.data
|
| 172 |
+
break
|
| 173 |
+
elif hasattr(part, "blob") and part.blob:
|
| 174 |
+
generated_image = part.blob.data
|
| 175 |
+
break
|
| 176 |
+
|
| 177 |
+
# If still no image, the model might have returned text describing the image
|
| 178 |
+
# In that case, we'll need to handle it differently or raise an error
|
| 179 |
+
if generated_image is None:
|
| 180 |
+
# Check if response has text (might indicate the model didn't generate an image)
|
| 181 |
+
if hasattr(response, "text") and response.text:
|
| 182 |
+
raise Exception(
|
| 183 |
+
f"Gemini returned text instead of image. This model may not support image generation. "
|
| 184 |
+
f"Response: {response.text[:200]}"
|
| 185 |
+
)
|
| 186 |
+
else:
|
| 187 |
+
raise Exception(
|
| 188 |
+
"Failed to extract generated image from Gemini response"
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
# Convert image data to PIL Image
|
| 192 |
+
thumbnail = Image.open(BytesIO(generated_image)).convert("RGB")
|
| 193 |
+
|
| 194 |
+
# Determine output path
|
| 195 |
+
if output_path is None:
|
| 196 |
+
temp_dir = tempfile.gettempdir()
|
| 197 |
+
timestamp = int(time.time())
|
| 198 |
+
output_path = os.path.join(temp_dir, f"thumbnail_{timestamp}.png")
|
| 199 |
+
|
| 200 |
+
# Ensure output directory exists
|
| 201 |
+
output_dir = os.path.dirname(output_path)
|
| 202 |
+
if output_dir and not os.path.exists(output_dir):
|
| 203 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 204 |
+
|
| 205 |
+
# Save thumbnail
|
| 206 |
+
thumbnail.save(output_path, "PNG", quality=95)
|
| 207 |
+
|
| 208 |
+
# Return absolute path
|
| 209 |
+
return os.path.abspath(output_path)
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
raise Exception(f"Error generating thumbnail: {str(e)}")
|
src/app/tools/video_clipper.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from moviepy import VideoFileClip
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def video_clipper(
|
| 8 |
+
video_input, start_time: float, end_time: float, output_path: str = None
|
| 9 |
+
) -> str:
|
| 10 |
+
"""
|
| 11 |
+
Extract a specific segment from a video file based on start and end times.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
video_input: Video file path (str) or tuple (video_path, subtitle_path) from Gradio
|
| 15 |
+
start_time (float): Start time in seconds (0-based)
|
| 16 |
+
end_time (float): End time in seconds (must be > start_time)
|
| 17 |
+
output_path (str, optional): Path where the clipped video should be saved.
|
| 18 |
+
If not provided, saves to a temporary file.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
str: Path to the clipped video file
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
# Handle Gradio video input format (can be tuple or string)
|
| 25 |
+
if isinstance(video_input, tuple):
|
| 26 |
+
video_path = video_input[0]
|
| 27 |
+
elif isinstance(video_input, str):
|
| 28 |
+
video_path = video_input
|
| 29 |
+
else:
|
| 30 |
+
raise ValueError("Invalid video input format. Expected string or tuple.")
|
| 31 |
+
|
| 32 |
+
# Validate video file exists
|
| 33 |
+
if not video_path or not os.path.exists(video_path):
|
| 34 |
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
| 35 |
+
|
| 36 |
+
# Validate time parameters
|
| 37 |
+
if start_time < 0:
|
| 38 |
+
raise ValueError("Start time must be >= 0")
|
| 39 |
+
|
| 40 |
+
if end_time <= start_time:
|
| 41 |
+
raise ValueError("End time must be greater than start time")
|
| 42 |
+
|
| 43 |
+
# Load the video file
|
| 44 |
+
video = VideoFileClip(video_path)
|
| 45 |
+
|
| 46 |
+
# Validate time range against video duration
|
| 47 |
+
video_duration = video.duration
|
| 48 |
+
if start_time >= video_duration:
|
| 49 |
+
video.close()
|
| 50 |
+
raise ValueError(
|
| 51 |
+
f"Start time ({start_time}s) exceeds video duration ({video_duration:.2f}s)"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Clamp end_time to video duration if necessary
|
| 55 |
+
if end_time > video_duration:
|
| 56 |
+
end_time = video_duration
|
| 57 |
+
|
| 58 |
+
# Extract the segment (using subclipped for MoviePy 2.1.2+)
|
| 59 |
+
clipped_video = video.subclipped(start_time, end_time)
|
| 60 |
+
|
| 61 |
+
# Determine output path
|
| 62 |
+
if output_path is None:
|
| 63 |
+
# Create a temporary file with appropriate extension
|
| 64 |
+
video_ext = Path(video_path).suffix or ".mp4"
|
| 65 |
+
temp_dir = tempfile.gettempdir()
|
| 66 |
+
output_path = os.path.join(
|
| 67 |
+
temp_dir,
|
| 68 |
+
f"clipped_{os.path.basename(video_path)}_{start_time}_{end_time}{video_ext}",
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Ensure output directory exists
|
| 72 |
+
output_dir = os.path.dirname(output_path)
|
| 73 |
+
if output_dir and not os.path.exists(output_dir):
|
| 74 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 75 |
+
|
| 76 |
+
# Write the clipped video
|
| 77 |
+
clipped_video.write_videofile(
|
| 78 |
+
output_path,
|
| 79 |
+
codec="libx264",
|
| 80 |
+
audio_codec="aac",
|
| 81 |
+
temp_audiofile=tempfile.mktemp(suffix=".m4a"),
|
| 82 |
+
remove_temp=True,
|
| 83 |
+
logger=None,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Clean up
|
| 87 |
+
clipped_video.close()
|
| 88 |
+
video.close()
|
| 89 |
+
|
| 90 |
+
# Return absolute path
|
| 91 |
+
return os.path.abspath(output_path)
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
# Clean up video object if it exists
|
| 95 |
+
try:
|
| 96 |
+
if "video" in locals() and video is not None:
|
| 97 |
+
video.close()
|
| 98 |
+
if "clipped_video" in locals() and clipped_video is not None:
|
| 99 |
+
clipped_video.close()
|
| 100 |
+
except:
|
| 101 |
+
pass
|
| 102 |
+
raise Exception(f"Error clipping video: {str(e)}")
|
src/app/tools/video_composer.py
ADDED
|
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import tempfile
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Optional, Union, TypedDict, Literal, Tuple
|
| 6 |
+
from moviepy import (
|
| 7 |
+
VideoFileClip,
|
| 8 |
+
CompositeVideoClip,
|
| 9 |
+
AudioFileClip,
|
| 10 |
+
concatenate_videoclips,
|
| 11 |
+
ImageClip,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
from moviepy.audio import concatenate_audioclips
|
| 16 |
+
except ImportError:
|
| 17 |
+
from moviepy import concatenate_audioclips
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Type definitions for script structure
|
| 21 |
+
TransitionType = Literal["cut", "fade", "crossfade"]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class Scene(TypedDict, total=False):
|
| 25 |
+
"""Scene definition in the video composition script."""
|
| 26 |
+
|
| 27 |
+
scene_id: int
|
| 28 |
+
source_video: Union[
|
| 29 |
+
int, str
|
| 30 |
+
] # Index (int) or filename (str) referencing video_clips
|
| 31 |
+
start_time: float
|
| 32 |
+
end_time: Optional[float]
|
| 33 |
+
duration: Optional[float]
|
| 34 |
+
transition_in: TransitionType
|
| 35 |
+
transition_out: TransitionType
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class Music(TypedDict, total=False):
|
| 39 |
+
"""Music configuration in the video composition script."""
|
| 40 |
+
|
| 41 |
+
mood: str
|
| 42 |
+
bpm: int
|
| 43 |
+
sync_points: List[float]
|
| 44 |
+
volume: float
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class ScriptData(TypedDict, total=False):
|
| 48 |
+
"""Complete script structure for video composition."""
|
| 49 |
+
|
| 50 |
+
total_duration: float
|
| 51 |
+
scenes: List[Scene]
|
| 52 |
+
music: Music
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Type aliases for Gradio file inputs
|
| 56 |
+
GradioVideoInput = Union[
|
| 57 |
+
str, # Single file path
|
| 58 |
+
Tuple[str, str], # Gradio video format: (video_path, subtitle_path)
|
| 59 |
+
List[Union[str, Tuple[str, str]]], # List of files
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
GradioMusicInput = Union[
|
| 63 |
+
str, # File path
|
| 64 |
+
Tuple[str, ...], # Gradio file format: (file_path, ...)
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
GradioImageInput = Union[
|
| 68 |
+
str, # File path
|
| 69 |
+
Tuple[str, ...], # Gradio file format: (file_path, ...)
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def video_composer(
|
| 74 |
+
script: Union[str, ScriptData],
|
| 75 |
+
video_clips: GradioVideoInput,
|
| 76 |
+
music_path: Optional[GradioMusicInput] = None,
|
| 77 |
+
thumbnail_image: Optional[GradioImageInput] = None,
|
| 78 |
+
output_path: Optional[str] = None,
|
| 79 |
+
) -> str:
|
| 80 |
+
"""
|
| 81 |
+
Combine video clips, add music, and apply transitions according to a script.
|
| 82 |
+
Creates a final composed video from multiple video segments.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
script: Script JSON string or dict containing scene information with transitions.
|
| 86 |
+
Expected format:
|
| 87 |
+
{
|
| 88 |
+
"total_duration": 30.0,
|
| 89 |
+
"scenes": [
|
| 90 |
+
{
|
| 91 |
+
"scene_id": 1,
|
| 92 |
+
"source_video": 0, # Index into video_clips list, or filename
|
| 93 |
+
"start_time": 5.2,
|
| 94 |
+
"end_time": 8.5,
|
| 95 |
+
"duration": 3.3,
|
| 96 |
+
"transition_in": "fade",
|
| 97 |
+
"transition_out": "crossfade"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"scene_id": 2,
|
| 101 |
+
"source_video": 0, # Same video can be used in multiple scenes
|
| 102 |
+
"start_time": 10.0,
|
| 103 |
+
"end_time": 15.0,
|
| 104 |
+
"duration": 5.0,
|
| 105 |
+
"transition_in": "crossfade",
|
| 106 |
+
"transition_out": "fade"
|
| 107 |
+
},
|
| 108 |
+
...
|
| 109 |
+
],
|
| 110 |
+
"music": {
|
| 111 |
+
"mood": "energetic",
|
| 112 |
+
"bpm": 120,
|
| 113 |
+
"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0]
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
Note: source_video can be:
|
| 117 |
+
- An integer index (0-based) into the video_clips list
|
| 118 |
+
- A filename (string) that matches the basename of one of the videos in video_clips
|
| 119 |
+
The same source_video can be used in multiple scenes with different
|
| 120 |
+
time ranges. Each scene will extract its own clip from the referenced video.
|
| 121 |
+
video_clips: Required list of source video file paths or single path.
|
| 122 |
+
Each scene's source_video references a video from this list.
|
| 123 |
+
Can be a list, single string (from Gradio File component).
|
| 124 |
+
music_path: Optional path to background music file. If provided, will be added
|
| 125 |
+
to the final video. Can be a string path or tuple (from Gradio File component).
|
| 126 |
+
thumbnail_image: Optional path to thumbnail image file. If provided, will be overlaid
|
| 127 |
+
on the first frame of the video. Can be a string path or tuple (from Gradio File component).
|
| 128 |
+
output_path: Optional path where the composed video should be saved.
|
| 129 |
+
If not provided, saves to a temporary file.
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
str: Path to the final composed video file
|
| 133 |
+
"""
|
| 134 |
+
try:
|
| 135 |
+
# Handle Gradio file input formats
|
| 136 |
+
if isinstance(video_clips, tuple):
|
| 137 |
+
# Single tuple from Gradio: (video_path, subtitle_path)
|
| 138 |
+
video_clips = [video_clips[0]] if video_clips else []
|
| 139 |
+
elif isinstance(video_clips, str):
|
| 140 |
+
# Single file path
|
| 141 |
+
video_clips = [video_clips]
|
| 142 |
+
elif isinstance(video_clips, list):
|
| 143 |
+
# List of file paths (may contain tuples from Gradio)
|
| 144 |
+
processed_clips = []
|
| 145 |
+
for clip in video_clips:
|
| 146 |
+
if isinstance(clip, tuple):
|
| 147 |
+
# Gradio video format: (video_path, subtitle_path)
|
| 148 |
+
processed_clips.append(clip[0])
|
| 149 |
+
elif isinstance(clip, str):
|
| 150 |
+
processed_clips.append(clip)
|
| 151 |
+
video_clips = processed_clips if processed_clips else []
|
| 152 |
+
else:
|
| 153 |
+
raise ValueError(
|
| 154 |
+
"video_clips must be a string, tuple, or list of strings/tuples"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Validate video_clips is not empty
|
| 158 |
+
if not video_clips:
|
| 159 |
+
raise ValueError("video_clips is required and cannot be empty")
|
| 160 |
+
|
| 161 |
+
# Validate all video files exist
|
| 162 |
+
for clip_path in video_clips:
|
| 163 |
+
if not os.path.exists(clip_path):
|
| 164 |
+
raise FileNotFoundError(f"Video clip not found: {clip_path}")
|
| 165 |
+
|
| 166 |
+
# Handle Gradio music file input format
|
| 167 |
+
if music_path is not None:
|
| 168 |
+
if isinstance(music_path, tuple):
|
| 169 |
+
# Gradio file format: (file_path, ...)
|
| 170 |
+
music_path = music_path[0] if music_path else None
|
| 171 |
+
elif not isinstance(music_path, str):
|
| 172 |
+
music_path = None
|
| 173 |
+
|
| 174 |
+
# Handle Gradio thumbnail image input format
|
| 175 |
+
thumbnail_path = None
|
| 176 |
+
if thumbnail_image is not None:
|
| 177 |
+
if isinstance(thumbnail_image, tuple):
|
| 178 |
+
# Gradio file format: (file_path, ...)
|
| 179 |
+
thumbnail_path = thumbnail_image[0] if thumbnail_image else None
|
| 180 |
+
elif isinstance(thumbnail_image, str):
|
| 181 |
+
thumbnail_path = thumbnail_image
|
| 182 |
+
else:
|
| 183 |
+
thumbnail_path = None
|
| 184 |
+
|
| 185 |
+
# Validate thumbnail image exists if provided
|
| 186 |
+
if thumbnail_path and not os.path.exists(thumbnail_path):
|
| 187 |
+
raise FileNotFoundError(f"Thumbnail image not found: {thumbnail_path}")
|
| 188 |
+
|
| 189 |
+
# Parse script if it's a string
|
| 190 |
+
if isinstance(script, str):
|
| 191 |
+
try:
|
| 192 |
+
script_data: ScriptData = json.loads(script)
|
| 193 |
+
except json.JSONDecodeError:
|
| 194 |
+
raise ValueError("Invalid JSON format for script")
|
| 195 |
+
else:
|
| 196 |
+
script_data = script
|
| 197 |
+
|
| 198 |
+
# Validate script structure
|
| 199 |
+
if not isinstance(script_data, dict) or "scenes" not in script_data:
|
| 200 |
+
raise ValueError(
|
| 201 |
+
"Script must contain a 'scenes' key with scene information"
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
scenes: List[Scene] = script_data.get("scenes", [])
|
| 205 |
+
if not scenes:
|
| 206 |
+
raise ValueError("Script must contain at least one scene")
|
| 207 |
+
|
| 208 |
+
# Helper function to resolve source_video reference
|
| 209 |
+
def resolve_source_video(
|
| 210 |
+
source_video_ref: Union[int, str], video_clips_list: List[str]
|
| 211 |
+
) -> str:
|
| 212 |
+
"""Resolve source_video reference to actual video path.
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
source_video_ref: Can be an integer index or a filename string
|
| 216 |
+
video_clips_list: List of video file paths
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
str: Path to the source video
|
| 220 |
+
"""
|
| 221 |
+
if isinstance(source_video_ref, int):
|
| 222 |
+
# Index-based reference
|
| 223 |
+
if source_video_ref < 0 or source_video_ref >= len(video_clips_list):
|
| 224 |
+
raise ValueError(
|
| 225 |
+
f"source_video index {source_video_ref} is out of range. "
|
| 226 |
+
f"Must be between 0 and {len(video_clips_list) - 1}"
|
| 227 |
+
)
|
| 228 |
+
return video_clips_list[source_video_ref]
|
| 229 |
+
elif isinstance(source_video_ref, str):
|
| 230 |
+
# Filename-based reference - match by basename
|
| 231 |
+
for clip_path in video_clips_list:
|
| 232 |
+
if os.path.basename(clip_path) == source_video_ref:
|
| 233 |
+
return clip_path
|
| 234 |
+
# Also try matching the full path
|
| 235 |
+
if clip_path == source_video_ref:
|
| 236 |
+
return clip_path
|
| 237 |
+
raise ValueError(
|
| 238 |
+
f"source_video '{source_video_ref}' not found in video_clips. "
|
| 239 |
+
f"Available videos: {[os.path.basename(v) for v in video_clips_list]}"
|
| 240 |
+
)
|
| 241 |
+
else:
|
| 242 |
+
raise ValueError(
|
| 243 |
+
f"source_video must be an integer index or filename string, "
|
| 244 |
+
f"got {type(source_video_ref).__name__}"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Extract clips from source videos based on script
|
| 248 |
+
clip_paths = []
|
| 249 |
+
for scene in scenes:
|
| 250 |
+
source_video_ref = scene.get("source_video")
|
| 251 |
+
start_time = scene.get("start_time", 0.0)
|
| 252 |
+
end_time = scene.get("end_time")
|
| 253 |
+
duration = scene.get("duration")
|
| 254 |
+
|
| 255 |
+
if source_video_ref is None:
|
| 256 |
+
raise ValueError(
|
| 257 |
+
f"Scene {scene.get('scene_id', 'unknown')} missing 'source_video'"
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
# Resolve source_video reference to actual video path
|
| 261 |
+
source_video = resolve_source_video(source_video_ref, video_clips)
|
| 262 |
+
|
| 263 |
+
# Calculate end_time from duration if not provided
|
| 264 |
+
if end_time is None and duration is not None:
|
| 265 |
+
end_time = start_time + duration
|
| 266 |
+
elif end_time is None:
|
| 267 |
+
# Load video to get duration
|
| 268 |
+
temp_video = VideoFileClip(source_video)
|
| 269 |
+
end_time = temp_video.duration
|
| 270 |
+
temp_video.close()
|
| 271 |
+
|
| 272 |
+
# Clip the video segment
|
| 273 |
+
from .video_clipper import video_clipper
|
| 274 |
+
|
| 275 |
+
clipped_path = video_clipper(source_video, start_time, end_time)
|
| 276 |
+
clip_paths.append(clipped_path)
|
| 277 |
+
|
| 278 |
+
# Load all video clips
|
| 279 |
+
video_clips_loaded = []
|
| 280 |
+
for clip_path in clip_paths:
|
| 281 |
+
if not os.path.exists(clip_path):
|
| 282 |
+
raise FileNotFoundError(f"Video clip not found: {clip_path}")
|
| 283 |
+
video_clips_loaded.append(VideoFileClip(clip_path))
|
| 284 |
+
|
| 285 |
+
# Apply transitions and compose clips
|
| 286 |
+
transition_duration = 0.5 # Default transition duration in seconds
|
| 287 |
+
has_crossfade = False
|
| 288 |
+
|
| 289 |
+
# Check if any scene uses crossfade
|
| 290 |
+
for scene in scenes:
|
| 291 |
+
if (
|
| 292 |
+
scene.get("transition_in") == "crossfade"
|
| 293 |
+
or scene.get("transition_out") == "crossfade"
|
| 294 |
+
):
|
| 295 |
+
has_crossfade = True
|
| 296 |
+
break
|
| 297 |
+
|
| 298 |
+
processed_clips = []
|
| 299 |
+
|
| 300 |
+
for i, (clip, scene) in enumerate(zip(video_clips_loaded, scenes)):
|
| 301 |
+
transition_in = scene.get("transition_in", "cut")
|
| 302 |
+
transition_out = scene.get("transition_out", "cut")
|
| 303 |
+
|
| 304 |
+
# Apply transition in (except for first clip)
|
| 305 |
+
if i > 0 and transition_in != "cut":
|
| 306 |
+
if transition_in in ("fade", "crossfade"):
|
| 307 |
+
# Try fadein if available, otherwise skip transition
|
| 308 |
+
if hasattr(clip, "fadein"):
|
| 309 |
+
clip = clip.fadein(transition_duration)
|
| 310 |
+
# If fadein not available, continue without transition
|
| 311 |
+
|
| 312 |
+
# Apply transition out (except for last clip)
|
| 313 |
+
if i < len(video_clips_loaded) - 1 and transition_out != "cut":
|
| 314 |
+
if transition_out in ("fade", "crossfade"):
|
| 315 |
+
# Try fadeout if available, otherwise skip transition
|
| 316 |
+
if hasattr(clip, "fadeout"):
|
| 317 |
+
clip = clip.fadeout(transition_duration)
|
| 318 |
+
# If fadeout not available, continue without transition
|
| 319 |
+
|
| 320 |
+
processed_clips.append(clip)
|
| 321 |
+
|
| 322 |
+
# Compose clips based on transition type
|
| 323 |
+
if has_crossfade:
|
| 324 |
+
# Use CompositeVideoClip for crossfades (overlapping clips)
|
| 325 |
+
final_clips = []
|
| 326 |
+
current_time = 0.0
|
| 327 |
+
|
| 328 |
+
for i, (clip, scene) in enumerate(zip(processed_clips, scenes)):
|
| 329 |
+
transition_in = scene.get("transition_in", "cut")
|
| 330 |
+
|
| 331 |
+
if i > 0 and transition_in == "crossfade":
|
| 332 |
+
# Overlap for crossfade
|
| 333 |
+
clip_start = current_time - transition_duration
|
| 334 |
+
else:
|
| 335 |
+
clip_start = current_time
|
| 336 |
+
|
| 337 |
+
clip = clip.with_start(clip_start)
|
| 338 |
+
final_clips.append(clip)
|
| 339 |
+
current_time = clip_start + clip.duration
|
| 340 |
+
|
| 341 |
+
final_video = CompositeVideoClip(final_clips)
|
| 342 |
+
else:
|
| 343 |
+
# Use concatenate_videoclips for simple sequential composition
|
| 344 |
+
final_video = concatenate_videoclips(processed_clips, method="compose")
|
| 345 |
+
|
| 346 |
+
# Add thumbnail image to first frame if provided
|
| 347 |
+
if thumbnail_path and os.path.exists(thumbnail_path):
|
| 348 |
+
try:
|
| 349 |
+
from PIL import Image as PILImage
|
| 350 |
+
|
| 351 |
+
# Get video dimensions
|
| 352 |
+
video_width = final_video.w
|
| 353 |
+
video_height = final_video.h
|
| 354 |
+
|
| 355 |
+
# Load and resize thumbnail image using PIL
|
| 356 |
+
pil_image = PILImage.open(thumbnail_path)
|
| 357 |
+
pil_image = pil_image.resize(
|
| 358 |
+
(video_width, video_height), PILImage.Resampling.LANCZOS
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
# Save resized image to temporary file
|
| 362 |
+
import tempfile
|
| 363 |
+
|
| 364 |
+
temp_thumbnail = tempfile.NamedTemporaryFile(
|
| 365 |
+
suffix=".png", delete=False
|
| 366 |
+
)
|
| 367 |
+
pil_image.save(temp_thumbnail.name, "PNG")
|
| 368 |
+
temp_thumbnail.close()
|
| 369 |
+
|
| 370 |
+
# Load the resized thumbnail as ImageClip
|
| 371 |
+
thumbnail_clip = ImageClip(temp_thumbnail.name)
|
| 372 |
+
|
| 373 |
+
# Set duration to match one frame duration (very short)
|
| 374 |
+
# This ensures it only appears on the first frame
|
| 375 |
+
# MoviePy 2.x uses with_duration instead of set_duration
|
| 376 |
+
fps = final_video.fps if final_video.fps > 0 else 30.0
|
| 377 |
+
frame_duration = 1.0 / fps
|
| 378 |
+
thumbnail_clip = thumbnail_clip.with_duration(frame_duration)
|
| 379 |
+
|
| 380 |
+
# Position at the start (t=0) so it overlays the first frame
|
| 381 |
+
# MoviePy 2.x uses with_start instead of set_start
|
| 382 |
+
thumbnail_clip = thumbnail_clip.with_start(0)
|
| 383 |
+
|
| 384 |
+
# Composite the thumbnail over the video
|
| 385 |
+
# The thumbnail will appear on top of the first frame
|
| 386 |
+
final_video = CompositeVideoClip([final_video, thumbnail_clip])
|
| 387 |
+
|
| 388 |
+
# Clean up temporary file after composition
|
| 389 |
+
try:
|
| 390 |
+
os.unlink(temp_thumbnail.name)
|
| 391 |
+
except:
|
| 392 |
+
pass
|
| 393 |
+
except Exception as e:
|
| 394 |
+
# If thumbnail overlay fails, continue without thumbnail
|
| 395 |
+
print(f"Warning: Could not add thumbnail image: {str(e)}")
|
| 396 |
+
|
| 397 |
+
# Add music if provided
|
| 398 |
+
if music_path and os.path.exists(music_path):
|
| 399 |
+
try:
|
| 400 |
+
audio_clip = AudioFileClip(music_path)
|
| 401 |
+
video_duration = final_video.duration
|
| 402 |
+
|
| 403 |
+
# Trim or loop music to match video duration
|
| 404 |
+
if audio_clip.duration > video_duration:
|
| 405 |
+
audio_clip = audio_clip.subclipped(0, video_duration)
|
| 406 |
+
elif audio_clip.duration < video_duration:
|
| 407 |
+
# Loop the music if it's shorter than video
|
| 408 |
+
loops_needed = int(video_duration / audio_clip.duration) + 1
|
| 409 |
+
audio_clips = [audio_clip] * loops_needed
|
| 410 |
+
audio_clip = concatenate_audioclips(audio_clips).subclipped(
|
| 411 |
+
0, video_duration
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
# Set audio volume (can be adjusted based on script music settings)
|
| 415 |
+
audio_volume = script_data.get("music", {}).get("volume", 0.5)
|
| 416 |
+
|
| 417 |
+
# Apply volume adjustment using MoviePy 2.x API
|
| 418 |
+
# MoviePy 2.1.2+ uses with_volume_scaled instead of volumex
|
| 419 |
+
audio_clip = audio_clip.with_volume_scaled(audio_volume)
|
| 420 |
+
|
| 421 |
+
# Combine video with audio
|
| 422 |
+
# MoviePy 2.x uses with_audio instead of set_audio
|
| 423 |
+
final_video = final_video.with_audio(audio_clip)
|
| 424 |
+
except Exception as e:
|
| 425 |
+
# If music loading fails, continue without music
|
| 426 |
+
print(f"Warning: Could not add music: {str(e)}")
|
| 427 |
+
|
| 428 |
+
# Determine output path
|
| 429 |
+
if output_path is None:
|
| 430 |
+
video_ext = ".mp4"
|
| 431 |
+
temp_dir = tempfile.gettempdir()
|
| 432 |
+
output_path = os.path.join(
|
| 433 |
+
temp_dir, f"composed_video_{os.getpid()}{video_ext}"
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# Ensure output directory exists
|
| 437 |
+
output_dir = os.path.dirname(output_path)
|
| 438 |
+
if output_dir and not os.path.exists(output_dir):
|
| 439 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 440 |
+
|
| 441 |
+
# Write the final composed video
|
| 442 |
+
final_video.write_videofile(
|
| 443 |
+
output_path,
|
| 444 |
+
codec="libx264",
|
| 445 |
+
audio_codec="aac",
|
| 446 |
+
temp_audiofile=tempfile.mktemp(suffix=".m4a"),
|
| 447 |
+
remove_temp=True,
|
| 448 |
+
logger=None,
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
# Clean up
|
| 452 |
+
final_video.close()
|
| 453 |
+
for clip in video_clips_loaded:
|
| 454 |
+
clip.close()
|
| 455 |
+
if music_path and "audio_clip" in locals():
|
| 456 |
+
try:
|
| 457 |
+
audio_clip.close()
|
| 458 |
+
except:
|
| 459 |
+
pass
|
| 460 |
+
|
| 461 |
+
# Clean up temporary clipped files (always created from source videos)
|
| 462 |
+
for clip_path in clip_paths:
|
| 463 |
+
try:
|
| 464 |
+
if os.path.exists(clip_path) and "clipped_" in os.path.basename(
|
| 465 |
+
clip_path
|
| 466 |
+
):
|
| 467 |
+
os.remove(clip_path)
|
| 468 |
+
except:
|
| 469 |
+
pass
|
| 470 |
+
|
| 471 |
+
# Return absolute path
|
| 472 |
+
return os.path.abspath(output_path)
|
| 473 |
+
|
| 474 |
+
except Exception as e:
|
| 475 |
+
# Clean up video objects if they exist
|
| 476 |
+
try:
|
| 477 |
+
if "video_clips_loaded" in locals():
|
| 478 |
+
for clip in video_clips_loaded:
|
| 479 |
+
try:
|
| 480 |
+
clip.close()
|
| 481 |
+
except:
|
| 482 |
+
pass
|
| 483 |
+
if "final_video" in locals():
|
| 484 |
+
try:
|
| 485 |
+
final_video.close()
|
| 486 |
+
except:
|
| 487 |
+
pass
|
| 488 |
+
if "audio_clip" in locals():
|
| 489 |
+
try:
|
| 490 |
+
audio_clip.close()
|
| 491 |
+
except:
|
| 492 |
+
pass
|
| 493 |
+
except:
|
| 494 |
+
pass
|
| 495 |
+
raise Exception(f"Error composing video: {str(e)}")
|
src/app/tools/video_script_generator.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from typing import Optional, List, Union
|
| 5 |
+
import google.genai as genai
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _extract_and_parse_json(text: str) -> Optional[Union[dict, list]]:
|
| 9 |
+
"""
|
| 10 |
+
Extract and parse JSON from text that might contain extra content.
|
| 11 |
+
Handles cases where JSON is wrapped in markdown, has extra text, or multiple objects.
|
| 12 |
+
"""
|
| 13 |
+
if not text or not isinstance(text, str):
|
| 14 |
+
return None
|
| 15 |
+
|
| 16 |
+
text = text.strip()
|
| 17 |
+
|
| 18 |
+
# Try direct parsing first
|
| 19 |
+
try:
|
| 20 |
+
return json.loads(text)
|
| 21 |
+
except json.JSONDecodeError:
|
| 22 |
+
pass
|
| 23 |
+
|
| 24 |
+
# Try to extract JSON array by finding balanced brackets
|
| 25 |
+
bracket_count = 0
|
| 26 |
+
array_start = -1
|
| 27 |
+
for i, char in enumerate(text):
|
| 28 |
+
if char == "[":
|
| 29 |
+
if bracket_count == 0:
|
| 30 |
+
array_start = i
|
| 31 |
+
bracket_count += 1
|
| 32 |
+
elif char == "]":
|
| 33 |
+
bracket_count -= 1
|
| 34 |
+
if bracket_count == 0 and array_start >= 0:
|
| 35 |
+
array_str = text[array_start : i + 1]
|
| 36 |
+
try:
|
| 37 |
+
return json.loads(array_str)
|
| 38 |
+
except json.JSONDecodeError:
|
| 39 |
+
pass
|
| 40 |
+
array_start = -1
|
| 41 |
+
|
| 42 |
+
# Try to find multiple JSON objects and combine them into an array
|
| 43 |
+
# This handles cases where objects are concatenated: {}{}
|
| 44 |
+
objects = []
|
| 45 |
+
brace_count = 0
|
| 46 |
+
start_idx = -1
|
| 47 |
+
|
| 48 |
+
for i, char in enumerate(text):
|
| 49 |
+
if char == "{":
|
| 50 |
+
if brace_count == 0:
|
| 51 |
+
start_idx = i
|
| 52 |
+
brace_count += 1
|
| 53 |
+
elif char == "}":
|
| 54 |
+
brace_count -= 1
|
| 55 |
+
if brace_count == 0 and start_idx >= 0:
|
| 56 |
+
obj_str = text[start_idx : i + 1]
|
| 57 |
+
try:
|
| 58 |
+
obj = json.loads(obj_str)
|
| 59 |
+
objects.append(obj)
|
| 60 |
+
except json.JSONDecodeError:
|
| 61 |
+
pass
|
| 62 |
+
start_idx = -1
|
| 63 |
+
|
| 64 |
+
if objects:
|
| 65 |
+
# If we found multiple objects, return as list
|
| 66 |
+
# If only one, return it directly (will be wrapped in list by caller)
|
| 67 |
+
return objects if len(objects) > 1 else objects[0]
|
| 68 |
+
|
| 69 |
+
# Try to extract a single JSON object by finding balanced braces
|
| 70 |
+
brace_count = 0
|
| 71 |
+
obj_start = -1
|
| 72 |
+
for i, char in enumerate(text):
|
| 73 |
+
if char == "{":
|
| 74 |
+
if brace_count == 0:
|
| 75 |
+
obj_start = i
|
| 76 |
+
brace_count += 1
|
| 77 |
+
elif char == "}":
|
| 78 |
+
brace_count -= 1
|
| 79 |
+
if brace_count == 0 and obj_start >= 0:
|
| 80 |
+
obj_str = text[obj_start : i + 1]
|
| 81 |
+
try:
|
| 82 |
+
return json.loads(obj_str)
|
| 83 |
+
except json.JSONDecodeError:
|
| 84 |
+
pass
|
| 85 |
+
obj_start = -1
|
| 86 |
+
|
| 87 |
+
return None
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def video_script_generator(
|
| 91 |
+
video_summaries: Union[str, List[dict], List[str]],
|
| 92 |
+
user_description: Optional[str] = None,
|
| 93 |
+
target_duration: float = 30.0,
|
| 94 |
+
) -> str:
|
| 95 |
+
"""
|
| 96 |
+
Create a detailed script/storyboard for the final 30-second video.
|
| 97 |
+
Uses Google Gemini API to intelligently generate a script based on video summaries
|
| 98 |
+
and user requirements.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
video_summaries: Video summaries from video_summarizer tool.
|
| 102 |
+
Can be:
|
| 103 |
+
- JSON string (single summary)
|
| 104 |
+
- List of dict objects (multiple summaries)
|
| 105 |
+
- List of JSON strings (multiple summaries)
|
| 106 |
+
user_description: Optional user description of desired mood/style/content
|
| 107 |
+
target_duration: Target duration in seconds (default: 30.0)
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
str: JSON string containing detailed script with:
|
| 111 |
+
- Scene sequence with source video and timestamps
|
| 112 |
+
- Duration for each scene segment (must sum to ~target_duration seconds)
|
| 113 |
+
- Transition types between scenes (cut, fade, crossfade)
|
| 114 |
+
- Pacing and rhythm plan
|
| 115 |
+
- Music synchronization points (beat markers, mood changes)
|
| 116 |
+
- Overall narrative structure and flow
|
| 117 |
+
- Visual style recommendations
|
| 118 |
+
|
| 119 |
+
Example output format:
|
| 120 |
+
{
|
| 121 |
+
"total_duration": 30.0,
|
| 122 |
+
"scenes": [
|
| 123 |
+
{
|
| 124 |
+
"scene_id": 1,
|
| 125 |
+
"source_video": 0,
|
| 126 |
+
"start_time": 5.2,
|
| 127 |
+
"end_time": 8.5,
|
| 128 |
+
"duration": 3.3,
|
| 129 |
+
"description": "Opening shot of landscape",
|
| 130 |
+
"transition_in": "fade",
|
| 131 |
+
"transition_out": "crossfade"
|
| 132 |
+
},
|
| 133 |
+
...
|
| 134 |
+
],
|
| 135 |
+
"music": {
|
| 136 |
+
"mood": "energetic",
|
| 137 |
+
"bpm": 120,
|
| 138 |
+
"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0],
|
| 139 |
+
"volume": 0.5
|
| 140 |
+
},
|
| 141 |
+
"pacing": "fast",
|
| 142 |
+
"narrative_structure": "hook -> build -> climax -> resolution",
|
| 143 |
+
"visual_style": "bright, colorful, dynamic"
|
| 144 |
+
}
|
| 145 |
+
"""
|
| 146 |
+
try:
|
| 147 |
+
# Parse video summaries input
|
| 148 |
+
summaries_list = []
|
| 149 |
+
if isinstance(video_summaries, str):
|
| 150 |
+
# JSON string - could be a single object or an array
|
| 151 |
+
# Use robust parsing to handle malformed JSON
|
| 152 |
+
parsed = _extract_and_parse_json(video_summaries)
|
| 153 |
+
|
| 154 |
+
if parsed is None:
|
| 155 |
+
raise ValueError(
|
| 156 |
+
f"Invalid JSON format for video_summaries. "
|
| 157 |
+
f"Could not parse: {video_summaries[:200]}..."
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
if isinstance(parsed, list):
|
| 161 |
+
# It's a JSON array
|
| 162 |
+
summaries_list = parsed
|
| 163 |
+
else:
|
| 164 |
+
# It's a single JSON object
|
| 165 |
+
summaries_list = [parsed]
|
| 166 |
+
elif isinstance(video_summaries, list):
|
| 167 |
+
# List of summaries
|
| 168 |
+
for summary in video_summaries:
|
| 169 |
+
if isinstance(summary, str):
|
| 170 |
+
# Use robust parsing for string summaries
|
| 171 |
+
parsed = _extract_and_parse_json(summary)
|
| 172 |
+
if parsed is None:
|
| 173 |
+
raise ValueError(
|
| 174 |
+
f"Invalid JSON format in video_summaries: {summary[:200]}..."
|
| 175 |
+
)
|
| 176 |
+
# If parsed is a list, extend; if it's a dict, append
|
| 177 |
+
if isinstance(parsed, list):
|
| 178 |
+
summaries_list.extend(parsed)
|
| 179 |
+
else:
|
| 180 |
+
summaries_list.append(parsed)
|
| 181 |
+
elif isinstance(summary, dict):
|
| 182 |
+
summaries_list.append(summary)
|
| 183 |
+
else:
|
| 184 |
+
raise ValueError(
|
| 185 |
+
f"Invalid summary type: {type(summary).__name__}. "
|
| 186 |
+
"Expected dict or JSON string."
|
| 187 |
+
)
|
| 188 |
+
else:
|
| 189 |
+
raise ValueError(
|
| 190 |
+
f"Invalid video_summaries type: {type(video_summaries).__name__}. "
|
| 191 |
+
"Expected str, list of dicts, or list of JSON strings."
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
if not summaries_list:
|
| 195 |
+
raise ValueError("No video summaries provided")
|
| 196 |
+
|
| 197 |
+
# Validate target_duration
|
| 198 |
+
if target_duration <= 0:
|
| 199 |
+
raise ValueError("target_duration must be greater than 0")
|
| 200 |
+
|
| 201 |
+
# Get API key
|
| 202 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 203 |
+
if not api_key:
|
| 204 |
+
# Fallback: create a simple script using first video
|
| 205 |
+
summary = summaries_list[0]
|
| 206 |
+
duration = summary.get("duration", target_duration)
|
| 207 |
+
clip_duration = min(duration, target_duration)
|
| 208 |
+
|
| 209 |
+
# Extract mood from summary
|
| 210 |
+
mood_tags = summary.get("mood_tags", ["energetic"])
|
| 211 |
+
mood = mood_tags[0] if mood_tags else "energetic"
|
| 212 |
+
|
| 213 |
+
fallback_script = {
|
| 214 |
+
"total_duration": clip_duration,
|
| 215 |
+
"scenes": [
|
| 216 |
+
{
|
| 217 |
+
"scene_id": 1,
|
| 218 |
+
"source_video": 0,
|
| 219 |
+
"start_time": 0.0,
|
| 220 |
+
"end_time": clip_duration,
|
| 221 |
+
"duration": clip_duration,
|
| 222 |
+
"description": summary.get("summary", "Video clip")[:100],
|
| 223 |
+
"transition_in": "fade",
|
| 224 |
+
"transition_out": "fade",
|
| 225 |
+
}
|
| 226 |
+
],
|
| 227 |
+
"music": {
|
| 228 |
+
"mood": mood,
|
| 229 |
+
"volume": 0.5,
|
| 230 |
+
},
|
| 231 |
+
"pacing": "moderate",
|
| 232 |
+
"narrative_structure": "single scene",
|
| 233 |
+
}
|
| 234 |
+
return json.dumps(fallback_script, indent=2)
|
| 235 |
+
|
| 236 |
+
# Initialize Gemini client
|
| 237 |
+
client = genai.Client(api_key=api_key)
|
| 238 |
+
|
| 239 |
+
# Build prompt for script generation
|
| 240 |
+
summaries_text = "\n\n".join(
|
| 241 |
+
[
|
| 242 |
+
f"Video {i+1}:\n{json.dumps(s, indent=2)}"
|
| 243 |
+
for i, s in enumerate(summaries_list)
|
| 244 |
+
]
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
user_desc_text = (
|
| 248 |
+
f"\n\nUser Description: {user_description}" if user_description else ""
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
prompt = f"""You are a professional video editor creating a {target_duration}-second short-form video.
|
| 252 |
+
|
| 253 |
+
Here are the video summaries:
|
| 254 |
+
{summaries_text}
|
| 255 |
+
{user_desc_text}
|
| 256 |
+
|
| 257 |
+
Create a detailed video composition script that:
|
| 258 |
+
1. Selects the most engaging and relevant scenes from the videos
|
| 259 |
+
2. Creates a coherent narrative flow with a clear structure (hook -> build -> climax -> resolution)
|
| 260 |
+
3. Uses appropriate transitions (cut, fade, or crossfade) between scenes
|
| 261 |
+
4. Ensures the total duration is approximately {target_duration} seconds (within ±2 seconds)
|
| 262 |
+
5. Distributes scenes evenly across the duration, considering pacing
|
| 263 |
+
6. Identifies music mood, BPM, and sync points for rhythm matching
|
| 264 |
+
7. Provides visual style recommendations based on the content
|
| 265 |
+
|
| 266 |
+
Return ONLY a valid JSON object with this exact structure:
|
| 267 |
+
{{
|
| 268 |
+
"total_duration": {target_duration},
|
| 269 |
+
"scenes": [
|
| 270 |
+
{{
|
| 271 |
+
"scene_id": 1,
|
| 272 |
+
"source_video": 0,
|
| 273 |
+
"start_time": 0.0,
|
| 274 |
+
"end_time": 5.0,
|
| 275 |
+
"duration": 5.0,
|
| 276 |
+
"description": "Brief description of what happens in this scene",
|
| 277 |
+
"transition_in": "fade",
|
| 278 |
+
"transition_out": "crossfade"
|
| 279 |
+
}},
|
| 280 |
+
{{
|
| 281 |
+
"scene_id": 2,
|
| 282 |
+
"source_video": 1,
|
| 283 |
+
"start_time": 10.0,
|
| 284 |
+
"end_time": 15.0,
|
| 285 |
+
"duration": 5.0,
|
| 286 |
+
"description": "Brief description of what happens in this scene",
|
| 287 |
+
"transition_in": "crossfade",
|
| 288 |
+
"transition_out": "fade"
|
| 289 |
+
}}
|
| 290 |
+
],
|
| 291 |
+
"music": {{
|
| 292 |
+
"mood": "energetic",
|
| 293 |
+
"bpm": 120,
|
| 294 |
+
"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0],
|
| 295 |
+
"volume": 0.5
|
| 296 |
+
}},
|
| 297 |
+
"pacing": "fast",
|
| 298 |
+
"narrative_structure": "hook -> build -> climax -> resolution",
|
| 299 |
+
"visual_style": "bright, colorful, dynamic"
|
| 300 |
+
}}
|
| 301 |
+
|
| 302 |
+
Rules:
|
| 303 |
+
- source_video is 0-based index (0 for first video, 1 for second, etc.)
|
| 304 |
+
- Each scene must have start_time, end_time, and duration
|
| 305 |
+
- Total of all scene durations should be approximately {target_duration} seconds (±2 seconds tolerance)
|
| 306 |
+
- Use transitions: "cut", "fade", or "crossfade"
|
| 307 |
+
- Extract mood tags from the video summaries for the music section
|
| 308 |
+
- sync_points should be evenly distributed or aligned to scene transitions
|
| 309 |
+
- pacing should be one of: "slow", "moderate", "fast", "very-fast"
|
| 310 |
+
- narrative_structure should describe the flow (e.g., "hook -> build -> climax -> resolution")
|
| 311 |
+
- visual_style should describe the aesthetic (e.g., "bright, colorful, dynamic" or "dark, moody, cinematic")
|
| 312 |
+
- Return ONLY the JSON, no other text or markdown formatting"""
|
| 313 |
+
|
| 314 |
+
# Generate script using Gemini
|
| 315 |
+
response = client.models.generate_content(
|
| 316 |
+
model="gemini-2.5-flash-lite",
|
| 317 |
+
contents=[prompt],
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
# Extract JSON from response
|
| 321 |
+
response_text = response.text.strip()
|
| 322 |
+
|
| 323 |
+
# Try to extract JSON if wrapped in markdown code blocks
|
| 324 |
+
if "```json" in response_text:
|
| 325 |
+
response_text = response_text.split("```json")[1].split("```")[0].strip()
|
| 326 |
+
elif "```" in response_text:
|
| 327 |
+
response_text = response_text.split("```")[1].split("```")[0].strip()
|
| 328 |
+
|
| 329 |
+
script = json.loads(response_text)
|
| 330 |
+
|
| 331 |
+
# Validate script structure
|
| 332 |
+
if not isinstance(script, dict):
|
| 333 |
+
raise ValueError("Generated script is not a valid dictionary")
|
| 334 |
+
|
| 335 |
+
# Ensure required fields exist
|
| 336 |
+
if "total_duration" not in script:
|
| 337 |
+
script["total_duration"] = target_duration
|
| 338 |
+
|
| 339 |
+
if "scenes" not in script:
|
| 340 |
+
raise ValueError("Generated script missing 'scenes' field")
|
| 341 |
+
|
| 342 |
+
if not isinstance(script["scenes"], list) or len(script["scenes"]) == 0:
|
| 343 |
+
raise ValueError("Generated script must contain at least one scene")
|
| 344 |
+
|
| 345 |
+
# Validate scene durations sum to approximately target_duration
|
| 346 |
+
total_scene_duration = sum(
|
| 347 |
+
scene.get("duration", 0) for scene in script["scenes"]
|
| 348 |
+
)
|
| 349 |
+
if abs(total_scene_duration - target_duration) > 5.0:
|
| 350 |
+
# Adjust durations proportionally if they're way off
|
| 351 |
+
if total_scene_duration > 0:
|
| 352 |
+
scale_factor = target_duration / total_scene_duration
|
| 353 |
+
for scene in script["scenes"]:
|
| 354 |
+
if "duration" in scene:
|
| 355 |
+
scene["duration"] = round(scene["duration"] * scale_factor, 2)
|
| 356 |
+
if "start_time" in scene and "end_time" in scene:
|
| 357 |
+
# Recalculate end_time based on scaled duration
|
| 358 |
+
scene["end_time"] = round(
|
| 359 |
+
scene["start_time"] + scene["duration"], 2
|
| 360 |
+
)
|
| 361 |
+
script["total_duration"] = target_duration
|
| 362 |
+
|
| 363 |
+
# Ensure music section exists
|
| 364 |
+
if "music" not in script:
|
| 365 |
+
# Extract mood from summaries
|
| 366 |
+
mood_tags = []
|
| 367 |
+
for summary in summaries_list:
|
| 368 |
+
tags = summary.get("mood_tags", [])
|
| 369 |
+
if isinstance(tags, list):
|
| 370 |
+
mood_tags.extend(tags)
|
| 371 |
+
mood = mood_tags[0] if mood_tags else "energetic"
|
| 372 |
+
script["music"] = {
|
| 373 |
+
"mood": mood,
|
| 374 |
+
"volume": 0.5,
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
# Add optional fields if missing
|
| 378 |
+
if "pacing" not in script:
|
| 379 |
+
script["pacing"] = "moderate"
|
| 380 |
+
|
| 381 |
+
if "narrative_structure" not in script:
|
| 382 |
+
script["narrative_structure"] = "linear"
|
| 383 |
+
|
| 384 |
+
if "visual_style" not in script:
|
| 385 |
+
script["visual_style"] = "standard"
|
| 386 |
+
|
| 387 |
+
return json.dumps(script, indent=2)
|
| 388 |
+
|
| 389 |
+
except json.JSONDecodeError as e:
|
| 390 |
+
# Fallback to simple script
|
| 391 |
+
if not summaries_list:
|
| 392 |
+
raise ValueError("No video summaries provided")
|
| 393 |
+
|
| 394 |
+
summary = summaries_list[0]
|
| 395 |
+
duration = summary.get("duration", target_duration)
|
| 396 |
+
clip_duration = min(duration, target_duration)
|
| 397 |
+
|
| 398 |
+
# Extract mood from summary
|
| 399 |
+
mood_tags = summary.get("mood_tags", ["energetic"])
|
| 400 |
+
mood = mood_tags[0] if mood_tags else "energetic"
|
| 401 |
+
|
| 402 |
+
fallback_script = {
|
| 403 |
+
"total_duration": clip_duration,
|
| 404 |
+
"scenes": [
|
| 405 |
+
{
|
| 406 |
+
"scene_id": 1,
|
| 407 |
+
"source_video": 0,
|
| 408 |
+
"start_time": 0.0,
|
| 409 |
+
"end_time": clip_duration,
|
| 410 |
+
"duration": clip_duration,
|
| 411 |
+
"description": (
|
| 412 |
+
summary.get("summary", "Video clip")[:100]
|
| 413 |
+
if isinstance(summary.get("summary"), str)
|
| 414 |
+
else "Video clip"
|
| 415 |
+
),
|
| 416 |
+
"transition_in": "fade",
|
| 417 |
+
"transition_out": "fade",
|
| 418 |
+
}
|
| 419 |
+
],
|
| 420 |
+
"music": {
|
| 421 |
+
"mood": mood,
|
| 422 |
+
"volume": 0.5,
|
| 423 |
+
},
|
| 424 |
+
"pacing": "moderate",
|
| 425 |
+
"narrative_structure": "single scene",
|
| 426 |
+
}
|
| 427 |
+
return json.dumps(fallback_script, indent=2)
|
| 428 |
+
|
| 429 |
+
except Exception as e:
|
| 430 |
+
raise Exception(f"Error generating video script: {str(e)}")
|
src/app/tools/video_summarizer.py
CHANGED
|
@@ -2,6 +2,7 @@ import cv2
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import mimetypes
|
|
|
|
| 5 |
import google.genai as genai
|
| 6 |
|
| 7 |
|
|
@@ -15,7 +16,7 @@ def video_summarizer(video_input, fps: float = 2.0) -> str:
|
|
| 15 |
fps (float): Frames per second for video processing by Gemini (default: 2.0, range: 0.1-24.0)
|
| 16 |
|
| 17 |
Returns:
|
| 18 |
-
str: JSON string containing video summary with key scenes, detected objects/activities,
|
| 19 |
"""
|
| 20 |
try:
|
| 21 |
# Handle Gradio video input format (can be tuple or string)
|
|
@@ -47,6 +48,8 @@ def video_summarizer(video_input, fps: float = 2.0) -> str:
|
|
| 47 |
api_key = os.getenv("GOOGLE_API_KEY")
|
| 48 |
if not api_key:
|
| 49 |
# Fallback: return basic metadata without AI analysis
|
|
|
|
|
|
|
| 50 |
return json.dumps(
|
| 51 |
{
|
| 52 |
"duration": round(duration, 2),
|
|
@@ -57,6 +60,7 @@ def video_summarizer(video_input, fps: float = 2.0) -> str:
|
|
| 57 |
"key_scenes": [],
|
| 58 |
"detected_objects": [],
|
| 59 |
"mood_tags": [],
|
|
|
|
| 60 |
}
|
| 61 |
)
|
| 62 |
|
|
@@ -81,6 +85,7 @@ def video_summarizer(video_input, fps: float = 2.0) -> str:
|
|
| 81 |
3. Detected objects/activities - List the main objects, people, activities, or subjects visible in the video
|
| 82 |
4. Mood and style tags - Identify the mood and style (e.g., energetic, calm, dramatic, fun, professional, casual, bright, dark, colorful, minimalist, fast-paced, slow-paced)
|
| 83 |
5. Visual style description - Describe the visual aesthetics, color palette, lighting, and overall style
|
|
|
|
| 84 |
|
| 85 |
Format your response as a structured, detailed summary that captures the essence of the video."""
|
| 86 |
|
|
@@ -120,6 +125,26 @@ Format your response as a structured, detailed summary that captures the essence
|
|
| 120 |
mood for mood in mood_keywords if mood.lower() in summary_text.lower()
|
| 121 |
]
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
# Structure the response
|
| 124 |
result = {
|
| 125 |
"duration": round(duration, 2),
|
|
@@ -128,6 +153,7 @@ Format your response as a structured, detailed summary that captures the essence
|
|
| 128 |
"frame_count": frame_count,
|
| 129 |
"summary": summary_text,
|
| 130 |
"mood_tags": detected_moods if detected_moods else ["general"],
|
|
|
|
| 131 |
}
|
| 132 |
|
| 133 |
return json.dumps(result, indent=2)
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import mimetypes
|
| 5 |
+
import re
|
| 6 |
import google.genai as genai
|
| 7 |
|
| 8 |
|
|
|
|
| 16 |
fps (float): Frames per second for video processing by Gemini (default: 2.0, range: 0.1-24.0)
|
| 17 |
|
| 18 |
Returns:
|
| 19 |
+
str: JSON string containing video summary with key scenes, detected objects/activities, mood tags, and thumbnail_timeframe (in seconds)
|
| 20 |
"""
|
| 21 |
try:
|
| 22 |
# Handle Gradio video input format (can be tuple or string)
|
|
|
|
| 48 |
api_key = os.getenv("GOOGLE_API_KEY")
|
| 49 |
if not api_key:
|
| 50 |
# Fallback: return basic metadata without AI analysis
|
| 51 |
+
# Use middle of video as default thumbnail timeframe
|
| 52 |
+
thumbnail_timeframe = round(duration / 2, 2) if duration > 0 else 0
|
| 53 |
return json.dumps(
|
| 54 |
{
|
| 55 |
"duration": round(duration, 2),
|
|
|
|
| 60 |
"key_scenes": [],
|
| 61 |
"detected_objects": [],
|
| 62 |
"mood_tags": [],
|
| 63 |
+
"thumbnail_timeframe": thumbnail_timeframe,
|
| 64 |
}
|
| 65 |
)
|
| 66 |
|
|
|
|
| 85 |
3. Detected objects/activities - List the main objects, people, activities, or subjects visible in the video
|
| 86 |
4. Mood and style tags - Identify the mood and style (e.g., energetic, calm, dramatic, fun, professional, casual, bright, dark, colorful, minimalist, fast-paced, slow-paced)
|
| 87 |
5. Visual style description - Describe the visual aesthetics, color palette, lighting, and overall style
|
| 88 |
+
6. Recommended thumbnail timestamp - Suggest the best timestamp (in seconds) to use as a thumbnail. This should be a visually representative moment that captures the essence of the video. Format your answer as: "THUMBNAIL_TIMESTAMP: X.XX seconds" where X.XX is the timestamp.
|
| 89 |
|
| 90 |
Format your response as a structured, detailed summary that captures the essence of the video."""
|
| 91 |
|
|
|
|
| 125 |
mood for mood in mood_keywords if mood.lower() in summary_text.lower()
|
| 126 |
]
|
| 127 |
|
| 128 |
+
# Extract thumbnail timestamp from response
|
| 129 |
+
thumbnail_timeframe = None
|
| 130 |
+
# Try to find "THUMBNAIL_TIMESTAMP: X.XX seconds" pattern
|
| 131 |
+
timestamp_pattern = r"THUMBNAIL_TIMESTAMP:\s*([\d.]+)\s*seconds?"
|
| 132 |
+
match = re.search(timestamp_pattern, summary_text, re.IGNORECASE)
|
| 133 |
+
if match:
|
| 134 |
+
try:
|
| 135 |
+
thumbnail_timeframe = float(match.group(1))
|
| 136 |
+
# Ensure timestamp is within video duration
|
| 137 |
+
if thumbnail_timeframe > duration:
|
| 138 |
+
thumbnail_timeframe = duration / 2
|
| 139 |
+
elif thumbnail_timeframe < 0:
|
| 140 |
+
thumbnail_timeframe = 0
|
| 141 |
+
except ValueError:
|
| 142 |
+
thumbnail_timeframe = None
|
| 143 |
+
|
| 144 |
+
# Fallback: use middle of video if extraction failed
|
| 145 |
+
if thumbnail_timeframe is None:
|
| 146 |
+
thumbnail_timeframe = round(duration / 2, 2) if duration > 0 else 0
|
| 147 |
+
|
| 148 |
# Structure the response
|
| 149 |
result = {
|
| 150 |
"duration": round(duration, 2),
|
|
|
|
| 153 |
"frame_count": frame_count,
|
| 154 |
"summary": summary_text,
|
| 155 |
"mood_tags": detected_moods if detected_moods else ["general"],
|
| 156 |
+
"thumbnail_timeframe": round(thumbnail_timeframe, 2),
|
| 157 |
}
|
| 158 |
|
| 159 |
return json.dumps(result, indent=2)
|
src/app/workflow.py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent workflow for video creation using LangChain agent.
|
| 3 |
+
|
| 4 |
+
This module implements the main workflow that orchestrates video processing
|
| 5 |
+
tools to create polished videos from raw footage using a central agent.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
import re
|
| 11 |
+
from typing import List, Optional, Generator, Tuple
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
|
| 14 |
+
# Load environment variables
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
# LangChain v1.0 uses create_agent instead of create_react_agent
|
| 18 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 19 |
+
from langchain.agents import create_agent
|
| 20 |
+
|
| 21 |
+
from tools.langchain_tools import ALL_TOOLS
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _normalize_video_inputs(video_inputs) -> List[str]:
|
| 25 |
+
"""
|
| 26 |
+
Normalize video inputs from Gradio format to list of absolute paths.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
video_inputs: Can be:
|
| 30 |
+
- List of file paths
|
| 31 |
+
- List of tuples (from Gradio: (video_path, subtitle_path))
|
| 32 |
+
- Single file path
|
| 33 |
+
- Single tuple
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
List of absolute file paths
|
| 37 |
+
"""
|
| 38 |
+
if not video_inputs:
|
| 39 |
+
return []
|
| 40 |
+
|
| 41 |
+
# Handle single input
|
| 42 |
+
if not isinstance(video_inputs, list):
|
| 43 |
+
video_inputs = [video_inputs]
|
| 44 |
+
|
| 45 |
+
normalized = []
|
| 46 |
+
for item in video_inputs:
|
| 47 |
+
if isinstance(item, tuple):
|
| 48 |
+
# Gradio format: (video_path, subtitle_path)
|
| 49 |
+
video_path = item[0]
|
| 50 |
+
elif isinstance(item, str):
|
| 51 |
+
video_path = item
|
| 52 |
+
else:
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
+
# Convert to absolute path
|
| 56 |
+
if video_path and os.path.exists(video_path):
|
| 57 |
+
normalized.append(os.path.abspath(video_path))
|
| 58 |
+
|
| 59 |
+
return normalized
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _extract_json_from_text(text: str) -> Optional[str]:
|
| 63 |
+
"""Extract JSON from text if present."""
|
| 64 |
+
if "{" in text and "}" in text:
|
| 65 |
+
start_idx = text.find("{")
|
| 66 |
+
end_idx = text.rfind("}") + 1
|
| 67 |
+
if start_idx >= 0 and end_idx > start_idx:
|
| 68 |
+
json_str = text[start_idx:end_idx]
|
| 69 |
+
try:
|
| 70 |
+
json.loads(json_str)
|
| 71 |
+
return json_str
|
| 72 |
+
except json.JSONDecodeError:
|
| 73 |
+
pass
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _extract_file_path_from_text(text: str, extensions: List[str]) -> Optional[str]:
|
| 78 |
+
"""Extract file path from text if present."""
|
| 79 |
+
# First check if the text itself is a valid path
|
| 80 |
+
if os.path.exists(text.strip()):
|
| 81 |
+
return text.strip()
|
| 82 |
+
|
| 83 |
+
# Try to find path pattern in text
|
| 84 |
+
pattern = r"([/\\][^\s]+\.(" + "|".join(extensions) + "))"
|
| 85 |
+
match = re.search(pattern, text)
|
| 86 |
+
if match:
|
| 87 |
+
return match.group(1)
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def agent_workflow(
|
| 92 |
+
video_inputs,
|
| 93 |
+
user_description: Optional[str] = None,
|
| 94 |
+
target_duration: float = 30.0,
|
| 95 |
+
generate_music: bool = True,
|
| 96 |
+
) -> Generator[Tuple[Optional[str], str, str, str, str], None, None]:
|
| 97 |
+
"""
|
| 98 |
+
Main agent workflow that orchestrates video creation using a central agent.
|
| 99 |
+
|
| 100 |
+
This is a generator function that yields progress updates as the workflow progresses.
|
| 101 |
+
Each yield contains: (final_path, summary_json, script_json, thumbnail_path, status)
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
video_inputs: Video file(s) from Gradio (can be list, tuple, or string)
|
| 105 |
+
user_description: Optional description of desired mood, style, or content
|
| 106 |
+
target_duration: Target duration in seconds for final video
|
| 107 |
+
generate_music: Whether to generate background music
|
| 108 |
+
|
| 109 |
+
Yields:
|
| 110 |
+
Tuple of (final_path, summary_json, script_json, thumbnail_path, status)
|
| 111 |
+
- final_path: Path to final video (None until complete)
|
| 112 |
+
- summary_json: JSON string of video summaries
|
| 113 |
+
- script_json: JSON string of generated script
|
| 114 |
+
- thumbnail_path: Path to thumbnail image (None until generated)
|
| 115 |
+
- status: Status message for UI
|
| 116 |
+
"""
|
| 117 |
+
# Initialize outputs
|
| 118 |
+
final_path = None
|
| 119 |
+
summary_json = ""
|
| 120 |
+
script_json = ""
|
| 121 |
+
thumbnail_path = None
|
| 122 |
+
status = "Starting workflow...\n"
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
# Normalize video inputs
|
| 126 |
+
status += "📥 Processing video inputs...\n"
|
| 127 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 128 |
+
|
| 129 |
+
video_paths = _normalize_video_inputs(video_inputs)
|
| 130 |
+
if not video_paths:
|
| 131 |
+
status += "❌ No valid video files found.\n"
|
| 132 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 133 |
+
return
|
| 134 |
+
|
| 135 |
+
status += f"✅ Found {len(video_paths)} video file(s).\n"
|
| 136 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 137 |
+
|
| 138 |
+
# Get API key
|
| 139 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
| 140 |
+
if not api_key:
|
| 141 |
+
status += "❌ GOOGLE_API_KEY not found in environment.\n"
|
| 142 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
+
# Create LLM
|
| 146 |
+
llm = ChatGoogleGenerativeAI(
|
| 147 |
+
model="gemini-2.5-flash-lite",
|
| 148 |
+
google_api_key=api_key,
|
| 149 |
+
temperature=0.7,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Create central agent with all tools
|
| 153 |
+
status += "\n🤖 Initializing AI agent...\n"
|
| 154 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 155 |
+
|
| 156 |
+
agent = create_agent(llm, tools=ALL_TOOLS)
|
| 157 |
+
|
| 158 |
+
# Build comprehensive workflow prompt
|
| 159 |
+
video_paths_str = "\n".join([f"- {path}" for path in video_paths])
|
| 160 |
+
user_desc_text = (
|
| 161 |
+
f"\nUser Description: {user_description}" if user_description else ""
|
| 162 |
+
)
|
| 163 |
+
music_instruction = (
|
| 164 |
+
"Generate background music matching the video's mood and style."
|
| 165 |
+
if generate_music
|
| 166 |
+
else "Do not generate background music."
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
workflow_prompt = f"""You are a professional video editor creating short-form videos. Your task is to transform raw video footage into a polished, engaging video.
|
| 170 |
+
|
| 171 |
+
WORKFLOW TASKS (execute in order):
|
| 172 |
+
|
| 173 |
+
Step 1. VIDEO ANALYSIS using the video_summarizer_tool
|
| 174 |
+
- Video files to analyze:
|
| 175 |
+
{video_paths_str}
|
| 176 |
+
- Use fps=2.0 for analysis
|
| 177 |
+
- Collect all video summaries
|
| 178 |
+
|
| 179 |
+
Step 2. SCRIPT GENERATION using the video_script_generator_tool
|
| 180 |
+
- Target duration: {target_duration} seconds
|
| 181 |
+
- The script should include:
|
| 182 |
+
* Scene sequences with source video references and timestamps
|
| 183 |
+
* Transitions (cut, fade, crossfade)
|
| 184 |
+
* Music configuration with mood and style
|
| 185 |
+
* Pacing and narrative structure
|
| 186 |
+
{user_desc_text}
|
| 187 |
+
|
| 188 |
+
Step 3. BACKGROUND MUSIC GENERATION (if enabled) using the music_selector_tool:
|
| 189 |
+
- {music_instruction}
|
| 190 |
+
- Extract mood from the script or video summaries
|
| 191 |
+
- Target duration: {target_duration} seconds
|
| 192 |
+
|
| 193 |
+
Step 4. FRAME EXTRACTION using the frame_extractor_tool
|
| 194 |
+
- Extract a representative frame from the first video
|
| 195 |
+
- Use the thumbnail_timeframe from the first video's summary if available
|
| 196 |
+
|
| 197 |
+
Step 5. THUMBNAIL GENERATION using thumbnail_generator_tool
|
| 198 |
+
- Use the extracted frame and video summary
|
| 199 |
+
|
| 200 |
+
Step 6. VIDEO COMPOSITION using the video_composer_tool
|
| 201 |
+
- Provide the script JSON, video clips, music (if generated), and thumbnail
|
| 202 |
+
- The video_clips parameter should be a JSON array of video file paths
|
| 203 |
+
|
| 204 |
+
IMPORTANT INSTRUCTIONS:
|
| 205 |
+
- Execute all steps in sequence
|
| 206 |
+
- Use the tools to accomplish each task
|
| 207 |
+
- Extract and preserve JSON outputs (scripts, summaries) for the final result
|
| 208 |
+
- Extract file paths from tool outputs (music, thumbnail, final video)
|
| 209 |
+
- Think step by step and use the appropriate tools for each task
|
| 210 |
+
- Do not skip any steps
|
| 211 |
+
- Report progress as you complete each step
|
| 212 |
+
|
| 213 |
+
Begin by analyzing the videos."""
|
| 214 |
+
|
| 215 |
+
status += "\n🎬 Starting video creation workflow...\n"
|
| 216 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 217 |
+
|
| 218 |
+
# Stream agent execution for real-time updates
|
| 219 |
+
status += "🤖 Agent is working...\n"
|
| 220 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 221 |
+
|
| 222 |
+
# Invoke agent and stream results
|
| 223 |
+
result = agent.invoke(
|
| 224 |
+
{"messages": [{"role": "user", "content": workflow_prompt}]}
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
print(result)
|
| 228 |
+
|
| 229 |
+
# Process agent result to extract outputs
|
| 230 |
+
if result and "messages" in result:
|
| 231 |
+
# Track tool outputs
|
| 232 |
+
summaries = []
|
| 233 |
+
music_path = None
|
| 234 |
+
|
| 235 |
+
# Extract information from all messages (including tool messages)
|
| 236 |
+
for message in result["messages"]:
|
| 237 |
+
# Check if this is a tool message (contains tool output)
|
| 238 |
+
message_type = (
|
| 239 |
+
getattr(message, "type", None) if hasattr(message, "type") else None
|
| 240 |
+
)
|
| 241 |
+
content = (
|
| 242 |
+
message.content if hasattr(message, "content") else str(message)
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
# Update status with agent's progress
|
| 246 |
+
if content:
|
| 247 |
+
# Check for tool outputs - video_summarizer returns JSON
|
| 248 |
+
if (
|
| 249 |
+
"video_summarizer" in str(message).lower()
|
| 250 |
+
or "summary" in content.lower()
|
| 251 |
+
):
|
| 252 |
+
extracted_json = _extract_json_from_text(content)
|
| 253 |
+
if extracted_json:
|
| 254 |
+
try:
|
| 255 |
+
parsed = json.loads(extracted_json)
|
| 256 |
+
if isinstance(parsed, list):
|
| 257 |
+
summaries.extend(parsed)
|
| 258 |
+
elif isinstance(parsed, dict) and "summary" in parsed:
|
| 259 |
+
summaries.append(parsed)
|
| 260 |
+
except:
|
| 261 |
+
pass
|
| 262 |
+
|
| 263 |
+
# Check for script output
|
| 264 |
+
if "script" in content.lower() or "scenes" in content:
|
| 265 |
+
extracted_json = _extract_json_from_text(content)
|
| 266 |
+
if extracted_json:
|
| 267 |
+
try:
|
| 268 |
+
parsed = json.loads(extracted_json)
|
| 269 |
+
if "scenes" in parsed:
|
| 270 |
+
script_json = extracted_json
|
| 271 |
+
status += "✅ Script generated.\n"
|
| 272 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 273 |
+
except:
|
| 274 |
+
pass
|
| 275 |
+
|
| 276 |
+
# Check for music file path
|
| 277 |
+
if "music" in content.lower() or "sound" in content.lower():
|
| 278 |
+
music_path = _extract_file_path_from_text(
|
| 279 |
+
content, ["mp3", "wav", "m4a"]
|
| 280 |
+
)
|
| 281 |
+
if music_path and os.path.exists(music_path):
|
| 282 |
+
status += f"✅ Music generated.\n"
|
| 283 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 284 |
+
|
| 285 |
+
# Check for thumbnail file path
|
| 286 |
+
if "thumbnail" in content.lower() or "frame" in content.lower():
|
| 287 |
+
thumb_path = _extract_file_path_from_text(
|
| 288 |
+
content, ["png", "jpg", "jpeg"]
|
| 289 |
+
)
|
| 290 |
+
if thumb_path and os.path.exists(thumb_path):
|
| 291 |
+
thumbnail_path = thumb_path
|
| 292 |
+
status += f"✅ Thumbnail generated.\n"
|
| 293 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 294 |
+
|
| 295 |
+
# Check for final video path
|
| 296 |
+
if (
|
| 297 |
+
"compose" in content.lower()
|
| 298 |
+
or "final" in content.lower()
|
| 299 |
+
or "video" in content.lower()
|
| 300 |
+
):
|
| 301 |
+
video_path = _extract_file_path_from_text(
|
| 302 |
+
content, ["mp4", "avi", "mov"]
|
| 303 |
+
)
|
| 304 |
+
if video_path and os.path.exists(video_path):
|
| 305 |
+
final_path = video_path
|
| 306 |
+
status += f"✅ Final video created.\n"
|
| 307 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 308 |
+
|
| 309 |
+
# Compile summaries if collected
|
| 310 |
+
if summaries:
|
| 311 |
+
summary_json = json.dumps(summaries, indent=2)
|
| 312 |
+
status += f"✅ Analyzed {len(summaries)} video(s).\n"
|
| 313 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 314 |
+
|
| 315 |
+
# Final extraction from last message as fallback
|
| 316 |
+
if result["messages"]:
|
| 317 |
+
last_message = result["messages"][-1]
|
| 318 |
+
final_content = (
|
| 319 |
+
last_message.content
|
| 320 |
+
if hasattr(last_message, "content")
|
| 321 |
+
else str(last_message)
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# Final attempt to extract missing outputs
|
| 325 |
+
if not script_json:
|
| 326 |
+
script_json = _extract_json_from_text(final_content) or ""
|
| 327 |
+
|
| 328 |
+
if not summary_json:
|
| 329 |
+
summary_json = _extract_json_from_text(final_content) or ""
|
| 330 |
+
|
| 331 |
+
if not thumbnail_path:
|
| 332 |
+
thumbnail_path = _extract_file_path_from_text(
|
| 333 |
+
final_content, ["png", "jpg", "jpeg"]
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
if not final_path:
|
| 337 |
+
final_path = _extract_file_path_from_text(
|
| 338 |
+
final_content, ["mp4", "avi", "mov"]
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
# Final status update
|
| 342 |
+
if final_path:
|
| 343 |
+
status += "\n✅ Video creation complete! 🎉\n"
|
| 344 |
+
else:
|
| 345 |
+
status += "\n⚠️ Workflow completed, but final video path not found.\n"
|
| 346 |
+
status += "Check agent output for details.\n"
|
| 347 |
+
|
| 348 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
| 349 |
+
|
| 350 |
+
except Exception as e:
|
| 351 |
+
status += f"\n❌ Workflow error: {str(e)}\n"
|
| 352 |
+
import traceback
|
| 353 |
+
|
| 354 |
+
status += f"\nDetails:\n{traceback.format_exc()}\n"
|
| 355 |
+
yield final_path, summary_json, script_json, thumbnail_path, status
|
src/app/workflow_ui.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import traceback
|
| 3 |
+
from workflow import agent_workflow
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def run_workflow(videos, description, duration, music):
|
| 7 |
+
"""
|
| 8 |
+
Generator function that runs the workflow and yields progress updates in real-time.
|
| 9 |
+
|
| 10 |
+
According to Gradio's streaming outputs guide, generator functions yield values
|
| 11 |
+
that update the UI in real-time as they're produced.
|
| 12 |
+
"""
|
| 13 |
+
# Handle empty or None input
|
| 14 |
+
if not videos or (isinstance(videos, list) and len(videos) == 0):
|
| 15 |
+
yield None, "❌ Please upload at least one video file.", "", "", None
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
# Iterate through the generator from agent_workflow
|
| 20 |
+
# This will yield progress updates in real-time
|
| 21 |
+
for (
|
| 22 |
+
final_path,
|
| 23 |
+
summary_json,
|
| 24 |
+
script_json,
|
| 25 |
+
thumbnail_path,
|
| 26 |
+
status,
|
| 27 |
+
) in agent_workflow(
|
| 28 |
+
video_inputs=videos,
|
| 29 |
+
user_description=description.strip() if description else None,
|
| 30 |
+
target_duration=float(duration),
|
| 31 |
+
generate_music=bool(music),
|
| 32 |
+
):
|
| 33 |
+
# Yield each progress update to Gradio
|
| 34 |
+
# Gradio will automatically update the UI with each yield
|
| 35 |
+
yield final_path, status, summary_json, script_json, thumbnail_path
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
error_msg = f"❌ Error: {str(e)}\n\nDetails: {traceback.format_exc()}"
|
| 39 |
+
yield None, error_msg, "", "", None
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def workflow_ui():
|
| 43 |
+
"""Create the full workflow UI interface."""
|
| 44 |
+
with gr.Column():
|
| 45 |
+
# Header
|
| 46 |
+
gr.Markdown(
|
| 47 |
+
"""
|
| 48 |
+
# 🤖 Vidzly - AI Agent Workflow
|
| 49 |
+
|
| 50 |
+
Transform your raw footage into a polished video with AI agent-powered editing.
|
| 51 |
+
Our intelligent agent uses MCP tools to analyze, plan, and create your video automatically.
|
| 52 |
+
Upload your videos, describe your vision, and let the AI agent handle the rest!
|
| 53 |
+
"""
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
with gr.Row():
|
| 57 |
+
with gr.Column(scale=1):
|
| 58 |
+
# Input section
|
| 59 |
+
gr.Markdown("### 📥 Input")
|
| 60 |
+
|
| 61 |
+
video_input = gr.File(
|
| 62 |
+
label="Upload Video(s)",
|
| 63 |
+
file_count="multiple",
|
| 64 |
+
file_types=["video"],
|
| 65 |
+
height=200,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
user_description = gr.Textbox(
|
| 69 |
+
label="Describe Your Vision (Optional)",
|
| 70 |
+
placeholder="e.g., energetic and fast-paced, calm and cinematic, fun and colorful...",
|
| 71 |
+
lines=3,
|
| 72 |
+
info="Describe the mood, style, or vibe you want for your video",
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
with gr.Row():
|
| 76 |
+
target_duration = gr.Slider(
|
| 77 |
+
value=30.0,
|
| 78 |
+
label="Target Duration (seconds)",
|
| 79 |
+
minimum=5.0,
|
| 80 |
+
maximum=60.0,
|
| 81 |
+
step=1.0,
|
| 82 |
+
info="Length of the final video",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
generate_music = gr.Checkbox(
|
| 86 |
+
value=True,
|
| 87 |
+
label="Generate Background Music",
|
| 88 |
+
info="Automatically generate music matching the video mood",
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
create_btn = gr.Button(
|
| 92 |
+
"🎬 Create Video",
|
| 93 |
+
variant="primary",
|
| 94 |
+
size="lg",
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
gr.Markdown("### 📝 Status")
|
| 98 |
+
progress_status = gr.Textbox(
|
| 99 |
+
label="Status",
|
| 100 |
+
value="Ready to create your video!",
|
| 101 |
+
interactive=False,
|
| 102 |
+
lines=10,
|
| 103 |
+
max_lines=20,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
with gr.Column(scale=2):
|
| 107 |
+
# Output section
|
| 108 |
+
gr.Markdown("### 🟢 Output")
|
| 109 |
+
|
| 110 |
+
thumbnail_image = gr.Image(
|
| 111 |
+
label="Generated Thumbnail",
|
| 112 |
+
type="filepath",
|
| 113 |
+
height=400,
|
| 114 |
+
visible=True,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
final_video = gr.Video(
|
| 118 |
+
label="Final Video",
|
| 119 |
+
height=400,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
with gr.Accordion("📋 Details", open=False):
|
| 123 |
+
summary_display = gr.Textbox(
|
| 124 |
+
label="Video Analysis Summary",
|
| 125 |
+
lines=10,
|
| 126 |
+
max_lines=20,
|
| 127 |
+
interactive=False,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
script_display = gr.Textbox(
|
| 131 |
+
label="Generated Script",
|
| 132 |
+
lines=10,
|
| 133 |
+
max_lines=20,
|
| 134 |
+
interactive=False,
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Connect the button to the workflow
|
| 138 |
+
create_btn.click(
|
| 139 |
+
fn=run_workflow,
|
| 140 |
+
inputs=[
|
| 141 |
+
video_input,
|
| 142 |
+
user_description,
|
| 143 |
+
target_duration,
|
| 144 |
+
generate_music,
|
| 145 |
+
],
|
| 146 |
+
outputs=[
|
| 147 |
+
final_video,
|
| 148 |
+
progress_status,
|
| 149 |
+
summary_display,
|
| 150 |
+
script_display,
|
| 151 |
+
thumbnail_image,
|
| 152 |
+
],
|
| 153 |
+
)
|
tests/README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tests
|
| 2 |
+
|
| 3 |
+
This directory contains unit tests for the Vidzly tools.
|
| 4 |
+
|
| 5 |
+
## Running Tests
|
| 6 |
+
|
| 7 |
+
### Run all tests
|
| 8 |
+
```bash
|
| 9 |
+
poetry run pytest
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
### Run specific test file
|
| 13 |
+
```bash
|
| 14 |
+
poetry run pytest tests/test_video_clipper.py
|
| 15 |
+
poetry run pytest tests/test_video_summarizer.py
|
| 16 |
+
poetry run pytest tests/test_video_composer.py
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
### Run with coverage
|
| 20 |
+
```bash
|
| 21 |
+
poetry run pytest --cov=src/app/tools --cov-report=html
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
### Run with verbose output
|
| 25 |
+
```bash
|
| 26 |
+
poetry run pytest -v
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### Run specific test
|
| 30 |
+
```bash
|
| 31 |
+
poetry run pytest tests/test_video_clipper.py::TestVideoClipper::test_video_clipper_with_string_path
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Test Structure
|
| 35 |
+
|
| 36 |
+
- `conftest.py`: Shared pytest fixtures and configuration
|
| 37 |
+
- `test_video_clipper.py`: Unit tests and integration tests for the video_clipper tool
|
| 38 |
+
- `test_video_summarizer.py`: Unit tests and integration tests for the video_summarizer tool
|
| 39 |
+
- `test_video_composer.py`: Unit tests and integration tests for the video_composer tool
|
| 40 |
+
- `data/dodo.MOV`: Real video file used for integration tests
|
| 41 |
+
- `data/dodo_2.mov`: Additional real video file used for integration tests
|
| 42 |
+
|
| 43 |
+
## Test Coverage
|
| 44 |
+
|
| 45 |
+
### Unit Tests (Mocked)
|
| 46 |
+
The unit tests use mocking to avoid requiring actual video files or API keys:
|
| 47 |
+
- **video_clipper**: Tests input validation, file handling, time range validation, and error handling
|
| 48 |
+
- **video_summarizer**: Tests input validation, API integration (mocked), metadata extraction, and mood tag detection
|
| 49 |
+
- **video_composer**: Tests script validation, source_video reference resolution (index/filename), error handling, and scene validation
|
| 50 |
+
|
| 51 |
+
### Integration Tests (Real Video)
|
| 52 |
+
The integration tests use the real video files from `tests/data/`:
|
| 53 |
+
- **video_clipper**: Tests actual video clipping functionality with real video file
|
| 54 |
+
- **video_summarizer**: Tests actual video analysis with real video file (works with or without API key)
|
| 55 |
+
- **video_composer**: Tests actual video composition with real video files, including:
|
| 56 |
+
- Basic composition with multiple scenes from the same video
|
| 57 |
+
- Crossfade transitions
|
| 58 |
+
- Multiple source videos
|
| 59 |
+
- Three scenes from two different source videos
|
| 60 |
+
- Pre-clipped video composition
|
| 61 |
+
|
| 62 |
+
## Notes
|
| 63 |
+
|
| 64 |
+
- **Unit tests** use `unittest.mock` to mock external dependencies (MoviePy, OpenCV, Google Gemini API)
|
| 65 |
+
- **Integration tests** use the real video files in `tests/data/` to test actual functionality
|
| 66 |
+
- Temporary files are created and cleaned up automatically via pytest fixtures
|
| 67 |
+
- Integration tests verify that the tools work correctly with real video files
|
| 68 |
+
- The video_summarizer integration test with API key is skipped if `GOOGLE_API_KEY` is not set
|
| 69 |
+
- **video_composer** requires `video_clips` parameter (list of source videos) and uses `source_video` in scenes to reference videos by index (0-based) or filename
|
| 70 |
+
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Tests package
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pytest configuration and shared fixtures for tests.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import pytest
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@pytest.fixture
|
| 12 |
+
def temp_video_file():
|
| 13 |
+
"""Create a temporary video file for testing."""
|
| 14 |
+
# Create a minimal test video file
|
| 15 |
+
# Note: In real tests, you might want to use an actual small video file
|
| 16 |
+
temp_dir = tempfile.mkdtemp()
|
| 17 |
+
video_path = os.path.join(temp_dir, "test_video.mp4")
|
| 18 |
+
|
| 19 |
+
# Create an empty file (in real scenario, this would be a valid video)
|
| 20 |
+
# For actual testing, you'd want to use a real small video file
|
| 21 |
+
Path(video_path).touch()
|
| 22 |
+
|
| 23 |
+
yield video_path
|
| 24 |
+
|
| 25 |
+
# Cleanup
|
| 26 |
+
if os.path.exists(video_path):
|
| 27 |
+
os.remove(video_path)
|
| 28 |
+
if os.path.exists(temp_dir):
|
| 29 |
+
os.rmdir(temp_dir)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@pytest.fixture
|
| 33 |
+
def temp_output_dir():
|
| 34 |
+
"""Create a temporary directory for test outputs."""
|
| 35 |
+
temp_dir = tempfile.mkdtemp()
|
| 36 |
+
yield temp_dir
|
| 37 |
+
|
| 38 |
+
# Cleanup
|
| 39 |
+
import shutil
|
| 40 |
+
|
| 41 |
+
if os.path.exists(temp_dir):
|
| 42 |
+
shutil.rmtree(temp_dir)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@pytest.fixture
|
| 46 |
+
def mock_video_duration():
|
| 47 |
+
"""Mock video duration in seconds."""
|
| 48 |
+
return 30.0
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@pytest.fixture
|
| 52 |
+
def sample_video_metadata():
|
| 53 |
+
"""Sample video metadata for testing."""
|
| 54 |
+
return {
|
| 55 |
+
"duration": 30.0,
|
| 56 |
+
"resolution": "1920x1080",
|
| 57 |
+
"fps": 30.0,
|
| 58 |
+
"frame_count": 900,
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@pytest.fixture
|
| 63 |
+
def real_video_file():
|
| 64 |
+
"""Get path to real video file in tests/data directory."""
|
| 65 |
+
test_dir = os.path.dirname(os.path.abspath(__file__))
|
| 66 |
+
video_path = os.path.join(test_dir, "data", "dodo.MOV")
|
| 67 |
+
|
| 68 |
+
if not os.path.exists(video_path):
|
| 69 |
+
pytest.skip(f"Test video file not found: {video_path}")
|
| 70 |
+
|
| 71 |
+
return video_path
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@pytest.fixture
|
| 75 |
+
def real_video_file_2():
|
| 76 |
+
"""Get path to second real video file in tests/data directory."""
|
| 77 |
+
test_dir = os.path.dirname(os.path.abspath(__file__))
|
| 78 |
+
video_path = os.path.join(test_dir, "data", "dodo_2.mov")
|
| 79 |
+
|
| 80 |
+
if not os.path.exists(video_path):
|
| 81 |
+
pytest.skip(f"Test video file not found: {video_path}")
|
| 82 |
+
|
| 83 |
+
return video_path
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@pytest.fixture
|
| 87 |
+
def temp_image_file():
|
| 88 |
+
"""Create a temporary image file for testing."""
|
| 89 |
+
from PIL import Image
|
| 90 |
+
|
| 91 |
+
temp_dir = tempfile.mkdtemp()
|
| 92 |
+
image_path = os.path.join(temp_dir, "test_image.png")
|
| 93 |
+
|
| 94 |
+
# Create a minimal test image
|
| 95 |
+
img = Image.new("RGB", (100, 100), color="red")
|
| 96 |
+
img.save(image_path)
|
| 97 |
+
|
| 98 |
+
yield image_path
|
| 99 |
+
|
| 100 |
+
# Cleanup
|
| 101 |
+
if os.path.exists(image_path):
|
| 102 |
+
os.remove(image_path)
|
| 103 |
+
if os.path.exists(temp_dir):
|
| 104 |
+
os.rmdir(temp_dir)
|
tests/test_adk_integration.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test ADK integration with session manager.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
| 10 |
+
|
| 11 |
+
from dotenv import load_dotenv
|
| 12 |
+
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
from google.adk.agents import LlmAgent
|
| 16 |
+
from google.adk.tools import FunctionTool
|
| 17 |
+
import sys
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
sys.path.insert(0, str(Path(__file__).parent.parent / "src" / "app"))
|
| 21 |
+
|
| 22 |
+
from adk_session_manager import ADKSessionManager
|
| 23 |
+
from agent_helpers import invoke_agent_simple
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_session_manager():
|
| 27 |
+
"""Test session manager creation."""
|
| 28 |
+
print("=== Testing Session Manager ===")
|
| 29 |
+
try:
|
| 30 |
+
manager = ADKSessionManager()
|
| 31 |
+
print("✅ Session manager created")
|
| 32 |
+
|
| 33 |
+
session_id = manager.create_session()
|
| 34 |
+
print(f"✅ Session created: {session_id}")
|
| 35 |
+
|
| 36 |
+
session = manager.get_session(session_id)
|
| 37 |
+
print(f"✅ Session retrieved: {type(session)}")
|
| 38 |
+
|
| 39 |
+
return manager
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(f"❌ Session manager test failed: {e}")
|
| 42 |
+
import traceback
|
| 43 |
+
|
| 44 |
+
traceback.print_exc()
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def test_simple_agent_invocation():
|
| 49 |
+
"""Test simple agent invocation."""
|
| 50 |
+
print("\n=== Testing Simple Agent Invocation ===")
|
| 51 |
+
try:
|
| 52 |
+
# Create a simple tool
|
| 53 |
+
def add(a: int, b: int) -> int:
|
| 54 |
+
"""Add two numbers."""
|
| 55 |
+
return a + b
|
| 56 |
+
|
| 57 |
+
tool = FunctionTool(func=add)
|
| 58 |
+
|
| 59 |
+
# Create agent
|
| 60 |
+
agent = LlmAgent(
|
| 61 |
+
model="gemini-2.5-flash-lite",
|
| 62 |
+
name="test_agent",
|
| 63 |
+
instruction="You are a helpful calculator. Use the add tool when asked to add numbers.",
|
| 64 |
+
tools=[tool],
|
| 65 |
+
)
|
| 66 |
+
print("✅ Agent created")
|
| 67 |
+
|
| 68 |
+
# Test invocation
|
| 69 |
+
try:
|
| 70 |
+
result = invoke_agent_simple(agent, "What is 5 + 3?")
|
| 71 |
+
print(f"✅ Agent invocation succeeded")
|
| 72 |
+
print(f" Result: {result[:200]}")
|
| 73 |
+
return True
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"⚠️ Agent invocation failed: {e}")
|
| 76 |
+
print(" This is expected if ADK requires additional setup")
|
| 77 |
+
import traceback
|
| 78 |
+
|
| 79 |
+
traceback.print_exc()
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"❌ Test failed: {e}")
|
| 84 |
+
import traceback
|
| 85 |
+
|
| 86 |
+
traceback.print_exc()
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
print("=" * 60)
|
| 92 |
+
print("ADK Integration Test")
|
| 93 |
+
print("=" * 60)
|
| 94 |
+
|
| 95 |
+
manager = test_session_manager()
|
| 96 |
+
success = test_simple_agent_invocation()
|
| 97 |
+
|
| 98 |
+
print("\n" + "=" * 60)
|
| 99 |
+
print("Test Summary")
|
| 100 |
+
print("=" * 60)
|
| 101 |
+
print(f"Session Manager: {'✅' if manager else '❌'}")
|
| 102 |
+
print(f"Agent Invocation: {'✅' if success else '⚠️'}")
|
| 103 |
+
print("\nNote: Agent invocation may fail if ADK requires additional")
|
| 104 |
+
print("message handling or context setup. This is expected during development.")
|
tests/test_frame_extractor.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for frame_extractor tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import pytest
|
| 8 |
+
from unittest.mock import Mock, patch
|
| 9 |
+
import numpy as np
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
# Add src to path to import modules
|
| 13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 14 |
+
|
| 15 |
+
from app.tools.frame_extractor import frame_extractor
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TestFrameExtractor:
|
| 19 |
+
"""Test cases for frame_extractor main function."""
|
| 20 |
+
|
| 21 |
+
def test_frame_extractor_with_api_key(self, temp_video_file, temp_output_dir):
|
| 22 |
+
"""Test frame_extractor with API key."""
|
| 23 |
+
with (
|
| 24 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 25 |
+
patch("app.tools.frame_extractor.cv2.VideoCapture") as mock_capture,
|
| 26 |
+
patch("app.tools.frame_extractor.cv2.imwrite") as mock_imwrite,
|
| 27 |
+
patch("app.tools.frame_extractor.genai.Client") as mock_client,
|
| 28 |
+
patch("app.tools.frame_extractor.genai.types.Blob") as mock_blob,
|
| 29 |
+
patch(
|
| 30 |
+
"app.tools.frame_extractor.genai.types.VideoMetadata"
|
| 31 |
+
) as mock_video_metadata,
|
| 32 |
+
patch("app.tools.frame_extractor.genai.types.Part") as mock_part,
|
| 33 |
+
patch("app.tools.frame_extractor.mimetypes.guess_type") as mock_guess_type,
|
| 34 |
+
patch("builtins.open", create=True) as mock_open,
|
| 35 |
+
):
|
| 36 |
+
|
| 37 |
+
# Setup video capture mock (called twice: once for metadata, once for frame extraction)
|
| 38 |
+
mock_cap = Mock()
|
| 39 |
+
mock_cap.isOpened.return_value = True
|
| 40 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 41 |
+
5: 30.0, # FPS
|
| 42 |
+
7: 900, # Frame count
|
| 43 |
+
}.get(prop, 0)
|
| 44 |
+
mock_cap.read.return_value = (
|
| 45 |
+
True,
|
| 46 |
+
np.ones((100, 100, 3), dtype=np.uint8) * 150,
|
| 47 |
+
)
|
| 48 |
+
mock_cap.set.return_value = True
|
| 49 |
+
mock_capture.return_value = mock_cap
|
| 50 |
+
|
| 51 |
+
# Setup file open mock for reading video file
|
| 52 |
+
mock_file = Mock()
|
| 53 |
+
mock_file.read.return_value = b"video file data"
|
| 54 |
+
mock_file.__enter__ = Mock(return_value=mock_file)
|
| 55 |
+
mock_file.__exit__ = Mock(return_value=None)
|
| 56 |
+
mock_open.return_value = mock_file
|
| 57 |
+
|
| 58 |
+
# Setup genai types mocks
|
| 59 |
+
mock_blob_instance = Mock()
|
| 60 |
+
mock_video_metadata_instance = Mock()
|
| 61 |
+
mock_part_instance = Mock()
|
| 62 |
+
mock_blob.return_value = mock_blob_instance
|
| 63 |
+
mock_video_metadata.return_value = mock_video_metadata_instance
|
| 64 |
+
mock_part.return_value = mock_part_instance
|
| 65 |
+
|
| 66 |
+
# Setup mimetypes mock
|
| 67 |
+
mock_guess_type.return_value = ("video/mp4", None)
|
| 68 |
+
|
| 69 |
+
# Setup Gemini API mock - returns timestamp
|
| 70 |
+
mock_genai_client = Mock()
|
| 71 |
+
mock_response = Mock()
|
| 72 |
+
mock_response.text = "15.5" # Return timestamp in seconds
|
| 73 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 74 |
+
mock_client.return_value = mock_genai_client
|
| 75 |
+
|
| 76 |
+
result = frame_extractor(temp_video_file)
|
| 77 |
+
|
| 78 |
+
assert isinstance(result, str)
|
| 79 |
+
# VideoCapture should be called twice: once for metadata, once for frame extraction
|
| 80 |
+
assert mock_capture.call_count == 2
|
| 81 |
+
# imwrite is called once for the final output frame
|
| 82 |
+
assert mock_imwrite.call_count == 1
|
| 83 |
+
|
| 84 |
+
# Cleanup frames directory if it was created
|
| 85 |
+
import shutil
|
| 86 |
+
|
| 87 |
+
frames_dir = os.path.join(os.path.dirname(temp_video_file), "frames")
|
| 88 |
+
if os.path.exists(frames_dir):
|
| 89 |
+
shutil.rmtree(frames_dir)
|
| 90 |
+
|
| 91 |
+
def test_frame_extractor_without_api_key(self, temp_video_file):
|
| 92 |
+
"""Test frame_extractor raises error without API key."""
|
| 93 |
+
with (
|
| 94 |
+
patch.dict(os.environ, {}, clear=True),
|
| 95 |
+
patch("app.tools.frame_extractor.cv2.VideoCapture") as mock_capture,
|
| 96 |
+
):
|
| 97 |
+
|
| 98 |
+
# Mock video capture to allow metadata extraction, so we reach the API key check
|
| 99 |
+
mock_cap = Mock()
|
| 100 |
+
mock_cap.isOpened.return_value = True
|
| 101 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 102 |
+
5: 30.0, # FPS
|
| 103 |
+
7: 900, # Frame count
|
| 104 |
+
}.get(prop, 0)
|
| 105 |
+
mock_capture.return_value = mock_cap
|
| 106 |
+
|
| 107 |
+
with pytest.raises(Exception) as exc_info:
|
| 108 |
+
frame_extractor(temp_video_file)
|
| 109 |
+
|
| 110 |
+
assert "GOOGLE_API_KEY" in str(exc_info.value)
|
| 111 |
+
|
| 112 |
+
def test_frame_extractor_invalid_input_format(self):
|
| 113 |
+
"""Test frame_extractor with invalid input format."""
|
| 114 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 115 |
+
with pytest.raises(Exception) as exc_info:
|
| 116 |
+
frame_extractor(123)
|
| 117 |
+
|
| 118 |
+
assert "Invalid video input format" in str(exc_info.value)
|
| 119 |
+
|
| 120 |
+
def test_frame_extractor_file_not_found(self):
|
| 121 |
+
"""Test frame_extractor with non-existent file."""
|
| 122 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 123 |
+
with pytest.raises(Exception) as exc_info:
|
| 124 |
+
frame_extractor("/nonexistent/video.mp4")
|
| 125 |
+
|
| 126 |
+
assert "Video file not found" in str(exc_info.value)
|
| 127 |
+
|
| 128 |
+
def test_frame_extractor_cannot_open_video(self, temp_video_file):
|
| 129 |
+
"""Test frame_extractor when video cannot be opened."""
|
| 130 |
+
with (
|
| 131 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 132 |
+
patch("app.tools.frame_extractor.cv2.VideoCapture") as mock_capture,
|
| 133 |
+
):
|
| 134 |
+
|
| 135 |
+
mock_cap = Mock()
|
| 136 |
+
mock_cap.isOpened.return_value = False
|
| 137 |
+
mock_capture.return_value = mock_cap
|
| 138 |
+
|
| 139 |
+
with pytest.raises(Exception) as exc_info:
|
| 140 |
+
frame_extractor(temp_video_file)
|
| 141 |
+
|
| 142 |
+
assert "Could not open video file" in str(exc_info.value)
|
| 143 |
+
|
| 144 |
+
def test_frame_extractor_zero_duration(self, temp_video_file):
|
| 145 |
+
"""Test frame_extractor with zero duration video."""
|
| 146 |
+
with (
|
| 147 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 148 |
+
patch("app.tools.frame_extractor.cv2.VideoCapture") as mock_capture,
|
| 149 |
+
):
|
| 150 |
+
|
| 151 |
+
mock_cap = Mock()
|
| 152 |
+
mock_cap.isOpened.return_value = True
|
| 153 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 154 |
+
5: 0.0, # Zero FPS
|
| 155 |
+
7: 0, # Zero frames
|
| 156 |
+
}.get(prop, 0)
|
| 157 |
+
mock_capture.return_value = mock_cap
|
| 158 |
+
|
| 159 |
+
with pytest.raises(Exception) as exc_info:
|
| 160 |
+
frame_extractor(temp_video_file)
|
| 161 |
+
|
| 162 |
+
assert "zero duration" in str(exc_info.value).lower()
|
| 163 |
+
mock_cap.release.assert_called_once()
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class TestFrameExtractorIntegration:
|
| 167 |
+
"""Integration tests for frame_extractor using real video files."""
|
| 168 |
+
|
| 169 |
+
@pytest.mark.skipif(
|
| 170 |
+
not os.getenv("GOOGLE_API_KEY"),
|
| 171 |
+
reason="GOOGLE_API_KEY not set, skipping AI test",
|
| 172 |
+
)
|
| 173 |
+
def test_frame_extractor_real_video_ai(self, real_video_file):
|
| 174 |
+
"""Test frame_extractor with real video file using AI."""
|
| 175 |
+
result = frame_extractor(real_video_file)
|
| 176 |
+
|
| 177 |
+
assert os.path.exists(result)
|
| 178 |
+
assert os.path.isabs(result)
|
| 179 |
+
assert result.endswith(".png")
|
| 180 |
+
assert os.path.getsize(result) > 0
|
| 181 |
+
# Cleanup
|
| 182 |
+
if os.path.exists(result):
|
| 183 |
+
os.remove(result)
|
| 184 |
+
frames_dir = os.path.dirname(result)
|
| 185 |
+
if os.path.exists(frames_dir) and not os.listdir(frames_dir):
|
| 186 |
+
os.rmdir(frames_dir)
|
tests/test_music_selector.py
ADDED
|
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for music_selector tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import pytest
|
| 8 |
+
from unittest.mock import Mock, patch, MagicMock
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
# Add src to path to import modules
|
| 12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 13 |
+
|
| 14 |
+
from app.tools.music_selector import music_selector
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TestMusicSelector:
|
| 18 |
+
"""Test cases for music_selector function."""
|
| 19 |
+
|
| 20 |
+
def test_music_selector_basic(self, temp_output_dir):
|
| 21 |
+
"""Test music_selector with basic parameters."""
|
| 22 |
+
with (
|
| 23 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 24 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 25 |
+
):
|
| 26 |
+
|
| 27 |
+
# Setup mock
|
| 28 |
+
mock_client = Mock()
|
| 29 |
+
mock_text_to_sound = Mock()
|
| 30 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 31 |
+
mock_text_to_sound.convert = mock_convert
|
| 32 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 33 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 34 |
+
|
| 35 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 36 |
+
result = music_selector(
|
| 37 |
+
mood="energetic", target_duration=10.0, output_path=output_path
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# Assertions
|
| 41 |
+
assert os.path.isabs(result)
|
| 42 |
+
assert result == os.path.abspath(output_path)
|
| 43 |
+
assert os.path.exists(result)
|
| 44 |
+
mock_elevenlabs_class.assert_called_once_with(api_key="test_key")
|
| 45 |
+
mock_convert.assert_called_once()
|
| 46 |
+
|
| 47 |
+
# Check API was called with correct parameters
|
| 48 |
+
call_kwargs = mock_convert.call_args[1]
|
| 49 |
+
assert "text" in call_kwargs
|
| 50 |
+
assert "energetic" in call_kwargs["text"]
|
| 51 |
+
assert call_kwargs["duration_seconds"] == 10.0
|
| 52 |
+
assert call_kwargs["output_format"] == "mp3_44100_128"
|
| 53 |
+
|
| 54 |
+
@pytest.mark.parametrize(
|
| 55 |
+
"mood_input,expected_moods",
|
| 56 |
+
[
|
| 57 |
+
("energetic, calm, dramatic", ["energetic", "calm", "dramatic"]),
|
| 58 |
+
(["fun", "professional"], ["fun", "professional"]),
|
| 59 |
+
],
|
| 60 |
+
)
|
| 61 |
+
def test_music_selector_mood_formats(
|
| 62 |
+
self, temp_output_dir, mood_input, expected_moods
|
| 63 |
+
):
|
| 64 |
+
"""Test music_selector with different mood input formats."""
|
| 65 |
+
with (
|
| 66 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 67 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 68 |
+
):
|
| 69 |
+
|
| 70 |
+
mock_client = Mock()
|
| 71 |
+
mock_text_to_sound = Mock()
|
| 72 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 73 |
+
mock_text_to_sound.convert = mock_convert
|
| 74 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 75 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 76 |
+
|
| 77 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 78 |
+
result = music_selector(
|
| 79 |
+
mood=mood_input,
|
| 80 |
+
target_duration=15.0,
|
| 81 |
+
output_path=output_path,
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
assert os.path.exists(result)
|
| 85 |
+
call_kwargs = mock_convert.call_args[1]
|
| 86 |
+
for expected_mood in expected_moods:
|
| 87 |
+
assert expected_mood in call_kwargs["text"]
|
| 88 |
+
|
| 89 |
+
def test_music_selector_with_style(self, temp_output_dir):
|
| 90 |
+
"""Test music_selector with style parameter."""
|
| 91 |
+
with (
|
| 92 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 93 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 94 |
+
):
|
| 95 |
+
|
| 96 |
+
mock_client = Mock()
|
| 97 |
+
mock_text_to_sound = Mock()
|
| 98 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 99 |
+
mock_text_to_sound.convert = mock_convert
|
| 100 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 101 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 102 |
+
|
| 103 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 104 |
+
result = music_selector(
|
| 105 |
+
mood="energetic",
|
| 106 |
+
style="cinematic",
|
| 107 |
+
target_duration=25.0,
|
| 108 |
+
output_path=output_path,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
assert os.path.exists(result)
|
| 112 |
+
call_kwargs = mock_convert.call_args[1]
|
| 113 |
+
assert "cinematic" in call_kwargs["text"]
|
| 114 |
+
assert "style" in call_kwargs["text"]
|
| 115 |
+
|
| 116 |
+
def test_music_selector_with_bpm(self, temp_output_dir):
|
| 117 |
+
"""Test music_selector with BPM parameter."""
|
| 118 |
+
with (
|
| 119 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 120 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 121 |
+
):
|
| 122 |
+
|
| 123 |
+
mock_client = Mock()
|
| 124 |
+
mock_text_to_sound = Mock()
|
| 125 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 126 |
+
mock_text_to_sound.convert = mock_convert
|
| 127 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 128 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 129 |
+
|
| 130 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 131 |
+
result = music_selector(
|
| 132 |
+
mood="energetic", bpm=120, target_duration=15.0, output_path=output_path
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
assert os.path.exists(result)
|
| 136 |
+
call_kwargs = mock_convert.call_args[1]
|
| 137 |
+
assert "120" in call_kwargs["text"]
|
| 138 |
+
assert "BPM" in call_kwargs["text"]
|
| 139 |
+
|
| 140 |
+
def test_music_selector_with_looping(self, temp_output_dir):
|
| 141 |
+
"""Test music_selector with looping enabled."""
|
| 142 |
+
with (
|
| 143 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 144 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 145 |
+
):
|
| 146 |
+
|
| 147 |
+
mock_client = Mock()
|
| 148 |
+
mock_text_to_sound = Mock()
|
| 149 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 150 |
+
mock_text_to_sound.convert = mock_convert
|
| 151 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 152 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 153 |
+
|
| 154 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 155 |
+
result = music_selector(
|
| 156 |
+
mood="energetic",
|
| 157 |
+
looping=True,
|
| 158 |
+
target_duration=10.0,
|
| 159 |
+
output_path=output_path,
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
assert os.path.exists(result)
|
| 163 |
+
call_kwargs = mock_convert.call_args[1]
|
| 164 |
+
assert call_kwargs["loop"] is True
|
| 165 |
+
|
| 166 |
+
def test_music_selector_with_prompt_influence(self, temp_output_dir):
|
| 167 |
+
"""Test music_selector with prompt_influence parameter."""
|
| 168 |
+
with (
|
| 169 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 170 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 171 |
+
):
|
| 172 |
+
|
| 173 |
+
mock_client = Mock()
|
| 174 |
+
mock_text_to_sound = Mock()
|
| 175 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 176 |
+
mock_text_to_sound.convert = mock_convert
|
| 177 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 178 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 179 |
+
|
| 180 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 181 |
+
result = music_selector(
|
| 182 |
+
mood="energetic",
|
| 183 |
+
prompt_influence=0.7,
|
| 184 |
+
target_duration=10.0,
|
| 185 |
+
output_path=output_path,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
assert os.path.exists(result)
|
| 189 |
+
call_kwargs = mock_convert.call_args[1]
|
| 190 |
+
assert call_kwargs["prompt_influence"] == 0.7
|
| 191 |
+
|
| 192 |
+
def test_music_selector_without_output_path(self, temp_output_dir):
|
| 193 |
+
"""Test music_selector generates output path when not provided."""
|
| 194 |
+
with (
|
| 195 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 196 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 197 |
+
):
|
| 198 |
+
|
| 199 |
+
mock_client = Mock()
|
| 200 |
+
mock_text_to_sound = Mock()
|
| 201 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 202 |
+
mock_text_to_sound.convert = mock_convert
|
| 203 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 204 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 205 |
+
|
| 206 |
+
result = music_selector(mood="energetic", target_duration=10.0)
|
| 207 |
+
|
| 208 |
+
assert os.path.isabs(result)
|
| 209 |
+
assert os.path.exists(result)
|
| 210 |
+
assert result.endswith(".mp3")
|
| 211 |
+
# Should contain mood in filename
|
| 212 |
+
assert "energetic" in os.path.basename(result).lower()
|
| 213 |
+
|
| 214 |
+
def test_music_selector_duration_clamping_max(self, temp_output_dir):
|
| 215 |
+
"""Test music_selector clamps duration to maximum 30 seconds."""
|
| 216 |
+
with (
|
| 217 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 218 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 219 |
+
):
|
| 220 |
+
|
| 221 |
+
mock_client = Mock()
|
| 222 |
+
mock_text_to_sound = Mock()
|
| 223 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 224 |
+
mock_text_to_sound.convert = mock_convert
|
| 225 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 226 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 227 |
+
|
| 228 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 229 |
+
result = music_selector(
|
| 230 |
+
mood="energetic",
|
| 231 |
+
target_duration=50.0, # Exceeds max
|
| 232 |
+
output_path=output_path,
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
assert os.path.exists(result)
|
| 236 |
+
call_kwargs = mock_convert.call_args[1]
|
| 237 |
+
assert call_kwargs["duration_seconds"] == 30.0
|
| 238 |
+
|
| 239 |
+
def test_music_selector_prompt_influence_clamping(self, temp_output_dir):
|
| 240 |
+
"""Test music_selector clamps prompt_influence to 0-1 range."""
|
| 241 |
+
with (
|
| 242 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 243 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 244 |
+
):
|
| 245 |
+
|
| 246 |
+
mock_client = Mock()
|
| 247 |
+
mock_text_to_sound = Mock()
|
| 248 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 249 |
+
mock_text_to_sound.convert = mock_convert
|
| 250 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 251 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 252 |
+
|
| 253 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 254 |
+
|
| 255 |
+
# Test with value > 1
|
| 256 |
+
result = music_selector(
|
| 257 |
+
mood="energetic",
|
| 258 |
+
prompt_influence=2.0,
|
| 259 |
+
target_duration=10.0,
|
| 260 |
+
output_path=output_path,
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
assert os.path.exists(result)
|
| 264 |
+
call_kwargs = mock_convert.call_args[1]
|
| 265 |
+
assert call_kwargs["prompt_influence"] == 1.0
|
| 266 |
+
|
| 267 |
+
# Test with value < 0
|
| 268 |
+
result = music_selector(
|
| 269 |
+
mood="energetic",
|
| 270 |
+
prompt_influence=-0.5,
|
| 271 |
+
target_duration=10.0,
|
| 272 |
+
output_path=output_path,
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
call_kwargs = mock_convert.call_args[1]
|
| 276 |
+
assert call_kwargs["prompt_influence"] == 0.0
|
| 277 |
+
|
| 278 |
+
def test_music_selector_empty_mood_defaults(self, temp_output_dir):
|
| 279 |
+
"""Test music_selector uses default mood when empty."""
|
| 280 |
+
with (
|
| 281 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 282 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 283 |
+
):
|
| 284 |
+
|
| 285 |
+
mock_client = Mock()
|
| 286 |
+
mock_text_to_sound = Mock()
|
| 287 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 288 |
+
mock_text_to_sound.convert = mock_convert
|
| 289 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 290 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 291 |
+
|
| 292 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 293 |
+
result = music_selector(
|
| 294 |
+
mood="", target_duration=10.0, output_path=output_path
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
assert os.path.exists(result)
|
| 298 |
+
call_kwargs = mock_convert.call_args[1]
|
| 299 |
+
assert "energetic" in call_kwargs["text"]
|
| 300 |
+
|
| 301 |
+
def test_music_selector_with_sync_points(self, temp_output_dir):
|
| 302 |
+
"""Test music_selector with sync_points parameter."""
|
| 303 |
+
with (
|
| 304 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 305 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 306 |
+
):
|
| 307 |
+
|
| 308 |
+
mock_client = Mock()
|
| 309 |
+
mock_text_to_sound = Mock()
|
| 310 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 311 |
+
mock_text_to_sound.convert = mock_convert
|
| 312 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 313 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 314 |
+
|
| 315 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 316 |
+
result = music_selector(
|
| 317 |
+
mood="energetic",
|
| 318 |
+
sync_points=[0.0, 5.0, 10.0],
|
| 319 |
+
target_duration=10.0,
|
| 320 |
+
output_path=output_path,
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
assert os.path.exists(result)
|
| 324 |
+
call_kwargs = mock_convert.call_args[1]
|
| 325 |
+
assert "beat markers" in call_kwargs["text"]
|
| 326 |
+
|
| 327 |
+
def test_music_selector_creates_output_directory(self, temp_output_dir):
|
| 328 |
+
"""Test music_selector creates output directory if it doesn't exist."""
|
| 329 |
+
with (
|
| 330 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 331 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 332 |
+
):
|
| 333 |
+
|
| 334 |
+
mock_client = Mock()
|
| 335 |
+
mock_text_to_sound = Mock()
|
| 336 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 337 |
+
mock_text_to_sound.convert = mock_convert
|
| 338 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 339 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 340 |
+
|
| 341 |
+
output_dir = os.path.join(temp_output_dir, "nested", "path")
|
| 342 |
+
output_path = os.path.join(output_dir, "test_sound.mp3")
|
| 343 |
+
|
| 344 |
+
result = music_selector(
|
| 345 |
+
mood="energetic", target_duration=10.0, output_path=output_path
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
assert os.path.exists(output_dir)
|
| 349 |
+
assert os.path.exists(result)
|
| 350 |
+
|
| 351 |
+
def test_music_selector_no_api_key(self):
|
| 352 |
+
"""Test music_selector raises error when API key is missing."""
|
| 353 |
+
with patch.dict(os.environ, {}, clear=True):
|
| 354 |
+
with pytest.raises(Exception) as exc_info:
|
| 355 |
+
music_selector(mood="energetic", target_duration=10.0)
|
| 356 |
+
|
| 357 |
+
assert "ELEVENLABS_API_KEY" in str(exc_info.value)
|
| 358 |
+
|
| 359 |
+
def test_music_selector_elevenlabs_not_installed(self):
|
| 360 |
+
"""Test music_selector raises error when elevenlabs is not installed."""
|
| 361 |
+
with (
|
| 362 |
+
patch("app.tools.music_selector.ElevenLabs", None),
|
| 363 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 364 |
+
):
|
| 365 |
+
|
| 366 |
+
with pytest.raises(Exception) as exc_info:
|
| 367 |
+
music_selector(mood="energetic", target_duration=10.0)
|
| 368 |
+
|
| 369 |
+
assert "elevenlabs package is not installed" in str(exc_info.value)
|
| 370 |
+
|
| 371 |
+
def test_music_selector_api_error_handling(self, temp_output_dir):
|
| 372 |
+
"""Test music_selector handles API errors gracefully."""
|
| 373 |
+
with (
|
| 374 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 375 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 376 |
+
):
|
| 377 |
+
|
| 378 |
+
mock_client = Mock()
|
| 379 |
+
mock_text_to_sound = Mock()
|
| 380 |
+
mock_convert = Mock(side_effect=Exception("API Error"))
|
| 381 |
+
mock_text_to_sound.convert = mock_convert
|
| 382 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 383 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 384 |
+
|
| 385 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 386 |
+
with pytest.raises(Exception) as exc_info:
|
| 387 |
+
music_selector(
|
| 388 |
+
mood="energetic", target_duration=10.0, output_path=output_path
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
assert "Error generating sound effect" in str(exc_info.value)
|
| 392 |
+
|
| 393 |
+
def test_music_selector_audio_data_bytes(self, temp_output_dir):
|
| 394 |
+
"""Test music_selector handles bytes audio data."""
|
| 395 |
+
with (
|
| 396 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 397 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 398 |
+
):
|
| 399 |
+
|
| 400 |
+
mock_client = Mock()
|
| 401 |
+
mock_text_to_sound = Mock()
|
| 402 |
+
mock_convert = Mock(return_value=b"fake_audio_bytes_data")
|
| 403 |
+
mock_text_to_sound.convert = mock_convert
|
| 404 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 405 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 406 |
+
|
| 407 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 408 |
+
result = music_selector(
|
| 409 |
+
mood="energetic", target_duration=10.0, output_path=output_path
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
assert os.path.exists(result)
|
| 413 |
+
# Verify file was written
|
| 414 |
+
with open(result, "rb") as f:
|
| 415 |
+
assert f.read() == b"fake_audio_bytes_data"
|
| 416 |
+
|
| 417 |
+
def test_music_selector_audio_data_iterable(self, temp_output_dir):
|
| 418 |
+
"""Test music_selector handles iterable audio data (generator/list)."""
|
| 419 |
+
with (
|
| 420 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 421 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 422 |
+
):
|
| 423 |
+
|
| 424 |
+
mock_client = Mock()
|
| 425 |
+
mock_text_to_sound = Mock()
|
| 426 |
+
# Return list of byte chunks
|
| 427 |
+
mock_convert = Mock(return_value=[b"chunk1", b"chunk2", b"chunk3"])
|
| 428 |
+
mock_text_to_sound.convert = mock_convert
|
| 429 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 430 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 431 |
+
|
| 432 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 433 |
+
result = music_selector(
|
| 434 |
+
mood="energetic", target_duration=10.0, output_path=output_path
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
assert os.path.exists(result)
|
| 438 |
+
# Verify file contains all chunks
|
| 439 |
+
with open(result, "rb") as f:
|
| 440 |
+
content = f.read()
|
| 441 |
+
assert b"chunk1" in content
|
| 442 |
+
assert b"chunk2" in content
|
| 443 |
+
assert b"chunk3" in content
|
| 444 |
+
|
| 445 |
+
def test_music_selector_all_parameters(self, temp_output_dir):
|
| 446 |
+
"""Test music_selector with all parameters provided."""
|
| 447 |
+
with (
|
| 448 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 449 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 450 |
+
):
|
| 451 |
+
|
| 452 |
+
mock_client = Mock()
|
| 453 |
+
mock_text_to_sound = Mock()
|
| 454 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 455 |
+
mock_text_to_sound.convert = mock_convert
|
| 456 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 457 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 458 |
+
|
| 459 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 460 |
+
result = music_selector(
|
| 461 |
+
mood="energetic, calm",
|
| 462 |
+
style="cinematic",
|
| 463 |
+
target_duration=25.0,
|
| 464 |
+
bpm=120,
|
| 465 |
+
sync_points=[0.0, 5.0, 10.0],
|
| 466 |
+
looping=True,
|
| 467 |
+
prompt_influence=0.5,
|
| 468 |
+
output_path=output_path,
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
assert os.path.exists(result)
|
| 472 |
+
call_kwargs = mock_convert.call_args[1]
|
| 473 |
+
|
| 474 |
+
# Verify all parameters are in the prompt or API call
|
| 475 |
+
assert "energetic" in call_kwargs["text"]
|
| 476 |
+
assert "calm" in call_kwargs["text"]
|
| 477 |
+
assert "cinematic" in call_kwargs["text"]
|
| 478 |
+
assert "120" in call_kwargs["text"]
|
| 479 |
+
assert "beat markers" in call_kwargs["text"]
|
| 480 |
+
assert call_kwargs["duration_seconds"] == 25.0
|
| 481 |
+
assert call_kwargs["loop"] is True
|
| 482 |
+
assert call_kwargs["prompt_influence"] == 0.5
|
| 483 |
+
assert call_kwargs["output_format"] == "mp3_44100_128"
|
| 484 |
+
|
| 485 |
+
def test_music_selector_looping_false(self, temp_output_dir):
|
| 486 |
+
"""Test music_selector with looping disabled."""
|
| 487 |
+
with (
|
| 488 |
+
patch("app.tools.music_selector.ElevenLabs") as mock_elevenlabs_class,
|
| 489 |
+
patch.dict(os.environ, {"ELEVENLABS_API_KEY": "test_key"}),
|
| 490 |
+
):
|
| 491 |
+
|
| 492 |
+
mock_client = Mock()
|
| 493 |
+
mock_text_to_sound = Mock()
|
| 494 |
+
mock_convert = Mock(return_value=b"fake_audio_data")
|
| 495 |
+
mock_text_to_sound.convert = mock_convert
|
| 496 |
+
mock_client.text_to_sound_effects = mock_text_to_sound
|
| 497 |
+
mock_elevenlabs_class.return_value = mock_client
|
| 498 |
+
|
| 499 |
+
output_path = os.path.join(temp_output_dir, "test_sound.mp3")
|
| 500 |
+
result = music_selector(
|
| 501 |
+
mood="energetic",
|
| 502 |
+
looping=False,
|
| 503 |
+
target_duration=10.0,
|
| 504 |
+
output_path=output_path,
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
assert os.path.exists(result)
|
| 508 |
+
call_kwargs = mock_convert.call_args[1]
|
| 509 |
+
# When looping is False, loop parameter should not be in the call
|
| 510 |
+
# (or should be False if included)
|
| 511 |
+
if "loop" in call_kwargs:
|
| 512 |
+
assert call_kwargs["loop"] is False
|
tests/test_script_generator.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for script_generator tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import pytest
|
| 8 |
+
from unittest.mock import Mock, patch, MagicMock
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
# Add src to path to import modules
|
| 12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 13 |
+
|
| 14 |
+
from app.tools.script_generator import script_generator
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TestScriptGenerator:
|
| 18 |
+
"""Test cases for script_generator function."""
|
| 19 |
+
|
| 20 |
+
def test_script_generator_with_multiple_videos(self, temp_video_file):
|
| 21 |
+
"""Test script_generator with multiple video inputs."""
|
| 22 |
+
with (
|
| 23 |
+
patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture,
|
| 24 |
+
patch("app.tools.script_generator.genai.Client") as mock_client,
|
| 25 |
+
):
|
| 26 |
+
|
| 27 |
+
mock_cap = Mock()
|
| 28 |
+
mock_cap.isOpened.return_value = True
|
| 29 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 30 |
+
5: 30.0,
|
| 31 |
+
7: 900,
|
| 32 |
+
3: 1920,
|
| 33 |
+
4: 1080,
|
| 34 |
+
}.get(prop, 0)
|
| 35 |
+
mock_capture.return_value = mock_cap
|
| 36 |
+
|
| 37 |
+
mock_genai_client = Mock()
|
| 38 |
+
mock_response = Mock()
|
| 39 |
+
mock_response.text = """Here's a comprehensive video script:
|
| 40 |
+
|
| 41 |
+
```json
|
| 42 |
+
{
|
| 43 |
+
"concept": "Energetic travel montage",
|
| 44 |
+
"target_duration": 30.0,
|
| 45 |
+
"total_duration": 30.0,
|
| 46 |
+
"scenes": [
|
| 47 |
+
{
|
| 48 |
+
"scene_id": 1,
|
| 49 |
+
"source_video": 0,
|
| 50 |
+
"start_time": 0.0,
|
| 51 |
+
"end_time": 10.0,
|
| 52 |
+
"duration": 10.0,
|
| 53 |
+
"description": "Opening scene",
|
| 54 |
+
"transition_in": "fade",
|
| 55 |
+
"transition_out": "crossfade"
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"audio": {
|
| 59 |
+
"mood": "energetic",
|
| 60 |
+
"style": "upbeat",
|
| 61 |
+
"bpm": 120,
|
| 62 |
+
"volume": 0.7
|
| 63 |
+
},
|
| 64 |
+
"text_overlays": [],
|
| 65 |
+
"visual_effects": ["color_grading"],
|
| 66 |
+
"call_to_action": "Subscribe for more"
|
| 67 |
+
}
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
This is a narrative description of the script."""
|
| 71 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 72 |
+
mock_client.return_value = mock_genai_client
|
| 73 |
+
|
| 74 |
+
video_inputs = [temp_video_file, temp_video_file]
|
| 75 |
+
|
| 76 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 77 |
+
result = script_generator(
|
| 78 |
+
video_inputs, user_prompt="Create an energetic video"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
result_json = json.loads(result)
|
| 82 |
+
assert "videos_analyzed" in result_json
|
| 83 |
+
assert "script_narrative" in result_json
|
| 84 |
+
assert len(result_json["videos_analyzed"]) == 2
|
| 85 |
+
|
| 86 |
+
def test_script_generator_without_prompt(self, temp_video_file):
|
| 87 |
+
"""Test script_generator without user prompt."""
|
| 88 |
+
with (
|
| 89 |
+
patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture,
|
| 90 |
+
patch("app.tools.script_generator.genai.Client") as mock_client,
|
| 91 |
+
):
|
| 92 |
+
|
| 93 |
+
mock_cap = Mock()
|
| 94 |
+
mock_cap.isOpened.return_value = True
|
| 95 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 96 |
+
5: 30.0,
|
| 97 |
+
7: 900,
|
| 98 |
+
3: 1920,
|
| 99 |
+
4: 1080,
|
| 100 |
+
}.get(prop, 0)
|
| 101 |
+
mock_capture.return_value = mock_cap
|
| 102 |
+
|
| 103 |
+
mock_genai_client = Mock()
|
| 104 |
+
mock_response = Mock()
|
| 105 |
+
mock_response.text = "Auto-generated script based on video analysis."
|
| 106 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 107 |
+
mock_client.return_value = mock_genai_client
|
| 108 |
+
|
| 109 |
+
video_inputs = [temp_video_file]
|
| 110 |
+
|
| 111 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 112 |
+
result = script_generator(video_inputs)
|
| 113 |
+
|
| 114 |
+
result_json = json.loads(result)
|
| 115 |
+
assert "videos_analyzed" in result_json
|
| 116 |
+
assert "script_narrative" in result_json
|
| 117 |
+
assert result_json["user_prompt"] == "Auto-generated based on materials"
|
| 118 |
+
|
| 119 |
+
def test_script_generator_with_string_input(self, temp_video_file):
|
| 120 |
+
"""Test script_generator with single string video input."""
|
| 121 |
+
with (
|
| 122 |
+
patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture,
|
| 123 |
+
patch("app.tools.script_generator.genai.Client") as mock_client,
|
| 124 |
+
):
|
| 125 |
+
|
| 126 |
+
mock_cap = Mock()
|
| 127 |
+
mock_cap.isOpened.return_value = True
|
| 128 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 129 |
+
5: 30.0,
|
| 130 |
+
7: 900,
|
| 131 |
+
3: 1920,
|
| 132 |
+
4: 1080,
|
| 133 |
+
}.get(prop, 0)
|
| 134 |
+
mock_capture.return_value = mock_cap
|
| 135 |
+
|
| 136 |
+
mock_genai_client = Mock()
|
| 137 |
+
mock_response = Mock()
|
| 138 |
+
mock_response.text = "Generated script."
|
| 139 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 140 |
+
mock_client.return_value = mock_genai_client
|
| 141 |
+
|
| 142 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 143 |
+
result = script_generator(temp_video_file)
|
| 144 |
+
|
| 145 |
+
result_json = json.loads(result)
|
| 146 |
+
assert "videos_analyzed" in result_json
|
| 147 |
+
assert len(result_json["videos_analyzed"]) == 1
|
| 148 |
+
|
| 149 |
+
def test_script_generator_with_tuple_input(self, temp_video_file):
|
| 150 |
+
"""Test script_generator with tuple input (Gradio format)."""
|
| 151 |
+
with (
|
| 152 |
+
patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture,
|
| 153 |
+
patch("app.tools.script_generator.genai.Client") as mock_client,
|
| 154 |
+
):
|
| 155 |
+
|
| 156 |
+
mock_cap = Mock()
|
| 157 |
+
mock_cap.isOpened.return_value = True
|
| 158 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 159 |
+
5: 30.0,
|
| 160 |
+
7: 900,
|
| 161 |
+
3: 1920,
|
| 162 |
+
4: 1080,
|
| 163 |
+
}.get(prop, 0)
|
| 164 |
+
mock_capture.return_value = mock_cap
|
| 165 |
+
|
| 166 |
+
mock_genai_client = Mock()
|
| 167 |
+
mock_response = Mock()
|
| 168 |
+
mock_response.text = "Generated script."
|
| 169 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 170 |
+
mock_client.return_value = mock_genai_client
|
| 171 |
+
|
| 172 |
+
video_input = (temp_video_file, "subtitle.srt")
|
| 173 |
+
|
| 174 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 175 |
+
result = script_generator(video_input)
|
| 176 |
+
|
| 177 |
+
result_json = json.loads(result)
|
| 178 |
+
assert "videos_analyzed" in result_json
|
| 179 |
+
|
| 180 |
+
def test_script_generator_with_empty_input(self):
|
| 181 |
+
"""Test script_generator with no video input."""
|
| 182 |
+
result = script_generator([])
|
| 183 |
+
result_json = json.loads(result)
|
| 184 |
+
assert "error" in result_json
|
| 185 |
+
assert result_json["error"] == "No video files provided"
|
| 186 |
+
|
| 187 |
+
def test_script_generator_with_nonexistent_file(self):
|
| 188 |
+
"""Test script_generator with nonexistent video file."""
|
| 189 |
+
result = script_generator(["/nonexistent/video.mp4"])
|
| 190 |
+
result_json = json.loads(result)
|
| 191 |
+
assert "error" in result_json
|
| 192 |
+
assert "not found" in result_json["error"]
|
| 193 |
+
|
| 194 |
+
def test_script_generator_without_api_key(self, temp_video_file):
|
| 195 |
+
"""Test script_generator without GOOGLE_API_KEY."""
|
| 196 |
+
with patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture:
|
| 197 |
+
mock_cap = Mock()
|
| 198 |
+
mock_cap.isOpened.return_value = True
|
| 199 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 200 |
+
5: 30.0,
|
| 201 |
+
7: 900,
|
| 202 |
+
3: 1920,
|
| 203 |
+
4: 1080,
|
| 204 |
+
}.get(prop, 0)
|
| 205 |
+
mock_capture.return_value = mock_cap
|
| 206 |
+
|
| 207 |
+
with patch.dict(os.environ, {}, clear=True):
|
| 208 |
+
result = script_generator([temp_video_file])
|
| 209 |
+
|
| 210 |
+
result_json = json.loads(result)
|
| 211 |
+
assert "error" in result_json
|
| 212 |
+
assert "GOOGLE_API_KEY" in result_json["error"]
|
| 213 |
+
assert "videos_analyzed" in result_json
|
| 214 |
+
|
| 215 |
+
def test_script_generator_with_invalid_video(self, temp_video_file):
|
| 216 |
+
"""Test script_generator with video that cannot be opened."""
|
| 217 |
+
with patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture:
|
| 218 |
+
mock_cap = Mock()
|
| 219 |
+
mock_cap.isOpened.return_value = False
|
| 220 |
+
mock_capture.return_value = mock_cap
|
| 221 |
+
|
| 222 |
+
result = script_generator([temp_video_file])
|
| 223 |
+
result_json = json.loads(result)
|
| 224 |
+
assert "error" in result_json
|
| 225 |
+
assert "Could not open video file" in result_json["error"]
|
| 226 |
+
|
| 227 |
+
def test_script_generator_structured_script_parsing(self, temp_video_file):
|
| 228 |
+
"""Test that structured JSON is properly extracted and parsed."""
|
| 229 |
+
with (
|
| 230 |
+
patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture,
|
| 231 |
+
patch("app.tools.script_generator.genai.Client") as mock_client,
|
| 232 |
+
):
|
| 233 |
+
|
| 234 |
+
mock_cap = Mock()
|
| 235 |
+
mock_cap.isOpened.return_value = True
|
| 236 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 237 |
+
5: 30.0,
|
| 238 |
+
7: 900,
|
| 239 |
+
3: 1920,
|
| 240 |
+
4: 1080,
|
| 241 |
+
}.get(prop, 0)
|
| 242 |
+
mock_capture.return_value = mock_cap
|
| 243 |
+
|
| 244 |
+
mock_genai_client = Mock()
|
| 245 |
+
mock_response = Mock()
|
| 246 |
+
mock_response.text = """Here's the script:
|
| 247 |
+
|
| 248 |
+
```json
|
| 249 |
+
{
|
| 250 |
+
"concept": "Test concept",
|
| 251 |
+
"target_duration": 30.0,
|
| 252 |
+
"scenes": []
|
| 253 |
+
}
|
| 254 |
+
```
|
| 255 |
+
|
| 256 |
+
Narrative description."""
|
| 257 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 258 |
+
mock_client.return_value = mock_genai_client
|
| 259 |
+
|
| 260 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 261 |
+
result = script_generator([temp_video_file])
|
| 262 |
+
|
| 263 |
+
result_json = json.loads(result)
|
| 264 |
+
assert "structured_script" in result_json
|
| 265 |
+
assert result_json["structured_script"]["concept"] == "Test concept"
|
| 266 |
+
assert result_json["structured_script"]["target_duration"] == 30.0
|
| 267 |
+
|
| 268 |
+
def test_script_generator_with_custom_prompt(self, temp_video_file):
|
| 269 |
+
"""Test script_generator with custom user prompt."""
|
| 270 |
+
with (
|
| 271 |
+
patch("app.tools.script_generator.cv2.VideoCapture") as mock_capture,
|
| 272 |
+
patch("app.tools.script_generator.genai.Client") as mock_client,
|
| 273 |
+
):
|
| 274 |
+
|
| 275 |
+
mock_cap = Mock()
|
| 276 |
+
mock_cap.isOpened.return_value = True
|
| 277 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 278 |
+
5: 30.0,
|
| 279 |
+
7: 900,
|
| 280 |
+
3: 1920,
|
| 281 |
+
4: 1080,
|
| 282 |
+
}.get(prop, 0)
|
| 283 |
+
mock_capture.return_value = mock_cap
|
| 284 |
+
|
| 285 |
+
mock_genai_client = Mock()
|
| 286 |
+
mock_response = Mock()
|
| 287 |
+
mock_response.text = "Custom prompt response."
|
| 288 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 289 |
+
mock_client.return_value = mock_genai_client
|
| 290 |
+
|
| 291 |
+
custom_prompt = "Create a dramatic product reveal"
|
| 292 |
+
|
| 293 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 294 |
+
result = script_generator([temp_video_file], user_prompt=custom_prompt)
|
| 295 |
+
|
| 296 |
+
result_json = json.loads(result)
|
| 297 |
+
assert result_json["user_prompt"] == custom_prompt
|
| 298 |
+
assert "script_narrative" in result_json
|
tests/test_subtitle_creator.py
ADDED
|
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for subtitle_creator tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import pytest
|
| 8 |
+
from unittest.mock import Mock, patch, MagicMock, call
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
# Add src to path to import modules
|
| 12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 13 |
+
|
| 14 |
+
from app.tools.subtitle_creator import subtitle_creator
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TestSubtitleCreator:
|
| 18 |
+
"""Test cases for subtitle_creator function."""
|
| 19 |
+
|
| 20 |
+
@pytest.fixture
|
| 21 |
+
def simple_transcript(self):
|
| 22 |
+
"""Simple transcript JSON for testing."""
|
| 23 |
+
return json.dumps(
|
| 24 |
+
{
|
| 25 |
+
"subtitles": [
|
| 26 |
+
{
|
| 27 |
+
"start": 0.0,
|
| 28 |
+
"end": 2.5,
|
| 29 |
+
"text": "Hello, welcome!",
|
| 30 |
+
"position": "bottom",
|
| 31 |
+
"fontsize": 48,
|
| 32 |
+
"color": "white",
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"start": 2.5,
|
| 36 |
+
"end": 5.0,
|
| 37 |
+
"text": "This is a test.",
|
| 38 |
+
"position": "top",
|
| 39 |
+
"fontsize": 52,
|
| 40 |
+
"color": "yellow",
|
| 41 |
+
},
|
| 42 |
+
],
|
| 43 |
+
"default_style": {
|
| 44 |
+
"font": "Arial",
|
| 45 |
+
"fontsize": 48,
|
| 46 |
+
"color": "white",
|
| 47 |
+
"bg_color": "black",
|
| 48 |
+
},
|
| 49 |
+
}
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
@pytest.fixture
|
| 53 |
+
def minimal_transcript(self):
|
| 54 |
+
"""Minimal transcript with just required fields."""
|
| 55 |
+
return json.dumps(
|
| 56 |
+
{"subtitles": [{"start": 0.0, "end": 2.0, "text": "Test subtitle"}]}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def test_subtitle_creator_with_simple_transcript(
|
| 60 |
+
self, temp_video_file, simple_transcript
|
| 61 |
+
):
|
| 62 |
+
"""Test subtitle_creator with a simple transcript."""
|
| 63 |
+
with (
|
| 64 |
+
patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,
|
| 65 |
+
patch("app.tools.subtitle_creator.TextClip") as mock_text_clip,
|
| 66 |
+
patch("app.tools.subtitle_creator.CompositeVideoClip") as mock_composite,
|
| 67 |
+
):
|
| 68 |
+
# Mock video
|
| 69 |
+
mock_video = Mock()
|
| 70 |
+
mock_video.duration = 10.0
|
| 71 |
+
mock_video.size = (1920, 1080)
|
| 72 |
+
mock_video.fps = 30.0
|
| 73 |
+
mock_video_clip.return_value = mock_video
|
| 74 |
+
|
| 75 |
+
# Mock text clips
|
| 76 |
+
mock_text = Mock()
|
| 77 |
+
mock_text.with_start.return_value = mock_text
|
| 78 |
+
mock_text.with_end.return_value = mock_text
|
| 79 |
+
mock_text.with_position.return_value = mock_text
|
| 80 |
+
mock_text_clip.return_value = mock_text
|
| 81 |
+
|
| 82 |
+
# Mock composite
|
| 83 |
+
mock_final = Mock()
|
| 84 |
+
mock_composite.return_value = mock_final
|
| 85 |
+
|
| 86 |
+
result = subtitle_creator(temp_video_file, simple_transcript)
|
| 87 |
+
|
| 88 |
+
# Verify video was loaded
|
| 89 |
+
mock_video_clip.assert_called_once_with(temp_video_file)
|
| 90 |
+
|
| 91 |
+
# Verify text clips were created (2 subtitles)
|
| 92 |
+
assert mock_text_clip.call_count == 2
|
| 93 |
+
|
| 94 |
+
# Verify composite was created
|
| 95 |
+
mock_composite.assert_called_once()
|
| 96 |
+
|
| 97 |
+
# Verify video was written
|
| 98 |
+
mock_final.write_videofile.assert_called_once()
|
| 99 |
+
|
| 100 |
+
# Verify cleanup
|
| 101 |
+
mock_video.close.assert_called_once()
|
| 102 |
+
mock_final.close.assert_called_once()
|
| 103 |
+
|
| 104 |
+
assert result.endswith(".mp4")
|
| 105 |
+
assert os.path.exists(result) or result.startswith("/tmp/")
|
| 106 |
+
|
| 107 |
+
def test_subtitle_creator_with_minimal_transcript(
|
| 108 |
+
self, temp_video_file, minimal_transcript
|
| 109 |
+
):
|
| 110 |
+
"""Test subtitle_creator with minimal transcript (no styling)."""
|
| 111 |
+
with (
|
| 112 |
+
patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,
|
| 113 |
+
patch("app.tools.subtitle_creator.TextClip") as mock_text_clip,
|
| 114 |
+
patch("app.tools.subtitle_creator.CompositeVideoClip") as mock_composite,
|
| 115 |
+
):
|
| 116 |
+
mock_video = Mock()
|
| 117 |
+
mock_video.duration = 10.0
|
| 118 |
+
mock_video.size = (1920, 1080)
|
| 119 |
+
mock_video.fps = 30.0
|
| 120 |
+
mock_video_clip.return_value = mock_video
|
| 121 |
+
|
| 122 |
+
mock_text = Mock()
|
| 123 |
+
mock_text.with_start.return_value = mock_text
|
| 124 |
+
mock_text.with_end.return_value = mock_text
|
| 125 |
+
mock_text.with_position.return_value = mock_text
|
| 126 |
+
mock_text_clip.return_value = mock_text
|
| 127 |
+
|
| 128 |
+
mock_final = Mock()
|
| 129 |
+
mock_composite.return_value = mock_final
|
| 130 |
+
|
| 131 |
+
result = subtitle_creator(temp_video_file, minimal_transcript)
|
| 132 |
+
|
| 133 |
+
# Verify text clip was created with defaults
|
| 134 |
+
mock_text_clip.assert_called_once()
|
| 135 |
+
call_kwargs = mock_text_clip.call_args[1]
|
| 136 |
+
assert call_kwargs["font"] == "Arial"
|
| 137 |
+
assert call_kwargs["font_size"] == 48
|
| 138 |
+
assert call_kwargs["color"] == "white"
|
| 139 |
+
assert call_kwargs["bg_color"] == "black"
|
| 140 |
+
|
| 141 |
+
assert result.endswith(".mp4")
|
| 142 |
+
|
| 143 |
+
def test_subtitle_creator_with_tuple_input(
|
| 144 |
+
self, temp_video_file, simple_transcript
|
| 145 |
+
):
|
| 146 |
+
"""Test subtitle_creator with tuple input (Gradio format)."""
|
| 147 |
+
with (
|
| 148 |
+
patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,
|
| 149 |
+
patch("app.tools.subtitle_creator.TextClip") as mock_text_clip,
|
| 150 |
+
patch("app.tools.subtitle_creator.CompositeVideoClip") as mock_composite,
|
| 151 |
+
):
|
| 152 |
+
mock_video = Mock()
|
| 153 |
+
mock_video.duration = 10.0
|
| 154 |
+
mock_video.size = (1920, 1080)
|
| 155 |
+
mock_video.fps = 30.0
|
| 156 |
+
mock_video_clip.return_value = mock_video
|
| 157 |
+
|
| 158 |
+
mock_text = Mock()
|
| 159 |
+
mock_text.with_start.return_value = mock_text
|
| 160 |
+
mock_text.with_end.return_value = mock_text
|
| 161 |
+
mock_text.with_position.return_value = mock_text
|
| 162 |
+
mock_text_clip.return_value = mock_text
|
| 163 |
+
|
| 164 |
+
mock_final = Mock()
|
| 165 |
+
mock_composite.return_value = mock_final
|
| 166 |
+
|
| 167 |
+
video_input = (temp_video_file, "subtitle.srt")
|
| 168 |
+
|
| 169 |
+
result = subtitle_creator(video_input, simple_transcript)
|
| 170 |
+
|
| 171 |
+
mock_video_clip.assert_called_once_with(temp_video_file)
|
| 172 |
+
assert result.endswith(".mp4")
|
| 173 |
+
|
| 174 |
+
def test_subtitle_creator_with_custom_output_path(
|
| 175 |
+
self, temp_video_file, minimal_transcript, tmp_path
|
| 176 |
+
):
|
| 177 |
+
"""Test subtitle_creator with custom output path."""
|
| 178 |
+
with (
|
| 179 |
+
patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,
|
| 180 |
+
patch("app.tools.subtitle_creator.TextClip") as mock_text_clip,
|
| 181 |
+
patch("app.tools.subtitle_creator.CompositeVideoClip") as mock_composite,
|
| 182 |
+
):
|
| 183 |
+
mock_video = Mock()
|
| 184 |
+
mock_video.duration = 10.0
|
| 185 |
+
mock_video.size = (1920, 1080)
|
| 186 |
+
mock_video.fps = 30.0
|
| 187 |
+
mock_video_clip.return_value = mock_video
|
| 188 |
+
|
| 189 |
+
mock_text = Mock()
|
| 190 |
+
mock_text.with_start.return_value = mock_text
|
| 191 |
+
mock_text.with_end.return_value = mock_text
|
| 192 |
+
mock_text.with_position.return_value = mock_text
|
| 193 |
+
mock_text_clip.return_value = mock_text
|
| 194 |
+
|
| 195 |
+
mock_final = Mock()
|
| 196 |
+
mock_composite.return_value = mock_final
|
| 197 |
+
|
| 198 |
+
output_path = str(tmp_path / "custom_subtitled.mp4")
|
| 199 |
+
|
| 200 |
+
result = subtitle_creator(temp_video_file, minimal_transcript, output_path)
|
| 201 |
+
|
| 202 |
+
assert result == output_path
|
| 203 |
+
mock_final.write_videofile.assert_called_once()
|
| 204 |
+
|
| 205 |
+
def test_subtitle_creator_with_nonexistent_file(self, simple_transcript):
|
| 206 |
+
"""Test subtitle_creator with nonexistent video file."""
|
| 207 |
+
with pytest.raises(FileNotFoundError) as exc_info:
|
| 208 |
+
subtitle_creator("/nonexistent/video.mp4", simple_transcript)
|
| 209 |
+
assert "not found" in str(exc_info.value)
|
| 210 |
+
|
| 211 |
+
def test_subtitle_creator_with_invalid_json(self, temp_video_file):
|
| 212 |
+
"""Test subtitle_creator with invalid JSON transcript."""
|
| 213 |
+
invalid_json = "{ this is not valid json }"
|
| 214 |
+
|
| 215 |
+
with pytest.raises(ValueError) as exc_info:
|
| 216 |
+
subtitle_creator(temp_video_file, invalid_json)
|
| 217 |
+
assert "Invalid JSON format" in str(exc_info.value)
|
| 218 |
+
|
| 219 |
+
def test_subtitle_creator_with_missing_subtitles_array(self, temp_video_file):
|
| 220 |
+
"""Test subtitle_creator with missing subtitles array."""
|
| 221 |
+
transcript = json.dumps({"default_style": {"font": "Arial"}})
|
| 222 |
+
|
| 223 |
+
with (patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,):
|
| 224 |
+
mock_video = Mock()
|
| 225 |
+
mock_video.duration = 10.0
|
| 226 |
+
mock_video.size = (1920, 1080)
|
| 227 |
+
mock_video_clip.return_value = mock_video
|
| 228 |
+
|
| 229 |
+
with pytest.raises(ValueError) as exc_info:
|
| 230 |
+
subtitle_creator(temp_video_file, transcript)
|
| 231 |
+
assert "must contain 'subtitles' array" in str(exc_info.value)
|
| 232 |
+
|
| 233 |
+
def test_subtitle_creator_with_empty_subtitles(self, temp_video_file):
|
| 234 |
+
"""Test subtitle_creator with empty subtitles array."""
|
| 235 |
+
transcript = json.dumps({"subtitles": []})
|
| 236 |
+
|
| 237 |
+
with (patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,):
|
| 238 |
+
mock_video = Mock()
|
| 239 |
+
mock_video.duration = 10.0
|
| 240 |
+
mock_video.size = (1920, 1080)
|
| 241 |
+
mock_video_clip.return_value = mock_video
|
| 242 |
+
|
| 243 |
+
with pytest.raises(ValueError) as exc_info:
|
| 244 |
+
subtitle_creator(temp_video_file, transcript)
|
| 245 |
+
assert "must contain 'subtitles' array" in str(exc_info.value)
|
| 246 |
+
|
| 247 |
+
def test_subtitle_creator_with_missing_required_fields(self, temp_video_file):
|
| 248 |
+
"""Test subtitle_creator with subtitle missing required fields."""
|
| 249 |
+
transcript = json.dumps(
|
| 250 |
+
{"subtitles": [{"start": 0.0, "text": "Missing end time"}]}
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
with (patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,):
|
| 254 |
+
mock_video = Mock()
|
| 255 |
+
mock_video.duration = 10.0
|
| 256 |
+
mock_video.size = (1920, 1080)
|
| 257 |
+
mock_video_clip.return_value = mock_video
|
| 258 |
+
|
| 259 |
+
with pytest.raises(ValueError) as exc_info:
|
| 260 |
+
subtitle_creator(temp_video_file, transcript)
|
| 261 |
+
assert "must have 'start', 'end', and 'text' fields" in str(exc_info.value)
|
| 262 |
+
|
| 263 |
+
def test_subtitle_creator_with_negative_times(self, temp_video_file):
|
| 264 |
+
"""Test subtitle_creator with negative start/end times."""
|
| 265 |
+
transcript = json.dumps(
|
| 266 |
+
{"subtitles": [{"start": -1.0, "end": 2.0, "text": "Negative start"}]}
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
with (patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,):
|
| 270 |
+
mock_video = Mock()
|
| 271 |
+
mock_video.duration = 10.0
|
| 272 |
+
mock_video.size = (1920, 1080)
|
| 273 |
+
mock_video_clip.return_value = mock_video
|
| 274 |
+
|
| 275 |
+
with pytest.raises(ValueError) as exc_info:
|
| 276 |
+
subtitle_creator(temp_video_file, transcript)
|
| 277 |
+
assert "must be >= 0" in str(exc_info.value)
|
| 278 |
+
|
| 279 |
+
def test_subtitle_creator_with_invalid_time_range(self, temp_video_file):
|
| 280 |
+
"""Test subtitle_creator with end time before start time."""
|
| 281 |
+
transcript = json.dumps(
|
| 282 |
+
{"subtitles": [{"start": 5.0, "end": 2.0, "text": "Invalid range"}]}
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
with (patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,):
|
| 286 |
+
mock_video = Mock()
|
| 287 |
+
mock_video.duration = 10.0
|
| 288 |
+
mock_video.size = (1920, 1080)
|
| 289 |
+
mock_video_clip.return_value = mock_video
|
| 290 |
+
|
| 291 |
+
with pytest.raises(ValueError) as exc_info:
|
| 292 |
+
subtitle_creator(temp_video_file, transcript)
|
| 293 |
+
assert "end time must be greater than start time" in str(exc_info.value)
|
| 294 |
+
|
| 295 |
+
def test_subtitle_creator_with_time_exceeding_duration(self, temp_video_file):
|
| 296 |
+
"""Test subtitle_creator with start time exceeding video duration."""
|
| 297 |
+
transcript = json.dumps(
|
| 298 |
+
{"subtitles": [{"start": 15.0, "end": 20.0, "text": "Beyond video"}]}
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
with (patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,):
|
| 302 |
+
mock_video = Mock()
|
| 303 |
+
mock_video.duration = 10.0
|
| 304 |
+
mock_video.size = (1920, 1080)
|
| 305 |
+
mock_video_clip.return_value = mock_video
|
| 306 |
+
|
| 307 |
+
with pytest.raises(ValueError) as exc_info:
|
| 308 |
+
subtitle_creator(temp_video_file, transcript)
|
| 309 |
+
assert "exceeds video duration" in str(exc_info.value)
|
| 310 |
+
|
| 311 |
+
def test_subtitle_creator_clamps_end_time(self, temp_video_file):
|
| 312 |
+
"""Test that end time is clamped to video duration."""
|
| 313 |
+
transcript = json.dumps(
|
| 314 |
+
{"subtitles": [{"start": 8.0, "end": 15.0, "text": "End beyond duration"}]}
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
with (
|
| 318 |
+
patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,
|
| 319 |
+
patch("app.tools.subtitle_creator.TextClip") as mock_text_clip,
|
| 320 |
+
patch("app.tools.subtitle_creator.CompositeVideoClip") as mock_composite,
|
| 321 |
+
):
|
| 322 |
+
mock_video = Mock()
|
| 323 |
+
mock_video.duration = 10.0
|
| 324 |
+
mock_video.size = (1920, 1080)
|
| 325 |
+
mock_video.fps = 30.0
|
| 326 |
+
mock_video_clip.return_value = mock_video
|
| 327 |
+
|
| 328 |
+
mock_text = Mock()
|
| 329 |
+
mock_text.with_start.return_value = mock_text
|
| 330 |
+
mock_text.with_end.return_value = mock_text
|
| 331 |
+
mock_text.with_position.return_value = mock_text
|
| 332 |
+
mock_text_clip.return_value = mock_text
|
| 333 |
+
|
| 334 |
+
mock_final = Mock()
|
| 335 |
+
mock_composite.return_value = mock_final
|
| 336 |
+
|
| 337 |
+
result = subtitle_creator(temp_video_file, transcript)
|
| 338 |
+
|
| 339 |
+
# Verify end time was clamped by checking with_end was called with video duration
|
| 340 |
+
mock_text.with_end.assert_called_with(10.0)
|
| 341 |
+
|
| 342 |
+
def test_subtitle_creator_with_different_positions(self, temp_video_file):
|
| 343 |
+
"""Test subtitle_creator with different position options."""
|
| 344 |
+
transcript = json.dumps(
|
| 345 |
+
{
|
| 346 |
+
"subtitles": [
|
| 347 |
+
{"start": 0.0, "end": 1.0, "text": "Bottom", "position": "bottom"},
|
| 348 |
+
{"start": 1.0, "end": 2.0, "text": "Top", "position": "top"},
|
| 349 |
+
{"start": 2.0, "end": 3.0, "text": "Center", "position": "center"},
|
| 350 |
+
{
|
| 351 |
+
"start": 3.0,
|
| 352 |
+
"end": 4.0,
|
| 353 |
+
"text": "Custom",
|
| 354 |
+
"position": [100, 200],
|
| 355 |
+
},
|
| 356 |
+
]
|
| 357 |
+
}
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
with (
|
| 361 |
+
patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,
|
| 362 |
+
patch("app.tools.subtitle_creator.TextClip") as mock_text_clip,
|
| 363 |
+
patch("app.tools.subtitle_creator.CompositeVideoClip") as mock_composite,
|
| 364 |
+
):
|
| 365 |
+
mock_video = Mock()
|
| 366 |
+
mock_video.duration = 10.0
|
| 367 |
+
mock_video.size = (1920, 1080)
|
| 368 |
+
mock_video.fps = 30.0
|
| 369 |
+
mock_video_clip.return_value = mock_video
|
| 370 |
+
|
| 371 |
+
mock_text = Mock()
|
| 372 |
+
mock_text.with_start.return_value = mock_text
|
| 373 |
+
mock_text.with_end.return_value = mock_text
|
| 374 |
+
mock_text.with_position.return_value = mock_text
|
| 375 |
+
mock_text_clip.return_value = mock_text
|
| 376 |
+
|
| 377 |
+
mock_final = Mock()
|
| 378 |
+
mock_composite.return_value = mock_final
|
| 379 |
+
|
| 380 |
+
result = subtitle_creator(temp_video_file, transcript)
|
| 381 |
+
|
| 382 |
+
# Verify 4 text clips were created
|
| 383 |
+
assert mock_text_clip.call_count == 4
|
| 384 |
+
|
| 385 |
+
# Verify with_position was called 4 times with different positions
|
| 386 |
+
assert mock_text.with_position.call_count == 4
|
| 387 |
+
|
| 388 |
+
def test_subtitle_creator_with_stroke_styling(self, temp_video_file):
|
| 389 |
+
"""Test subtitle_creator with stroke/outline styling."""
|
| 390 |
+
transcript = json.dumps(
|
| 391 |
+
{
|
| 392 |
+
"subtitles": [
|
| 393 |
+
{
|
| 394 |
+
"start": 0.0,
|
| 395 |
+
"end": 2.0,
|
| 396 |
+
"text": "Outlined text",
|
| 397 |
+
"stroke_color": "black",
|
| 398 |
+
"stroke_width": 3,
|
| 399 |
+
}
|
| 400 |
+
]
|
| 401 |
+
}
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
with (
|
| 405 |
+
patch("app.tools.subtitle_creator.VideoFileClip") as mock_video_clip,
|
| 406 |
+
patch("app.tools.subtitle_creator.TextClip") as mock_text_clip,
|
| 407 |
+
patch("app.tools.subtitle_creator.CompositeVideoClip") as mock_composite,
|
| 408 |
+
):
|
| 409 |
+
mock_video = Mock()
|
| 410 |
+
mock_video.duration = 10.0
|
| 411 |
+
mock_video.size = (1920, 1080)
|
| 412 |
+
mock_video.fps = 30.0
|
| 413 |
+
mock_video_clip.return_value = mock_video
|
| 414 |
+
|
| 415 |
+
mock_text = Mock()
|
| 416 |
+
mock_text.with_start.return_value = mock_text
|
| 417 |
+
mock_text.with_end.return_value = mock_text
|
| 418 |
+
mock_text.with_position.return_value = mock_text
|
| 419 |
+
mock_text_clip.return_value = mock_text
|
| 420 |
+
|
| 421 |
+
mock_final = Mock()
|
| 422 |
+
mock_composite.return_value = mock_final
|
| 423 |
+
|
| 424 |
+
result = subtitle_creator(temp_video_file, transcript)
|
| 425 |
+
|
| 426 |
+
# Verify stroke parameters were passed
|
| 427 |
+
call_kwargs = mock_text_clip.call_args[1]
|
| 428 |
+
assert call_kwargs["stroke_color"] == "black"
|
| 429 |
+
assert call_kwargs["stroke_width"] == 3
|
| 430 |
+
|
| 431 |
+
def test_subtitle_creator_with_invalid_video_input(self, simple_transcript):
|
| 432 |
+
"""Test subtitle_creator with invalid video input type."""
|
| 433 |
+
with pytest.raises(ValueError) as exc_info:
|
| 434 |
+
subtitle_creator(12345, simple_transcript) # type: ignore
|
| 435 |
+
assert "Invalid video input format" in str(exc_info.value)
|
tests/test_text_to_speech.py
ADDED
|
@@ -0,0 +1,638 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the Text-to-Speech converter.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import pytest
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from unittest.mock import Mock, patch, MagicMock
|
| 10 |
+
from src.app.tools.text_to_speech import text_to_speech_simple
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TestTextToSpeechSimple:
|
| 14 |
+
"""Test suite for the simple text-to-speech function using gTTS."""
|
| 15 |
+
|
| 16 |
+
def test_basic_text_to_speech(self, tmp_path, monkeypatch):
|
| 17 |
+
"""Test basic text-to-speech conversion."""
|
| 18 |
+
# Mock gTTS
|
| 19 |
+
mock_tts = Mock()
|
| 20 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 21 |
+
|
| 22 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 23 |
+
# Change output directory to tmp_path
|
| 24 |
+
monkeypatch.setattr(
|
| 25 |
+
"src.app.tools.text_to_speech.Path",
|
| 26 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
result = text_to_speech_simple("Hello, world!")
|
| 30 |
+
|
| 31 |
+
# Verify gTTS was called correctly with default parameters
|
| 32 |
+
mock_gtts_class.assert_called_once_with(
|
| 33 |
+
text="Hello, world!", lang="en", slow=False, tld="com"
|
| 34 |
+
)
|
| 35 |
+
mock_tts.save.assert_called_once()
|
| 36 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 37 |
+
|
| 38 |
+
def test_srt_subtitle_conversion(self, tmp_path, monkeypatch):
|
| 39 |
+
"""Test conversion of SRT subtitle format."""
|
| 40 |
+
srt_content = """1
|
| 41 |
+
00:00:00,000 --> 00:00:03,500
|
| 42 |
+
Welcome to our video.
|
| 43 |
+
|
| 44 |
+
2
|
| 45 |
+
00:00:03,500 --> 00:00:07,000
|
| 46 |
+
Today we will learn something new."""
|
| 47 |
+
|
| 48 |
+
mock_tts = Mock()
|
| 49 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 50 |
+
|
| 51 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 52 |
+
monkeypatch.setattr(
|
| 53 |
+
"src.app.tools.text_to_speech.Path",
|
| 54 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
result = text_to_speech_simple(srt_content, format_type="srt")
|
| 58 |
+
|
| 59 |
+
# Check that dialogues were combined
|
| 60 |
+
call_args = mock_gtts_class.call_args
|
| 61 |
+
combined_text = call_args[1]["text"]
|
| 62 |
+
assert "Welcome to our video" in combined_text
|
| 63 |
+
assert "Today we will learn" in combined_text
|
| 64 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 65 |
+
|
| 66 |
+
def test_vtt_subtitle_conversion(self, tmp_path, monkeypatch):
|
| 67 |
+
"""Test conversion of VTT subtitle format."""
|
| 68 |
+
vtt_content = """WEBVTT
|
| 69 |
+
|
| 70 |
+
00:00:00.000 --> 00:00:03.500
|
| 71 |
+
Hello world.
|
| 72 |
+
|
| 73 |
+
00:00:03.500 --> 00:00:07.000
|
| 74 |
+
Welcome to the tutorial."""
|
| 75 |
+
|
| 76 |
+
mock_tts = Mock()
|
| 77 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 78 |
+
|
| 79 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 80 |
+
monkeypatch.setattr(
|
| 81 |
+
"src.app.tools.text_to_speech.Path",
|
| 82 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
result = text_to_speech_simple(vtt_content, format_type="vtt")
|
| 86 |
+
|
| 87 |
+
call_args = mock_gtts_class.call_args
|
| 88 |
+
combined_text = call_args[1]["text"]
|
| 89 |
+
assert "Hello world" in combined_text
|
| 90 |
+
assert "Welcome to the tutorial" in combined_text
|
| 91 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 92 |
+
|
| 93 |
+
def test_json_subtitle_conversion(self, tmp_path, monkeypatch):
|
| 94 |
+
"""Test conversion of JSON scenario format."""
|
| 95 |
+
json_content = (
|
| 96 |
+
'{"scenes": [{"dialogue": "First line"}, {"dialogue": "Second line"}]}'
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
mock_tts = Mock()
|
| 100 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 101 |
+
|
| 102 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 103 |
+
monkeypatch.setattr(
|
| 104 |
+
"src.app.tools.text_to_speech.Path",
|
| 105 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
result = text_to_speech_simple(json_content, format_type="json")
|
| 109 |
+
|
| 110 |
+
call_args = mock_gtts_class.call_args
|
| 111 |
+
combined_text = call_args[1]["text"]
|
| 112 |
+
assert "First line" in combined_text
|
| 113 |
+
assert "Second line" in combined_text
|
| 114 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 115 |
+
|
| 116 |
+
def test_auto_detect_srt(self, tmp_path, monkeypatch):
|
| 117 |
+
"""Test auto-detection of SRT format."""
|
| 118 |
+
srt_content = """1
|
| 119 |
+
00:00:00,000 --> 00:00:03,500
|
| 120 |
+
Auto-detected SRT."""
|
| 121 |
+
|
| 122 |
+
mock_tts = Mock()
|
| 123 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 124 |
+
|
| 125 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 126 |
+
monkeypatch.setattr(
|
| 127 |
+
"src.app.tools.text_to_speech.Path",
|
| 128 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
result = text_to_speech_simple(srt_content, format_type="auto")
|
| 132 |
+
|
| 133 |
+
mock_gtts_class.assert_called_once()
|
| 134 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 135 |
+
|
| 136 |
+
def test_auto_detect_vtt(self, tmp_path, monkeypatch):
|
| 137 |
+
"""Test auto-detection of VTT format."""
|
| 138 |
+
vtt_content = """WEBVTT
|
| 139 |
+
|
| 140 |
+
00:00:00.000 --> 00:00:03.500
|
| 141 |
+
Auto-detected VTT."""
|
| 142 |
+
|
| 143 |
+
mock_tts = Mock()
|
| 144 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 145 |
+
|
| 146 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 147 |
+
monkeypatch.setattr(
|
| 148 |
+
"src.app.tools.text_to_speech.Path",
|
| 149 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
result = text_to_speech_simple(vtt_content, format_type="auto")
|
| 153 |
+
|
| 154 |
+
mock_gtts_class.assert_called_once()
|
| 155 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 156 |
+
|
| 157 |
+
def test_empty_text_raises_error(self):
|
| 158 |
+
"""Test that empty text raises ValueError."""
|
| 159 |
+
with pytest.raises(ValueError, match="Text cannot be empty"):
|
| 160 |
+
text_to_speech_simple("")
|
| 161 |
+
|
| 162 |
+
def test_whitespace_only_text_raises_error(self):
|
| 163 |
+
"""Test that whitespace-only text raises ValueError."""
|
| 164 |
+
with pytest.raises(ValueError, match="Text cannot be empty"):
|
| 165 |
+
text_to_speech_simple(" ")
|
| 166 |
+
|
| 167 |
+
def test_long_text_conversion(self, tmp_path, monkeypatch):
|
| 168 |
+
"""Test conversion of longer text."""
|
| 169 |
+
long_text = "This is a longer piece of text. " * 10
|
| 170 |
+
|
| 171 |
+
mock_tts = Mock()
|
| 172 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 173 |
+
|
| 174 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 175 |
+
monkeypatch.setattr(
|
| 176 |
+
"src.app.tools.text_to_speech.Path",
|
| 177 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
result = text_to_speech_simple(long_text)
|
| 181 |
+
|
| 182 |
+
mock_gtts_class.assert_called_once()
|
| 183 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 184 |
+
|
| 185 |
+
def test_gtts_not_installed(self):
|
| 186 |
+
"""Test behavior when gTTS is not installed."""
|
| 187 |
+
# Mock the entire import to fail
|
| 188 |
+
with patch.dict("sys.modules", {"gtts": None}):
|
| 189 |
+
with patch(
|
| 190 |
+
"builtins.__import__", side_effect=ImportError("No module named 'gtts'")
|
| 191 |
+
):
|
| 192 |
+
result = text_to_speech_simple("test")
|
| 193 |
+
assert "Please install gTTS" in result
|
| 194 |
+
|
| 195 |
+
def test_gtts_save_error(self):
|
| 196 |
+
"""Test handling of gTTS save errors."""
|
| 197 |
+
mock_tts = Mock()
|
| 198 |
+
mock_tts.save.side_effect = Exception("Save failed")
|
| 199 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 200 |
+
|
| 201 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 202 |
+
with pytest.raises(RuntimeError, match="Failed to generate audio"):
|
| 203 |
+
text_to_speech_simple("test")
|
| 204 |
+
|
| 205 |
+
def test_special_characters_in_text(self, tmp_path, monkeypatch):
|
| 206 |
+
"""Test text with special characters."""
|
| 207 |
+
text_with_special = "Hello! How are you? I'm fine, thanks. 😊"
|
| 208 |
+
|
| 209 |
+
mock_tts = Mock()
|
| 210 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 211 |
+
|
| 212 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 213 |
+
monkeypatch.setattr(
|
| 214 |
+
"src.app.tools.text_to_speech.Path",
|
| 215 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
result = text_to_speech_simple(text_with_special)
|
| 219 |
+
|
| 220 |
+
mock_gtts_class.assert_called_once_with(
|
| 221 |
+
text=text_with_special, lang="en", slow=False, tld="com"
|
| 222 |
+
)
|
| 223 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 224 |
+
|
| 225 |
+
def test_multiline_text(self, tmp_path, monkeypatch):
|
| 226 |
+
"""Test text with multiple lines."""
|
| 227 |
+
multiline_text = """Line one.
|
| 228 |
+
Line two.
|
| 229 |
+
Line three."""
|
| 230 |
+
|
| 231 |
+
mock_tts = Mock()
|
| 232 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 233 |
+
|
| 234 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 235 |
+
monkeypatch.setattr(
|
| 236 |
+
"src.app.tools.text_to_speech.Path",
|
| 237 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
result = text_to_speech_simple(multiline_text)
|
| 241 |
+
|
| 242 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 243 |
+
|
| 244 |
+
def test_output_directory_creation(self, tmp_path, monkeypatch):
|
| 245 |
+
"""Test that output directory is created if it doesn't exist."""
|
| 246 |
+
output_dir = tmp_path / "outputs" / "audio"
|
| 247 |
+
|
| 248 |
+
mock_tts = Mock()
|
| 249 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 250 |
+
|
| 251 |
+
def mock_path(path_str):
|
| 252 |
+
if "outputs/audio" in str(path_str):
|
| 253 |
+
return output_dir
|
| 254 |
+
return Path(path_str)
|
| 255 |
+
|
| 256 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 257 |
+
with patch("src.app.tools.text_to_speech.Path", side_effect=mock_path):
|
| 258 |
+
text_to_speech_simple("test")
|
| 259 |
+
|
| 260 |
+
# Verify directory would be created
|
| 261 |
+
assert mock_tts.save.called
|
| 262 |
+
|
| 263 |
+
def test_numbers_and_punctuation(self, tmp_path, monkeypatch):
|
| 264 |
+
"""Test text with numbers and various punctuation."""
|
| 265 |
+
text = "The year is 2024! Count: 1, 2, 3... Ready? Let's go!"
|
| 266 |
+
|
| 267 |
+
mock_tts = Mock()
|
| 268 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 269 |
+
|
| 270 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 271 |
+
monkeypatch.setattr(
|
| 272 |
+
"src.app.tools.text_to_speech.Path",
|
| 273 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
result = text_to_speech_simple(text)
|
| 277 |
+
|
| 278 |
+
mock_gtts_class.assert_called_once_with(
|
| 279 |
+
text=text, lang="en", slow=False, tld="com"
|
| 280 |
+
)
|
| 281 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 282 |
+
|
| 283 |
+
def test_male_voice_selection(self, tmp_path, monkeypatch):
|
| 284 |
+
"""Test male voice selection uses correct TLD."""
|
| 285 |
+
mock_tts = Mock()
|
| 286 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 287 |
+
|
| 288 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 289 |
+
monkeypatch.setattr(
|
| 290 |
+
"src.app.tools.text_to_speech.Path",
|
| 291 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
result = text_to_speech_simple("Hello", voice="male")
|
| 295 |
+
|
| 296 |
+
mock_gtts_class.assert_called_once_with(
|
| 297 |
+
text="Hello",
|
| 298 |
+
lang="en",
|
| 299 |
+
slow=False,
|
| 300 |
+
tld="co.uk", # British English for male voice
|
| 301 |
+
)
|
| 302 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 303 |
+
|
| 304 |
+
def test_female_voice_selection(self, tmp_path, monkeypatch):
|
| 305 |
+
"""Test female voice selection uses correct TLD."""
|
| 306 |
+
mock_tts = Mock()
|
| 307 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 308 |
+
|
| 309 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 310 |
+
monkeypatch.setattr(
|
| 311 |
+
"src.app.tools.text_to_speech.Path",
|
| 312 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
result = text_to_speech_simple("Hello", voice="female")
|
| 316 |
+
|
| 317 |
+
mock_gtts_class.assert_called_once_with(
|
| 318 |
+
text="Hello",
|
| 319 |
+
lang="en",
|
| 320 |
+
slow=False,
|
| 321 |
+
tld="com.au", # Australian English for female voice
|
| 322 |
+
)
|
| 323 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 324 |
+
|
| 325 |
+
def test_neutral_voice_selection(self, tmp_path, monkeypatch):
|
| 326 |
+
"""Test neutral voice selection uses correct TLD."""
|
| 327 |
+
mock_tts = Mock()
|
| 328 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 329 |
+
|
| 330 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 331 |
+
monkeypatch.setattr(
|
| 332 |
+
"src.app.tools.text_to_speech.Path",
|
| 333 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
result = text_to_speech_simple("Hello", voice="neutral")
|
| 337 |
+
|
| 338 |
+
mock_gtts_class.assert_called_once_with(
|
| 339 |
+
text="Hello",
|
| 340 |
+
lang="en",
|
| 341 |
+
slow=False,
|
| 342 |
+
tld="com", # US English for neutral voice
|
| 343 |
+
)
|
| 344 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 345 |
+
|
| 346 |
+
def test_language_selection(self, tmp_path, monkeypatch):
|
| 347 |
+
"""Test different language selection."""
|
| 348 |
+
mock_tts = Mock()
|
| 349 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 350 |
+
|
| 351 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 352 |
+
monkeypatch.setattr(
|
| 353 |
+
"src.app.tools.text_to_speech.Path",
|
| 354 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
result = text_to_speech_simple("Hola", language="es")
|
| 358 |
+
|
| 359 |
+
mock_gtts_class.assert_called_once_with(
|
| 360 |
+
text="Hola", lang="es", slow=False, tld="com"
|
| 361 |
+
)
|
| 362 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 363 |
+
|
| 364 |
+
def test_slow_speed_selection(self, tmp_path, monkeypatch):
|
| 365 |
+
"""Test slow speed selection."""
|
| 366 |
+
mock_tts = Mock()
|
| 367 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 368 |
+
|
| 369 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 370 |
+
monkeypatch.setattr(
|
| 371 |
+
"src.app.tools.text_to_speech.Path",
|
| 372 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
result = text_to_speech_simple("Hello", speed="slow")
|
| 376 |
+
|
| 377 |
+
mock_gtts_class.assert_called_once_with(
|
| 378 |
+
text="Hello", lang="en", slow=True, tld="com" # Slow speed enabled
|
| 379 |
+
)
|
| 380 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 381 |
+
|
| 382 |
+
def test_combined_options(self, tmp_path, monkeypatch):
|
| 383 |
+
"""Test combining voice, language, and speed options."""
|
| 384 |
+
mock_tts = Mock()
|
| 385 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 386 |
+
|
| 387 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 388 |
+
monkeypatch.setattr(
|
| 389 |
+
"src.app.tools.text_to_speech.Path",
|
| 390 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
result = text_to_speech_simple(
|
| 394 |
+
"Bonjour", voice="female", language="fr", speed="slow"
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
mock_gtts_class.assert_called_once_with(
|
| 398 |
+
text="Bonjour", lang="fr", slow=True, tld="com.au"
|
| 399 |
+
)
|
| 400 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
class TestTimedAudioSegments:
|
| 404 |
+
"""Test suite for timed audio segment generation."""
|
| 405 |
+
|
| 406 |
+
def test_srt_timed_segments(self, tmp_path, monkeypatch):
|
| 407 |
+
"""Test generating timed audio segments from SRT format."""
|
| 408 |
+
srt_content = """1
|
| 409 |
+
00:00:00,000 --> 00:00:03,500
|
| 410 |
+
Welcome to our video.
|
| 411 |
+
|
| 412 |
+
2
|
| 413 |
+
00:00:03,500 --> 00:00:07,000
|
| 414 |
+
Today we will learn something new."""
|
| 415 |
+
|
| 416 |
+
mock_tts = Mock()
|
| 417 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 418 |
+
|
| 419 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 420 |
+
monkeypatch.setattr(
|
| 421 |
+
"src.app.tools.text_to_speech.Path",
|
| 422 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
result = text_to_speech_simple(
|
| 426 |
+
srt_content, format_type="srt", generate_segments=True
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
# Parse JSON result
|
| 430 |
+
result_data = json.loads(result)
|
| 431 |
+
|
| 432 |
+
# Verify structure
|
| 433 |
+
assert "segments" in result_data
|
| 434 |
+
assert len(result_data["segments"]) == 2
|
| 435 |
+
|
| 436 |
+
# Check first segment
|
| 437 |
+
segment1 = result_data["segments"][0]
|
| 438 |
+
assert segment1["segment_id"] == 1
|
| 439 |
+
assert segment1["start_time"] == 0.0
|
| 440 |
+
assert segment1["end_time"] == 3.5
|
| 441 |
+
assert segment1["duration"] == 3.5
|
| 442 |
+
assert segment1["dialogue"] == "Welcome to our video."
|
| 443 |
+
assert "segment_1.mp3" in segment1["audio_file"]
|
| 444 |
+
|
| 445 |
+
# Check second segment
|
| 446 |
+
segment2 = result_data["segments"][1]
|
| 447 |
+
assert segment2["segment_id"] == 2
|
| 448 |
+
assert segment2["start_time"] == 3.5
|
| 449 |
+
assert segment2["end_time"] == 7.0
|
| 450 |
+
assert segment2["duration"] == 3.5
|
| 451 |
+
assert segment2["dialogue"] == "Today we will learn something new."
|
| 452 |
+
assert "segment_2.mp3" in segment2["audio_file"]
|
| 453 |
+
|
| 454 |
+
# Verify gTTS was called twice (once per segment)
|
| 455 |
+
assert mock_gtts_class.call_count == 2
|
| 456 |
+
|
| 457 |
+
def test_vtt_timed_segments(self, tmp_path, monkeypatch):
|
| 458 |
+
"""Test generating timed audio segments from VTT format."""
|
| 459 |
+
vtt_content = """WEBVTT
|
| 460 |
+
|
| 461 |
+
1
|
| 462 |
+
00:00:00.000 --> 00:00:02.500
|
| 463 |
+
First subtitle here.
|
| 464 |
+
|
| 465 |
+
2
|
| 466 |
+
00:00:02.500 --> 00:00:05.000
|
| 467 |
+
Second subtitle here."""
|
| 468 |
+
|
| 469 |
+
mock_tts = Mock()
|
| 470 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 471 |
+
|
| 472 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 473 |
+
monkeypatch.setattr(
|
| 474 |
+
"src.app.tools.text_to_speech.Path",
|
| 475 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
result = text_to_speech_simple(
|
| 479 |
+
vtt_content, format_type="vtt", generate_segments=True
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
# Parse JSON result
|
| 483 |
+
result_data = json.loads(result)
|
| 484 |
+
|
| 485 |
+
# Verify structure
|
| 486 |
+
assert "segments" in result_data
|
| 487 |
+
assert len(result_data["segments"]) == 2
|
| 488 |
+
|
| 489 |
+
# Check timing
|
| 490 |
+
segment1 = result_data["segments"][0]
|
| 491 |
+
assert segment1["start_time"] == 0.0
|
| 492 |
+
assert segment1["end_time"] == 2.5
|
| 493 |
+
assert segment1["dialogue"] == "First subtitle here."
|
| 494 |
+
|
| 495 |
+
segment2 = result_data["segments"][1]
|
| 496 |
+
assert segment2["start_time"] == 2.5
|
| 497 |
+
assert segment2["end_time"] == 5.0
|
| 498 |
+
assert segment2["dialogue"] == "Second subtitle here."
|
| 499 |
+
|
| 500 |
+
def test_json_timed_segments(self, tmp_path, monkeypatch):
|
| 501 |
+
"""Test generating timed audio segments from JSON format."""
|
| 502 |
+
json_content = json.dumps(
|
| 503 |
+
{
|
| 504 |
+
"scenes": [
|
| 505 |
+
{
|
| 506 |
+
"scene_id": 1,
|
| 507 |
+
"start_time": 0.0,
|
| 508 |
+
"end_time": 4.0,
|
| 509 |
+
"dialogue": "Scene one dialogue.",
|
| 510 |
+
},
|
| 511 |
+
{
|
| 512 |
+
"scene_id": 2,
|
| 513 |
+
"start_time": 4.0,
|
| 514 |
+
"duration": 3.0,
|
| 515 |
+
"dialogue": "Scene two dialogue.",
|
| 516 |
+
},
|
| 517 |
+
]
|
| 518 |
+
}
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
mock_tts = Mock()
|
| 522 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 523 |
+
|
| 524 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 525 |
+
monkeypatch.setattr(
|
| 526 |
+
"src.app.tools.text_to_speech.Path",
|
| 527 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 528 |
+
)
|
| 529 |
+
|
| 530 |
+
result = text_to_speech_simple(
|
| 531 |
+
json_content, format_type="json", generate_segments=True
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
# Parse JSON result
|
| 535 |
+
result_data = json.loads(result)
|
| 536 |
+
|
| 537 |
+
# Verify structure
|
| 538 |
+
assert "segments" in result_data
|
| 539 |
+
assert len(result_data["segments"]) == 2
|
| 540 |
+
|
| 541 |
+
# Check first segment
|
| 542 |
+
segment1 = result_data["segments"][0]
|
| 543 |
+
assert segment1["start_time"] == 0.0
|
| 544 |
+
assert segment1["end_time"] == 4.0
|
| 545 |
+
assert segment1["duration"] == 4.0
|
| 546 |
+
assert segment1["dialogue"] == "Scene one dialogue."
|
| 547 |
+
|
| 548 |
+
# Check second segment (end_time calculated from duration)
|
| 549 |
+
segment2 = result_data["segments"][1]
|
| 550 |
+
assert segment2["start_time"] == 4.0
|
| 551 |
+
assert segment2["end_time"] == 7.0
|
| 552 |
+
assert segment2["duration"] == 3.0
|
| 553 |
+
assert segment2["dialogue"] == "Scene two dialogue."
|
| 554 |
+
|
| 555 |
+
def test_auto_detect_srt_with_segments(self, tmp_path, monkeypatch):
|
| 556 |
+
"""Test auto-detection of SRT format with segment generation."""
|
| 557 |
+
srt_content = """1
|
| 558 |
+
00:00:00,000 --> 00:00:02,000
|
| 559 |
+
Auto-detected SRT."""
|
| 560 |
+
|
| 561 |
+
mock_tts = Mock()
|
| 562 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 563 |
+
|
| 564 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 565 |
+
monkeypatch.setattr(
|
| 566 |
+
"src.app.tools.text_to_speech.Path",
|
| 567 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
result = text_to_speech_simple(
|
| 571 |
+
srt_content, format_type="auto", generate_segments=True
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
# Parse JSON result
|
| 575 |
+
result_data = json.loads(result)
|
| 576 |
+
|
| 577 |
+
# Verify auto-detection worked
|
| 578 |
+
assert "segments" in result_data
|
| 579 |
+
assert len(result_data["segments"]) == 1
|
| 580 |
+
assert result_data["segments"][0]["dialogue"] == "Auto-detected SRT."
|
| 581 |
+
|
| 582 |
+
def test_plain_text_with_segments_returns_single_file(self, tmp_path, monkeypatch):
|
| 583 |
+
"""Test that plain text with generate_segments=True returns single file path."""
|
| 584 |
+
mock_tts = Mock()
|
| 585 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 586 |
+
|
| 587 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 588 |
+
monkeypatch.setattr(
|
| 589 |
+
"src.app.tools.text_to_speech.Path",
|
| 590 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
result = text_to_speech_simple(
|
| 594 |
+
"Plain text", format_type="text", generate_segments=True
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
# Plain text should return file path, not JSON
|
| 598 |
+
assert "generated_speech.mp3" in result or result.endswith(".mp3")
|
| 599 |
+
assert not result.startswith("{")
|
| 600 |
+
|
| 601 |
+
def test_empty_subtitle_segments(self, tmp_path, monkeypatch):
|
| 602 |
+
"""Test handling of empty dialogue in timed segments (uses placeholder)."""
|
| 603 |
+
json_content = json.dumps(
|
| 604 |
+
{
|
| 605 |
+
"scenes": [
|
| 606 |
+
{"scene_id": 1, "start_time": 0.0, "end_time": 2.0, "dialogue": ""},
|
| 607 |
+
{
|
| 608 |
+
"scene_id": 2,
|
| 609 |
+
"start_time": 2.0,
|
| 610 |
+
"end_time": 4.0,
|
| 611 |
+
"dialogue": "Valid dialogue",
|
| 612 |
+
},
|
| 613 |
+
]
|
| 614 |
+
}
|
| 615 |
+
)
|
| 616 |
+
|
| 617 |
+
mock_tts = Mock()
|
| 618 |
+
mock_gtts_class = Mock(return_value=mock_tts)
|
| 619 |
+
|
| 620 |
+
with patch("gtts.gTTS", mock_gtts_class):
|
| 621 |
+
monkeypatch.setattr(
|
| 622 |
+
"src.app.tools.text_to_speech.Path",
|
| 623 |
+
lambda x: tmp_path / x if "outputs/audio" in x else Path(x),
|
| 624 |
+
)
|
| 625 |
+
|
| 626 |
+
result = text_to_speech_simple(
|
| 627 |
+
json_content, format_type="json", generate_segments=True
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
# Parse JSON result
|
| 631 |
+
result_data = json.loads(result)
|
| 632 |
+
|
| 633 |
+
# Both segments should be generated (empty dialogue gets placeholder)
|
| 634 |
+
assert len(result_data["segments"]) == 2
|
| 635 |
+
assert (
|
| 636 |
+
result_data["segments"][0]["dialogue"] == "Scene 1"
|
| 637 |
+
) # Placeholder for empty dialogue
|
| 638 |
+
assert result_data["segments"][1]["dialogue"] == "Valid dialogue"
|
tests/test_thumbnail_generator.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for thumbnail_generator tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import pytest
|
| 8 |
+
from unittest.mock import Mock, patch, MagicMock, mock_open
|
| 9 |
+
import sys
|
| 10 |
+
from io import BytesIO
|
| 11 |
+
|
| 12 |
+
# Add src to path to import modules
|
| 13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 14 |
+
|
| 15 |
+
from app.tools.thumbnail_generator import thumbnail_generator
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TestThumbnailGenerator:
|
| 19 |
+
"""Test cases for thumbnail_generator function."""
|
| 20 |
+
|
| 21 |
+
def test_thumbnail_generator_with_tuple_input(self, temp_image_file):
|
| 22 |
+
"""Test thumbnail_generator with tuple input (Gradio format)."""
|
| 23 |
+
with (
|
| 24 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 25 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 26 |
+
patch("app.tools.thumbnail_generator.Image.open") as mock_image_open,
|
| 27 |
+
patch(
|
| 28 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 29 |
+
) as mock_guess_type,
|
| 30 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 31 |
+
):
|
| 32 |
+
|
| 33 |
+
mock_guess_type.return_value = ("image/png", None)
|
| 34 |
+
|
| 35 |
+
mock_genai_client = Mock()
|
| 36 |
+
mock_response = Mock()
|
| 37 |
+
mock_candidate = Mock()
|
| 38 |
+
mock_content = Mock()
|
| 39 |
+
mock_part = Mock()
|
| 40 |
+
mock_inline_data = Mock()
|
| 41 |
+
mock_inline_data.data = b"fake generated image data"
|
| 42 |
+
mock_part.inline_data = mock_inline_data
|
| 43 |
+
mock_content.parts = [mock_part]
|
| 44 |
+
mock_candidate.content = mock_content
|
| 45 |
+
mock_response.candidates = [mock_candidate]
|
| 46 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 47 |
+
mock_client.return_value = mock_genai_client
|
| 48 |
+
|
| 49 |
+
mock_image = Mock()
|
| 50 |
+
mock_image.convert.return_value = mock_image
|
| 51 |
+
mock_image_open.return_value = mock_image
|
| 52 |
+
|
| 53 |
+
image_input = (temp_image_file, "subtitle.srt")
|
| 54 |
+
summary = "An exciting adventure"
|
| 55 |
+
|
| 56 |
+
result = thumbnail_generator(image_input, summary)
|
| 57 |
+
|
| 58 |
+
assert os.path.isabs(result)
|
| 59 |
+
assert "thumbnail_" in result
|
| 60 |
+
assert result.endswith(".png")
|
| 61 |
+
|
| 62 |
+
def test_thumbnail_generator_without_output_path(self, temp_image_file):
|
| 63 |
+
"""Test thumbnail_generator generates output path when not provided."""
|
| 64 |
+
with (
|
| 65 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 66 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 67 |
+
patch("app.tools.thumbnail_generator.Image.open") as mock_image_open,
|
| 68 |
+
patch(
|
| 69 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 70 |
+
) as mock_guess_type,
|
| 71 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 72 |
+
):
|
| 73 |
+
|
| 74 |
+
mock_guess_type.return_value = ("image/png", None)
|
| 75 |
+
|
| 76 |
+
mock_genai_client = Mock()
|
| 77 |
+
mock_response = Mock()
|
| 78 |
+
mock_candidate = Mock()
|
| 79 |
+
mock_content = Mock()
|
| 80 |
+
mock_part = Mock()
|
| 81 |
+
mock_inline_data = Mock()
|
| 82 |
+
mock_inline_data.data = b"fake generated image data"
|
| 83 |
+
mock_part.inline_data = mock_inline_data
|
| 84 |
+
mock_content.parts = [mock_part]
|
| 85 |
+
mock_candidate.content = mock_content
|
| 86 |
+
mock_response.candidates = [mock_candidate]
|
| 87 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 88 |
+
mock_client.return_value = mock_genai_client
|
| 89 |
+
|
| 90 |
+
mock_image = Mock()
|
| 91 |
+
mock_image.convert.return_value = mock_image
|
| 92 |
+
mock_image_open.return_value = mock_image
|
| 93 |
+
|
| 94 |
+
summary = "A dramatic moment"
|
| 95 |
+
|
| 96 |
+
result = thumbnail_generator(temp_image_file, summary)
|
| 97 |
+
|
| 98 |
+
assert os.path.isabs(result)
|
| 99 |
+
assert "thumbnail_" in result
|
| 100 |
+
assert result.endswith(".png")
|
| 101 |
+
|
| 102 |
+
def test_thumbnail_generator_invalid_input_format(self):
|
| 103 |
+
"""Test thumbnail_generator with invalid input format."""
|
| 104 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 105 |
+
with pytest.raises(Exception) as exc_info:
|
| 106 |
+
thumbnail_generator(123, "summary") # Invalid input type
|
| 107 |
+
|
| 108 |
+
assert "Invalid image input format" in str(exc_info.value)
|
| 109 |
+
|
| 110 |
+
def test_thumbnail_generator_file_not_found(self):
|
| 111 |
+
"""Test thumbnail_generator with non-existent file."""
|
| 112 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 113 |
+
with pytest.raises(Exception) as exc_info:
|
| 114 |
+
thumbnail_generator("/nonexistent/image.png", "summary")
|
| 115 |
+
|
| 116 |
+
assert "Image file not found" in str(exc_info.value)
|
| 117 |
+
|
| 118 |
+
def test_thumbnail_generator_without_api_key(self, temp_image_file):
|
| 119 |
+
"""Test thumbnail_generator raises error without API key."""
|
| 120 |
+
with patch.dict(os.environ, {}, clear=True):
|
| 121 |
+
with pytest.raises(Exception) as exc_info:
|
| 122 |
+
thumbnail_generator(temp_image_file, "summary")
|
| 123 |
+
|
| 124 |
+
assert "GOOGLE_API_KEY" in str(exc_info.value)
|
| 125 |
+
|
| 126 |
+
def test_thumbnail_generator_api_failure(self, temp_image_file):
|
| 127 |
+
"""Test thumbnail_generator handles API failures."""
|
| 128 |
+
with (
|
| 129 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 130 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 131 |
+
patch(
|
| 132 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 133 |
+
) as mock_guess_type,
|
| 134 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 135 |
+
):
|
| 136 |
+
|
| 137 |
+
mock_guess_type.return_value = ("image/png", None)
|
| 138 |
+
|
| 139 |
+
mock_genai_client = Mock()
|
| 140 |
+
mock_genai_client.models.generate_content.side_effect = Exception(
|
| 141 |
+
"API Error"
|
| 142 |
+
)
|
| 143 |
+
mock_client.return_value = mock_genai_client
|
| 144 |
+
|
| 145 |
+
with pytest.raises(Exception) as exc_info:
|
| 146 |
+
thumbnail_generator(temp_image_file, "summary")
|
| 147 |
+
|
| 148 |
+
assert "Error generating thumbnail" in str(
|
| 149 |
+
exc_info.value
|
| 150 |
+
) or "API Error" in str(exc_info.value)
|
| 151 |
+
|
| 152 |
+
def test_thumbnail_generator_no_image_in_response(self, temp_image_file):
|
| 153 |
+
"""Test thumbnail_generator when API doesn't return image data."""
|
| 154 |
+
with (
|
| 155 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 156 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 157 |
+
patch(
|
| 158 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 159 |
+
) as mock_guess_type,
|
| 160 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 161 |
+
):
|
| 162 |
+
|
| 163 |
+
mock_guess_type.return_value = ("image/png", None)
|
| 164 |
+
|
| 165 |
+
mock_genai_client = Mock()
|
| 166 |
+
mock_response = Mock()
|
| 167 |
+
mock_response.candidates = [] # No candidates
|
| 168 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 169 |
+
mock_client.return_value = mock_genai_client
|
| 170 |
+
|
| 171 |
+
with pytest.raises(Exception) as exc_info:
|
| 172 |
+
thumbnail_generator(temp_image_file, "summary")
|
| 173 |
+
|
| 174 |
+
assert "Failed to extract generated image" in str(
|
| 175 |
+
exc_info.value
|
| 176 |
+
) or "Error generating thumbnail" in str(exc_info.value)
|
| 177 |
+
|
| 178 |
+
def test_thumbnail_generator_creates_output_directory(
|
| 179 |
+
self, temp_image_file, temp_output_dir
|
| 180 |
+
):
|
| 181 |
+
"""Test thumbnail_generator creates output directory if it doesn't exist."""
|
| 182 |
+
with (
|
| 183 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 184 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 185 |
+
patch("app.tools.thumbnail_generator.Image.open") as mock_image_open,
|
| 186 |
+
patch(
|
| 187 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 188 |
+
) as mock_guess_type,
|
| 189 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 190 |
+
):
|
| 191 |
+
|
| 192 |
+
mock_guess_type.return_value = ("image/png", None)
|
| 193 |
+
|
| 194 |
+
mock_genai_client = Mock()
|
| 195 |
+
mock_response = Mock()
|
| 196 |
+
mock_candidate = Mock()
|
| 197 |
+
mock_content = Mock()
|
| 198 |
+
mock_part = Mock()
|
| 199 |
+
mock_inline_data = Mock()
|
| 200 |
+
mock_inline_data.data = b"fake generated image data"
|
| 201 |
+
mock_part.inline_data = mock_inline_data
|
| 202 |
+
mock_content.parts = [mock_part]
|
| 203 |
+
mock_candidate.content = mock_content
|
| 204 |
+
mock_response.candidates = [mock_candidate]
|
| 205 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 206 |
+
mock_client.return_value = mock_genai_client
|
| 207 |
+
|
| 208 |
+
mock_image = Mock()
|
| 209 |
+
mock_image.convert.return_value = mock_image
|
| 210 |
+
mock_image_open.return_value = mock_image
|
| 211 |
+
|
| 212 |
+
output_dir = os.path.join(temp_output_dir, "nested", "path")
|
| 213 |
+
output_path = os.path.join(output_dir, "thumbnail.png")
|
| 214 |
+
|
| 215 |
+
result = thumbnail_generator(temp_image_file, "summary", output_path)
|
| 216 |
+
|
| 217 |
+
assert os.path.exists(output_dir)
|
| 218 |
+
assert os.path.isabs(result)
|
| 219 |
+
|
| 220 |
+
def test_thumbnail_generator_with_blob_format(self, temp_image_file):
|
| 221 |
+
"""Test thumbnail_generator handles blob format in response."""
|
| 222 |
+
with (
|
| 223 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 224 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 225 |
+
patch("app.tools.thumbnail_generator.Image.open") as mock_image_open,
|
| 226 |
+
patch(
|
| 227 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 228 |
+
) as mock_guess_type,
|
| 229 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 230 |
+
):
|
| 231 |
+
|
| 232 |
+
mock_guess_type.return_value = ("image/png", None)
|
| 233 |
+
|
| 234 |
+
mock_genai_client = Mock()
|
| 235 |
+
mock_response = Mock()
|
| 236 |
+
mock_candidate = Mock()
|
| 237 |
+
mock_content = Mock()
|
| 238 |
+
mock_part = Mock()
|
| 239 |
+
# Test blob format instead of inline_data
|
| 240 |
+
mock_blob = Mock()
|
| 241 |
+
mock_blob.data = b"fake generated image data"
|
| 242 |
+
mock_part.blob = mock_blob
|
| 243 |
+
mock_part.inline_data = None
|
| 244 |
+
mock_content.parts = [mock_part]
|
| 245 |
+
mock_candidate.content = mock_content
|
| 246 |
+
mock_response.candidates = [mock_candidate]
|
| 247 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 248 |
+
mock_client.return_value = mock_genai_client
|
| 249 |
+
|
| 250 |
+
mock_image = Mock()
|
| 251 |
+
mock_image.convert.return_value = mock_image
|
| 252 |
+
mock_image_open.return_value = mock_image
|
| 253 |
+
|
| 254 |
+
result = thumbnail_generator(temp_image_file, "summary")
|
| 255 |
+
|
| 256 |
+
assert os.path.isabs(result)
|
| 257 |
+
assert result.endswith(".png")
|
| 258 |
+
|
| 259 |
+
def test_thumbnail_generator_api_fallback_without_response_modalities(
|
| 260 |
+
self, temp_image_file
|
| 261 |
+
):
|
| 262 |
+
"""Test thumbnail_generator falls back when response_modalities fails."""
|
| 263 |
+
with (
|
| 264 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 265 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 266 |
+
patch("app.tools.thumbnail_generator.Image.open") as mock_image_open,
|
| 267 |
+
patch(
|
| 268 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 269 |
+
) as mock_guess_type,
|
| 270 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 271 |
+
):
|
| 272 |
+
|
| 273 |
+
mock_guess_type.return_value = ("image/png", None)
|
| 274 |
+
|
| 275 |
+
mock_genai_client = Mock()
|
| 276 |
+
mock_response = Mock()
|
| 277 |
+
mock_candidate = Mock()
|
| 278 |
+
mock_content = Mock()
|
| 279 |
+
mock_part = Mock()
|
| 280 |
+
mock_inline_data = Mock()
|
| 281 |
+
mock_inline_data.data = b"fake generated image data"
|
| 282 |
+
mock_part.inline_data = mock_inline_data
|
| 283 |
+
mock_content.parts = [mock_part]
|
| 284 |
+
mock_candidate.content = mock_content
|
| 285 |
+
mock_response.candidates = [mock_candidate]
|
| 286 |
+
|
| 287 |
+
# First call fails, second succeeds
|
| 288 |
+
mock_genai_client.models.generate_content.side_effect = [
|
| 289 |
+
Exception("response_modalities not supported"),
|
| 290 |
+
mock_response,
|
| 291 |
+
]
|
| 292 |
+
mock_client.return_value = mock_genai_client
|
| 293 |
+
|
| 294 |
+
mock_image = Mock()
|
| 295 |
+
mock_image.convert.return_value = mock_image
|
| 296 |
+
mock_image_open.return_value = mock_image
|
| 297 |
+
|
| 298 |
+
result = thumbnail_generator(temp_image_file, "summary")
|
| 299 |
+
|
| 300 |
+
assert os.path.isabs(result)
|
| 301 |
+
# Should have been called twice (first with response_modalities, second without)
|
| 302 |
+
assert mock_genai_client.models.generate_content.call_count == 2
|
| 303 |
+
|
| 304 |
+
def test_thumbnail_generator_mime_type_detection(self, temp_image_file):
|
| 305 |
+
"""Test thumbnail_generator handles different image MIME types."""
|
| 306 |
+
with (
|
| 307 |
+
patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}),
|
| 308 |
+
patch("app.tools.thumbnail_generator.genai.Client") as mock_client,
|
| 309 |
+
patch("app.tools.thumbnail_generator.Image.open") as mock_image_open,
|
| 310 |
+
patch(
|
| 311 |
+
"app.tools.thumbnail_generator.mimetypes.guess_type"
|
| 312 |
+
) as mock_guess_type,
|
| 313 |
+
patch("builtins.open", mock_open(read_data=b"fake image data")),
|
| 314 |
+
):
|
| 315 |
+
|
| 316 |
+
# Test with JPEG MIME type
|
| 317 |
+
mock_guess_type.return_value = ("image/jpeg", None)
|
| 318 |
+
|
| 319 |
+
mock_genai_client = Mock()
|
| 320 |
+
mock_response = Mock()
|
| 321 |
+
mock_candidate = Mock()
|
| 322 |
+
mock_content = Mock()
|
| 323 |
+
mock_part = Mock()
|
| 324 |
+
mock_inline_data = Mock()
|
| 325 |
+
mock_inline_data.data = b"fake generated image data"
|
| 326 |
+
mock_part.inline_data = mock_inline_data
|
| 327 |
+
mock_content.parts = [mock_part]
|
| 328 |
+
mock_candidate.content = mock_content
|
| 329 |
+
mock_response.candidates = [mock_candidate]
|
| 330 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 331 |
+
mock_client.return_value = mock_genai_client
|
| 332 |
+
|
| 333 |
+
mock_image = Mock()
|
| 334 |
+
mock_image.convert.return_value = mock_image
|
| 335 |
+
mock_image_open.return_value = mock_image
|
| 336 |
+
|
| 337 |
+
result = thumbnail_generator(temp_image_file, "summary")
|
| 338 |
+
|
| 339 |
+
assert os.path.isabs(result)
|
| 340 |
+
# Verify MIME type was used
|
| 341 |
+
mock_guess_type.assert_called_once()
|
tests/test_video_clipper.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for video_clipper tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import pytest
|
| 8 |
+
from unittest.mock import Mock, patch, MagicMock
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
# Add src to path to import modules
|
| 12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 13 |
+
|
| 14 |
+
from app.tools.video_clipper import video_clipper
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TestVideoClipper:
|
| 18 |
+
"""Test cases for video_clipper function."""
|
| 19 |
+
|
| 20 |
+
def test_video_clipper_with_tuple_input(self, temp_video_file, mock_video_duration):
|
| 21 |
+
"""Test video_clipper with tuple input (Gradio format)."""
|
| 22 |
+
with patch("app.tools.video_clipper.VideoFileClip") as mock_video_clip:
|
| 23 |
+
mock_video = Mock()
|
| 24 |
+
mock_video.duration = mock_video_duration
|
| 25 |
+
mock_clipped = Mock()
|
| 26 |
+
mock_video.subclipped.return_value = mock_clipped
|
| 27 |
+
mock_video_clip.return_value = mock_video
|
| 28 |
+
|
| 29 |
+
video_input = (temp_video_file, "subtitle.srt")
|
| 30 |
+
start_time = 0.0
|
| 31 |
+
end_time = 10.0
|
| 32 |
+
output_path = tempfile.mktemp(suffix=".mp4")
|
| 33 |
+
|
| 34 |
+
result = video_clipper(video_input, start_time, end_time, output_path)
|
| 35 |
+
|
| 36 |
+
assert os.path.isabs(result)
|
| 37 |
+
assert result == os.path.abspath(output_path)
|
| 38 |
+
mock_video_clip.assert_called_once_with(temp_video_file)
|
| 39 |
+
|
| 40 |
+
def test_video_clipper_without_output_path(
|
| 41 |
+
self, temp_video_file, mock_video_duration
|
| 42 |
+
):
|
| 43 |
+
"""Test video_clipper generates output path when not provided."""
|
| 44 |
+
with patch("app.tools.video_clipper.VideoFileClip") as mock_video_clip:
|
| 45 |
+
mock_video = Mock()
|
| 46 |
+
mock_video.duration = mock_video_duration
|
| 47 |
+
mock_clipped = Mock()
|
| 48 |
+
mock_video.subclipped.return_value = mock_clipped
|
| 49 |
+
mock_video_clip.return_value = mock_video
|
| 50 |
+
|
| 51 |
+
start_time = 2.5
|
| 52 |
+
end_time = 7.5
|
| 53 |
+
|
| 54 |
+
result = video_clipper(temp_video_file, start_time, end_time)
|
| 55 |
+
|
| 56 |
+
assert os.path.isabs(result)
|
| 57 |
+
assert result.endswith(".mp4") # Verify extension preserved
|
| 58 |
+
assert "clipped" in result.lower() or os.path.basename(result).startswith(
|
| 59 |
+
"clipped_"
|
| 60 |
+
)
|
| 61 |
+
mock_clipped.write_videofile.assert_called_once()
|
| 62 |
+
|
| 63 |
+
def test_video_clipper_invalid_input_format(self):
|
| 64 |
+
"""Test video_clipper with invalid input format."""
|
| 65 |
+
with pytest.raises(Exception) as exc_info:
|
| 66 |
+
video_clipper(123, 0.0, 10.0) # Invalid input type
|
| 67 |
+
|
| 68 |
+
assert "Invalid video input format" in str(exc_info.value)
|
| 69 |
+
|
| 70 |
+
def test_video_clipper_file_not_found(self):
|
| 71 |
+
"""Test video_clipper with non-existent file."""
|
| 72 |
+
with pytest.raises(Exception) as exc_info:
|
| 73 |
+
video_clipper("/nonexistent/video.mp4", 0.0, 10.0)
|
| 74 |
+
|
| 75 |
+
assert "Video file not found" in str(exc_info.value)
|
| 76 |
+
|
| 77 |
+
def test_video_clipper_negative_start_time(self, temp_video_file):
|
| 78 |
+
"""Test video_clipper with negative start time."""
|
| 79 |
+
with pytest.raises(Exception) as exc_info:
|
| 80 |
+
video_clipper(temp_video_file, -1.0, 10.0)
|
| 81 |
+
|
| 82 |
+
assert "Start time must be >= 0" in str(exc_info.value)
|
| 83 |
+
|
| 84 |
+
def test_video_clipper_end_time_less_than_start(self, temp_video_file):
|
| 85 |
+
"""Test video_clipper with end time less than start time."""
|
| 86 |
+
with pytest.raises(Exception) as exc_info:
|
| 87 |
+
video_clipper(temp_video_file, 10.0, 5.0)
|
| 88 |
+
|
| 89 |
+
assert "End time must be greater than start time" in str(exc_info.value)
|
| 90 |
+
|
| 91 |
+
def test_video_clipper_start_time_exceeds_duration(
|
| 92 |
+
self, temp_video_file, mock_video_duration
|
| 93 |
+
):
|
| 94 |
+
"""Test video_clipper when start time exceeds video duration."""
|
| 95 |
+
with patch("app.tools.video_clipper.VideoFileClip") as mock_video_clip:
|
| 96 |
+
mock_video = Mock()
|
| 97 |
+
mock_video.duration = mock_video_duration
|
| 98 |
+
mock_video_clip.return_value = mock_video
|
| 99 |
+
|
| 100 |
+
start_time = mock_video_duration + 10.0
|
| 101 |
+
end_time = mock_video_duration + 20.0
|
| 102 |
+
|
| 103 |
+
with pytest.raises(Exception) as exc_info:
|
| 104 |
+
video_clipper(temp_video_file, start_time, end_time)
|
| 105 |
+
|
| 106 |
+
assert "exceeds video duration" in str(exc_info.value)
|
| 107 |
+
# close() may be called multiple times in error handling, so just check it was called
|
| 108 |
+
assert mock_video.close.call_count >= 1
|
| 109 |
+
|
| 110 |
+
def test_video_clipper_end_time_clamped_to_duration(
|
| 111 |
+
self, temp_video_file, mock_video_duration
|
| 112 |
+
):
|
| 113 |
+
"""Test video_clipper clamps end_time to video duration."""
|
| 114 |
+
with patch("app.tools.video_clipper.VideoFileClip") as mock_video_clip:
|
| 115 |
+
mock_video = Mock()
|
| 116 |
+
mock_video.duration = mock_video_duration
|
| 117 |
+
mock_clipped = Mock()
|
| 118 |
+
mock_video.subclipped.return_value = mock_clipped
|
| 119 |
+
mock_video_clip.return_value = mock_video
|
| 120 |
+
|
| 121 |
+
start_time = 5.0
|
| 122 |
+
end_time = mock_video_duration + 10.0 # Exceeds duration
|
| 123 |
+
output_path = tempfile.mktemp(suffix=".mp4")
|
| 124 |
+
|
| 125 |
+
result = video_clipper(temp_video_file, start_time, end_time, output_path)
|
| 126 |
+
|
| 127 |
+
# Should clamp end_time to duration
|
| 128 |
+
mock_video.subclipped.assert_called_once_with(
|
| 129 |
+
start_time, mock_video_duration
|
| 130 |
+
)
|
| 131 |
+
assert os.path.isabs(result)
|
| 132 |
+
|
| 133 |
+
def test_video_clipper_creates_output_directory(
|
| 134 |
+
self, temp_video_file, temp_output_dir, mock_video_duration
|
| 135 |
+
):
|
| 136 |
+
"""Test video_clipper creates output directory if it doesn't exist."""
|
| 137 |
+
with patch("app.tools.video_clipper.VideoFileClip") as mock_video_clip:
|
| 138 |
+
mock_video = Mock()
|
| 139 |
+
mock_video.duration = mock_video_duration
|
| 140 |
+
mock_clipped = Mock()
|
| 141 |
+
mock_video.subclipped.return_value = mock_clipped
|
| 142 |
+
mock_video_clip.return_value = mock_video
|
| 143 |
+
|
| 144 |
+
output_dir = os.path.join(temp_output_dir, "nested", "path")
|
| 145 |
+
output_path = os.path.join(output_dir, "output.mp4")
|
| 146 |
+
|
| 147 |
+
result = video_clipper(temp_video_file, 0.0, 10.0, output_path)
|
| 148 |
+
|
| 149 |
+
assert os.path.exists(output_dir)
|
| 150 |
+
assert os.path.isabs(result)
|
| 151 |
+
|
| 152 |
+
def test_video_clipper_cleanup_on_error(self, temp_video_file, mock_video_duration):
|
| 153 |
+
"""Test video_clipper properly cleans up resources on error."""
|
| 154 |
+
with patch("app.tools.video_clipper.VideoFileClip") as mock_video_clip:
|
| 155 |
+
mock_video = Mock()
|
| 156 |
+
mock_video.duration = mock_video_duration
|
| 157 |
+
mock_clipped = Mock()
|
| 158 |
+
mock_clipped.write_videofile.side_effect = Exception("Write error")
|
| 159 |
+
mock_video.subclipped.return_value = mock_clipped
|
| 160 |
+
mock_video_clip.return_value = mock_video
|
| 161 |
+
|
| 162 |
+
with pytest.raises(Exception) as exc_info:
|
| 163 |
+
video_clipper(temp_video_file, 0.0, 10.0)
|
| 164 |
+
|
| 165 |
+
assert "Error clipping video" in str(exc_info.value)
|
| 166 |
+
# Verify cleanup was attempted
|
| 167 |
+
mock_clipped.close.assert_called_once()
|
| 168 |
+
mock_video.close.assert_called_once()
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class TestVideoClipperIntegration:
|
| 172 |
+
"""Integration tests for video_clipper using real video files."""
|
| 173 |
+
|
| 174 |
+
def test_video_clipper_real_video_basic_clip(
|
| 175 |
+
self, real_video_file, temp_output_dir
|
| 176 |
+
):
|
| 177 |
+
"""Test video_clipper with real video file - basic clipping."""
|
| 178 |
+
output_path = os.path.join(temp_output_dir, "clipped_output.mp4")
|
| 179 |
+
start_time = 1.0
|
| 180 |
+
end_time = 3.0
|
| 181 |
+
|
| 182 |
+
result = video_clipper(real_video_file, start_time, end_time, output_path)
|
| 183 |
+
|
| 184 |
+
# Assertions
|
| 185 |
+
assert os.path.exists(result), f"Clipped video file should exist at {result}"
|
| 186 |
+
assert os.path.isabs(result)
|
| 187 |
+
assert result == os.path.abspath(output_path)
|
| 188 |
+
assert os.path.getsize(result) > 0, "Clipped video should have content"
|
| 189 |
+
|
| 190 |
+
def test_video_clipper_real_video_short_clip(
|
| 191 |
+
self, real_video_file, temp_output_dir
|
| 192 |
+
):
|
| 193 |
+
"""Test video_clipper with real video file - very short clip."""
|
| 194 |
+
output_path = os.path.join(temp_output_dir, "short_clip.mp4")
|
| 195 |
+
start_time = 0.0
|
| 196 |
+
end_time = 0.5 # Half second clip
|
| 197 |
+
|
| 198 |
+
result = video_clipper(real_video_file, start_time, end_time, output_path)
|
| 199 |
+
|
| 200 |
+
assert os.path.exists(result)
|
| 201 |
+
assert os.path.getsize(result) > 0
|
| 202 |
+
|
| 203 |
+
def test_video_clipper_real_video_validation(self, real_video_file):
|
| 204 |
+
"""Test video_clipper validation with real video file."""
|
| 205 |
+
from moviepy import VideoFileClip
|
| 206 |
+
|
| 207 |
+
# Get actual video duration
|
| 208 |
+
with VideoFileClip(real_video_file) as video:
|
| 209 |
+
actual_duration = video.duration
|
| 210 |
+
|
| 211 |
+
# Test with invalid start time (exceeds duration)
|
| 212 |
+
with pytest.raises(Exception) as exc_info:
|
| 213 |
+
video_clipper(real_video_file, actual_duration + 1.0, actual_duration + 2.0)
|
| 214 |
+
|
| 215 |
+
assert "exceeds video duration" in str(exc_info.value)
|
| 216 |
+
|
| 217 |
+
def test_video_clipper_real_video_end_time_clamping(
|
| 218 |
+
self, real_video_file, temp_output_dir
|
| 219 |
+
):
|
| 220 |
+
"""Test video_clipper clamps end_time to video duration with real video."""
|
| 221 |
+
from moviepy import VideoFileClip
|
| 222 |
+
|
| 223 |
+
# Get actual video duration
|
| 224 |
+
with VideoFileClip(real_video_file) as video:
|
| 225 |
+
actual_duration = video.duration
|
| 226 |
+
|
| 227 |
+
output_path = os.path.join(temp_output_dir, "clamped_output.mp4")
|
| 228 |
+
start_time = max(0.0, actual_duration - 2.0) # Start 2 seconds before end
|
| 229 |
+
end_time = actual_duration + 10.0 # End time exceeds duration
|
| 230 |
+
|
| 231 |
+
# Should not raise error, should clamp end_time
|
| 232 |
+
result = video_clipper(real_video_file, start_time, end_time, output_path)
|
| 233 |
+
|
| 234 |
+
assert os.path.exists(result)
|
| 235 |
+
assert os.path.getsize(result) > 0
|
tests/test_video_composer.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for video_composer tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import tempfile
|
| 8 |
+
import pytest
|
| 9 |
+
from unittest.mock import Mock, patch, MagicMock
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
# Add src to path to import modules
|
| 14 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 15 |
+
|
| 16 |
+
from app.tools.video_composer import video_composer
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class TestVideoComposer:
|
| 20 |
+
"""Test cases for video_composer function."""
|
| 21 |
+
|
| 22 |
+
@pytest.fixture
|
| 23 |
+
def sample_script(self):
|
| 24 |
+
"""Sample script JSON for testing."""
|
| 25 |
+
return {
|
| 26 |
+
"total_duration": 30.0,
|
| 27 |
+
"scenes": [
|
| 28 |
+
{
|
| 29 |
+
"scene_id": 1,
|
| 30 |
+
"source_video": 0, # Reference first video by index
|
| 31 |
+
"start_time": 0.0,
|
| 32 |
+
"end_time": 5.0,
|
| 33 |
+
"duration": 5.0,
|
| 34 |
+
"transition_in": "fade",
|
| 35 |
+
"transition_out": "crossfade",
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"scene_id": 2,
|
| 39 |
+
"source_video": 1, # Reference second video by index
|
| 40 |
+
"start_time": 10.0,
|
| 41 |
+
"end_time": 15.0,
|
| 42 |
+
"duration": 5.0,
|
| 43 |
+
"transition_in": "crossfade",
|
| 44 |
+
"transition_out": "cut",
|
| 45 |
+
},
|
| 46 |
+
],
|
| 47 |
+
"music": {"mood": "energetic", "bpm": 120, "volume": 0.5},
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
@pytest.fixture
|
| 51 |
+
def sample_script_json(self, sample_script):
|
| 52 |
+
"""Sample script as JSON string."""
|
| 53 |
+
return json.dumps(sample_script)
|
| 54 |
+
|
| 55 |
+
def test_video_composer_missing_scenes_key(self, temp_output_dir):
|
| 56 |
+
"""Test video_composer with script missing 'scenes' key."""
|
| 57 |
+
invalid_script = {"total_duration": 30.0}
|
| 58 |
+
clip1_path = os.path.join(temp_output_dir, "clip1.mp4")
|
| 59 |
+
Path(clip1_path).touch()
|
| 60 |
+
|
| 61 |
+
with pytest.raises(Exception) as exc_info:
|
| 62 |
+
video_composer(invalid_script, video_clips=[clip1_path])
|
| 63 |
+
|
| 64 |
+
assert "Script must contain a 'scenes' key" in str(exc_info.value)
|
| 65 |
+
|
| 66 |
+
def test_video_composer_empty_scenes(self, temp_output_dir):
|
| 67 |
+
"""Test video_composer with empty scenes list."""
|
| 68 |
+
invalid_script = {"scenes": []}
|
| 69 |
+
clip1_path = os.path.join(temp_output_dir, "clip1.mp4")
|
| 70 |
+
Path(clip1_path).touch()
|
| 71 |
+
|
| 72 |
+
with pytest.raises(Exception) as exc_info:
|
| 73 |
+
video_composer(invalid_script, video_clips=[clip1_path])
|
| 74 |
+
|
| 75 |
+
assert "Script must contain at least one scene" in str(exc_info.value)
|
| 76 |
+
|
| 77 |
+
def test_video_composer_invalid_source_video_index(
|
| 78 |
+
self, sample_script, temp_output_dir
|
| 79 |
+
):
|
| 80 |
+
"""Test video_composer with invalid source_video index."""
|
| 81 |
+
clip1_path = os.path.join(temp_output_dir, "clip1.mp4")
|
| 82 |
+
Path(clip1_path).touch()
|
| 83 |
+
|
| 84 |
+
# Update script to use index 2 (out of range for single video)
|
| 85 |
+
sample_script["scenes"][0]["source_video"] = 2
|
| 86 |
+
|
| 87 |
+
with pytest.raises(Exception) as exc_info:
|
| 88 |
+
video_composer(sample_script, video_clips=[clip1_path])
|
| 89 |
+
|
| 90 |
+
assert "source_video index" in str(exc_info.value)
|
| 91 |
+
assert "out of range" in str(exc_info.value)
|
| 92 |
+
|
| 93 |
+
def test_video_composer_missing_source_video(self, temp_output_dir):
|
| 94 |
+
"""Test video_composer with scene missing source_video."""
|
| 95 |
+
script = {
|
| 96 |
+
"scenes": [
|
| 97 |
+
{
|
| 98 |
+
"scene_id": 1,
|
| 99 |
+
"start_time": 0.0,
|
| 100 |
+
"end_time": 5.0,
|
| 101 |
+
}
|
| 102 |
+
]
|
| 103 |
+
}
|
| 104 |
+
clip1_path = os.path.join(temp_output_dir, "clip1.mp4")
|
| 105 |
+
Path(clip1_path).touch()
|
| 106 |
+
|
| 107 |
+
with pytest.raises(Exception) as exc_info:
|
| 108 |
+
video_composer(script, video_clips=[clip1_path])
|
| 109 |
+
|
| 110 |
+
assert "missing 'source_video'" in str(exc_info.value)
|
| 111 |
+
|
| 112 |
+
def test_video_composer_source_video_not_found(self, temp_output_dir):
|
| 113 |
+
"""Test video_composer with source_video filename not found in video_clips."""
|
| 114 |
+
script = {
|
| 115 |
+
"scenes": [
|
| 116 |
+
{
|
| 117 |
+
"scene_id": 1,
|
| 118 |
+
"source_video": "nonexistent.mp4",
|
| 119 |
+
"start_time": 0.0,
|
| 120 |
+
"end_time": 5.0,
|
| 121 |
+
}
|
| 122 |
+
]
|
| 123 |
+
}
|
| 124 |
+
clip1_path = os.path.join(temp_output_dir, "clip1.mp4")
|
| 125 |
+
Path(clip1_path).touch()
|
| 126 |
+
|
| 127 |
+
with pytest.raises(Exception) as exc_info:
|
| 128 |
+
video_composer(script, video_clips=[clip1_path])
|
| 129 |
+
|
| 130 |
+
assert "source_video" in str(exc_info.value)
|
| 131 |
+
assert "not found in video_clips" in str(exc_info.value)
|
| 132 |
+
|
| 133 |
+
def test_video_composer_clip_not_found(self, sample_script, temp_output_dir):
|
| 134 |
+
"""Test video_composer with non-existent clip file in video_clips."""
|
| 135 |
+
nonexistent_clip = os.path.join(temp_output_dir, "nonexistent.mp4")
|
| 136 |
+
clip1_path = os.path.join(temp_output_dir, "clip1.mp4")
|
| 137 |
+
Path(clip1_path).touch()
|
| 138 |
+
|
| 139 |
+
with pytest.raises(Exception) as exc_info:
|
| 140 |
+
video_composer(sample_script, video_clips=[clip1_path, nonexistent_clip])
|
| 141 |
+
|
| 142 |
+
assert "Video clip not found" in str(exc_info.value)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class TestVideoComposerIntegration:
|
| 146 |
+
"""Integration tests for video_composer using real video files."""
|
| 147 |
+
|
| 148 |
+
def test_video_composer_real_video_basic_composition(
|
| 149 |
+
self, real_video_file, temp_output_dir
|
| 150 |
+
):
|
| 151 |
+
"""Test video_composer with real video file - basic composition."""
|
| 152 |
+
script = {
|
| 153 |
+
"scenes": [
|
| 154 |
+
{
|
| 155 |
+
"scene_id": 1,
|
| 156 |
+
"source_video": 0, # Reference first video by index
|
| 157 |
+
"start_time": 0.0,
|
| 158 |
+
"end_time": 2.0,
|
| 159 |
+
"duration": 2.0,
|
| 160 |
+
"transition_in": "cut",
|
| 161 |
+
"transition_out": "fade",
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"scene_id": 2,
|
| 165 |
+
"source_video": 0, # Same video, different time range
|
| 166 |
+
"start_time": 2.0,
|
| 167 |
+
"end_time": 4.0,
|
| 168 |
+
"duration": 2.0,
|
| 169 |
+
"transition_in": "fade",
|
| 170 |
+
"transition_out": "cut",
|
| 171 |
+
},
|
| 172 |
+
]
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
output_path = os.path.join(temp_output_dir, "composed_output.mp4")
|
| 176 |
+
|
| 177 |
+
result = video_composer(
|
| 178 |
+
script, video_clips=[real_video_file], output_path=output_path
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Assertions
|
| 182 |
+
assert os.path.exists(result), f"Composed video file should exist at {result}"
|
| 183 |
+
assert os.path.isabs(result)
|
| 184 |
+
assert result == os.path.abspath(output_path)
|
| 185 |
+
assert os.path.getsize(result) > 0, "Composed video should have content"
|
| 186 |
+
|
| 187 |
+
def test_video_composer_real_video_with_preclipped(
|
| 188 |
+
self, real_video_file, temp_output_dir
|
| 189 |
+
):
|
| 190 |
+
"""Test video_composer with real video using pre-clipped clips."""
|
| 191 |
+
from app.tools.video_clipper import video_clipper
|
| 192 |
+
|
| 193 |
+
# Create pre-clipped videos
|
| 194 |
+
clip1_path = os.path.join(temp_output_dir, "clip1.mp4")
|
| 195 |
+
clip2_path = os.path.join(temp_output_dir, "clip2.mp4")
|
| 196 |
+
|
| 197 |
+
video_clipper(real_video_file, 0.0, 2.0, clip1_path)
|
| 198 |
+
video_clipper(real_video_file, 2.0, 4.0, clip2_path)
|
| 199 |
+
|
| 200 |
+
script = {
|
| 201 |
+
"scenes": [
|
| 202 |
+
{
|
| 203 |
+
"scene_id": 1,
|
| 204 |
+
"source_video": 0, # Reference first video
|
| 205 |
+
"start_time": 0.0,
|
| 206 |
+
"end_time": 2.0,
|
| 207 |
+
"transition_in": "cut",
|
| 208 |
+
"transition_out": "fade",
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"scene_id": 2,
|
| 212 |
+
"source_video": 1, # Reference second video
|
| 213 |
+
"start_time": 0.0,
|
| 214 |
+
"end_time": 2.0,
|
| 215 |
+
"transition_in": "fade",
|
| 216 |
+
"transition_out": "cut",
|
| 217 |
+
},
|
| 218 |
+
]
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
output_path = os.path.join(temp_output_dir, "composed_preclipped.mp4")
|
| 222 |
+
|
| 223 |
+
result = video_composer(
|
| 224 |
+
script,
|
| 225 |
+
video_clips=[clip1_path, clip2_path],
|
| 226 |
+
output_path=output_path,
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
assert os.path.exists(result)
|
| 230 |
+
assert os.path.getsize(result) > 0
|
| 231 |
+
|
| 232 |
+
def test_video_composer_real_video_crossfade(
|
| 233 |
+
self, real_video_file, temp_output_dir
|
| 234 |
+
):
|
| 235 |
+
"""Test video_composer with real video using crossfade transitions."""
|
| 236 |
+
script = {
|
| 237 |
+
"scenes": [
|
| 238 |
+
{
|
| 239 |
+
"scene_id": 1,
|
| 240 |
+
"source_video": 0, # Reference first video by index
|
| 241 |
+
"start_time": 0.0,
|
| 242 |
+
"end_time": 1.5,
|
| 243 |
+
"transition_in": "cut",
|
| 244 |
+
"transition_out": "crossfade",
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"scene_id": 2,
|
| 248 |
+
"source_video": 0, # Same video, different time range
|
| 249 |
+
"start_time": 1.5,
|
| 250 |
+
"end_time": 3.0,
|
| 251 |
+
"transition_in": "crossfade",
|
| 252 |
+
"transition_out": "cut",
|
| 253 |
+
},
|
| 254 |
+
]
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
output_path = os.path.join(temp_output_dir, "composed_crossfade.mp4")
|
| 258 |
+
|
| 259 |
+
result = video_composer(
|
| 260 |
+
script, video_clips=[real_video_file], output_path=output_path
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
assert os.path.exists(result)
|
| 264 |
+
assert os.path.getsize(result) > 0
|
| 265 |
+
|
| 266 |
+
def test_video_composer_multiple_source_videos(
|
| 267 |
+
self, real_video_file, real_video_file_2, temp_output_dir
|
| 268 |
+
):
|
| 269 |
+
"""Test video_composer composing from multiple different source videos."""
|
| 270 |
+
script = {
|
| 271 |
+
"scenes": [
|
| 272 |
+
{
|
| 273 |
+
"scene_id": 1,
|
| 274 |
+
"source_video": 0, # Reference first video by index
|
| 275 |
+
"start_time": 0.0,
|
| 276 |
+
"end_time": 2.0,
|
| 277 |
+
"duration": 2.0,
|
| 278 |
+
"transition_in": "fade",
|
| 279 |
+
"transition_out": "crossfade",
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"scene_id": 2,
|
| 283 |
+
"source_video": 1, # Reference second video by index
|
| 284 |
+
"start_time": 0.0,
|
| 285 |
+
"end_time": 2.0,
|
| 286 |
+
"duration": 2.0,
|
| 287 |
+
"transition_in": "crossfade",
|
| 288 |
+
"transition_out": "fade",
|
| 289 |
+
},
|
| 290 |
+
]
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
output_path = os.path.join(temp_output_dir, "composed_multiple_sources.mp4")
|
| 294 |
+
|
| 295 |
+
result = video_composer(
|
| 296 |
+
script,
|
| 297 |
+
video_clips=[real_video_file, real_video_file_2],
|
| 298 |
+
output_path=output_path,
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
assert os.path.exists(result)
|
| 302 |
+
assert os.path.isabs(result)
|
| 303 |
+
assert result == os.path.abspath(output_path)
|
| 304 |
+
assert os.path.getsize(result) > 0
|
| 305 |
+
|
| 306 |
+
def test_video_composer_real_video_with_second_file_preclipped(
|
| 307 |
+
self, real_video_file, real_video_file_2, temp_output_dir
|
| 308 |
+
):
|
| 309 |
+
"""Test video_composer with clips from both video files."""
|
| 310 |
+
from app.tools.video_clipper import video_clipper
|
| 311 |
+
|
| 312 |
+
# Create pre-clipped videos from both sources
|
| 313 |
+
clip1_path = os.path.join(temp_output_dir, "clip1_from_dodo1.mp4")
|
| 314 |
+
clip2_path = os.path.join(temp_output_dir, "clip2_from_dodo2.mp4")
|
| 315 |
+
|
| 316 |
+
video_clipper(real_video_file, 0.0, 2.0, clip1_path)
|
| 317 |
+
video_clipper(real_video_file_2, 0.0, 2.0, clip2_path)
|
| 318 |
+
|
| 319 |
+
script = {
|
| 320 |
+
"scenes": [
|
| 321 |
+
{
|
| 322 |
+
"scene_id": 1,
|
| 323 |
+
"source_video": 0, # Reference first video
|
| 324 |
+
"start_time": 0.0,
|
| 325 |
+
"end_time": 2.0,
|
| 326 |
+
"transition_in": "cut",
|
| 327 |
+
"transition_out": "crossfade",
|
| 328 |
+
},
|
| 329 |
+
{
|
| 330 |
+
"scene_id": 2,
|
| 331 |
+
"source_video": 1, # Reference second video
|
| 332 |
+
"start_time": 0.0,
|
| 333 |
+
"end_time": 2.0,
|
| 334 |
+
"transition_in": "crossfade",
|
| 335 |
+
"transition_out": "cut",
|
| 336 |
+
},
|
| 337 |
+
]
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
output_path = os.path.join(temp_output_dir, "composed_two_sources.mp4")
|
| 341 |
+
|
| 342 |
+
result = video_composer(
|
| 343 |
+
script,
|
| 344 |
+
video_clips=[clip1_path, clip2_path],
|
| 345 |
+
output_path=output_path,
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
assert os.path.exists(result)
|
| 349 |
+
assert os.path.getsize(result) > 0
|
| 350 |
+
|
| 351 |
+
def test_video_composer_real_video_three_scenes_from_two_files(
|
| 352 |
+
self, real_video_file, real_video_file_2, temp_output_dir
|
| 353 |
+
):
|
| 354 |
+
"""Test video_composer with three scenes from two different source videos."""
|
| 355 |
+
script = {
|
| 356 |
+
"scenes": [
|
| 357 |
+
{
|
| 358 |
+
"scene_id": 1,
|
| 359 |
+
"source_video": 0, # Reference first video by index
|
| 360 |
+
"start_time": 0.0,
|
| 361 |
+
"end_time": 1.5,
|
| 362 |
+
"transition_in": "fade",
|
| 363 |
+
"transition_out": "crossfade",
|
| 364 |
+
},
|
| 365 |
+
{
|
| 366 |
+
"scene_id": 2,
|
| 367 |
+
"source_video": 1, # Reference second video by index
|
| 368 |
+
"start_time": 0.0,
|
| 369 |
+
"end_time": 1.5,
|
| 370 |
+
"transition_in": "crossfade",
|
| 371 |
+
"transition_out": "crossfade",
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"scene_id": 3,
|
| 375 |
+
"source_video": 0, # Reference first video again, different time range
|
| 376 |
+
"start_time": 1.5,
|
| 377 |
+
"end_time": 3.0,
|
| 378 |
+
"transition_in": "crossfade",
|
| 379 |
+
"transition_out": "fade",
|
| 380 |
+
},
|
| 381 |
+
]
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
output_path = os.path.join(temp_output_dir, "composed_three_scenes.mp4")
|
| 385 |
+
|
| 386 |
+
result = video_composer(
|
| 387 |
+
script,
|
| 388 |
+
video_clips=[real_video_file, real_video_file_2],
|
| 389 |
+
output_path=output_path,
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
assert os.path.exists(result)
|
| 393 |
+
assert os.path.getsize(result) > 0
|
tests/test_video_summarizer.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unit tests for video_summarizer tool.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import pytest
|
| 8 |
+
from unittest.mock import Mock, patch, MagicMock
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
# Add src to path to import modules
|
| 12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
| 13 |
+
|
| 14 |
+
from app.tools.video_summarizer import video_summarizer
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TestVideoSummarizer:
|
| 18 |
+
"""Test cases for video_summarizer function."""
|
| 19 |
+
|
| 20 |
+
def test_video_summarizer_with_tuple_input(self, temp_video_file):
|
| 21 |
+
"""Test video_summarizer with tuple input (Gradio format)."""
|
| 22 |
+
with (
|
| 23 |
+
patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture,
|
| 24 |
+
patch("app.tools.video_summarizer.genai.Client") as mock_client,
|
| 25 |
+
):
|
| 26 |
+
|
| 27 |
+
mock_cap = Mock()
|
| 28 |
+
mock_cap.isOpened.return_value = True
|
| 29 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 30 |
+
5: 30.0,
|
| 31 |
+
7: 900,
|
| 32 |
+
3: 1920,
|
| 33 |
+
4: 1080,
|
| 34 |
+
}.get(prop, 0)
|
| 35 |
+
mock_capture.return_value = mock_cap
|
| 36 |
+
|
| 37 |
+
mock_genai_client = Mock()
|
| 38 |
+
mock_response = Mock()
|
| 39 |
+
mock_response.text = "Test summary"
|
| 40 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 41 |
+
mock_client.return_value = mock_genai_client
|
| 42 |
+
|
| 43 |
+
video_input = (temp_video_file, "subtitle.srt")
|
| 44 |
+
|
| 45 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 46 |
+
result = video_summarizer(video_input, fps=2.0)
|
| 47 |
+
|
| 48 |
+
result_json = json.loads(result)
|
| 49 |
+
assert "summary" in result_json
|
| 50 |
+
|
| 51 |
+
def test_video_summarizer_invalid_input_format(self):
|
| 52 |
+
"""Test video_summarizer with invalid input format."""
|
| 53 |
+
result = video_summarizer(123, fps=2.0)
|
| 54 |
+
result_json = json.loads(result)
|
| 55 |
+
assert "error" in result_json
|
| 56 |
+
assert "Invalid video input format" in result_json["error"]
|
| 57 |
+
|
| 58 |
+
def test_video_summarizer_file_not_found(self):
|
| 59 |
+
"""Test video_summarizer with non-existent file."""
|
| 60 |
+
result = video_summarizer("/nonexistent/video.mp4", fps=2.0)
|
| 61 |
+
result_json = json.loads(result)
|
| 62 |
+
assert "error" in result_json
|
| 63 |
+
assert "Video file not found" in result_json["error"]
|
| 64 |
+
|
| 65 |
+
def test_video_summarizer_cannot_open_video(self, temp_video_file):
|
| 66 |
+
"""Test video_summarizer when video cannot be opened."""
|
| 67 |
+
with patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture:
|
| 68 |
+
mock_cap = Mock()
|
| 69 |
+
mock_cap.isOpened.return_value = False
|
| 70 |
+
mock_capture.return_value = mock_cap
|
| 71 |
+
|
| 72 |
+
result = video_summarizer(temp_video_file, fps=2.0)
|
| 73 |
+
result_json = json.loads(result)
|
| 74 |
+
|
| 75 |
+
assert "error" in result_json
|
| 76 |
+
assert "Could not open video file" in result_json["error"]
|
| 77 |
+
|
| 78 |
+
def test_video_summarizer_no_api_key(self, temp_video_file):
|
| 79 |
+
"""Test video_summarizer without API key (fallback mode)."""
|
| 80 |
+
with patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture:
|
| 81 |
+
mock_cap = Mock()
|
| 82 |
+
mock_cap.isOpened.return_value = True
|
| 83 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 84 |
+
5: 30.0,
|
| 85 |
+
7: 900,
|
| 86 |
+
3: 1920,
|
| 87 |
+
4: 1080,
|
| 88 |
+
}.get(prop, 0)
|
| 89 |
+
mock_capture.return_value = mock_cap
|
| 90 |
+
|
| 91 |
+
with patch.dict(os.environ, {}, clear=True):
|
| 92 |
+
result = video_summarizer(temp_video_file, fps=2.0)
|
| 93 |
+
|
| 94 |
+
result_json = json.loads(result)
|
| 95 |
+
|
| 96 |
+
assert "duration" in result_json
|
| 97 |
+
assert "summary" in result_json
|
| 98 |
+
assert "Video analysis requires GOOGLE_API_KEY" in result_json["summary"]
|
| 99 |
+
assert result_json["mood_tags"] == []
|
| 100 |
+
|
| 101 |
+
def test_video_summarizer_extracts_mood_tags(self, temp_video_file):
|
| 102 |
+
"""Test video_summarizer extracts mood tags from summary."""
|
| 103 |
+
with (
|
| 104 |
+
patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture,
|
| 105 |
+
patch("app.tools.video_summarizer.genai.Client") as mock_client,
|
| 106 |
+
):
|
| 107 |
+
|
| 108 |
+
mock_cap = Mock()
|
| 109 |
+
mock_cap.isOpened.return_value = True
|
| 110 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 111 |
+
5: 30.0,
|
| 112 |
+
7: 900,
|
| 113 |
+
3: 1920,
|
| 114 |
+
4: 1080,
|
| 115 |
+
}.get(prop, 0)
|
| 116 |
+
mock_capture.return_value = mock_cap
|
| 117 |
+
|
| 118 |
+
mock_genai_client = Mock()
|
| 119 |
+
mock_response = Mock()
|
| 120 |
+
# Include mood keywords in summary
|
| 121 |
+
mock_response.text = "This is an energetic and fast-paced video with bright colors and fun activities."
|
| 122 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 123 |
+
mock_client.return_value = mock_genai_client
|
| 124 |
+
|
| 125 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 126 |
+
result = video_summarizer(temp_video_file, fps=2.0)
|
| 127 |
+
|
| 128 |
+
result_json = json.loads(result)
|
| 129 |
+
|
| 130 |
+
assert "mood_tags" in result_json
|
| 131 |
+
assert isinstance(result_json["mood_tags"], list)
|
| 132 |
+
# Should detect some mood tags
|
| 133 |
+
assert len(result_json["mood_tags"]) > 0
|
| 134 |
+
|
| 135 |
+
def test_video_summarizer_default_mood_tags(self, temp_video_file):
|
| 136 |
+
"""Test video_summarizer uses default mood tag when none detected."""
|
| 137 |
+
with (
|
| 138 |
+
patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture,
|
| 139 |
+
patch("app.tools.video_summarizer.genai.Client") as mock_client,
|
| 140 |
+
):
|
| 141 |
+
|
| 142 |
+
mock_cap = Mock()
|
| 143 |
+
mock_cap.isOpened.return_value = True
|
| 144 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 145 |
+
5: 30.0,
|
| 146 |
+
7: 900,
|
| 147 |
+
3: 1920,
|
| 148 |
+
4: 1080,
|
| 149 |
+
}.get(prop, 0)
|
| 150 |
+
mock_capture.return_value = mock_cap
|
| 151 |
+
|
| 152 |
+
mock_genai_client = Mock()
|
| 153 |
+
mock_response = Mock()
|
| 154 |
+
# Summary without mood keywords
|
| 155 |
+
mock_response.text = (
|
| 156 |
+
"This is a regular video without specific mood indicators."
|
| 157 |
+
)
|
| 158 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 159 |
+
mock_client.return_value = mock_genai_client
|
| 160 |
+
|
| 161 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 162 |
+
result = video_summarizer(temp_video_file, fps=2.0)
|
| 163 |
+
|
| 164 |
+
result_json = json.loads(result)
|
| 165 |
+
|
| 166 |
+
assert "mood_tags" in result_json
|
| 167 |
+
assert result_json["mood_tags"] == ["general"]
|
| 168 |
+
|
| 169 |
+
def test_video_summarizer_custom_fps(self, temp_video_file):
|
| 170 |
+
"""Test video_summarizer with custom fps parameter."""
|
| 171 |
+
with (
|
| 172 |
+
patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture,
|
| 173 |
+
patch("app.tools.video_summarizer.genai.Client") as mock_client,
|
| 174 |
+
):
|
| 175 |
+
|
| 176 |
+
mock_cap = Mock()
|
| 177 |
+
mock_cap.isOpened.return_value = True
|
| 178 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 179 |
+
5: 30.0,
|
| 180 |
+
7: 900,
|
| 181 |
+
3: 1920,
|
| 182 |
+
4: 1080,
|
| 183 |
+
}.get(prop, 0)
|
| 184 |
+
mock_capture.return_value = mock_cap
|
| 185 |
+
|
| 186 |
+
mock_genai_client = Mock()
|
| 187 |
+
mock_response = Mock()
|
| 188 |
+
mock_response.text = "Test summary"
|
| 189 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 190 |
+
mock_client.return_value = mock_genai_client
|
| 191 |
+
|
| 192 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 193 |
+
result = video_summarizer(temp_video_file, fps=5.0)
|
| 194 |
+
|
| 195 |
+
result_json = json.loads(result)
|
| 196 |
+
assert "summary" in result_json
|
| 197 |
+
|
| 198 |
+
# Verify fps was passed to VideoMetadata
|
| 199 |
+
call_args = mock_genai_client.models.generate_content.call_args
|
| 200 |
+
assert call_args is not None
|
| 201 |
+
|
| 202 |
+
def test_video_summarizer_error_handling(self, temp_video_file):
|
| 203 |
+
"""Test video_summarizer handles exceptions gracefully."""
|
| 204 |
+
with patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture:
|
| 205 |
+
mock_cap = Mock()
|
| 206 |
+
mock_cap.isOpened.side_effect = Exception("Unexpected error")
|
| 207 |
+
mock_capture.return_value = mock_cap
|
| 208 |
+
|
| 209 |
+
result = video_summarizer(temp_video_file, fps=2.0)
|
| 210 |
+
result_json = json.loads(result)
|
| 211 |
+
|
| 212 |
+
assert "error" in result_json
|
| 213 |
+
assert "Error processing video" in result_json["error"]
|
| 214 |
+
|
| 215 |
+
def test_video_summarizer_metadata_extraction(self, temp_video_file):
|
| 216 |
+
"""Test video_summarizer extracts correct metadata."""
|
| 217 |
+
with (
|
| 218 |
+
patch("app.tools.video_summarizer.cv2.VideoCapture") as mock_capture,
|
| 219 |
+
patch("app.tools.video_summarizer.genai.Client") as mock_client,
|
| 220 |
+
):
|
| 221 |
+
|
| 222 |
+
mock_cap = Mock()
|
| 223 |
+
mock_cap.isOpened.return_value = True
|
| 224 |
+
mock_cap.get.side_effect = lambda prop: {
|
| 225 |
+
5: 24.0, # FPS
|
| 226 |
+
7: 720, # Frame count
|
| 227 |
+
3: 1280, # Width
|
| 228 |
+
4: 720, # Height
|
| 229 |
+
}.get(prop, 0)
|
| 230 |
+
mock_capture.return_value = mock_cap
|
| 231 |
+
|
| 232 |
+
mock_genai_client = Mock()
|
| 233 |
+
mock_response = Mock()
|
| 234 |
+
mock_response.text = "Test summary"
|
| 235 |
+
mock_genai_client.models.generate_content.return_value = mock_response
|
| 236 |
+
mock_client.return_value = mock_genai_client
|
| 237 |
+
|
| 238 |
+
with patch.dict(os.environ, {"GOOGLE_API_KEY": "test_key"}):
|
| 239 |
+
result = video_summarizer(temp_video_file, fps=2.0)
|
| 240 |
+
|
| 241 |
+
result_json = json.loads(result)
|
| 242 |
+
|
| 243 |
+
assert result_json["fps"] == 24.0
|
| 244 |
+
assert result_json["frame_count"] == 720
|
| 245 |
+
assert result_json["resolution"] == "1280x720"
|
| 246 |
+
assert result_json["duration"] == pytest.approx(
|
| 247 |
+
30.0, rel=0.1
|
| 248 |
+
) # 720/24 = 30
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
class TestVideoSummarizerIntegration:
|
| 252 |
+
"""Integration tests for video_summarizer using real video files."""
|
| 253 |
+
|
| 254 |
+
def test_video_summarizer_real_video_basic(self, real_video_file):
|
| 255 |
+
"""Test video_summarizer with real video file - basic functionality."""
|
| 256 |
+
result = video_summarizer(real_video_file, fps=2.0)
|
| 257 |
+
result_json = json.loads(result)
|
| 258 |
+
|
| 259 |
+
# Should return valid JSON with metadata
|
| 260 |
+
assert "duration" in result_json
|
| 261 |
+
assert "resolution" in result_json
|
| 262 |
+
assert "fps" in result_json
|
| 263 |
+
assert "frame_count" in result_json
|
| 264 |
+
assert isinstance(result_json["duration"], (int, float))
|
| 265 |
+
assert result_json["duration"] > 0
|
| 266 |
+
|
| 267 |
+
def test_video_summarizer_real_video_no_api_key(self, real_video_file):
|
| 268 |
+
"""Test video_summarizer with real video file without API key (fallback mode)."""
|
| 269 |
+
with patch.dict(os.environ, {}, clear=True):
|
| 270 |
+
result = video_summarizer(real_video_file, fps=2.0)
|
| 271 |
+
|
| 272 |
+
result_json = json.loads(result)
|
| 273 |
+
|
| 274 |
+
# Should still return metadata even without API key
|
| 275 |
+
assert "duration" in result_json
|
| 276 |
+
assert "resolution" in result_json
|
| 277 |
+
assert "summary" in result_json
|
| 278 |
+
assert "Video analysis requires GOOGLE_API_KEY" in result_json["summary"]
|
| 279 |
+
assert result_json["mood_tags"] == []
|
| 280 |
+
|
| 281 |
+
@pytest.mark.skipif(
|
| 282 |
+
not os.getenv("GOOGLE_API_KEY"),
|
| 283 |
+
reason="GOOGLE_API_KEY not set, skipping API test",
|
| 284 |
+
)
|
| 285 |
+
def test_video_summarizer_real_video_with_api(self, real_video_file):
|
| 286 |
+
"""Test video_summarizer with real video file and API key (if available)."""
|
| 287 |
+
result = video_summarizer(real_video_file, fps=2.0)
|
| 288 |
+
result_json = json.loads(result)
|
| 289 |
+
|
| 290 |
+
# If API key is available, should get actual summary
|
| 291 |
+
if "error" not in result_json:
|
| 292 |
+
assert "summary" in result_json
|
| 293 |
+
assert len(result_json["summary"]) > 0
|
| 294 |
+
assert "mood_tags" in result_json
|
| 295 |
+
assert isinstance(result_json["mood_tags"], list)
|