vidzly / src /app /tools /video_script_generator.py
tthhanh's picture
chore: black reformating
5a459dd
import os
import json
import re
from typing import Optional, List, Union
import google.genai as genai
def _extract_and_parse_json(text: str) -> Optional[Union[dict, list]]:
"""
Extract and parse JSON from text that might contain extra content.
Handles cases where JSON is wrapped in markdown, has extra text, multiple objects,
or wrapped in tool response format like {"tool_name_response": {...}}.
"""
if not text or not isinstance(text, str):
return None
text = text.strip()
# Try direct parsing first
try:
parsed = json.loads(text)
# Check if it's wrapped in a tool response format
# e.g., {"video_summarizer_tool_response": {...}} or [{"video_summarizer_tool_response": {...}}]
if isinstance(parsed, dict):
# Check if it's a single tool response wrapper
if len(parsed) == 1:
key = list(parsed.keys())[0]
if "_tool_response" in key.lower() or "_response" in key.lower():
# Extract the actual data from the wrapper
return parsed[key]
elif isinstance(parsed, list) and len(parsed) > 0:
# Check if list contains wrapped responses
unwrapped = []
for item in parsed:
if isinstance(item, dict) and len(item) == 1:
key = list(item.keys())[0]
if "_tool_response" in key.lower() or "_response" in key.lower():
unwrapped.append(item[key])
else:
unwrapped.append(item)
else:
unwrapped.append(item)
return unwrapped if unwrapped else parsed
return parsed
except json.JSONDecodeError:
pass
# Try to extract JSON array by finding balanced brackets
bracket_count = 0
array_start = -1
for i, char in enumerate(text):
if char == "[":
if bracket_count == 0:
array_start = i
bracket_count += 1
elif char == "]":
bracket_count -= 1
if bracket_count == 0 and array_start >= 0:
array_str = text[array_start : i + 1]
try:
return json.loads(array_str)
except json.JSONDecodeError:
pass
array_start = -1
# Try to find multiple JSON objects and combine them into an array
# This handles cases where objects are concatenated: {}{}
objects = []
brace_count = 0
start_idx = -1
for i, char in enumerate(text):
if char == "{":
if brace_count == 0:
start_idx = i
brace_count += 1
elif char == "}":
brace_count -= 1
if brace_count == 0 and start_idx >= 0:
obj_str = text[start_idx : i + 1]
try:
obj = json.loads(obj_str)
objects.append(obj)
except json.JSONDecodeError:
pass
start_idx = -1
if objects:
# If we found multiple objects, return as list
# If only one, return it directly (will be wrapped in list by caller)
return objects if len(objects) > 1 else objects[0]
# Try to extract a single JSON object by finding balanced braces
brace_count = 0
obj_start = -1
for i, char in enumerate(text):
if char == "{":
if brace_count == 0:
obj_start = i
brace_count += 1
elif char == "}":
brace_count -= 1
if brace_count == 0 and obj_start >= 0:
obj_str = text[obj_start : i + 1]
try:
return json.loads(obj_str)
except json.JSONDecodeError:
pass
obj_start = -1
return None
def video_script_generator(
video_summaries: Union[str, List[dict], List[str]],
user_description: Optional[str] = None,
target_duration: float = 30.0,
) -> str:
"""
Create a detailed script/storyboard for the final 30-second video.
Uses Google Gemini API to intelligently generate a script based on video summaries
and user requirements.
Args:
video_summaries: Video summaries from video_summarizer tool.
Can be:
- JSON string (single summary)
- List of dict objects (multiple summaries)
- List of JSON strings (multiple summaries)
user_description: Optional user description of desired mood/style/content
target_duration: Target duration in seconds (default: 30.0)
Returns:
str: JSON string containing detailed script with:
- Scene sequence with source video and timestamps
- Duration for each scene segment (must sum to ~target_duration seconds)
- Transition types between scenes (cut, fade, crossfade)
- Pacing and rhythm plan
- Music synchronization points (beat markers, mood changes)
- Overall narrative structure and flow
- Visual style recommendations
Example output format:
{
"total_duration": 30.0,
"scenes": [
{
"scene_id": 1,
"source_video": 0,
"start_time": 5.2,
"end_time": 8.5,
"duration": 3.3,
"description": "Opening shot of landscape",
"transition_in": "fade",
"transition_out": "crossfade"
},
...
],
"music": {
"mood": "energetic",
"bpm": 120,
"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0],
"volume": 0.5
},
"pacing": "fast",
"narrative_structure": "hook -> build -> climax -> resolution",
"visual_style": "bright, colorful, dynamic"
}
"""
try:
# Parse video summaries input
summaries_list = []
if isinstance(video_summaries, str):
# JSON string - could be a single object or an array
# Use robust parsing to handle malformed JSON
parsed = _extract_and_parse_json(video_summaries)
if parsed is None:
raise ValueError(
f"Invalid JSON format for video_summaries. "
f"Could not parse: {video_summaries[:200]}..."
)
if isinstance(parsed, list):
# It's a JSON array
summaries_list = parsed
else:
# It's a single JSON object
summaries_list = [parsed]
elif isinstance(video_summaries, list):
# List of summaries
for summary in video_summaries:
if isinstance(summary, str):
# Use robust parsing for string summaries
parsed = _extract_and_parse_json(summary)
if parsed is None:
raise ValueError(
f"Invalid JSON format in video_summaries: {summary[:200]}..."
)
# If parsed is a list, extend; if it's a dict, append
if isinstance(parsed, list):
summaries_list.extend(parsed)
else:
summaries_list.append(parsed)
elif isinstance(summary, dict):
# Check if it's wrapped in a tool response format
if len(summary) == 1:
key = list(summary.keys())[0]
if (
"_tool_response" in key.lower()
or "_response" in key.lower()
):
# Extract the actual data from the wrapper
summaries_list.append(summary[key])
else:
summaries_list.append(summary)
else:
summaries_list.append(summary)
else:
raise ValueError(
f"Invalid summary type: {type(summary).__name__}. "
"Expected dict or JSON string."
)
else:
raise ValueError(
f"Invalid video_summaries type: {type(video_summaries).__name__}. "
"Expected str, list of dicts, or list of JSON strings."
)
if not summaries_list:
raise ValueError("No video summaries provided")
# Validate target_duration
if target_duration <= 0:
raise ValueError("target_duration must be greater than 0")
# Get API key
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
# Fallback: create a simple script using first video
summary = summaries_list[0]
duration = summary.get("duration", target_duration)
clip_duration = min(duration, target_duration)
# Extract mood from summary
mood_tags = summary.get("mood_tags", ["energetic"])
mood = mood_tags[0] if mood_tags else "energetic"
fallback_script = {
"total_duration": clip_duration,
"scenes": [
{
"scene_id": 1,
"source_video": 0,
"start_time": 0.0,
"end_time": clip_duration,
"duration": clip_duration,
"description": summary.get("summary", "Video clip")[:100],
"transition_in": "fade",
"transition_out": "fade",
}
],
"music": {
"mood": mood,
"volume": 0.5,
},
"pacing": "moderate",
"narrative_structure": "single scene",
}
return json.dumps(fallback_script, indent=2)
# Initialize Gemini client
client = genai.Client(api_key=api_key)
# Build prompt for script generation
summaries_text = "\n\n".join(
[
f"Video {i+1}:\n{json.dumps(s, indent=2)}"
for i, s in enumerate(summaries_list)
]
)
user_desc_text = (
f"\n\nUser Description: {user_description}" if user_description else ""
)
prompt = f"""You are a professional video editor creating a {target_duration}-second short-form video.
Here are the video summaries:
{summaries_text}
{user_desc_text}
Create a detailed video composition script that:
1. Selects the most engaging and relevant scenes from the videos
2. Creates a coherent narrative flow with a clear structure (hook -> build -> climax -> resolution)
3. Uses appropriate transitions (cut, fade, or crossfade) between scenes
4. Ensures the total duration is approximately {target_duration} seconds (within ±2 seconds)
5. Distributes scenes evenly across the duration, considering pacing
6. Identifies music mood, BPM, and sync points for rhythm matching
7. Provides visual style recommendations based on the content
Return ONLY a valid JSON object with this exact structure:
{{
"total_duration": {target_duration},
"scenes": [
{{
"scene_id": 1,
"source_video": 0,
"start_time": 0.0,
"end_time": 5.0,
"duration": 5.0,
"description": "Brief description of what happens in this scene",
"transition_in": "fade",
"transition_out": "crossfade"
}},
{{
"scene_id": 2,
"source_video": 1,
"start_time": 10.0,
"end_time": 15.0,
"duration": 5.0,
"description": "Brief description of what happens in this scene",
"transition_in": "crossfade",
"transition_out": "fade"
}}
],
"music": {{
"mood": "energetic",
"bpm": 120,
"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0],
"volume": 0.5
}},
"pacing": "fast",
"narrative_structure": "hook -> build -> climax -> resolution",
"visual_style": "bright, colorful, dynamic"
}}
Rules:
- source_video is 0-based index (0 for first video, 1 for second, etc.)
- Each scene must have start_time, end_time, and duration
- CRITICAL: start_time and end_time MUST be within the actual video duration. Check the "duration" field in each video summary to ensure timestamps don't exceed it.
- For example, if a video has duration 5.2 seconds, start_time must be < 5.2 and end_time must be <= 5.2
- Total of all scene durations should be approximately {target_duration} seconds (±2 seconds tolerance)
- Use transitions: "cut", "fade", or "crossfade"
- Extract mood tags from the video summaries for the music section
- sync_points should be evenly distributed or aligned to scene transitions
- pacing should be one of: "slow", "moderate", "fast", "very-fast"
- narrative_structure should describe the flow (e.g., "hook -> build -> climax -> resolution")
- visual_style should describe the aesthetic (e.g., "bright, colorful, dynamic" or "dark, moody, cinematic")
- Return ONLY the JSON, no other text or markdown formatting"""
# Generate script using Gemini
response = client.models.generate_content(
model="gemini-2.5-flash-lite",
contents=[prompt],
)
# Extract JSON from response
response_text = response.text.strip()
# Try to extract JSON if wrapped in markdown code blocks
if "```json" in response_text:
response_text = response_text.split("```json")[1].split("```")[0].strip()
elif "```" in response_text:
response_text = response_text.split("```")[1].split("```")[0].strip()
script = json.loads(response_text)
# Validate script structure
if not isinstance(script, dict):
raise ValueError("Generated script is not a valid dictionary")
# Ensure required fields exist
if "total_duration" not in script:
script["total_duration"] = target_duration
if "scenes" not in script:
raise ValueError("Generated script missing 'scenes' field")
if not isinstance(script["scenes"], list) or len(script["scenes"]) == 0:
raise ValueError("Generated script must contain at least one scene")
# Validate and fix scene timestamps to ensure they're within video durations
# Create a mapping of video index to duration from summaries
video_durations = {}
for i, summary in enumerate(summaries_list):
video_durations[i] = summary.get("duration", 0.0)
num_videos = len(summaries_list)
# Validate and fix each scene
for scene in script["scenes"]:
source_video_idx = scene.get("source_video")
# Validate and fix source_video index if it's an integer
if isinstance(source_video_idx, int):
# Clamp index to valid range (0 to num_videos - 1)
if source_video_idx < 0:
source_video_idx = 0
elif source_video_idx >= num_videos:
# Clamp to last valid index
source_video_idx = max(0, num_videos - 1)
scene["source_video"] = source_video_idx
elif source_video_idx is None:
# If source_video is missing, default to first video
scene["source_video"] = 0
# Now validate timestamps if we have a valid video index
# Use the clamped value from scene (in case it was updated)
validated_idx = scene.get("source_video")
if isinstance(validated_idx, int) and validated_idx in video_durations:
video_duration = video_durations[validated_idx]
start_time = scene.get("start_time", 0.0)
end_time = scene.get("end_time")
scene_duration = scene.get("duration")
# If start_time exceeds video duration, adjust it
if start_time >= video_duration:
# Use the last portion of the video (last 2 seconds or video duration, whichever is smaller)
clip_duration = min(2.0, video_duration)
scene["start_time"] = max(0.0, video_duration - clip_duration)
if end_time is None and scene_duration:
scene["end_time"] = video_duration
scene["duration"] = video_duration - scene["start_time"]
elif end_time:
scene["end_time"] = video_duration
scene["duration"] = video_duration - scene["start_time"]
else:
scene["end_time"] = video_duration
scene["duration"] = video_duration - scene["start_time"]
else:
# Clamp start_time to be within bounds
scene["start_time"] = max(
0.0, min(start_time, video_duration - 0.1)
)
# Calculate or validate end_time
if end_time is None:
if scene_duration:
calculated_end_time = scene["start_time"] + scene_duration
else:
calculated_end_time = video_duration
else:
calculated_end_time = end_time
# Clamp end_time to be within bounds
scene["end_time"] = max(
scene["start_time"] + 0.1,
min(calculated_end_time, video_duration),
)
# Update duration to match
scene["duration"] = scene["end_time"] - scene["start_time"]
# Validate scene durations sum to approximately target_duration
total_scene_duration = sum(
scene.get("duration", 0) for scene in script["scenes"]
)
if abs(total_scene_duration - target_duration) > 5.0:
# Adjust durations proportionally if they're way off
if total_scene_duration > 0:
scale_factor = target_duration / total_scene_duration
for scene in script["scenes"]:
if "duration" in scene:
scene["duration"] = round(scene["duration"] * scale_factor, 2)
if "start_time" in scene and "end_time" in scene:
# Recalculate end_time based on scaled duration
scene["end_time"] = round(
scene["start_time"] + scene["duration"], 2
)
script["total_duration"] = target_duration
# Ensure music section exists
if "music" not in script:
# Extract mood from summaries
mood_tags = []
for summary in summaries_list:
tags = summary.get("mood_tags", [])
if isinstance(tags, list):
mood_tags.extend(tags)
mood = mood_tags[0] if mood_tags else "energetic"
script["music"] = {
"mood": mood,
"volume": 0.5,
}
# Add optional fields if missing
if "pacing" not in script:
script["pacing"] = "moderate"
if "narrative_structure" not in script:
script["narrative_structure"] = "linear"
if "visual_style" not in script:
script["visual_style"] = "standard"
return json.dumps(script, indent=2)
except json.JSONDecodeError as e:
# Fallback to simple script
if not summaries_list:
raise ValueError("No video summaries provided")
summary = summaries_list[0]
duration = summary.get("duration", target_duration)
clip_duration = min(duration, target_duration)
# Extract mood from summary
mood_tags = summary.get("mood_tags", ["energetic"])
mood = mood_tags[0] if mood_tags else "energetic"
fallback_script = {
"total_duration": clip_duration,
"scenes": [
{
"scene_id": 1,
"source_video": 0,
"start_time": 0.0,
"end_time": clip_duration,
"duration": clip_duration,
"description": (
summary.get("summary", "Video clip")[:100]
if isinstance(summary.get("summary"), str)
else "Video clip"
),
"transition_in": "fade",
"transition_out": "fade",
}
],
"music": {
"mood": mood,
"volume": 0.5,
},
"pacing": "moderate",
"narrative_structure": "single scene",
}
return json.dumps(fallback_script, indent=2)
except Exception as e:
raise Exception(f"Error generating video script: {str(e)}")