Spaces:

MCP-1st-Birthday
/

vidzly

Running

App Files Files Community

vidzly / src /app /tools /video_script_generator.py

tthhanh

chore: black reformating

5a459dd about 2 months ago

raw

history blame contribute delete

21.5 kB

	import os
	import json
	import re
	from typing import Optional, List, Union
	import google.genai as genai


	def _extract_and_parse_json(text: str) -> Optional[Union[dict, list]]:
	"""
	Extract and parse JSON from text that might contain extra content.
	Handles cases where JSON is wrapped in markdown, has extra text, multiple objects,
	or wrapped in tool response format like {"tool_name_response": {...}}.
	"""
	if not text or not isinstance(text, str):
	return None

	text = text.strip()

	# Try direct parsing first
	try:
	parsed = json.loads(text)
	# Check if it's wrapped in a tool response format
	# e.g., {"video_summarizer_tool_response": {...}} or [{"video_summarizer_tool_response": {...}}]
	if isinstance(parsed, dict):
	# Check if it's a single tool response wrapper
	if len(parsed) == 1:
	key = list(parsed.keys())[0]
	if "_tool_response" in key.lower() or "_response" in key.lower():
	# Extract the actual data from the wrapper
	return parsed[key]
	elif isinstance(parsed, list) and len(parsed) > 0:
	# Check if list contains wrapped responses
	unwrapped = []
	for item in parsed:
	if isinstance(item, dict) and len(item) == 1:
	key = list(item.keys())[0]
	if "_tool_response" in key.lower() or "_response" in key.lower():
	unwrapped.append(item[key])
	else:
	unwrapped.append(item)
	else:
	unwrapped.append(item)
	return unwrapped if unwrapped else parsed
	return parsed
	except json.JSONDecodeError:
	pass

	# Try to extract JSON array by finding balanced brackets
	bracket_count = 0
	array_start = -1
	for i, char in enumerate(text):
	if char == "[":
	if bracket_count == 0:
	array_start = i
	bracket_count += 1
	elif char == "]":
	bracket_count -= 1
	if bracket_count == 0 and array_start >= 0:
	array_str = text[array_start : i + 1]
	try:
	return json.loads(array_str)
	except json.JSONDecodeError:
	pass
	array_start = -1

	# Try to find multiple JSON objects and combine them into an array
	# This handles cases where objects are concatenated: {}{}
	objects = []
	brace_count = 0
	start_idx = -1

	for i, char in enumerate(text):
	if char == "{":
	if brace_count == 0:
	start_idx = i
	brace_count += 1
	elif char == "}":
	brace_count -= 1
	if brace_count == 0 and start_idx >= 0:
	obj_str = text[start_idx : i + 1]
	try:
	obj = json.loads(obj_str)
	objects.append(obj)
	except json.JSONDecodeError:
	pass
	start_idx = -1

	if objects:
	# If we found multiple objects, return as list
	# If only one, return it directly (will be wrapped in list by caller)
	return objects if len(objects) > 1 else objects[0]

	# Try to extract a single JSON object by finding balanced braces
	brace_count = 0
	obj_start = -1
	for i, char in enumerate(text):
	if char == "{":
	if brace_count == 0:
	obj_start = i
	brace_count += 1
	elif char == "}":
	brace_count -= 1
	if brace_count == 0 and obj_start >= 0:
	obj_str = text[obj_start : i + 1]
	try:
	return json.loads(obj_str)
	except json.JSONDecodeError:
	pass
	obj_start = -1

	return None


	def video_script_generator(
	video_summaries: Union[str, List[dict], List[str]],
	user_description: Optional[str] = None,
	target_duration: float = 30.0,
	) -> str:
	"""
	Create a detailed script/storyboard for the final 30-second video.
	Uses Google Gemini API to intelligently generate a script based on video summaries
	and user requirements.

	Args:
	video_summaries: Video summaries from video_summarizer tool.
	Can be:
	- JSON string (single summary)
	- List of dict objects (multiple summaries)
	- List of JSON strings (multiple summaries)
	user_description: Optional user description of desired mood/style/content
	target_duration: Target duration in seconds (default: 30.0)

	Returns:
	str: JSON string containing detailed script with:
	- Scene sequence with source video and timestamps
	- Duration for each scene segment (must sum to ~target_duration seconds)
	- Transition types between scenes (cut, fade, crossfade)
	- Pacing and rhythm plan
	- Music synchronization points (beat markers, mood changes)
	- Overall narrative structure and flow
	- Visual style recommendations

	Example output format:
	{
	"total_duration": 30.0,
	"scenes": [
	{
	"scene_id": 1,
	"source_video": 0,
	"start_time": 5.2,
	"end_time": 8.5,
	"duration": 3.3,
	"description": "Opening shot of landscape",
	"transition_in": "fade",
	"transition_out": "crossfade"
	},
	...
	],
	"music": {
	"mood": "energetic",
	"bpm": 120,
	"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0],
	"volume": 0.5
	},
	"pacing": "fast",
	"narrative_structure": "hook -> build -> climax -> resolution",
	"visual_style": "bright, colorful, dynamic"
	}
	"""
	try:
	# Parse video summaries input
	summaries_list = []
	if isinstance(video_summaries, str):
	# JSON string - could be a single object or an array
	# Use robust parsing to handle malformed JSON
	parsed = _extract_and_parse_json(video_summaries)

	if parsed is None:
	raise ValueError(
	f"Invalid JSON format for video_summaries. "
	f"Could not parse: {video_summaries[:200]}..."
	)

	if isinstance(parsed, list):
	# It's a JSON array
	summaries_list = parsed
	else:
	# It's a single JSON object
	summaries_list = [parsed]
	elif isinstance(video_summaries, list):
	# List of summaries
	for summary in video_summaries:
	if isinstance(summary, str):
	# Use robust parsing for string summaries
	parsed = _extract_and_parse_json(summary)
	if parsed is None:
	raise ValueError(
	f"Invalid JSON format in video_summaries: {summary[:200]}..."
	)
	# If parsed is a list, extend; if it's a dict, append
	if isinstance(parsed, list):
	summaries_list.extend(parsed)
	else:
	summaries_list.append(parsed)
	elif isinstance(summary, dict):
	# Check if it's wrapped in a tool response format
	if len(summary) == 1:
	key = list(summary.keys())[0]
	if (
	"_tool_response" in key.lower()
	or "_response" in key.lower()
	):
	# Extract the actual data from the wrapper
	summaries_list.append(summary[key])
	else:
	summaries_list.append(summary)
	else:
	summaries_list.append(summary)
	else:
	raise ValueError(
	f"Invalid summary type: {type(summary).__name__}. "
	"Expected dict or JSON string."
	)
	else:
	raise ValueError(
	f"Invalid video_summaries type: {type(video_summaries).__name__}. "
	"Expected str, list of dicts, or list of JSON strings."
	)

	if not summaries_list:
	raise ValueError("No video summaries provided")

	# Validate target_duration
	if target_duration <= 0:
	raise ValueError("target_duration must be greater than 0")

	# Get API key
	api_key = os.getenv("GOOGLE_API_KEY")
	if not api_key:
	# Fallback: create a simple script using first video
	summary = summaries_list[0]
	duration = summary.get("duration", target_duration)
	clip_duration = min(duration, target_duration)

	# Extract mood from summary
	mood_tags = summary.get("mood_tags", ["energetic"])
	mood = mood_tags[0] if mood_tags else "energetic"

	fallback_script = {
	"total_duration": clip_duration,
	"scenes": [
	{
	"scene_id": 1,
	"source_video": 0,
	"start_time": 0.0,
	"end_time": clip_duration,
	"duration": clip_duration,
	"description": summary.get("summary", "Video clip")[:100],
	"transition_in": "fade",
	"transition_out": "fade",
	}
	],
	"music": {
	"mood": mood,
	"volume": 0.5,
	},
	"pacing": "moderate",
	"narrative_structure": "single scene",
	}
	return json.dumps(fallback_script, indent=2)

	# Initialize Gemini client
	client = genai.Client(api_key=api_key)

	# Build prompt for script generation
	summaries_text = "\n\n".join(
	[
	f"Video {i+1}:\n{json.dumps(s, indent=2)}"
	for i, s in enumerate(summaries_list)
	]
	)

	user_desc_text = (
	f"\n\nUser Description: {user_description}" if user_description else ""
	)

	prompt = f"""You are a professional video editor creating a {target_duration}-second short-form video.

	Here are the video summaries:
	{summaries_text}
	{user_desc_text}

	Create a detailed video composition script that:
	1. Selects the most engaging and relevant scenes from the videos
	2. Creates a coherent narrative flow with a clear structure (hook -> build -> climax -> resolution)
	3. Uses appropriate transitions (cut, fade, or crossfade) between scenes
	4. Ensures the total duration is approximately {target_duration} seconds (within ±2 seconds)
	5. Distributes scenes evenly across the duration, considering pacing
	6. Identifies music mood, BPM, and sync points for rhythm matching
	7. Provides visual style recommendations based on the content

	Return ONLY a valid JSON object with this exact structure:
	{{
	"total_duration": {target_duration},
	"scenes": [
	{{
	"scene_id": 1,
	"source_video": 0,
	"start_time": 0.0,
	"end_time": 5.0,
	"duration": 5.0,
	"description": "Brief description of what happens in this scene",
	"transition_in": "fade",
	"transition_out": "crossfade"
	}},
	{{
	"scene_id": 2,
	"source_video": 1,
	"start_time": 10.0,
	"end_time": 15.0,
	"duration": 5.0,
	"description": "Brief description of what happens in this scene",
	"transition_in": "crossfade",
	"transition_out": "fade"
	}}
	],
	"music": {{
	"mood": "energetic",
	"bpm": 120,
	"sync_points": [0.0, 7.5, 15.0, 22.5, 30.0],
	"volume": 0.5
	}},
	"pacing": "fast",
	"narrative_structure": "hook -> build -> climax -> resolution",
	"visual_style": "bright, colorful, dynamic"
	}}

	Rules:
	- source_video is 0-based index (0 for first video, 1 for second, etc.)
	- Each scene must have start_time, end_time, and duration
	- CRITICAL: start_time and end_time MUST be within the actual video duration. Check the "duration" field in each video summary to ensure timestamps don't exceed it.
	- For example, if a video has duration 5.2 seconds, start_time must be < 5.2 and end_time must be <= 5.2
	- Total of all scene durations should be approximately {target_duration} seconds (±2 seconds tolerance)
	- Use transitions: "cut", "fade", or "crossfade"
	- Extract mood tags from the video summaries for the music section
	- sync_points should be evenly distributed or aligned to scene transitions
	- pacing should be one of: "slow", "moderate", "fast", "very-fast"
	- narrative_structure should describe the flow (e.g., "hook -> build -> climax -> resolution")
	- visual_style should describe the aesthetic (e.g., "bright, colorful, dynamic" or "dark, moody, cinematic")
	- Return ONLY the JSON, no other text or markdown formatting"""

	# Generate script using Gemini
	response = client.models.generate_content(
	model="gemini-2.5-flash-lite",
	contents=[prompt],
	)

	# Extract JSON from response
	response_text = response.text.strip()

	# Try to extract JSON if wrapped in markdown code blocks
	if "```json" in response_text:
	response_text = response_text.split("```json")[1].split("```")[0].strip()
	elif "```" in response_text:
	response_text = response_text.split("```")[1].split("```")[0].strip()

	script = json.loads(response_text)

	# Validate script structure
	if not isinstance(script, dict):
	raise ValueError("Generated script is not a valid dictionary")

	# Ensure required fields exist
	if "total_duration" not in script:
	script["total_duration"] = target_duration

	if "scenes" not in script:
	raise ValueError("Generated script missing 'scenes' field")

	if not isinstance(script["scenes"], list) or len(script["scenes"]) == 0:
	raise ValueError("Generated script must contain at least one scene")

	# Validate and fix scene timestamps to ensure they're within video durations
	# Create a mapping of video index to duration from summaries
	video_durations = {}
	for i, summary in enumerate(summaries_list):
	video_durations[i] = summary.get("duration", 0.0)

	num_videos = len(summaries_list)

	# Validate and fix each scene
	for scene in script["scenes"]:
	source_video_idx = scene.get("source_video")

	# Validate and fix source_video index if it's an integer
	if isinstance(source_video_idx, int):
	# Clamp index to valid range (0 to num_videos - 1)
	if source_video_idx < 0:
	source_video_idx = 0
	elif source_video_idx >= num_videos:
	# Clamp to last valid index
	source_video_idx = max(0, num_videos - 1)
	scene["source_video"] = source_video_idx
	elif source_video_idx is None:
	# If source_video is missing, default to first video
	scene["source_video"] = 0

	# Now validate timestamps if we have a valid video index
	# Use the clamped value from scene (in case it was updated)
	validated_idx = scene.get("source_video")
	if isinstance(validated_idx, int) and validated_idx in video_durations:
	video_duration = video_durations[validated_idx]
	start_time = scene.get("start_time", 0.0)
	end_time = scene.get("end_time")
	scene_duration = scene.get("duration")

	# If start_time exceeds video duration, adjust it
	if start_time >= video_duration:
	# Use the last portion of the video (last 2 seconds or video duration, whichever is smaller)
	clip_duration = min(2.0, video_duration)
	scene["start_time"] = max(0.0, video_duration - clip_duration)
	if end_time is None and scene_duration:
	scene["end_time"] = video_duration
	scene["duration"] = video_duration - scene["start_time"]
	elif end_time:
	scene["end_time"] = video_duration
	scene["duration"] = video_duration - scene["start_time"]
	else:
	scene["end_time"] = video_duration
	scene["duration"] = video_duration - scene["start_time"]
	else:
	# Clamp start_time to be within bounds
	scene["start_time"] = max(
	0.0, min(start_time, video_duration - 0.1)
	)

	# Calculate or validate end_time
	if end_time is None:
	if scene_duration:
	calculated_end_time = scene["start_time"] + scene_duration
	else:
	calculated_end_time = video_duration
	else:
	calculated_end_time = end_time

	# Clamp end_time to be within bounds
	scene["end_time"] = max(
	scene["start_time"] + 0.1,
	min(calculated_end_time, video_duration),
	)

	# Update duration to match
	scene["duration"] = scene["end_time"] - scene["start_time"]

	# Validate scene durations sum to approximately target_duration
	total_scene_duration = sum(
	scene.get("duration", 0) for scene in script["scenes"]
	)
	if abs(total_scene_duration - target_duration) > 5.0:
	# Adjust durations proportionally if they're way off
	if total_scene_duration > 0:
	scale_factor = target_duration / total_scene_duration
	for scene in script["scenes"]:
	if "duration" in scene:
	scene["duration"] = round(scene["duration"] * scale_factor, 2)
	if "start_time" in scene and "end_time" in scene:
	# Recalculate end_time based on scaled duration
	scene["end_time"] = round(
	scene["start_time"] + scene["duration"], 2
	)
	script["total_duration"] = target_duration

	# Ensure music section exists
	if "music" not in script:
	# Extract mood from summaries
	mood_tags = []
	for summary in summaries_list:
	tags = summary.get("mood_tags", [])
	if isinstance(tags, list):
	mood_tags.extend(tags)
	mood = mood_tags[0] if mood_tags else "energetic"
	script["music"] = {
	"mood": mood,
	"volume": 0.5,
	}

	# Add optional fields if missing
	if "pacing" not in script:
	script["pacing"] = "moderate"

	if "narrative_structure" not in script:
	script["narrative_structure"] = "linear"

	if "visual_style" not in script:
	script["visual_style"] = "standard"

	return json.dumps(script, indent=2)

	except json.JSONDecodeError as e:
	# Fallback to simple script
	if not summaries_list:
	raise ValueError("No video summaries provided")

	summary = summaries_list[0]
	duration = summary.get("duration", target_duration)
	clip_duration = min(duration, target_duration)

	# Extract mood from summary
	mood_tags = summary.get("mood_tags", ["energetic"])
	mood = mood_tags[0] if mood_tags else "energetic"

	fallback_script = {
	"total_duration": clip_duration,
	"scenes": [
	{
	"scene_id": 1,
	"source_video": 0,
	"start_time": 0.0,
	"end_time": clip_duration,
	"duration": clip_duration,
	"description": (
	summary.get("summary", "Video clip")[:100]
	if isinstance(summary.get("summary"), str)
	else "Video clip"
	),
	"transition_in": "fade",
	"transition_out": "fade",
	}
	],
	"music": {
	"mood": mood,
	"volume": 0.5,
	},
	"pacing": "moderate",
	"narrative_structure": "single scene",
	}
	return json.dumps(fallback_script, indent=2)

	except Exception as e:
	raise Exception(f"Error generating video script: {str(e)}")