import os import json import re from typing import Optional, List, Union import google.genai as genai def _extract_and_parse_json(text: str) -> Optional[Union[dict, list]]: """ Extract and parse JSON from text that might contain extra content. Handles cases where JSON is wrapped in markdown, has extra text, multiple objects, or wrapped in tool response format like {"tool_name_response": {...}}. """ if not text or not isinstance(text, str): return None text = text.strip() # Try direct parsing first try: parsed = json.loads(text) # Check if it's wrapped in a tool response format # e.g., {"video_summarizer_tool_response": {...}} or [{"video_summarizer_tool_response": {...}}] if isinstance(parsed, dict): # Check if it's a single tool response wrapper if len(parsed) == 1: key = list(parsed.keys())[0] if "_tool_response" in key.lower() or "_response" in key.lower(): # Extract the actual data from the wrapper return parsed[key] elif isinstance(parsed, list) and len(parsed) > 0: # Check if list contains wrapped responses unwrapped = [] for item in parsed: if isinstance(item, dict) and len(item) == 1: key = list(item.keys())[0] if "_tool_response" in key.lower() or "_response" in key.lower(): unwrapped.append(item[key]) else: unwrapped.append(item) else: unwrapped.append(item) return unwrapped if unwrapped else parsed return parsed except json.JSONDecodeError: pass # Try to extract JSON array by finding balanced brackets bracket_count = 0 array_start = -1 for i, char in enumerate(text): if char == "[": if bracket_count == 0: array_start = i bracket_count += 1 elif char == "]": bracket_count -= 1 if bracket_count == 0 and array_start >= 0: array_str = text[array_start : i + 1] try: return json.loads(array_str) except json.JSONDecodeError: pass array_start = -1 # Try to find multiple JSON objects and combine them into an array # This handles cases where objects are concatenated: {}{} objects = [] brace_count = 0 start_idx = -1 for i, char in enumerate(text): if char == "{": if brace_count == 0: start_idx = i brace_count += 1 elif char == "}": brace_count -= 1 if brace_count == 0 and start_idx >= 0: obj_str = text[start_idx : i + 1] try: obj = json.loads(obj_str) objects.append(obj) except json.JSONDecodeError: pass start_idx = -1 if objects: # If we found multiple objects, return as list # If only one, return it directly (will be wrapped in list by caller) return objects if len(objects) > 1 else objects[0] # Try to extract a single JSON object by finding balanced braces brace_count = 0 obj_start = -1 for i, char in enumerate(text): if char == "{": if brace_count == 0: obj_start = i brace_count += 1 elif char == "}": brace_count -= 1 if brace_count == 0 and obj_start >= 0: obj_str = text[obj_start : i + 1] try: return json.loads(obj_str) except json.JSONDecodeError: pass obj_start = -1 return None def video_script_generator( video_summaries: Union[str, List[dict], List[str]], user_description: Optional[str] = None, target_duration: float = 30.0, ) -> str: """ Create a detailed script/storyboard for the final 30-second video. Uses Google Gemini API to intelligently generate a script based on video summaries and user requirements. Args: video_summaries: Video summaries from video_summarizer tool. Can be: - JSON string (single summary) - List of dict objects (multiple summaries) - List of JSON strings (multiple summaries) user_description: Optional user description of desired mood/style/content target_duration: Target duration in seconds (default: 30.0) Returns: str: JSON string containing detailed script with: - Scene sequence with source video and timestamps - Duration for each scene segment (must sum to ~target_duration seconds) - Transition types between scenes (cut, fade, crossfade) - Pacing and rhythm plan - Music synchronization points (beat markers, mood changes) - Overall narrative structure and flow - Visual style recommendations Example output format: { "total_duration": 30.0, "scenes": [ { "scene_id": 1, "source_video": 0, "start_time": 5.2, "end_time": 8.5, "duration": 3.3, "description": "Opening shot of landscape", "transition_in": "fade", "transition_out": "crossfade" }, ... ], "music": { "mood": "energetic", "bpm": 120, "sync_points": [0.0, 7.5, 15.0, 22.5, 30.0], "volume": 0.5 }, "pacing": "fast", "narrative_structure": "hook -> build -> climax -> resolution", "visual_style": "bright, colorful, dynamic" } """ try: # Parse video summaries input summaries_list = [] if isinstance(video_summaries, str): # JSON string - could be a single object or an array # Use robust parsing to handle malformed JSON parsed = _extract_and_parse_json(video_summaries) if parsed is None: raise ValueError( f"Invalid JSON format for video_summaries. " f"Could not parse: {video_summaries[:200]}..." ) if isinstance(parsed, list): # It's a JSON array summaries_list = parsed else: # It's a single JSON object summaries_list = [parsed] elif isinstance(video_summaries, list): # List of summaries for summary in video_summaries: if isinstance(summary, str): # Use robust parsing for string summaries parsed = _extract_and_parse_json(summary) if parsed is None: raise ValueError( f"Invalid JSON format in video_summaries: {summary[:200]}..." ) # If parsed is a list, extend; if it's a dict, append if isinstance(parsed, list): summaries_list.extend(parsed) else: summaries_list.append(parsed) elif isinstance(summary, dict): # Check if it's wrapped in a tool response format if len(summary) == 1: key = list(summary.keys())[0] if ( "_tool_response" in key.lower() or "_response" in key.lower() ): # Extract the actual data from the wrapper summaries_list.append(summary[key]) else: summaries_list.append(summary) else: summaries_list.append(summary) else: raise ValueError( f"Invalid summary type: {type(summary).__name__}. " "Expected dict or JSON string." ) else: raise ValueError( f"Invalid video_summaries type: {type(video_summaries).__name__}. " "Expected str, list of dicts, or list of JSON strings." ) if not summaries_list: raise ValueError("No video summaries provided") # Validate target_duration if target_duration <= 0: raise ValueError("target_duration must be greater than 0") # Get API key api_key = os.getenv("GOOGLE_API_KEY") if not api_key: # Fallback: create a simple script using first video summary = summaries_list[0] duration = summary.get("duration", target_duration) clip_duration = min(duration, target_duration) # Extract mood from summary mood_tags = summary.get("mood_tags", ["energetic"]) mood = mood_tags[0] if mood_tags else "energetic" fallback_script = { "total_duration": clip_duration, "scenes": [ { "scene_id": 1, "source_video": 0, "start_time": 0.0, "end_time": clip_duration, "duration": clip_duration, "description": summary.get("summary", "Video clip")[:100], "transition_in": "fade", "transition_out": "fade", } ], "music": { "mood": mood, "volume": 0.5, }, "pacing": "moderate", "narrative_structure": "single scene", } return json.dumps(fallback_script, indent=2) # Initialize Gemini client client = genai.Client(api_key=api_key) # Build prompt for script generation summaries_text = "\n\n".join( [ f"Video {i+1}:\n{json.dumps(s, indent=2)}" for i, s in enumerate(summaries_list) ] ) user_desc_text = ( f"\n\nUser Description: {user_description}" if user_description else "" ) prompt = f"""You are a professional video editor creating a {target_duration}-second short-form video. Here are the video summaries: {summaries_text} {user_desc_text} Create a detailed video composition script that: 1. Selects the most engaging and relevant scenes from the videos 2. Creates a coherent narrative flow with a clear structure (hook -> build -> climax -> resolution) 3. Uses appropriate transitions (cut, fade, or crossfade) between scenes 4. Ensures the total duration is approximately {target_duration} seconds (within ±2 seconds) 5. Distributes scenes evenly across the duration, considering pacing 6. Identifies music mood, BPM, and sync points for rhythm matching 7. Provides visual style recommendations based on the content Return ONLY a valid JSON object with this exact structure: {{ "total_duration": {target_duration}, "scenes": [ {{ "scene_id": 1, "source_video": 0, "start_time": 0.0, "end_time": 5.0, "duration": 5.0, "description": "Brief description of what happens in this scene", "transition_in": "fade", "transition_out": "crossfade" }}, {{ "scene_id": 2, "source_video": 1, "start_time": 10.0, "end_time": 15.0, "duration": 5.0, "description": "Brief description of what happens in this scene", "transition_in": "crossfade", "transition_out": "fade" }} ], "music": {{ "mood": "energetic", "bpm": 120, "sync_points": [0.0, 7.5, 15.0, 22.5, 30.0], "volume": 0.5 }}, "pacing": "fast", "narrative_structure": "hook -> build -> climax -> resolution", "visual_style": "bright, colorful, dynamic" }} Rules: - source_video is 0-based index (0 for first video, 1 for second, etc.) - Each scene must have start_time, end_time, and duration - CRITICAL: start_time and end_time MUST be within the actual video duration. Check the "duration" field in each video summary to ensure timestamps don't exceed it. - For example, if a video has duration 5.2 seconds, start_time must be < 5.2 and end_time must be <= 5.2 - Total of all scene durations should be approximately {target_duration} seconds (±2 seconds tolerance) - Use transitions: "cut", "fade", or "crossfade" - Extract mood tags from the video summaries for the music section - sync_points should be evenly distributed or aligned to scene transitions - pacing should be one of: "slow", "moderate", "fast", "very-fast" - narrative_structure should describe the flow (e.g., "hook -> build -> climax -> resolution") - visual_style should describe the aesthetic (e.g., "bright, colorful, dynamic" or "dark, moody, cinematic") - Return ONLY the JSON, no other text or markdown formatting""" # Generate script using Gemini response = client.models.generate_content( model="gemini-2.5-flash-lite", contents=[prompt], ) # Extract JSON from response response_text = response.text.strip() # Try to extract JSON if wrapped in markdown code blocks if "```json" in response_text: response_text = response_text.split("```json")[1].split("```")[0].strip() elif "```" in response_text: response_text = response_text.split("```")[1].split("```")[0].strip() script = json.loads(response_text) # Validate script structure if not isinstance(script, dict): raise ValueError("Generated script is not a valid dictionary") # Ensure required fields exist if "total_duration" not in script: script["total_duration"] = target_duration if "scenes" not in script: raise ValueError("Generated script missing 'scenes' field") if not isinstance(script["scenes"], list) or len(script["scenes"]) == 0: raise ValueError("Generated script must contain at least one scene") # Validate and fix scene timestamps to ensure they're within video durations # Create a mapping of video index to duration from summaries video_durations = {} for i, summary in enumerate(summaries_list): video_durations[i] = summary.get("duration", 0.0) num_videos = len(summaries_list) # Validate and fix each scene for scene in script["scenes"]: source_video_idx = scene.get("source_video") # Validate and fix source_video index if it's an integer if isinstance(source_video_idx, int): # Clamp index to valid range (0 to num_videos - 1) if source_video_idx < 0: source_video_idx = 0 elif source_video_idx >= num_videos: # Clamp to last valid index source_video_idx = max(0, num_videos - 1) scene["source_video"] = source_video_idx elif source_video_idx is None: # If source_video is missing, default to first video scene["source_video"] = 0 # Now validate timestamps if we have a valid video index # Use the clamped value from scene (in case it was updated) validated_idx = scene.get("source_video") if isinstance(validated_idx, int) and validated_idx in video_durations: video_duration = video_durations[validated_idx] start_time = scene.get("start_time", 0.0) end_time = scene.get("end_time") scene_duration = scene.get("duration") # If start_time exceeds video duration, adjust it if start_time >= video_duration: # Use the last portion of the video (last 2 seconds or video duration, whichever is smaller) clip_duration = min(2.0, video_duration) scene["start_time"] = max(0.0, video_duration - clip_duration) if end_time is None and scene_duration: scene["end_time"] = video_duration scene["duration"] = video_duration - scene["start_time"] elif end_time: scene["end_time"] = video_duration scene["duration"] = video_duration - scene["start_time"] else: scene["end_time"] = video_duration scene["duration"] = video_duration - scene["start_time"] else: # Clamp start_time to be within bounds scene["start_time"] = max( 0.0, min(start_time, video_duration - 0.1) ) # Calculate or validate end_time if end_time is None: if scene_duration: calculated_end_time = scene["start_time"] + scene_duration else: calculated_end_time = video_duration else: calculated_end_time = end_time # Clamp end_time to be within bounds scene["end_time"] = max( scene["start_time"] + 0.1, min(calculated_end_time, video_duration), ) # Update duration to match scene["duration"] = scene["end_time"] - scene["start_time"] # Validate scene durations sum to approximately target_duration total_scene_duration = sum( scene.get("duration", 0) for scene in script["scenes"] ) if abs(total_scene_duration - target_duration) > 5.0: # Adjust durations proportionally if they're way off if total_scene_duration > 0: scale_factor = target_duration / total_scene_duration for scene in script["scenes"]: if "duration" in scene: scene["duration"] = round(scene["duration"] * scale_factor, 2) if "start_time" in scene and "end_time" in scene: # Recalculate end_time based on scaled duration scene["end_time"] = round( scene["start_time"] + scene["duration"], 2 ) script["total_duration"] = target_duration # Ensure music section exists if "music" not in script: # Extract mood from summaries mood_tags = [] for summary in summaries_list: tags = summary.get("mood_tags", []) if isinstance(tags, list): mood_tags.extend(tags) mood = mood_tags[0] if mood_tags else "energetic" script["music"] = { "mood": mood, "volume": 0.5, } # Add optional fields if missing if "pacing" not in script: script["pacing"] = "moderate" if "narrative_structure" not in script: script["narrative_structure"] = "linear" if "visual_style" not in script: script["visual_style"] = "standard" return json.dumps(script, indent=2) except json.JSONDecodeError as e: # Fallback to simple script if not summaries_list: raise ValueError("No video summaries provided") summary = summaries_list[0] duration = summary.get("duration", target_duration) clip_duration = min(duration, target_duration) # Extract mood from summary mood_tags = summary.get("mood_tags", ["energetic"]) mood = mood_tags[0] if mood_tags else "energetic" fallback_script = { "total_duration": clip_duration, "scenes": [ { "scene_id": 1, "source_video": 0, "start_time": 0.0, "end_time": clip_duration, "duration": clip_duration, "description": ( summary.get("summary", "Video clip")[:100] if isinstance(summary.get("summary"), str) else "Video clip" ), "transition_in": "fade", "transition_out": "fade", } ], "music": { "mood": mood, "volume": 0.5, }, "pacing": "moderate", "narrative_structure": "single scene", } return json.dumps(fallback_script, indent=2) except Exception as e: raise Exception(f"Error generating video script: {str(e)}")