Spaces:
Sleeping
Sleeping
| """ | |
| Advanced Prompt Generator using GPT-4o | |
| Structured JSON generation with strict validation | |
| """ | |
| import re | |
| import base64 | |
| from typing import List, Optional, Dict, Any | |
| from pydantic import BaseModel, Field | |
| from openai import OpenAI | |
| import os | |
| class VeoInputs(BaseModel): | |
| """Input parameters for video generation""" | |
| script: str | |
| style: str | |
| jsonFormat: str = 'standard' | |
| continuationMode: bool = True | |
| voiceType: Optional[str] = None | |
| energyLevel: Optional[str] = None | |
| settingMode: str = 'single' | |
| cameraStyle: Optional[str] = None | |
| energyArc: Optional[str] = None | |
| narrativeStyle: Optional[str] = None | |
| accentRegion: Optional[str] = None | |
| class ContinuityMarkers(BaseModel): | |
| """Markers for maintaining continuity between segments""" | |
| start_position: str | |
| end_position: str | |
| start_expression: str | |
| end_expression: str | |
| start_gesture: str | |
| end_gesture: str | |
| location_status: str | |
| class SegmentInfo(BaseModel): | |
| """Basic segment information""" | |
| segment_number: int | |
| total_segments: int | |
| duration: str | |
| location: str | |
| continuity_markers: ContinuityMarkers | |
| class CharacterDescription(BaseModel): | |
| """Detailed character description""" | |
| physical: str = Field(..., description="200+ words") | |
| clothing: str = Field(..., description="150+ words") | |
| current_state: str = Field(..., description="100+ words, segment-specific") | |
| voice_matching: str = Field(..., description="100+ words, segment-specific") | |
| class SynchronizedActions(BaseModel): | |
| """Time-synced actions throughout the segment""" | |
| f0000_0002: str = Field(alias="0:00-0:02") | |
| f0002_0004: str = Field(alias="0:02-0:04") | |
| f0004_0006: str = Field(alias="0:04-0:06") | |
| f0006_0008: str = Field(alias="0:06-0:08") | |
| class Config: | |
| populate_by_name = True | |
| class ActionTimeline(BaseModel): | |
| """Detailed action timeline for the segment""" | |
| dialogue: str | |
| synchronized_actions: SynchronizedActions | |
| micro_expressions: str = Field(..., description="50+ words") | |
| breathing_rhythm: str | |
| location_transition: str | |
| continuity_checkpoint: str | |
| class SceneContinuity(BaseModel): | |
| """Scene and camera details""" | |
| environment: str = Field(..., description="250+ words") | |
| camera_position: str = Field(..., description="75+ words") | |
| camera_movement: str = Field(..., description="detailed movement path") | |
| lighting_state: str = Field(..., description="50+ words") | |
| props_in_frame: str = Field(..., description="75+ words") | |
| background_elements: str = Field(..., description="50+ words") | |
| spatial_relationships: str | |
| class Segment(BaseModel): | |
| """Complete segment specification""" | |
| segment_info: SegmentInfo | |
| character_description: CharacterDescription | |
| scene_continuity: SceneContinuity | |
| action_timeline: ActionTimeline | |
| class SegmentsPayload(BaseModel): | |
| """Complete payload with all segments""" | |
| segments: List[Segment] | |
| def split_script_into_segments( | |
| script: str, | |
| seconds_per_segment: int = 8, | |
| words_per_second: float = 2.2 | |
| ) -> List[str]: | |
| """ | |
| Split script into segments based on timing | |
| Args: | |
| script: Full script text | |
| seconds_per_segment: Target duration per segment | |
| words_per_second: Speaking rate (adjust for VO tempo) | |
| Returns: | |
| List of script segments | |
| """ | |
| sentences = re.split(r'(?<=[.!?])\s+', script.strip()) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| target = max(14, int(seconds_per_segment * words_per_second)) | |
| segments, cur, cur_len = [], [], 0 | |
| for s in sentences: | |
| w = len(s.split()) | |
| if cur and cur_len + w > target: | |
| segments.append(" ".join(cur)) | |
| cur, cur_len = [], 0 | |
| cur.append(s) | |
| cur_len += w | |
| if cur: | |
| segments.append(" ".join(cur)) | |
| # Environment-based segment limiting | |
| # In DEV mode: limit to 2 segments for faster testing | |
| # In PROD mode: generate all segments | |
| environment = os.getenv('ENVIRONMENT', 'dev').lower() | |
| is_dev_mode = environment == 'dev' or environment == 'development' | |
| if is_dev_mode and len(segments) > 2: | |
| print(f"β οΈ DEV MODE: Limiting from {len(segments)} to 2 segments") | |
| segments = segments[:2] | |
| elif not is_dev_mode: | |
| print(f"β PROD MODE: Generating all {len(segments)} segments") | |
| return segments or [script.strip()] | |
| def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str: | |
| """ | |
| Build the system prompt for GPT-4o | |
| Args: | |
| inputs: Video generation inputs | |
| segment_texts: List of segment scripts | |
| Returns: | |
| Formatted prompt string | |
| """ | |
| N = len(segment_texts) | |
| knobs = inputs.model_dump() | |
| header = f""" | |
| You are a STRICT production-grade JSON generator for Veo 3 video prompts. | |
| β οΈ CRITICAL: Your output will be VALIDATED. ANY field under minimum word count will be REJECTED. | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| π¨ CRITICAL: CHARACTER MUST MATCH REFERENCE IMAGE EXACTLY π¨ | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| A REFERENCE IMAGE IS PROVIDED. You MUST: | |
| 1. ANALYZE the image carefully and describe the EXACT person you see | |
| 2. Use the SAME character description for ALL {N} segments (copy-paste identical text) | |
| 3. Include SPECIFIC details from the image: | |
| - EXACT hair color (e.g., "strawberry blonde", "auburn", "dark brown") | |
| - EXACT eye color (e.g., "green", "blue", "brown") | |
| - EXACT facial features (freckles, skin tone, face shape) | |
| - EXACT clothing visible in the image (color, pattern, style) | |
| - EXACT age appearance (not generic "mid-thirties") | |
| 4. DO NOT invent or change ANY physical features | |
| 5. The generated video MUST show the SAME person as the reference image | |
| β οΈ If the character description doesn't match the reference image, the video will be REJECTED. | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MANDATORY WORD COUNT REQUIREMENTS - WILL BE VALIDATED | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| character_description.physical: MINIMUM 150 WORDS (MUST describe the EXACT person in the reference image - IDENTICAL across ALL {N} segments) | |
| character_description.clothing: MINIMUM 100 WORDS (MUST describe the EXACT clothing in the reference image - IDENTICAL across ALL {N} segments) | |
| character_description.current_state: MINIMUM 50 WORDS (segment-specific) | |
| character_description.voice_matching: MINIMUM 50 WORDS (segment-specific) | |
| scene_continuity.environment: MINIMUM 150 WORDS (MUST be IDENTICAL across ALL {N} segments - same location throughout) | |
| scene_continuity.camera_position: MINIMUM 50 WORDS (MUST be consistent framing) | |
| scene_continuity.lighting_state: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments) | |
| scene_continuity.props_in_frame: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments) | |
| scene_continuity.background_elements: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments) | |
| action_timeline.micro_expressions: MINIMUM 40 WORDS | |
| β οΈ If ANY field has fewer words than the minimum, the ENTIRE payload will be REJECTED. | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| WHAT 200+ WORDS LOOKS LIKE (EXAMPLE FOR PHYSICAL): | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "A person in their mid-thirties with a warm, approachable presence that immediately puts viewers at ease. Their facial structure features high, defined cheekbones and a strong, angular jawline that conveys confidence without appearing intimidating. They have expressive, almond-shaped eyes with a rich brown color that sparkles with intelligence and authenticity when discussing financial topics. Their eyebrows are naturally shaped and animated, often raising slightly when emphasizing important points about debt relief. The person maintains excellent posture throughout, sitting or standing with shoulders back and spine straight, projecting both professionalism and relatability. Their skin tone is natural and even, with a healthy glow that suggests good self-care. They have a genuine, engaging smile that reaches their eyes, creating authentic crow's feet at the corners when they express enthusiasm about helping people save money. Their hair is styled in a modern, professional manner that doesn't distract from their message. The person's hands are visible during gestures, with natural, purposeful movements that emphasize key phrases about the tariff relief program. They maintain steady eye contact with the camera, creating a direct connection with viewers. Their overall appearance suggests someone who is both knowledgeable about financial matters and genuinely invested in helping others achieve debt freedom." (200+ words) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| WHAT 150+ WORDS LOOKS LIKE (EXAMPLE FOR ENVIRONMENT): | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "A contemporary, warmly-lit interior space that exudes professionalism while maintaining an approachable, comfortable atmosphere perfect for discussing personal finance topics. The setting features soft, natural daylight streaming through large windows, creating gentle highlights and shadows that add depth and dimension to the frame. The background wall showcases a sophisticated neutral color paletteβthink warm beige or soft gray tonesβthat doesn't compete for attention but provides visual interest through subtle texture. The space includes carefully curated elements of modern interior design: perhaps a sleek bookshelf with financial publications, a tasteful piece of abstract art that adds color without distraction, and contemporary furniture pieces that suggest success and stability. The floor is likely hardwood or high-quality laminate, polished to reflect light subtly. The lighting setup combines natural window light with strategically placed LED panels or softboxes that eliminate harsh shadows while maintaining a natural, lifestyle aesthetic rather than an overly commercial look. The depth of field is moderate, keeping the subject sharp while softly blurring background elements to maintain focus. Environmental sound design would capture subtle ambient noiseβperhaps distant city sounds or soft office ambianceβthat grounds the viewer in a real, authentic space. The overall atmosphere suggests a professional consultation setting where important financial decisions are made, yet feels intimate and personal enough that viewers can imagine having this conversation in their own homes. Every element of the environment reinforces the credibility of the debt relief message while maintaining the authentic, UGC-style feel that drives engagement. The space is clutter-free but lived-in, striking the perfect balance between aspirational and relatable." (250+ words) | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| YOUR TASK | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Generate EXACTLY {N} segments. Each segment MUST meet ALL word count requirements above. | |
| Duration: "00:00-00:08" for each segment | |
| Synchronized actions: MUST have keys "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08" | |
| Total segments: Set segment_info.total_segments = {N} on EVERY segment | |
| β οΈ CRITICAL DIALOGUE RULE - NO OVERLAP: | |
| - Each segment's action_timeline.dialogue MUST contain ONLY the text assigned below | |
| - NEVER repeat any words or sentences from previous segments | |
| - NEVER include any words or sentences from the next segment | |
| - Each segment's dialogue is MUTUALLY EXCLUSIVE - zero overlap allowed | |
| - The dialogue for each segment is PRE-SPLIT below - use it EXACTLY as given | |
| SCRIPT TO SEGMENT: | |
| \"\"\"{inputs.script.strip()}\"\"\" | |
| STYLE SETTINGS: | |
| {knobs} | |
| SEGMENTS TO GENERATE (USE DIALOGUE EXACTLY AS SHOWN - NO OVERLAP): | |
| """ | |
| seg_lines = "\n".join([f"- Segment {i+1} dialogue (EXACT): \"{t}\"" for i, t in enumerate(segment_texts)]) | |
| footer = """ | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CRITICAL REMINDER BEFORE YOU GENERATE | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| π¨ CHARACTER CONSISTENCY IS MANDATORY: | |
| - physical description: COPY-PASTE THE EXACT SAME TEXT for ALL segments | |
| - clothing description: COPY-PASTE THE EXACT SAME TEXT for ALL segments | |
| - environment description: COPY-PASTE THE EXACT SAME TEXT for ALL segments | |
| - The person MUST look IDENTICAL in every segment (same face, hair, clothes, setting) | |
| β CHECK EVERY FIELD FOR MINIMUM WORD COUNTS | |
| β physical: 150+ words | clothing: 100+ words | |
| β current_state: 50+ words | voice_matching: 50+ words | |
| β environment: 150+ words (MOST IMPORTANT - BE VERY DETAILED) | |
| β camera_position: 50+ words | lighting_state: 40+ words | |
| β props_in_frame: 40+ words | background_elements: 40+ words | |
| β micro_expressions: 40+ words | |
| π¨ CRITICAL: NO BLUR TRANSITIONS - REMINDER π¨ | |
| - Every segment starts SHARP and CLEAR at 0:00 | |
| - camera_movement must describe movement from an already-focused state | |
| - synchronized_actions["0:00-0:02"] must begin with subject in sharp focus | |
| - NO fade-in, NO blur, NO gradual focus at segment start | |
| β οΈ VALIDATION WILL COUNT EVERY WORD. Generate MORE than minimum to be safe! | |
| β οΈ Describe the EXACT person in the reference image - do not invent features! | |
| OUTPUT FORMAT: | |
| Return ONLY valid JSON (no markdown, no code blocks): | |
| {{ | |
| "segments": [ {{ ... }} ] | |
| }} | |
| """ | |
| return header + seg_lines + footer | |
| # Minimum word counts for validation | |
| MIN_WORDS = { | |
| ("character_description", "physical"): 150, | |
| ("character_description", "clothing"): 100, | |
| ("character_description", "current_state"): 50, | |
| ("character_description", "voice_matching"): 50, | |
| ("scene_continuity", "environment"): 150, | |
| ("scene_continuity", "camera_position"): 50, | |
| ("scene_continuity", "lighting_state"): 40, | |
| ("scene_continuity", "props_in_frame"): 40, | |
| ("scene_continuity", "background_elements"): 40, | |
| ("action_timeline", "micro_expressions"): 40, | |
| } | |
| def _word_count(text: str) -> int: | |
| """Count words in text""" | |
| return len(re.findall(r"\b\w+\b", text or "")) | |
| def validate_segments_payload( | |
| payload: Dict[str, Any], | |
| expected_segments: int | |
| ) -> List[str]: | |
| """ | |
| Validate the generated payload against strict rules | |
| Args: | |
| payload: Generated payload | |
| expected_segments: Expected number of segments | |
| Returns: | |
| List of validation errors (empty if valid) | |
| """ | |
| errors: List[str] = [] | |
| segs = payload.get("segments", []) | |
| if len(segs) != expected_segments: | |
| errors.append(f"Expected {expected_segments} segments, got {len(segs)}.") | |
| required_sync_keys = {"0:00-0:02", "0:02-0:04", "0:04-0:06", "0:06-0:08"} | |
| physical_blocks, clothing_blocks, environment_blocks = [], [], [] | |
| for i, seg in enumerate(segs, start=1): | |
| # Check segment info | |
| si = seg.get("segment_info", {}) | |
| if si.get("duration") != "00:00-00:08": | |
| errors.append(f"Segment {i}: duration must be 00:00-00:08.") | |
| if si.get("total_segments") != expected_segments: | |
| errors.append( | |
| f"Segment {i}: total_segments should be {expected_segments}, " | |
| f"got {si.get('total_segments')}." | |
| ) | |
| # Check synchronized actions keys | |
| sync = seg.get("action_timeline", {}).get("synchronized_actions", {}) | |
| if set(sync.keys()) != required_sync_keys: | |
| errors.append( | |
| f"Segment {i}: synchronized_actions must have keys " | |
| f"{sorted(required_sync_keys)}." | |
| ) | |
| # Word-count checks | |
| for (section, field), minw in MIN_WORDS.items(): | |
| text = seg.get(section, {}).get(field, "") | |
| wc = _word_count(text) | |
| if wc < minw: | |
| errors.append( | |
| f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc})." | |
| ) | |
| # Collect for uniformity check | |
| ch = seg.get("character_description", {}) | |
| sc = seg.get("scene_continuity", {}) | |
| physical_blocks.append(ch.get("physical", "")) | |
| clothing_blocks.append(ch.get("clothing", "")) | |
| environment_blocks.append(sc.get("environment", "")) | |
| # Uniformity across segments - CRITICAL for visual consistency | |
| if expected_segments > 1: | |
| if len(set(physical_blocks)) > 1: | |
| errors.append( | |
| "π¨ `character_description.physical` must be EXACTLY identical " | |
| "across all segments - character is changing!" | |
| ) | |
| if len(set(clothing_blocks)) > 1: | |
| errors.append( | |
| "π¨ `character_description.clothing` must be EXACTLY identical " | |
| "across all segments - clothing is changing!" | |
| ) | |
| if len(set(environment_blocks)) > 1: | |
| errors.append( | |
| "π¨ `scene_continuity.environment` must be EXACTLY identical " | |
| "across all segments - location is changing!" | |
| ) | |
| return errors | |
| def generate_segments_payload( | |
| inputs: VeoInputs, | |
| image_bytes: Optional[bytes] = None, | |
| model: str = "gpt-4o", | |
| api_key: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Generate segments payload using GPT-4o with structured output | |
| WARNING-ONLY MODE: Validation errors are logged but don't block generation. | |
| This allows the system to work with whatever GPT-4o generates. | |
| Args: | |
| inputs: Video generation inputs | |
| image_bytes: Optional reference image bytes | |
| model: OpenAI model to use | |
| api_key: OpenAI API key (or from env) | |
| Returns: | |
| Segments payload (always returns, even if validation warnings exist) | |
| Raises: | |
| Exception: If API call fails (network, auth, etc.) | |
| """ | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY')) | |
| # Split script into segments | |
| segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8) | |
| N = len(segment_texts) | |
| print(f"π Generating {N} segments...") | |
| # Build prompt | |
| user_prompt = build_prompt(inputs, segment_texts) | |
| # Call GPT-4o (WARNING-ONLY validation - no retries, no blocking) | |
| print(f"π€ Calling GPT-4o to generate {N} segments...") | |
| # Prepare messages | |
| system_content = "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count." | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": system_content | |
| }, | |
| { | |
| "role": "user", | |
| "content": [] | |
| } | |
| ] | |
| # Add text prompt | |
| messages[1]["content"].append({ | |
| "type": "text", | |
| "text": user_prompt | |
| }) | |
| # Add image if provided | |
| if image_bytes: | |
| encoded_image = base64.b64encode(image_bytes).decode("utf-8") | |
| messages[1]["content"].append({ | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encoded_image}" | |
| } | |
| }) | |
| # Call GPT-4o with structured output | |
| response = client.beta.chat.completions.parse( | |
| model=model, | |
| response_format=SegmentsPayload, | |
| messages=messages, | |
| ) | |
| parsed_obj = response.choices[0].message.parsed | |
| payload = parsed_obj.model_dump(by_alias=True) | |
| print(f"β GPT-4o generated {N} segments successfully") | |
| # DEBUG: Show actual word counts for first segment | |
| if payload.get("segments"): | |
| seg = payload["segments"][0] | |
| cd = seg.get("character_description", {}) | |
| sc = seg.get("scene_continuity", {}) | |
| print(f"π Sample word counts (Segment 1):") | |
| print(f" physical: {_word_count(cd.get('physical', ''))} words") | |
| print(f" clothing: {_word_count(cd.get('clothing', ''))} words") | |
| print(f" current_state: {_word_count(cd.get('current_state', ''))} words") | |
| print(f" environment: {_word_count(sc.get('environment', ''))} words") | |
| print(f" camera_position: {_word_count(sc.get('camera_position', ''))} words") | |
| # Run validation (WARNING-ONLY - doesn't block generation) | |
| errors = validate_segments_payload(payload, N) | |
| if errors: | |
| # Log warnings but DON'T block generation | |
| print(f"\nβ οΈ VALIDATION WARNINGS ({len(errors)} issues found):") | |
| print(f"β οΈ These are non-blocking - generation will continue") | |
| for i, error in enumerate(errors[:10], 1): # Show first 10 | |
| print(f" {i}. {error}") | |
| if len(errors) > 10: | |
| print(f" ... and {len(errors) - 10} more warnings") | |
| print(f"β Proceeding with generation despite warnings\n") | |
| else: | |
| print(f"β All validation checks passed!") | |
| # ALWAYS return payload (even with warnings) | |
| return payload | |