Spaces:

userIdc2024
/

Video_AdGenesis_App

Sleeping

File size: 22,392 Bytes

91d209c

"""
Advanced Prompt Generator using GPT-4o
Structured JSON generation with strict validation
"""

import re
import base64
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from openai import OpenAI
import os


class VeoInputs(BaseModel):
    """Input parameters for video generation"""
    script: str
    style: str
    jsonFormat: str = 'standard'
    continuationMode: bool = True
    voiceType: Optional[str] = None
    energyLevel: Optional[str] = None
    settingMode: str = 'single'
    cameraStyle: Optional[str] = None
    energyArc: Optional[str] = None
    narrativeStyle: Optional[str] = None
    accentRegion: Optional[str] = None


class ContinuityMarkers(BaseModel):
    """Markers for maintaining continuity between segments"""
    start_position: str
    end_position: str
    start_expression: str
    end_expression: str
    start_gesture: str
    end_gesture: str
    location_status: str


class SegmentInfo(BaseModel):
    """Basic segment information"""
    segment_number: int
    total_segments: int
    duration: str
    location: str
    continuity_markers: ContinuityMarkers


class CharacterDescription(BaseModel):
    """Detailed character description"""
    physical: str = Field(..., description="200+ words")
    clothing: str = Field(..., description="150+ words")
    current_state: str = Field(..., description="100+ words, segment-specific")
    voice_matching: str = Field(..., description="100+ words, segment-specific")


class SynchronizedActions(BaseModel):
    """Time-synced actions throughout the segment"""
    f0000_0002: str = Field(alias="0:00-0:02")
    f0002_0004: str = Field(alias="0:02-0:04")
    f0004_0006: str = Field(alias="0:04-0:06")
    f0006_0008: str = Field(alias="0:06-0:08")

    class Config:
        populate_by_name = True


class ActionTimeline(BaseModel):
    """Detailed action timeline for the segment"""
    dialogue: str
    synchronized_actions: SynchronizedActions
    micro_expressions: str = Field(..., description="50+ words")
    breathing_rhythm: str
    location_transition: str
    continuity_checkpoint: str


class SceneContinuity(BaseModel):
    """Scene and camera details"""
    environment: str = Field(..., description="250+ words")
    camera_position: str = Field(..., description="75+ words")
    camera_movement: str = Field(..., description="detailed movement path")
    lighting_state: str = Field(..., description="50+ words")
    props_in_frame: str = Field(..., description="75+ words")
    background_elements: str = Field(..., description="50+ words")
    spatial_relationships: str


class Segment(BaseModel):
    """Complete segment specification"""
    segment_info: SegmentInfo
    character_description: CharacterDescription
    scene_continuity: SceneContinuity
    action_timeline: ActionTimeline


class SegmentsPayload(BaseModel):
    """Complete payload with all segments"""
    segments: List[Segment]


def split_script_into_segments(
    script: str,
    seconds_per_segment: int = 8,
    words_per_second: float = 2.2
) -> List[str]:
    """
    Split script into segments based on timing
    
    Args:
        script: Full script text
        seconds_per_segment: Target duration per segment
        words_per_second: Speaking rate (adjust for VO tempo)
    
    Returns:
        List of script segments
    """
    sentences = re.split(r'(?<=[.!?])\s+', script.strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    
    target = max(14, int(seconds_per_segment * words_per_second))
    segments, cur, cur_len = [], [], 0
    
    for s in sentences:
        w = len(s.split())
        if cur and cur_len + w > target:
            segments.append(" ".join(cur))
            cur, cur_len = [], 0
        cur.append(s)
        cur_len += w
    
    if cur:
        segments.append(" ".join(cur))
    
    # Environment-based segment limiting
    # In DEV mode: limit to 2 segments for faster testing
    # In PROD mode: generate all segments
    environment = os.getenv('ENVIRONMENT', 'dev').lower()
    is_dev_mode = environment == 'dev' or environment == 'development'
    
    if is_dev_mode and len(segments) > 2:
        print(f"⚠️  DEV MODE: Limiting from {len(segments)} to 2 segments")
        segments = segments[:2]
    elif not is_dev_mode:
        print(f"✅ PROD MODE: Generating all {len(segments)} segments")
    
    return segments or [script.strip()]


def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
    """
    Build the system prompt for GPT-4o
    
    Args:
        inputs: Video generation inputs
        segment_texts: List of segment scripts
    
    Returns:
        Formatted prompt string
    """
    N = len(segment_texts)
    knobs = inputs.model_dump()
    
    header = f"""
You are a STRICT production-grade JSON generator for Veo 3 video prompts.

⚠️ CRITICAL: Your output will be VALIDATED. ANY field under minimum word count will be REJECTED.

═══════════════════════════════════════════════════════════
🚨 CRITICAL: CHARACTER MUST MATCH REFERENCE IMAGE EXACTLY 🚨
═══════════════════════════════════════════════════════════

A REFERENCE IMAGE IS PROVIDED. You MUST:
1. ANALYZE the image carefully and describe the EXACT person you see
2. Use the SAME character description for ALL {N} segments (copy-paste identical text)
3. Include SPECIFIC details from the image:
   - EXACT hair color (e.g., "strawberry blonde", "auburn", "dark brown")
   - EXACT eye color (e.g., "green", "blue", "brown")
   - EXACT facial features (freckles, skin tone, face shape)
   - EXACT clothing visible in the image (color, pattern, style)
   - EXACT age appearance (not generic "mid-thirties")
4. DO NOT invent or change ANY physical features
5. The generated video MUST show the SAME person as the reference image

⚠️ If the character description doesn't match the reference image, the video will be REJECTED.

═══════════════════════════════════════════════════════════
MANDATORY WORD COUNT REQUIREMENTS - WILL BE VALIDATED
═══════════════════════════════════════════════════════════

character_description.physical: MINIMUM 150 WORDS (MUST describe the EXACT person in the reference image - IDENTICAL across ALL {N} segments)
character_description.clothing: MINIMUM 100 WORDS (MUST describe the EXACT clothing in the reference image - IDENTICAL across ALL {N} segments)
character_description.current_state: MINIMUM 50 WORDS (segment-specific)
character_description.voice_matching: MINIMUM 50 WORDS (segment-specific)

scene_continuity.environment: MINIMUM 150 WORDS (MUST be IDENTICAL across ALL {N} segments - same location throughout)
scene_continuity.camera_position: MINIMUM 50 WORDS (MUST be consistent framing)
scene_continuity.lighting_state: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
scene_continuity.props_in_frame: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
scene_continuity.background_elements: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)

action_timeline.micro_expressions: MINIMUM 40 WORDS

⚠️ If ANY field has fewer words than the minimum, the ENTIRE payload will be REJECTED.

═══════════════════════════════════════════════════════════
WHAT 200+ WORDS LOOKS LIKE (EXAMPLE FOR PHYSICAL):
═══════════════════════════════════════════════════════════

"A person in their mid-thirties with a warm, approachable presence that immediately puts viewers at ease. Their facial structure features high, defined cheekbones and a strong, angular jawline that conveys confidence without appearing intimidating. They have expressive, almond-shaped eyes with a rich brown color that sparkles with intelligence and authenticity when discussing financial topics. Their eyebrows are naturally shaped and animated, often raising slightly when emphasizing important points about debt relief. The person maintains excellent posture throughout, sitting or standing with shoulders back and spine straight, projecting both professionalism and relatability. Their skin tone is natural and even, with a healthy glow that suggests good self-care. They have a genuine, engaging smile that reaches their eyes, creating authentic crow's feet at the corners when they express enthusiasm about helping people save money. Their hair is styled in a modern, professional manner that doesn't distract from their message. The person's hands are visible during gestures, with natural, purposeful movements that emphasize key phrases about the tariff relief program. They maintain steady eye contact with the camera, creating a direct connection with viewers. Their overall appearance suggests someone who is both knowledgeable about financial matters and genuinely invested in helping others achieve debt freedom." (200+ words)

═══════════════════════════════════════════════════════════
WHAT 150+ WORDS LOOKS LIKE (EXAMPLE FOR ENVIRONMENT):
═══════════════════════════════════════════════════════════

"A contemporary, warmly-lit interior space that exudes professionalism while maintaining an approachable, comfortable atmosphere perfect for discussing personal finance topics. The setting features soft, natural daylight streaming through large windows, creating gentle highlights and shadows that add depth and dimension to the frame. The background wall showcases a sophisticated neutral color palette—think warm beige or soft gray tones—that doesn't compete for attention but provides visual interest through subtle texture. The space includes carefully curated elements of modern interior design: perhaps a sleek bookshelf with financial publications, a tasteful piece of abstract art that adds color without distraction, and contemporary furniture pieces that suggest success and stability. The floor is likely hardwood or high-quality laminate, polished to reflect light subtly. The lighting setup combines natural window light with strategically placed LED panels or softboxes that eliminate harsh shadows while maintaining a natural, lifestyle aesthetic rather than an overly commercial look. The depth of field is moderate, keeping the subject sharp while softly blurring background elements to maintain focus. Environmental sound design would capture subtle ambient noise—perhaps distant city sounds or soft office ambiance—that grounds the viewer in a real, authentic space. The overall atmosphere suggests a professional consultation setting where important financial decisions are made, yet feels intimate and personal enough that viewers can imagine having this conversation in their own homes. Every element of the environment reinforces the credibility of the debt relief message while maintaining the authentic, UGC-style feel that drives engagement. The space is clutter-free but lived-in, striking the perfect balance between aspirational and relatable." (250+ words)

═══════════════════════════════════════════════════════════
YOUR TASK
═══════════════════════════════════════════════════════════

Generate EXACTLY {N} segments. Each segment MUST meet ALL word count requirements above.

Duration: "00:00-00:08" for each segment
Synchronized actions: MUST have keys "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08"
Total segments: Set segment_info.total_segments = {N} on EVERY segment

⚠️ CRITICAL DIALOGUE RULE - NO OVERLAP:
- Each segment's action_timeline.dialogue MUST contain ONLY the text assigned below
- NEVER repeat any words or sentences from previous segments
- NEVER include any words or sentences from the next segment
- Each segment's dialogue is MUTUALLY EXCLUSIVE - zero overlap allowed
- The dialogue for each segment is PRE-SPLIT below - use it EXACTLY as given

SCRIPT TO SEGMENT:
\"\"\"{inputs.script.strip()}\"\"\"

STYLE SETTINGS:
{knobs}

SEGMENTS TO GENERATE (USE DIALOGUE EXACTLY AS SHOWN - NO OVERLAP):
"""
    
    seg_lines = "\n".join([f"- Segment {i+1} dialogue (EXACT): \"{t}\"" for i, t in enumerate(segment_texts)])
    
    footer = """

═══════════════════════════════════════════════════════════
CRITICAL REMINDER BEFORE YOU GENERATE
═══════════════════════════════════════════════════════════

🚨 CHARACTER CONSISTENCY IS MANDATORY:
- physical description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- clothing description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- environment description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- The person MUST look IDENTICAL in every segment (same face, hair, clothes, setting)

✅ CHECK EVERY FIELD FOR MINIMUM WORD COUNTS
✅ physical: 150+ words | clothing: 100+ words  
✅ current_state: 50+ words | voice_matching: 50+ words
✅ environment: 150+ words (MOST IMPORTANT - BE VERY DETAILED)
✅ camera_position: 50+ words | lighting_state: 40+ words
✅ props_in_frame: 40+ words | background_elements: 40+ words
✅ micro_expressions: 40+ words

🚨 CRITICAL: NO BLUR TRANSITIONS - REMINDER 🚨
- Every segment starts SHARP and CLEAR at 0:00
- camera_movement must describe movement from an already-focused state
- synchronized_actions["0:00-0:02"] must begin with subject in sharp focus
- NO fade-in, NO blur, NO gradual focus at segment start

⚠️ VALIDATION WILL COUNT EVERY WORD. Generate MORE than minimum to be safe!
⚠️ Describe the EXACT person in the reference image - do not invent features!

OUTPUT FORMAT:
Return ONLY valid JSON (no markdown, no code blocks):
{{
  "segments": [ {{ ... }} ]
}}
"""
    
    return header + seg_lines + footer


# Minimum word counts for validation
MIN_WORDS = {
    ("character_description", "physical"): 150,
    ("character_description", "clothing"): 100,
    ("character_description", "current_state"): 50,
    ("character_description", "voice_matching"): 50,
    ("scene_continuity", "environment"): 150,
    ("scene_continuity", "camera_position"): 50,
    ("scene_continuity", "lighting_state"): 40,
    ("scene_continuity", "props_in_frame"): 40,
    ("scene_continuity", "background_elements"): 40,
    ("action_timeline", "micro_expressions"): 40,
}


def _word_count(text: str) -> int:
    """Count words in text"""
    return len(re.findall(r"\b\w+\b", text or ""))


def validate_segments_payload(
    payload: Dict[str, Any],
    expected_segments: int
) -> List[str]:
    """
    Validate the generated payload against strict rules
    
    Args:
        payload: Generated payload
        expected_segments: Expected number of segments
    
    Returns:
        List of validation errors (empty if valid)
    """
    errors: List[str] = []
    segs = payload.get("segments", [])
    
    if len(segs) != expected_segments:
        errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")
    
    required_sync_keys = {"0:00-0:02", "0:02-0:04", "0:04-0:06", "0:06-0:08"}
    physical_blocks, clothing_blocks, environment_blocks = [], [], []
    
    for i, seg in enumerate(segs, start=1):
        # Check segment info
        si = seg.get("segment_info", {})
        if si.get("duration") != "00:00-00:08":
            errors.append(f"Segment {i}: duration must be 00:00-00:08.")
        if si.get("total_segments") != expected_segments:
            errors.append(
                f"Segment {i}: total_segments should be {expected_segments}, "
                f"got {si.get('total_segments')}."
            )
        
        # Check synchronized actions keys
        sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
        if set(sync.keys()) != required_sync_keys:
            errors.append(
                f"Segment {i}: synchronized_actions must have keys "
                f"{sorted(required_sync_keys)}."
            )
        
        # Word-count checks
        for (section, field), minw in MIN_WORDS.items():
            text = seg.get(section, {}).get(field, "")
            wc = _word_count(text)
            if wc < minw:
                errors.append(
                    f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc})."
                )
        
        # Collect for uniformity check
        ch = seg.get("character_description", {})
        sc = seg.get("scene_continuity", {})
        physical_blocks.append(ch.get("physical", ""))
        clothing_blocks.append(ch.get("clothing", ""))
        environment_blocks.append(sc.get("environment", ""))
    
    # Uniformity across segments - CRITICAL for visual consistency
    if expected_segments > 1:
        if len(set(physical_blocks)) > 1:
            errors.append(
                "🚨 `character_description.physical` must be EXACTLY identical "
                "across all segments - character is changing!"
            )
        if len(set(clothing_blocks)) > 1:
            errors.append(
                "🚨 `character_description.clothing` must be EXACTLY identical "
                "across all segments - clothing is changing!"
            )
        if len(set(environment_blocks)) > 1:
            errors.append(
                "🚨 `scene_continuity.environment` must be EXACTLY identical "
                "across all segments - location is changing!"
            )
    
    return errors


def generate_segments_payload(
    inputs: VeoInputs,
    image_bytes: Optional[bytes] = None,
    model: str = "gpt-4o",
    api_key: Optional[str] = None
) -> Dict[str, Any]:
    """
    Generate segments payload using GPT-4o with structured output
    
    WARNING-ONLY MODE: Validation errors are logged but don't block generation.
    This allows the system to work with whatever GPT-4o generates.
    
    Args:
        inputs: Video generation inputs
        image_bytes: Optional reference image bytes
        model: OpenAI model to use
        api_key: OpenAI API key (or from env)
    
    Returns:
        Segments payload (always returns, even if validation warnings exist)
    
    Raises:
        Exception: If API call fails (network, auth, etc.)
    """
    # Initialize OpenAI client
    client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))
    
    # Split script into segments
    segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
    N = len(segment_texts)
    
    print(f"📝 Generating {N} segments...")
    
    # Build prompt
    user_prompt = build_prompt(inputs, segment_texts)
    
    # Call GPT-4o (WARNING-ONLY validation - no retries, no blocking)
    print(f"🤖 Calling GPT-4o to generate {N} segments...")
    
    # Prepare messages
    system_content = "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."
    
    messages = [
        {
            "role": "system",
            "content": system_content
        },
        {
            "role": "user",
            "content": []
        }
    ]
    
    # Add text prompt
    messages[1]["content"].append({
        "type": "text",
        "text": user_prompt
    })
    
    # Add image if provided
    if image_bytes:
        encoded_image = base64.b64encode(image_bytes).decode("utf-8")
        messages[1]["content"].append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{encoded_image}"
            }
        })
    
    # Call GPT-4o with structured output
    response = client.beta.chat.completions.parse(
        model=model,
        response_format=SegmentsPayload,
        messages=messages,
    )
    
    parsed_obj = response.choices[0].message.parsed
    payload = parsed_obj.model_dump(by_alias=True)
    
    print(f"✅ GPT-4o generated {N} segments successfully")
    
    # DEBUG: Show actual word counts for first segment
    if payload.get("segments"):
        seg = payload["segments"][0]
        cd = seg.get("character_description", {})
        sc = seg.get("scene_continuity", {})
        print(f"📊 Sample word counts (Segment 1):")
        print(f"   physical: {_word_count(cd.get('physical', ''))} words")
        print(f"   clothing: {_word_count(cd.get('clothing', ''))} words")
        print(f"   current_state: {_word_count(cd.get('current_state', ''))} words")
        print(f"   environment: {_word_count(sc.get('environment', ''))} words")
        print(f"   camera_position: {_word_count(sc.get('camera_position', ''))} words")
    
    # Run validation (WARNING-ONLY - doesn't block generation)
    errors = validate_segments_payload(payload, N)
    
    if errors:
        # Log warnings but DON'T block generation
        print(f"\n⚠️  VALIDATION WARNINGS ({len(errors)} issues found):")
        print(f"⚠️  These are non-blocking - generation will continue")
        for i, error in enumerate(errors[:10], 1):  # Show first 10
            print(f"   {i}. {error}")
        if len(errors) > 10:
            print(f"   ... and {len(errors) - 10} more warnings")
        print(f"✅ Proceeding with generation despite warnings\n")
    else:
        print(f"✅ All validation checks passed!")
    
    # ALWAYS return payload (even with warnings)
    return payload