File size: 22,392 Bytes
91d209c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
"""
Advanced Prompt Generator using GPT-4o
Structured JSON generation with strict validation
"""

import re
import base64
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from openai import OpenAI
import os


class VeoInputs(BaseModel):
    """Input parameters for video generation"""
    script: str
    style: str
    jsonFormat: str = 'standard'
    continuationMode: bool = True
    voiceType: Optional[str] = None
    energyLevel: Optional[str] = None
    settingMode: str = 'single'
    cameraStyle: Optional[str] = None
    energyArc: Optional[str] = None
    narrativeStyle: Optional[str] = None
    accentRegion: Optional[str] = None


class ContinuityMarkers(BaseModel):
    """Markers for maintaining continuity between segments"""
    start_position: str
    end_position: str
    start_expression: str
    end_expression: str
    start_gesture: str
    end_gesture: str
    location_status: str


class SegmentInfo(BaseModel):
    """Basic segment information"""
    segment_number: int
    total_segments: int
    duration: str
    location: str
    continuity_markers: ContinuityMarkers


class CharacterDescription(BaseModel):
    """Detailed character description"""
    physical: str = Field(..., description="200+ words")
    clothing: str = Field(..., description="150+ words")
    current_state: str = Field(..., description="100+ words, segment-specific")
    voice_matching: str = Field(..., description="100+ words, segment-specific")


class SynchronizedActions(BaseModel):
    """Time-synced actions throughout the segment"""
    f0000_0002: str = Field(alias="0:00-0:02")
    f0002_0004: str = Field(alias="0:02-0:04")
    f0004_0006: str = Field(alias="0:04-0:06")
    f0006_0008: str = Field(alias="0:06-0:08")

    class Config:
        populate_by_name = True


class ActionTimeline(BaseModel):
    """Detailed action timeline for the segment"""
    dialogue: str
    synchronized_actions: SynchronizedActions
    micro_expressions: str = Field(..., description="50+ words")
    breathing_rhythm: str
    location_transition: str
    continuity_checkpoint: str


class SceneContinuity(BaseModel):
    """Scene and camera details"""
    environment: str = Field(..., description="250+ words")
    camera_position: str = Field(..., description="75+ words")
    camera_movement: str = Field(..., description="detailed movement path")
    lighting_state: str = Field(..., description="50+ words")
    props_in_frame: str = Field(..., description="75+ words")
    background_elements: str = Field(..., description="50+ words")
    spatial_relationships: str


class Segment(BaseModel):
    """Complete segment specification"""
    segment_info: SegmentInfo
    character_description: CharacterDescription
    scene_continuity: SceneContinuity
    action_timeline: ActionTimeline


class SegmentsPayload(BaseModel):
    """Complete payload with all segments"""
    segments: List[Segment]


def split_script_into_segments(
    script: str,
    seconds_per_segment: int = 8,
    words_per_second: float = 2.2
) -> List[str]:
    """
    Split script into segments based on timing
    
    Args:
        script: Full script text
        seconds_per_segment: Target duration per segment
        words_per_second: Speaking rate (adjust for VO tempo)
    
    Returns:
        List of script segments
    """
    sentences = re.split(r'(?<=[.!?])\s+', script.strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    
    target = max(14, int(seconds_per_segment * words_per_second))
    segments, cur, cur_len = [], [], 0
    
    for s in sentences:
        w = len(s.split())
        if cur and cur_len + w > target:
            segments.append(" ".join(cur))
            cur, cur_len = [], 0
        cur.append(s)
        cur_len += w
    
    if cur:
        segments.append(" ".join(cur))
    
    # Environment-based segment limiting
    # In DEV mode: limit to 2 segments for faster testing
    # In PROD mode: generate all segments
    environment = os.getenv('ENVIRONMENT', 'dev').lower()
    is_dev_mode = environment == 'dev' or environment == 'development'
    
    if is_dev_mode and len(segments) > 2:
        print(f"⚠️  DEV MODE: Limiting from {len(segments)} to 2 segments")
        segments = segments[:2]
    elif not is_dev_mode:
        print(f"βœ… PROD MODE: Generating all {len(segments)} segments")
    
    return segments or [script.strip()]


def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
    """
    Build the system prompt for GPT-4o
    
    Args:
        inputs: Video generation inputs
        segment_texts: List of segment scripts
    
    Returns:
        Formatted prompt string
    """
    N = len(segment_texts)
    knobs = inputs.model_dump()
    
    header = f"""
You are a STRICT production-grade JSON generator for Veo 3 video prompts.

⚠️ CRITICAL: Your output will be VALIDATED. ANY field under minimum word count will be REJECTED.

═══════════════════════════════════════════════════════════
🚨 CRITICAL: CHARACTER MUST MATCH REFERENCE IMAGE EXACTLY 🚨
═══════════════════════════════════════════════════════════

A REFERENCE IMAGE IS PROVIDED. You MUST:
1. ANALYZE the image carefully and describe the EXACT person you see
2. Use the SAME character description for ALL {N} segments (copy-paste identical text)
3. Include SPECIFIC details from the image:
   - EXACT hair color (e.g., "strawberry blonde", "auburn", "dark brown")
   - EXACT eye color (e.g., "green", "blue", "brown")
   - EXACT facial features (freckles, skin tone, face shape)
   - EXACT clothing visible in the image (color, pattern, style)
   - EXACT age appearance (not generic "mid-thirties")
4. DO NOT invent or change ANY physical features
5. The generated video MUST show the SAME person as the reference image

⚠️ If the character description doesn't match the reference image, the video will be REJECTED.

═══════════════════════════════════════════════════════════
MANDATORY WORD COUNT REQUIREMENTS - WILL BE VALIDATED
═══════════════════════════════════════════════════════════

character_description.physical: MINIMUM 150 WORDS (MUST describe the EXACT person in the reference image - IDENTICAL across ALL {N} segments)
character_description.clothing: MINIMUM 100 WORDS (MUST describe the EXACT clothing in the reference image - IDENTICAL across ALL {N} segments)
character_description.current_state: MINIMUM 50 WORDS (segment-specific)
character_description.voice_matching: MINIMUM 50 WORDS (segment-specific)

scene_continuity.environment: MINIMUM 150 WORDS (MUST be IDENTICAL across ALL {N} segments - same location throughout)
scene_continuity.camera_position: MINIMUM 50 WORDS (MUST be consistent framing)
scene_continuity.lighting_state: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
scene_continuity.props_in_frame: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
scene_continuity.background_elements: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)

action_timeline.micro_expressions: MINIMUM 40 WORDS

⚠️ If ANY field has fewer words than the minimum, the ENTIRE payload will be REJECTED.

═══════════════════════════════════════════════════════════
WHAT 200+ WORDS LOOKS LIKE (EXAMPLE FOR PHYSICAL):
═══════════════════════════════════════════════════════════

"A person in their mid-thirties with a warm, approachable presence that immediately puts viewers at ease. Their facial structure features high, defined cheekbones and a strong, angular jawline that conveys confidence without appearing intimidating. They have expressive, almond-shaped eyes with a rich brown color that sparkles with intelligence and authenticity when discussing financial topics. Their eyebrows are naturally shaped and animated, often raising slightly when emphasizing important points about debt relief. The person maintains excellent posture throughout, sitting or standing with shoulders back and spine straight, projecting both professionalism and relatability. Their skin tone is natural and even, with a healthy glow that suggests good self-care. They have a genuine, engaging smile that reaches their eyes, creating authentic crow's feet at the corners when they express enthusiasm about helping people save money. Their hair is styled in a modern, professional manner that doesn't distract from their message. The person's hands are visible during gestures, with natural, purposeful movements that emphasize key phrases about the tariff relief program. They maintain steady eye contact with the camera, creating a direct connection with viewers. Their overall appearance suggests someone who is both knowledgeable about financial matters and genuinely invested in helping others achieve debt freedom." (200+ words)

═══════════════════════════════════════════════════════════
WHAT 150+ WORDS LOOKS LIKE (EXAMPLE FOR ENVIRONMENT):
═══════════════════════════════════════════════════════════

"A contemporary, warmly-lit interior space that exudes professionalism while maintaining an approachable, comfortable atmosphere perfect for discussing personal finance topics. The setting features soft, natural daylight streaming through large windows, creating gentle highlights and shadows that add depth and dimension to the frame. The background wall showcases a sophisticated neutral color paletteβ€”think warm beige or soft gray tonesβ€”that doesn't compete for attention but provides visual interest through subtle texture. The space includes carefully curated elements of modern interior design: perhaps a sleek bookshelf with financial publications, a tasteful piece of abstract art that adds color without distraction, and contemporary furniture pieces that suggest success and stability. The floor is likely hardwood or high-quality laminate, polished to reflect light subtly. The lighting setup combines natural window light with strategically placed LED panels or softboxes that eliminate harsh shadows while maintaining a natural, lifestyle aesthetic rather than an overly commercial look. The depth of field is moderate, keeping the subject sharp while softly blurring background elements to maintain focus. Environmental sound design would capture subtle ambient noiseβ€”perhaps distant city sounds or soft office ambianceβ€”that grounds the viewer in a real, authentic space. The overall atmosphere suggests a professional consultation setting where important financial decisions are made, yet feels intimate and personal enough that viewers can imagine having this conversation in their own homes. Every element of the environment reinforces the credibility of the debt relief message while maintaining the authentic, UGC-style feel that drives engagement. The space is clutter-free but lived-in, striking the perfect balance between aspirational and relatable." (250+ words)

═══════════════════════════════════════════════════════════
YOUR TASK
═══════════════════════════════════════════════════════════

Generate EXACTLY {N} segments. Each segment MUST meet ALL word count requirements above.

Duration: "00:00-00:08" for each segment
Synchronized actions: MUST have keys "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08"
Total segments: Set segment_info.total_segments = {N} on EVERY segment

⚠️ CRITICAL DIALOGUE RULE - NO OVERLAP:
- Each segment's action_timeline.dialogue MUST contain ONLY the text assigned below
- NEVER repeat any words or sentences from previous segments
- NEVER include any words or sentences from the next segment
- Each segment's dialogue is MUTUALLY EXCLUSIVE - zero overlap allowed
- The dialogue for each segment is PRE-SPLIT below - use it EXACTLY as given

SCRIPT TO SEGMENT:
\"\"\"{inputs.script.strip()}\"\"\"

STYLE SETTINGS:
{knobs}

SEGMENTS TO GENERATE (USE DIALOGUE EXACTLY AS SHOWN - NO OVERLAP):
"""
    
    seg_lines = "\n".join([f"- Segment {i+1} dialogue (EXACT): \"{t}\"" for i, t in enumerate(segment_texts)])
    
    footer = """

═══════════════════════════════════════════════════════════
CRITICAL REMINDER BEFORE YOU GENERATE
═══════════════════════════════════════════════════════════

🚨 CHARACTER CONSISTENCY IS MANDATORY:
- physical description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- clothing description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- environment description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- The person MUST look IDENTICAL in every segment (same face, hair, clothes, setting)

βœ… CHECK EVERY FIELD FOR MINIMUM WORD COUNTS
βœ… physical: 150+ words | clothing: 100+ words  
βœ… current_state: 50+ words | voice_matching: 50+ words
βœ… environment: 150+ words (MOST IMPORTANT - BE VERY DETAILED)
βœ… camera_position: 50+ words | lighting_state: 40+ words
βœ… props_in_frame: 40+ words | background_elements: 40+ words
βœ… micro_expressions: 40+ words

🚨 CRITICAL: NO BLUR TRANSITIONS - REMINDER 🚨
- Every segment starts SHARP and CLEAR at 0:00
- camera_movement must describe movement from an already-focused state
- synchronized_actions["0:00-0:02"] must begin with subject in sharp focus
- NO fade-in, NO blur, NO gradual focus at segment start

⚠️ VALIDATION WILL COUNT EVERY WORD. Generate MORE than minimum to be safe!
⚠️ Describe the EXACT person in the reference image - do not invent features!

OUTPUT FORMAT:
Return ONLY valid JSON (no markdown, no code blocks):
{{
  "segments": [ {{ ... }} ]
}}
"""
    
    return header + seg_lines + footer


# Minimum word counts for validation
MIN_WORDS = {
    ("character_description", "physical"): 150,
    ("character_description", "clothing"): 100,
    ("character_description", "current_state"): 50,
    ("character_description", "voice_matching"): 50,
    ("scene_continuity", "environment"): 150,
    ("scene_continuity", "camera_position"): 50,
    ("scene_continuity", "lighting_state"): 40,
    ("scene_continuity", "props_in_frame"): 40,
    ("scene_continuity", "background_elements"): 40,
    ("action_timeline", "micro_expressions"): 40,
}


def _word_count(text: str) -> int:
    """Count words in text"""
    return len(re.findall(r"\b\w+\b", text or ""))


def validate_segments_payload(
    payload: Dict[str, Any],
    expected_segments: int
) -> List[str]:
    """
    Validate the generated payload against strict rules
    
    Args:
        payload: Generated payload
        expected_segments: Expected number of segments
    
    Returns:
        List of validation errors (empty if valid)
    """
    errors: List[str] = []
    segs = payload.get("segments", [])
    
    if len(segs) != expected_segments:
        errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")
    
    required_sync_keys = {"0:00-0:02", "0:02-0:04", "0:04-0:06", "0:06-0:08"}
    physical_blocks, clothing_blocks, environment_blocks = [], [], []
    
    for i, seg in enumerate(segs, start=1):
        # Check segment info
        si = seg.get("segment_info", {})
        if si.get("duration") != "00:00-00:08":
            errors.append(f"Segment {i}: duration must be 00:00-00:08.")
        if si.get("total_segments") != expected_segments:
            errors.append(
                f"Segment {i}: total_segments should be {expected_segments}, "
                f"got {si.get('total_segments')}."
            )
        
        # Check synchronized actions keys
        sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
        if set(sync.keys()) != required_sync_keys:
            errors.append(
                f"Segment {i}: synchronized_actions must have keys "
                f"{sorted(required_sync_keys)}."
            )
        
        # Word-count checks
        for (section, field), minw in MIN_WORDS.items():
            text = seg.get(section, {}).get(field, "")
            wc = _word_count(text)
            if wc < minw:
                errors.append(
                    f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc})."
                )
        
        # Collect for uniformity check
        ch = seg.get("character_description", {})
        sc = seg.get("scene_continuity", {})
        physical_blocks.append(ch.get("physical", ""))
        clothing_blocks.append(ch.get("clothing", ""))
        environment_blocks.append(sc.get("environment", ""))
    
    # Uniformity across segments - CRITICAL for visual consistency
    if expected_segments > 1:
        if len(set(physical_blocks)) > 1:
            errors.append(
                "🚨 `character_description.physical` must be EXACTLY identical "
                "across all segments - character is changing!"
            )
        if len(set(clothing_blocks)) > 1:
            errors.append(
                "🚨 `character_description.clothing` must be EXACTLY identical "
                "across all segments - clothing is changing!"
            )
        if len(set(environment_blocks)) > 1:
            errors.append(
                "🚨 `scene_continuity.environment` must be EXACTLY identical "
                "across all segments - location is changing!"
            )
    
    return errors


def generate_segments_payload(
    inputs: VeoInputs,
    image_bytes: Optional[bytes] = None,
    model: str = "gpt-4o",
    api_key: Optional[str] = None
) -> Dict[str, Any]:
    """
    Generate segments payload using GPT-4o with structured output
    
    WARNING-ONLY MODE: Validation errors are logged but don't block generation.
    This allows the system to work with whatever GPT-4o generates.
    
    Args:
        inputs: Video generation inputs
        image_bytes: Optional reference image bytes
        model: OpenAI model to use
        api_key: OpenAI API key (or from env)
    
    Returns:
        Segments payload (always returns, even if validation warnings exist)
    
    Raises:
        Exception: If API call fails (network, auth, etc.)
    """
    # Initialize OpenAI client
    client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))
    
    # Split script into segments
    segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
    N = len(segment_texts)
    
    print(f"πŸ“ Generating {N} segments...")
    
    # Build prompt
    user_prompt = build_prompt(inputs, segment_texts)
    
    # Call GPT-4o (WARNING-ONLY validation - no retries, no blocking)
    print(f"πŸ€– Calling GPT-4o to generate {N} segments...")
    
    # Prepare messages
    system_content = "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."
    
    messages = [
        {
            "role": "system",
            "content": system_content
        },
        {
            "role": "user",
            "content": []
        }
    ]
    
    # Add text prompt
    messages[1]["content"].append({
        "type": "text",
        "text": user_prompt
    })
    
    # Add image if provided
    if image_bytes:
        encoded_image = base64.b64encode(image_bytes).decode("utf-8")
        messages[1]["content"].append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{encoded_image}"
            }
        })
    
    # Call GPT-4o with structured output
    response = client.beta.chat.completions.parse(
        model=model,
        response_format=SegmentsPayload,
        messages=messages,
    )
    
    parsed_obj = response.choices[0].message.parsed
    payload = parsed_obj.model_dump(by_alias=True)
    
    print(f"βœ… GPT-4o generated {N} segments successfully")
    
    # DEBUG: Show actual word counts for first segment
    if payload.get("segments"):
        seg = payload["segments"][0]
        cd = seg.get("character_description", {})
        sc = seg.get("scene_continuity", {})
        print(f"πŸ“Š Sample word counts (Segment 1):")
        print(f"   physical: {_word_count(cd.get('physical', ''))} words")
        print(f"   clothing: {_word_count(cd.get('clothing', ''))} words")
        print(f"   current_state: {_word_count(cd.get('current_state', ''))} words")
        print(f"   environment: {_word_count(sc.get('environment', ''))} words")
        print(f"   camera_position: {_word_count(sc.get('camera_position', ''))} words")
    
    # Run validation (WARNING-ONLY - doesn't block generation)
    errors = validate_segments_payload(payload, N)
    
    if errors:
        # Log warnings but DON'T block generation
        print(f"\n⚠️  VALIDATION WARNINGS ({len(errors)} issues found):")
        print(f"⚠️  These are non-blocking - generation will continue")
        for i, error in enumerate(errors[:10], 1):  # Show first 10
            print(f"   {i}. {error}")
        if len(errors) > 10:
            print(f"   ... and {len(errors) - 10} more warnings")
        print(f"βœ… Proceeding with generation despite warnings\n")
    else:
        print(f"βœ… All validation checks passed!")
    
    # ALWAYS return payload (even with warnings)
    return payload