Video_AdGenesis_App / utils /prompt_generator.py
sushilideaclan01's picture
first push
91d209c
"""
Advanced Prompt Generator using GPT-4o
Structured JSON generation with strict validation
"""
import re
import base64
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from openai import OpenAI
import os
class VeoInputs(BaseModel):
"""Input parameters for video generation"""
script: str
style: str
jsonFormat: str = 'standard'
continuationMode: bool = True
voiceType: Optional[str] = None
energyLevel: Optional[str] = None
settingMode: str = 'single'
cameraStyle: Optional[str] = None
energyArc: Optional[str] = None
narrativeStyle: Optional[str] = None
accentRegion: Optional[str] = None
class ContinuityMarkers(BaseModel):
"""Markers for maintaining continuity between segments"""
start_position: str
end_position: str
start_expression: str
end_expression: str
start_gesture: str
end_gesture: str
location_status: str
class SegmentInfo(BaseModel):
"""Basic segment information"""
segment_number: int
total_segments: int
duration: str
location: str
continuity_markers: ContinuityMarkers
class CharacterDescription(BaseModel):
"""Detailed character description"""
physical: str = Field(..., description="200+ words")
clothing: str = Field(..., description="150+ words")
current_state: str = Field(..., description="100+ words, segment-specific")
voice_matching: str = Field(..., description="100+ words, segment-specific")
class SynchronizedActions(BaseModel):
"""Time-synced actions throughout the segment"""
f0000_0002: str = Field(alias="0:00-0:02")
f0002_0004: str = Field(alias="0:02-0:04")
f0004_0006: str = Field(alias="0:04-0:06")
f0006_0008: str = Field(alias="0:06-0:08")
class Config:
populate_by_name = True
class ActionTimeline(BaseModel):
"""Detailed action timeline for the segment"""
dialogue: str
synchronized_actions: SynchronizedActions
micro_expressions: str = Field(..., description="50+ words")
breathing_rhythm: str
location_transition: str
continuity_checkpoint: str
class SceneContinuity(BaseModel):
"""Scene and camera details"""
environment: str = Field(..., description="250+ words")
camera_position: str = Field(..., description="75+ words")
camera_movement: str = Field(..., description="detailed movement path")
lighting_state: str = Field(..., description="50+ words")
props_in_frame: str = Field(..., description="75+ words")
background_elements: str = Field(..., description="50+ words")
spatial_relationships: str
class Segment(BaseModel):
"""Complete segment specification"""
segment_info: SegmentInfo
character_description: CharacterDescription
scene_continuity: SceneContinuity
action_timeline: ActionTimeline
class SegmentsPayload(BaseModel):
"""Complete payload with all segments"""
segments: List[Segment]
def split_script_into_segments(
script: str,
seconds_per_segment: int = 8,
words_per_second: float = 2.2
) -> List[str]:
"""
Split script into segments based on timing
Args:
script: Full script text
seconds_per_segment: Target duration per segment
words_per_second: Speaking rate (adjust for VO tempo)
Returns:
List of script segments
"""
sentences = re.split(r'(?<=[.!?])\s+', script.strip())
sentences = [s.strip() for s in sentences if s.strip()]
target = max(14, int(seconds_per_segment * words_per_second))
segments, cur, cur_len = [], [], 0
for s in sentences:
w = len(s.split())
if cur and cur_len + w > target:
segments.append(" ".join(cur))
cur, cur_len = [], 0
cur.append(s)
cur_len += w
if cur:
segments.append(" ".join(cur))
# Environment-based segment limiting
# In DEV mode: limit to 2 segments for faster testing
# In PROD mode: generate all segments
environment = os.getenv('ENVIRONMENT', 'dev').lower()
is_dev_mode = environment == 'dev' or environment == 'development'
if is_dev_mode and len(segments) > 2:
print(f"⚠️ DEV MODE: Limiting from {len(segments)} to 2 segments")
segments = segments[:2]
elif not is_dev_mode:
print(f"βœ… PROD MODE: Generating all {len(segments)} segments")
return segments or [script.strip()]
def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
"""
Build the system prompt for GPT-4o
Args:
inputs: Video generation inputs
segment_texts: List of segment scripts
Returns:
Formatted prompt string
"""
N = len(segment_texts)
knobs = inputs.model_dump()
header = f"""
You are a STRICT production-grade JSON generator for Veo 3 video prompts.
⚠️ CRITICAL: Your output will be VALIDATED. ANY field under minimum word count will be REJECTED.
═══════════════════════════════════════════════════════════
🚨 CRITICAL: CHARACTER MUST MATCH REFERENCE IMAGE EXACTLY 🚨
═══════════════════════════════════════════════════════════
A REFERENCE IMAGE IS PROVIDED. You MUST:
1. ANALYZE the image carefully and describe the EXACT person you see
2. Use the SAME character description for ALL {N} segments (copy-paste identical text)
3. Include SPECIFIC details from the image:
- EXACT hair color (e.g., "strawberry blonde", "auburn", "dark brown")
- EXACT eye color (e.g., "green", "blue", "brown")
- EXACT facial features (freckles, skin tone, face shape)
- EXACT clothing visible in the image (color, pattern, style)
- EXACT age appearance (not generic "mid-thirties")
4. DO NOT invent or change ANY physical features
5. The generated video MUST show the SAME person as the reference image
⚠️ If the character description doesn't match the reference image, the video will be REJECTED.
═══════════════════════════════════════════════════════════
MANDATORY WORD COUNT REQUIREMENTS - WILL BE VALIDATED
═══════════════════════════════════════════════════════════
character_description.physical: MINIMUM 150 WORDS (MUST describe the EXACT person in the reference image - IDENTICAL across ALL {N} segments)
character_description.clothing: MINIMUM 100 WORDS (MUST describe the EXACT clothing in the reference image - IDENTICAL across ALL {N} segments)
character_description.current_state: MINIMUM 50 WORDS (segment-specific)
character_description.voice_matching: MINIMUM 50 WORDS (segment-specific)
scene_continuity.environment: MINIMUM 150 WORDS (MUST be IDENTICAL across ALL {N} segments - same location throughout)
scene_continuity.camera_position: MINIMUM 50 WORDS (MUST be consistent framing)
scene_continuity.lighting_state: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
scene_continuity.props_in_frame: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
scene_continuity.background_elements: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
action_timeline.micro_expressions: MINIMUM 40 WORDS
⚠️ If ANY field has fewer words than the minimum, the ENTIRE payload will be REJECTED.
═══════════════════════════════════════════════════════════
WHAT 200+ WORDS LOOKS LIKE (EXAMPLE FOR PHYSICAL):
═══════════════════════════════════════════════════════════
"A person in their mid-thirties with a warm, approachable presence that immediately puts viewers at ease. Their facial structure features high, defined cheekbones and a strong, angular jawline that conveys confidence without appearing intimidating. They have expressive, almond-shaped eyes with a rich brown color that sparkles with intelligence and authenticity when discussing financial topics. Their eyebrows are naturally shaped and animated, often raising slightly when emphasizing important points about debt relief. The person maintains excellent posture throughout, sitting or standing with shoulders back and spine straight, projecting both professionalism and relatability. Their skin tone is natural and even, with a healthy glow that suggests good self-care. They have a genuine, engaging smile that reaches their eyes, creating authentic crow's feet at the corners when they express enthusiasm about helping people save money. Their hair is styled in a modern, professional manner that doesn't distract from their message. The person's hands are visible during gestures, with natural, purposeful movements that emphasize key phrases about the tariff relief program. They maintain steady eye contact with the camera, creating a direct connection with viewers. Their overall appearance suggests someone who is both knowledgeable about financial matters and genuinely invested in helping others achieve debt freedom." (200+ words)
═══════════════════════════════════════════════════════════
WHAT 150+ WORDS LOOKS LIKE (EXAMPLE FOR ENVIRONMENT):
═══════════════════════════════════════════════════════════
"A contemporary, warmly-lit interior space that exudes professionalism while maintaining an approachable, comfortable atmosphere perfect for discussing personal finance topics. The setting features soft, natural daylight streaming through large windows, creating gentle highlights and shadows that add depth and dimension to the frame. The background wall showcases a sophisticated neutral color paletteβ€”think warm beige or soft gray tonesβ€”that doesn't compete for attention but provides visual interest through subtle texture. The space includes carefully curated elements of modern interior design: perhaps a sleek bookshelf with financial publications, a tasteful piece of abstract art that adds color without distraction, and contemporary furniture pieces that suggest success and stability. The floor is likely hardwood or high-quality laminate, polished to reflect light subtly. The lighting setup combines natural window light with strategically placed LED panels or softboxes that eliminate harsh shadows while maintaining a natural, lifestyle aesthetic rather than an overly commercial look. The depth of field is moderate, keeping the subject sharp while softly blurring background elements to maintain focus. Environmental sound design would capture subtle ambient noiseβ€”perhaps distant city sounds or soft office ambianceβ€”that grounds the viewer in a real, authentic space. The overall atmosphere suggests a professional consultation setting where important financial decisions are made, yet feels intimate and personal enough that viewers can imagine having this conversation in their own homes. Every element of the environment reinforces the credibility of the debt relief message while maintaining the authentic, UGC-style feel that drives engagement. The space is clutter-free but lived-in, striking the perfect balance between aspirational and relatable." (250+ words)
═══════════════════════════════════════════════════════════
YOUR TASK
═══════════════════════════════════════════════════════════
Generate EXACTLY {N} segments. Each segment MUST meet ALL word count requirements above.
Duration: "00:00-00:08" for each segment
Synchronized actions: MUST have keys "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08"
Total segments: Set segment_info.total_segments = {N} on EVERY segment
⚠️ CRITICAL DIALOGUE RULE - NO OVERLAP:
- Each segment's action_timeline.dialogue MUST contain ONLY the text assigned below
- NEVER repeat any words or sentences from previous segments
- NEVER include any words or sentences from the next segment
- Each segment's dialogue is MUTUALLY EXCLUSIVE - zero overlap allowed
- The dialogue for each segment is PRE-SPLIT below - use it EXACTLY as given
SCRIPT TO SEGMENT:
\"\"\"{inputs.script.strip()}\"\"\"
STYLE SETTINGS:
{knobs}
SEGMENTS TO GENERATE (USE DIALOGUE EXACTLY AS SHOWN - NO OVERLAP):
"""
seg_lines = "\n".join([f"- Segment {i+1} dialogue (EXACT): \"{t}\"" for i, t in enumerate(segment_texts)])
footer = """
═══════════════════════════════════════════════════════════
CRITICAL REMINDER BEFORE YOU GENERATE
═══════════════════════════════════════════════════════════
🚨 CHARACTER CONSISTENCY IS MANDATORY:
- physical description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- clothing description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- environment description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
- The person MUST look IDENTICAL in every segment (same face, hair, clothes, setting)
βœ… CHECK EVERY FIELD FOR MINIMUM WORD COUNTS
βœ… physical: 150+ words | clothing: 100+ words
βœ… current_state: 50+ words | voice_matching: 50+ words
βœ… environment: 150+ words (MOST IMPORTANT - BE VERY DETAILED)
βœ… camera_position: 50+ words | lighting_state: 40+ words
βœ… props_in_frame: 40+ words | background_elements: 40+ words
βœ… micro_expressions: 40+ words
🚨 CRITICAL: NO BLUR TRANSITIONS - REMINDER 🚨
- Every segment starts SHARP and CLEAR at 0:00
- camera_movement must describe movement from an already-focused state
- synchronized_actions["0:00-0:02"] must begin with subject in sharp focus
- NO fade-in, NO blur, NO gradual focus at segment start
⚠️ VALIDATION WILL COUNT EVERY WORD. Generate MORE than minimum to be safe!
⚠️ Describe the EXACT person in the reference image - do not invent features!
OUTPUT FORMAT:
Return ONLY valid JSON (no markdown, no code blocks):
{{
"segments": [ {{ ... }} ]
}}
"""
return header + seg_lines + footer
# Minimum word counts for validation
MIN_WORDS = {
("character_description", "physical"): 150,
("character_description", "clothing"): 100,
("character_description", "current_state"): 50,
("character_description", "voice_matching"): 50,
("scene_continuity", "environment"): 150,
("scene_continuity", "camera_position"): 50,
("scene_continuity", "lighting_state"): 40,
("scene_continuity", "props_in_frame"): 40,
("scene_continuity", "background_elements"): 40,
("action_timeline", "micro_expressions"): 40,
}
def _word_count(text: str) -> int:
"""Count words in text"""
return len(re.findall(r"\b\w+\b", text or ""))
def validate_segments_payload(
payload: Dict[str, Any],
expected_segments: int
) -> List[str]:
"""
Validate the generated payload against strict rules
Args:
payload: Generated payload
expected_segments: Expected number of segments
Returns:
List of validation errors (empty if valid)
"""
errors: List[str] = []
segs = payload.get("segments", [])
if len(segs) != expected_segments:
errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")
required_sync_keys = {"0:00-0:02", "0:02-0:04", "0:04-0:06", "0:06-0:08"}
physical_blocks, clothing_blocks, environment_blocks = [], [], []
for i, seg in enumerate(segs, start=1):
# Check segment info
si = seg.get("segment_info", {})
if si.get("duration") != "00:00-00:08":
errors.append(f"Segment {i}: duration must be 00:00-00:08.")
if si.get("total_segments") != expected_segments:
errors.append(
f"Segment {i}: total_segments should be {expected_segments}, "
f"got {si.get('total_segments')}."
)
# Check synchronized actions keys
sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
if set(sync.keys()) != required_sync_keys:
errors.append(
f"Segment {i}: synchronized_actions must have keys "
f"{sorted(required_sync_keys)}."
)
# Word-count checks
for (section, field), minw in MIN_WORDS.items():
text = seg.get(section, {}).get(field, "")
wc = _word_count(text)
if wc < minw:
errors.append(
f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc})."
)
# Collect for uniformity check
ch = seg.get("character_description", {})
sc = seg.get("scene_continuity", {})
physical_blocks.append(ch.get("physical", ""))
clothing_blocks.append(ch.get("clothing", ""))
environment_blocks.append(sc.get("environment", ""))
# Uniformity across segments - CRITICAL for visual consistency
if expected_segments > 1:
if len(set(physical_blocks)) > 1:
errors.append(
"🚨 `character_description.physical` must be EXACTLY identical "
"across all segments - character is changing!"
)
if len(set(clothing_blocks)) > 1:
errors.append(
"🚨 `character_description.clothing` must be EXACTLY identical "
"across all segments - clothing is changing!"
)
if len(set(environment_blocks)) > 1:
errors.append(
"🚨 `scene_continuity.environment` must be EXACTLY identical "
"across all segments - location is changing!"
)
return errors
def generate_segments_payload(
inputs: VeoInputs,
image_bytes: Optional[bytes] = None,
model: str = "gpt-4o",
api_key: Optional[str] = None
) -> Dict[str, Any]:
"""
Generate segments payload using GPT-4o with structured output
WARNING-ONLY MODE: Validation errors are logged but don't block generation.
This allows the system to work with whatever GPT-4o generates.
Args:
inputs: Video generation inputs
image_bytes: Optional reference image bytes
model: OpenAI model to use
api_key: OpenAI API key (or from env)
Returns:
Segments payload (always returns, even if validation warnings exist)
Raises:
Exception: If API call fails (network, auth, etc.)
"""
# Initialize OpenAI client
client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))
# Split script into segments
segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
N = len(segment_texts)
print(f"πŸ“ Generating {N} segments...")
# Build prompt
user_prompt = build_prompt(inputs, segment_texts)
# Call GPT-4o (WARNING-ONLY validation - no retries, no blocking)
print(f"πŸ€– Calling GPT-4o to generate {N} segments...")
# Prepare messages
system_content = "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."
messages = [
{
"role": "system",
"content": system_content
},
{
"role": "user",
"content": []
}
]
# Add text prompt
messages[1]["content"].append({
"type": "text",
"text": user_prompt
})
# Add image if provided
if image_bytes:
encoded_image = base64.b64encode(image_bytes).decode("utf-8")
messages[1]["content"].append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}"
}
})
# Call GPT-4o with structured output
response = client.beta.chat.completions.parse(
model=model,
response_format=SegmentsPayload,
messages=messages,
)
parsed_obj = response.choices[0].message.parsed
payload = parsed_obj.model_dump(by_alias=True)
print(f"βœ… GPT-4o generated {N} segments successfully")
# DEBUG: Show actual word counts for first segment
if payload.get("segments"):
seg = payload["segments"][0]
cd = seg.get("character_description", {})
sc = seg.get("scene_continuity", {})
print(f"πŸ“Š Sample word counts (Segment 1):")
print(f" physical: {_word_count(cd.get('physical', ''))} words")
print(f" clothing: {_word_count(cd.get('clothing', ''))} words")
print(f" current_state: {_word_count(cd.get('current_state', ''))} words")
print(f" environment: {_word_count(sc.get('environment', ''))} words")
print(f" camera_position: {_word_count(sc.get('camera_position', ''))} words")
# Run validation (WARNING-ONLY - doesn't block generation)
errors = validate_segments_payload(payload, N)
if errors:
# Log warnings but DON'T block generation
print(f"\n⚠️ VALIDATION WARNINGS ({len(errors)} issues found):")
print(f"⚠️ These are non-blocking - generation will continue")
for i, error in enumerate(errors[:10], 1): # Show first 10
print(f" {i}. {error}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more warnings")
print(f"βœ… Proceeding with generation despite warnings\n")
else:
print(f"βœ… All validation checks passed!")
# ALWAYS return payload (even with warnings)
return payload