Spaces:

userIdc2024
/

Video_AdGenesis_App

Sleeping

App Files Files Community

Video_AdGenesis_App / utils /prompt_generator.py

sushilideaclan01

first push

91d209c about 1 month ago

raw

history blame contribute delete

22.4 kB

	"""
	Advanced Prompt Generator using GPT-4o
	Structured JSON generation with strict validation
	"""

	import re
	import base64
	from typing import List, Optional, Dict, Any
	from pydantic import BaseModel, Field
	from openai import OpenAI
	import os


	class VeoInputs(BaseModel):
	"""Input parameters for video generation"""
	script: str
	style: str
	jsonFormat: str = 'standard'
	continuationMode: bool = True
	voiceType: Optional[str] = None
	energyLevel: Optional[str] = None
	settingMode: str = 'single'
	cameraStyle: Optional[str] = None
	energyArc: Optional[str] = None
	narrativeStyle: Optional[str] = None
	accentRegion: Optional[str] = None


	class ContinuityMarkers(BaseModel):
	"""Markers for maintaining continuity between segments"""
	start_position: str
	end_position: str
	start_expression: str
	end_expression: str
	start_gesture: str
	end_gesture: str
	location_status: str


	class SegmentInfo(BaseModel):
	"""Basic segment information"""
	segment_number: int
	total_segments: int
	duration: str
	location: str
	continuity_markers: ContinuityMarkers


	class CharacterDescription(BaseModel):
	"""Detailed character description"""
	physical: str = Field(..., description="200+ words")
	clothing: str = Field(..., description="150+ words")
	current_state: str = Field(..., description="100+ words, segment-specific")
	voice_matching: str = Field(..., description="100+ words, segment-specific")


	class SynchronizedActions(BaseModel):
	"""Time-synced actions throughout the segment"""
	f0000_0002: str = Field(alias="0:00-0:02")
	f0002_0004: str = Field(alias="0:02-0:04")
	f0004_0006: str = Field(alias="0:04-0:06")
	f0006_0008: str = Field(alias="0:06-0:08")

	class Config:
	populate_by_name = True


	class ActionTimeline(BaseModel):
	"""Detailed action timeline for the segment"""
	dialogue: str
	synchronized_actions: SynchronizedActions
	micro_expressions: str = Field(..., description="50+ words")
	breathing_rhythm: str
	location_transition: str
	continuity_checkpoint: str


	class SceneContinuity(BaseModel):
	"""Scene and camera details"""
	environment: str = Field(..., description="250+ words")
	camera_position: str = Field(..., description="75+ words")
	camera_movement: str = Field(..., description="detailed movement path")
	lighting_state: str = Field(..., description="50+ words")
	props_in_frame: str = Field(..., description="75+ words")
	background_elements: str = Field(..., description="50+ words")
	spatial_relationships: str


	class Segment(BaseModel):
	"""Complete segment specification"""
	segment_info: SegmentInfo
	character_description: CharacterDescription
	scene_continuity: SceneContinuity
	action_timeline: ActionTimeline


	class SegmentsPayload(BaseModel):
	"""Complete payload with all segments"""
	segments: List[Segment]


	def split_script_into_segments(
	script: str,
	seconds_per_segment: int = 8,
	words_per_second: float = 2.2
	) -> List[str]:
	"""
	Split script into segments based on timing

	Args:
	script: Full script text
	seconds_per_segment: Target duration per segment
	words_per_second: Speaking rate (adjust for VO tempo)

	Returns:
	List of script segments
	"""
	sentences = re.split(r'(?<=[.!?])\s+', script.strip())
	sentences = [s.strip() for s in sentences if s.strip()]

	target = max(14, int(seconds_per_segment * words_per_second))
	segments, cur, cur_len = [], [], 0

	for s in sentences:
	w = len(s.split())
	if cur and cur_len + w > target:
	segments.append(" ".join(cur))
	cur, cur_len = [], 0
	cur.append(s)
	cur_len += w

	if cur:
	segments.append(" ".join(cur))

	# Environment-based segment limiting
	# In DEV mode: limit to 2 segments for faster testing
	# In PROD mode: generate all segments
	environment = os.getenv('ENVIRONMENT', 'dev').lower()
	is_dev_mode = environment == 'dev' or environment == 'development'

	if is_dev_mode and len(segments) > 2:
	print(f"⚠️ DEV MODE: Limiting from {len(segments)} to 2 segments")
	segments = segments[:2]
	elif not is_dev_mode:
	print(f"✅ PROD MODE: Generating all {len(segments)} segments")

	return segments or [script.strip()]


	def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
	"""
	Build the system prompt for GPT-4o

	Args:
	inputs: Video generation inputs
	segment_texts: List of segment scripts

	Returns:
	Formatted prompt string
	"""
	N = len(segment_texts)
	knobs = inputs.model_dump()

	header = f"""
	You are a STRICT production-grade JSON generator for Veo 3 video prompts.

	⚠️ CRITICAL: Your output will be VALIDATED. ANY field under minimum word count will be REJECTED.

	═══════════════════════════════════════════════════════════
	🚨 CRITICAL: CHARACTER MUST MATCH REFERENCE IMAGE EXACTLY 🚨
	═══════════════════════════════════════════════════════════

	A REFERENCE IMAGE IS PROVIDED. You MUST:
	1. ANALYZE the image carefully and describe the EXACT person you see
	2. Use the SAME character description for ALL {N} segments (copy-paste identical text)
	3. Include SPECIFIC details from the image:
	- EXACT hair color (e.g., "strawberry blonde", "auburn", "dark brown")
	- EXACT eye color (e.g., "green", "blue", "brown")
	- EXACT facial features (freckles, skin tone, face shape)
	- EXACT clothing visible in the image (color, pattern, style)
	- EXACT age appearance (not generic "mid-thirties")
	4. DO NOT invent or change ANY physical features
	5. The generated video MUST show the SAME person as the reference image

	⚠️ If the character description doesn't match the reference image, the video will be REJECTED.

	═══════════════════════════════════════════════════════════
	MANDATORY WORD COUNT REQUIREMENTS - WILL BE VALIDATED
	═══════════════════════════════════════════════════════════

	character_description.physical: MINIMUM 150 WORDS (MUST describe the EXACT person in the reference image - IDENTICAL across ALL {N} segments)
	character_description.clothing: MINIMUM 100 WORDS (MUST describe the EXACT clothing in the reference image - IDENTICAL across ALL {N} segments)
	character_description.current_state: MINIMUM 50 WORDS (segment-specific)
	character_description.voice_matching: MINIMUM 50 WORDS (segment-specific)

	scene_continuity.environment: MINIMUM 150 WORDS (MUST be IDENTICAL across ALL {N} segments - same location throughout)
	scene_continuity.camera_position: MINIMUM 50 WORDS (MUST be consistent framing)
	scene_continuity.lighting_state: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
	scene_continuity.props_in_frame: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)
	scene_continuity.background_elements: MINIMUM 40 WORDS (MUST be IDENTICAL across ALL {N} segments)

	action_timeline.micro_expressions: MINIMUM 40 WORDS

	⚠️ If ANY field has fewer words than the minimum, the ENTIRE payload will be REJECTED.

	═══════════════════════════════════════════════════════════
	WHAT 200+ WORDS LOOKS LIKE (EXAMPLE FOR PHYSICAL):
	═══════════════════════════════════════════════════════════

	"A person in their mid-thirties with a warm, approachable presence that immediately puts viewers at ease. Their facial structure features high, defined cheekbones and a strong, angular jawline that conveys confidence without appearing intimidating. They have expressive, almond-shaped eyes with a rich brown color that sparkles with intelligence and authenticity when discussing financial topics. Their eyebrows are naturally shaped and animated, often raising slightly when emphasizing important points about debt relief. The person maintains excellent posture throughout, sitting or standing with shoulders back and spine straight, projecting both professionalism and relatability. Their skin tone is natural and even, with a healthy glow that suggests good self-care. They have a genuine, engaging smile that reaches their eyes, creating authentic crow's feet at the corners when they express enthusiasm about helping people save money. Their hair is styled in a modern, professional manner that doesn't distract from their message. The person's hands are visible during gestures, with natural, purposeful movements that emphasize key phrases about the tariff relief program. They maintain steady eye contact with the camera, creating a direct connection with viewers. Their overall appearance suggests someone who is both knowledgeable about financial matters and genuinely invested in helping others achieve debt freedom." (200+ words)

	═══════════════════════════════════════════════════════════
	WHAT 150+ WORDS LOOKS LIKE (EXAMPLE FOR ENVIRONMENT):
	═══════════════════════════════════════════════════════════

	"A contemporary, warmly-lit interior space that exudes professionalism while maintaining an approachable, comfortable atmosphere perfect for discussing personal finance topics. The setting features soft, natural daylight streaming through large windows, creating gentle highlights and shadows that add depth and dimension to the frame. The background wall showcases a sophisticated neutral color palette—think warm beige or soft gray tones—that doesn't compete for attention but provides visual interest through subtle texture. The space includes carefully curated elements of modern interior design: perhaps a sleek bookshelf with financial publications, a tasteful piece of abstract art that adds color without distraction, and contemporary furniture pieces that suggest success and stability. The floor is likely hardwood or high-quality laminate, polished to reflect light subtly. The lighting setup combines natural window light with strategically placed LED panels or softboxes that eliminate harsh shadows while maintaining a natural, lifestyle aesthetic rather than an overly commercial look. The depth of field is moderate, keeping the subject sharp while softly blurring background elements to maintain focus. Environmental sound design would capture subtle ambient noise—perhaps distant city sounds or soft office ambiance—that grounds the viewer in a real, authentic space. The overall atmosphere suggests a professional consultation setting where important financial decisions are made, yet feels intimate and personal enough that viewers can imagine having this conversation in their own homes. Every element of the environment reinforces the credibility of the debt relief message while maintaining the authentic, UGC-style feel that drives engagement. The space is clutter-free but lived-in, striking the perfect balance between aspirational and relatable." (250+ words)

	═══════════════════════════════════════════════════════════
	YOUR TASK
	═══════════════════════════════════════════════════════════

	Generate EXACTLY {N} segments. Each segment MUST meet ALL word count requirements above.

	Duration: "00:00-00:08" for each segment
	Synchronized actions: MUST have keys "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08"
	Total segments: Set segment_info.total_segments = {N} on EVERY segment

	⚠️ CRITICAL DIALOGUE RULE - NO OVERLAP:
	- Each segment's action_timeline.dialogue MUST contain ONLY the text assigned below
	- NEVER repeat any words or sentences from previous segments
	- NEVER include any words or sentences from the next segment
	- Each segment's dialogue is MUTUALLY EXCLUSIVE - zero overlap allowed
	- The dialogue for each segment is PRE-SPLIT below - use it EXACTLY as given

	SCRIPT TO SEGMENT:
	\"\"\"{inputs.script.strip()}\"\"\"

	STYLE SETTINGS:
	{knobs}

	SEGMENTS TO GENERATE (USE DIALOGUE EXACTLY AS SHOWN - NO OVERLAP):
	"""

	seg_lines = "\n".join([f"- Segment {i+1} dialogue (EXACT): \"{t}\"" for i, t in enumerate(segment_texts)])

	footer = """

	═══════════════════════════════════════════════════════════
	CRITICAL REMINDER BEFORE YOU GENERATE
	═══════════════════════════════════════════════════════════

	🚨 CHARACTER CONSISTENCY IS MANDATORY:
	- physical description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
	- clothing description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
	- environment description: COPY-PASTE THE EXACT SAME TEXT for ALL segments
	- The person MUST look IDENTICAL in every segment (same face, hair, clothes, setting)

	✅ CHECK EVERY FIELD FOR MINIMUM WORD COUNTS
	✅ physical: 150+ words \| clothing: 100+ words
	✅ current_state: 50+ words \| voice_matching: 50+ words
	✅ environment: 150+ words (MOST IMPORTANT - BE VERY DETAILED)
	✅ camera_position: 50+ words \| lighting_state: 40+ words
	✅ props_in_frame: 40+ words \| background_elements: 40+ words
	✅ micro_expressions: 40+ words

	🚨 CRITICAL: NO BLUR TRANSITIONS - REMINDER 🚨
	- Every segment starts SHARP and CLEAR at 0:00
	- camera_movement must describe movement from an already-focused state
	- synchronized_actions["0:00-0:02"] must begin with subject in sharp focus
	- NO fade-in, NO blur, NO gradual focus at segment start

	⚠️ VALIDATION WILL COUNT EVERY WORD. Generate MORE than minimum to be safe!
	⚠️ Describe the EXACT person in the reference image - do not invent features!

	OUTPUT FORMAT:
	Return ONLY valid JSON (no markdown, no code blocks):
	{{
	"segments": [ {{ ... }} ]
	}}
	"""

	return header + seg_lines + footer


	# Minimum word counts for validation
	MIN_WORDS = {
	("character_description", "physical"): 150,
	("character_description", "clothing"): 100,
	("character_description", "current_state"): 50,
	("character_description", "voice_matching"): 50,
	("scene_continuity", "environment"): 150,
	("scene_continuity", "camera_position"): 50,
	("scene_continuity", "lighting_state"): 40,
	("scene_continuity", "props_in_frame"): 40,
	("scene_continuity", "background_elements"): 40,
	("action_timeline", "micro_expressions"): 40,
	}


	def _word_count(text: str) -> int:
	"""Count words in text"""
	return len(re.findall(r"\b\w+\b", text or ""))


	def validate_segments_payload(
	payload: Dict[str, Any],
	expected_segments: int
	) -> List[str]:
	"""
	Validate the generated payload against strict rules

	Args:
	payload: Generated payload
	expected_segments: Expected number of segments

	Returns:
	List of validation errors (empty if valid)
	"""
	errors: List[str] = []
	segs = payload.get("segments", [])

	if len(segs) != expected_segments:
	errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")

	required_sync_keys = {"0:00-0:02", "0:02-0:04", "0:04-0:06", "0:06-0:08"}
	physical_blocks, clothing_blocks, environment_blocks = [], [], []

	for i, seg in enumerate(segs, start=1):
	# Check segment info
	si = seg.get("segment_info", {})
	if si.get("duration") != "00:00-00:08":
	errors.append(f"Segment {i}: duration must be 00:00-00:08.")
	if si.get("total_segments") != expected_segments:
	errors.append(
	f"Segment {i}: total_segments should be {expected_segments}, "
	f"got {si.get('total_segments')}."
	)

	# Check synchronized actions keys
	sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
	if set(sync.keys()) != required_sync_keys:
	errors.append(
	f"Segment {i}: synchronized_actions must have keys "
	f"{sorted(required_sync_keys)}."
	)

	# Word-count checks
	for (section, field), minw in MIN_WORDS.items():
	text = seg.get(section, {}).get(field, "")
	wc = _word_count(text)
	if wc < minw:
	errors.append(
	f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc})."
	)

	# Collect for uniformity check
	ch = seg.get("character_description", {})
	sc = seg.get("scene_continuity", {})
	physical_blocks.append(ch.get("physical", ""))
	clothing_blocks.append(ch.get("clothing", ""))
	environment_blocks.append(sc.get("environment", ""))

	# Uniformity across segments - CRITICAL for visual consistency
	if expected_segments > 1:
	if len(set(physical_blocks)) > 1:
	errors.append(
	"🚨 `character_description.physical` must be EXACTLY identical "
	"across all segments - character is changing!"
	)
	if len(set(clothing_blocks)) > 1:
	errors.append(
	"🚨 `character_description.clothing` must be EXACTLY identical "
	"across all segments - clothing is changing!"
	)
	if len(set(environment_blocks)) > 1:
	errors.append(
	"🚨 `scene_continuity.environment` must be EXACTLY identical "
	"across all segments - location is changing!"
	)

	return errors


	def generate_segments_payload(
	inputs: VeoInputs,
	image_bytes: Optional[bytes] = None,
	model: str = "gpt-4o",
	api_key: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Generate segments payload using GPT-4o with structured output

	WARNING-ONLY MODE: Validation errors are logged but don't block generation.
	This allows the system to work with whatever GPT-4o generates.

	Args:
	inputs: Video generation inputs
	image_bytes: Optional reference image bytes
	model: OpenAI model to use
	api_key: OpenAI API key (or from env)

	Returns:
	Segments payload (always returns, even if validation warnings exist)

	Raises:
	Exception: If API call fails (network, auth, etc.)
	"""
	# Initialize OpenAI client
	client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))

	# Split script into segments
	segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
	N = len(segment_texts)

	print(f"📝 Generating {N} segments...")

	# Build prompt
	user_prompt = build_prompt(inputs, segment_texts)

	# Call GPT-4o (WARNING-ONLY validation - no retries, no blocking)
	print(f"🤖 Calling GPT-4o to generate {N} segments...")

	# Prepare messages
	system_content = "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."

	messages = [
	{
	"role": "system",
	"content": system_content
	},
	{
	"role": "user",
	"content": []
	}
	]

	# Add text prompt
	messages[1]["content"].append({
	"type": "text",
	"text": user_prompt
	})

	# Add image if provided
	if image_bytes:
	encoded_image = base64.b64encode(image_bytes).decode("utf-8")
	messages[1]["content"].append({
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{encoded_image}"
	}
	})

	# Call GPT-4o with structured output
	response = client.beta.chat.completions.parse(
	model=model,
	response_format=SegmentsPayload,
	messages=messages,
	)

	parsed_obj = response.choices[0].message.parsed
	payload = parsed_obj.model_dump(by_alias=True)

	print(f"✅ GPT-4o generated {N} segments successfully")

	# DEBUG: Show actual word counts for first segment
	if payload.get("segments"):
	seg = payload["segments"][0]
	cd = seg.get("character_description", {})
	sc = seg.get("scene_continuity", {})
	print(f"📊 Sample word counts (Segment 1):")
	print(f" physical: {_word_count(cd.get('physical', ''))} words")
	print(f" clothing: {_word_count(cd.get('clothing', ''))} words")
	print(f" current_state: {_word_count(cd.get('current_state', ''))} words")
	print(f" environment: {_word_count(sc.get('environment', ''))} words")
	print(f" camera_position: {_word_count(sc.get('camera_position', ''))} words")

	# Run validation (WARNING-ONLY - doesn't block generation)
	errors = validate_segments_payload(payload, N)

	if errors:
	# Log warnings but DON'T block generation
	print(f"\n⚠️ VALIDATION WARNINGS ({len(errors)} issues found):")
	print(f"⚠️ These are non-blocking - generation will continue")
	for i, error in enumerate(errors[:10], 1): # Show first 10
	print(f" {i}. {error}")
	if len(errors) > 10:
	print(f" ... and {len(errors) - 10} more warnings")
	print(f"✅ Proceeding with generation despite warnings\n")
	else:
	print(f"✅ All validation checks passed!")

	# ALWAYS return payload (even with warnings)
	return payload