Spaces:
Sleeping
Sleeping
File size: 8,495 Bytes
d856b59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from openai import OpenAI
import os
import re
from dotenv import load_dotenv
import base64
load_dotenv()
gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
class VeoInputs(BaseModel):
script: str
style: str
jsonFormat: str = 'standard'
continuationMode: bool = True
voiceType: Optional[str] = None
energyLevel: Optional[str] = None
settingMode: str = 'single'
cameraStyle: Optional[str] = None
energyArc: Optional[str] = None
narrativeStyle: Optional[str] = None
accentRegion: Optional[str] = None
class ContinuityMarkers(BaseModel):
start_position: str
end_position: str
start_expression: str
end_expression: str
start_gesture: str
end_gesture: str
location_status: str
class SegmentInfo(BaseModel):
segment_number: int
total_segments: int
duration: str
location: str
continuity_markers: ContinuityMarkers
class CharacterDescription(BaseModel):
current_state: str # 100+ words, segment-specific
voice_matching: str # 100+ words, segment-specific
class SynchronizedActions(BaseModel):
# Use legal Python identifiers; map to exact JSON keys with aliases
f0000_0002: str = Field(alias="0:00-0:02")
f0002_0004: str = Field(alias="0:02-0:04")
f0004_0006: str = Field(alias="0:04-0:06")
f0006_0008: str = Field(alias="0:06-0:08")
class Config:
populate_by_name = True
class ActionTimeline(BaseModel):
dialogue: str
synchronized_actions: SynchronizedActions
micro_expressions: str # 50+ words
breathing_rhythm: str
location_transition: str
continuity_checkpoint: str
class SceneContinuity(BaseModel):
environment: str # 250+ words
camera_position: str # 75+ words
camera_movement: str # detailed movement path
lighting_state: str # 50+ words
background_elements: str # 50+ words
spatial_relationships: str
class Segment(BaseModel):
segment_info: SegmentInfo
character_description: CharacterDescription
scene_continuity: SceneContinuity
action_timeline: ActionTimeline
class SegmentsPayload(BaseModel):
segments: List[Segment]
def split_script_into_segments(script: str, seconds_per_segment: int = 8, words_per_second: float = 2.2) -> List[str]:
"""
Packs sentences into ~seconds * words_per_second buckets (≈ 17-20 words/8s).
Adjust words_per_second if your VO tempo differs.
"""
sentences = re.split(r'(?<=[.!?])\s+', script.strip())
sentences = [s.strip() for s in sentences if s.strip()]
target = max(14, int(seconds_per_segment * words_per_second)) # minimal guard
segments, cur, cur_len = [], [], 0
for s in sentences:
w = len(s.split())
if cur and cur_len + w > target:
segments.append(" ".join(cur))
cur, cur_len = [], 0
cur.append(s)
cur_len += w
if cur:
segments.append(" ".join(cur))
return segments or [script.strip()]
def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
N = len(segment_texts)
knobs = inputs.model_dump()
header = f"""
You are a senior performance-marketing video director who writes segment-accurate, production-grade JSON prompts for Veo 3.
Return ONLY JSON that parses into the provided schema. Do not add fields. No markdown.
Task: Build prompts for exactly {N} segments of 8 seconds each.
Hard rules for EVERY segment:
- "duration" MUST be "00:00-00:8"
- "current_state" = 100+ words, segment-specific
- "voice_matching" = 100+ words, segment-specific
- "environment" = 250+ words; "camera_position" = 75+ words; "lighting_state" = 50+ words min
- "camera_movement" = concrete, timestamped path (pan/tilt/dolly/handheld/steadicam)
- "synchronized_actions" must have exactly these keys: "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08","0:08-0:10"
- Dialogue must fit in 10s naturally with breath points.
- If continuationMode is true, include a continuity checkpoint aligning next segment’s start.
- Set "segment_info.total_segments" = {N} on each segment.
- Based on the character image provide select everything as asked.
FULL SCRIPT:
\"\"\"{inputs.script.strip()}\"\"\"
AUTHORITATIVE SETTINGS (must be reflected):
{knobs}
SEGMENT LINES (cover in exactly 8 seconds each):
"""
seg_lines = "\n".join([f"- Segment {i+1}: {t}" for i, t in enumerate(segment_texts)])
footer = """
OUTPUT:
Return JSON only as:
{
"segments": [ { ... per-segment object exactly matching the schema ... } ]
}
"""
return header + seg_lines + footer
# ---------- Validator (segment count, durations, keys, word counts, uniformity) ----------
MIN_WORDS = {
("character_description", "physical"): 200,
("character_description", "clothing"): 150,
("character_description", "current_state"): 100,
("character_description", "voice_matching"): 100,
("scene_continuity", "environment"): 250,
("scene_continuity", "camera_position"): 75,
("scene_continuity", "lighting_state"): 50,
("scene_continuity", "props_in_frame"): 75,
("scene_continuity", "background_elements"): 50,
("action_timeline", "micro_expressions"): 50,
}
def _word_count(text: str) -> int:
return len(re.findall(r"\b\w+\b", text or ""))
def validate_segments_payload(payload: Dict[str, Any], expected_segments: int) -> List[str]:
errors: List[str] = []
segs = payload.get("segments", [])
if len(segs) != expected_segments:
errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")
required_sync_keys = {"0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08", "0:08-0:10"}
physical_blocks, clothing_blocks = [], []
for i, seg in enumerate(segs, start=1):
si = seg.get("segment_info", {})
if si.get("duration") != "00:00-00:10":
errors.append(f"Segment {i}: duration must be 00:00-00:10.")
if si.get("total_segments") != expected_segments:
errors.append(f"Segment {i}: total_segments should be {expected_segments}, got {si.get('total_segments')}.")
sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
if set(sync.keys()) != required_sync_keys:
errors.append(f"Segment {i}: synchronized_actions must have keys {sorted(required_sync_keys)}.")
# Word-count checks
for (section, field), minw in MIN_WORDS.items():
text = seg.get(section, {}).get(field, "")
wc = _word_count(text)
if wc < minw:
errors.append(f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc}).")
ch = seg.get("character_description", {})
physical_blocks.append(ch.get("physical", ""))
clothing_blocks.append(ch.get("clothing", ""))
# Uniformity across segments
if expected_segments > 1:
if len(set(physical_blocks)) > 1:
errors.append("`character_description.physical` must be EXACTLY identical across all segments.")
if len(set(clothing_blocks)) > 1:
errors.append("`character_description.clothing` must be EXACTLY identical across all segments.")
return errors
def generate_segments_payload(
inputs: VeoInputs,
image_path: str = None,
model: str = "gpt-4o",
) -> Dict[str, Any]:
segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
N = len(segment_texts)
print(N)
encoded_image = base64.b64encode(image_path).decode("utf-8")
def _call_llm(user_prompt: str):
return gpt_client.beta.chat.completions.parse(
model=model,
response_format=SegmentsPayload,
messages=[
{"role": "system", "content": "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."},
{
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}"
},
},
],
},
],
).choices[0].message.parsed
user_prompt = build_prompt(inputs, segment_texts)
parsed_obj = _call_llm(user_prompt)
payload = parsed_obj.model_dump(by_alias=True)
return payload |