Spaces:
Sleeping
Sleeping
| from typing import List, Optional, Dict, Any | |
| from pydantic import BaseModel, Field | |
| from openai import OpenAI | |
| import os | |
| import re | |
| from dotenv import load_dotenv | |
| import base64 | |
| load_dotenv() | |
| gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| class VeoInputs(BaseModel): | |
| script: str | |
| style: str | |
| jsonFormat: str = 'standard' | |
| continuationMode: bool = True | |
| voiceType: Optional[str] = None | |
| energyLevel: Optional[str] = None | |
| settingMode: str = 'single' | |
| cameraStyle: Optional[str] = None | |
| energyArc: Optional[str] = None | |
| narrativeStyle: Optional[str] = None | |
| accentRegion: Optional[str] = None | |
| class ContinuityMarkers(BaseModel): | |
| start_position: str | |
| end_position: str | |
| start_expression: str | |
| end_expression: str | |
| start_gesture: str | |
| end_gesture: str | |
| location_status: str | |
| class SegmentInfo(BaseModel): | |
| segment_number: int | |
| total_segments: int | |
| duration: str | |
| location: str | |
| continuity_markers: ContinuityMarkers | |
| class CharacterDescription(BaseModel): | |
| current_state: str # 100+ words, segment-specific | |
| voice_matching: str # 100+ words, segment-specific | |
| class SynchronizedActions(BaseModel): | |
| # Use legal Python identifiers; map to exact JSON keys with aliases | |
| f0000_0002: str = Field(alias="0:00-0:02") | |
| f0002_0004: str = Field(alias="0:02-0:04") | |
| f0004_0006: str = Field(alias="0:04-0:06") | |
| f0006_0008: str = Field(alias="0:06-0:08") | |
| class Config: | |
| populate_by_name = True | |
| class ActionTimeline(BaseModel): | |
| dialogue: str | |
| synchronized_actions: SynchronizedActions | |
| micro_expressions: str # 50+ words | |
| breathing_rhythm: str | |
| location_transition: str | |
| continuity_checkpoint: str | |
| class SceneContinuity(BaseModel): | |
| environment: str # 250+ words | |
| camera_position: str # 75+ words | |
| camera_movement: str # detailed movement path | |
| lighting_state: str # 50+ words | |
| background_elements: str # 50+ words | |
| spatial_relationships: str | |
| class Segment(BaseModel): | |
| segment_info: SegmentInfo | |
| character_description: CharacterDescription | |
| scene_continuity: SceneContinuity | |
| action_timeline: ActionTimeline | |
| class SegmentsPayload(BaseModel): | |
| segments: List[Segment] | |
| def split_script_into_segments(script: str, seconds_per_segment: int = 8, words_per_second: float = 2.2) -> List[str]: | |
| """ | |
| Packs sentences into ~seconds * words_per_second buckets (≈ 17-20 words/8s). | |
| Adjust words_per_second if your VO tempo differs. | |
| """ | |
| sentences = re.split(r'(?<=[.!?])\s+', script.strip()) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| target = max(14, int(seconds_per_segment * words_per_second)) # minimal guard | |
| segments, cur, cur_len = [], [], 0 | |
| for s in sentences: | |
| w = len(s.split()) | |
| if cur and cur_len + w > target: | |
| segments.append(" ".join(cur)) | |
| cur, cur_len = [], 0 | |
| cur.append(s) | |
| cur_len += w | |
| if cur: | |
| segments.append(" ".join(cur)) | |
| return segments or [script.strip()] | |
| def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str: | |
| N = len(segment_texts) | |
| knobs = inputs.model_dump() | |
| header = f""" | |
| You are a senior performance-marketing video director who writes segment-accurate, production-grade JSON prompts for Veo 3. | |
| Return ONLY JSON that parses into the provided schema. Do not add fields. No markdown. | |
| Task: Build prompts for exactly {N} segments of 8 seconds each. | |
| Hard rules for EVERY segment: | |
| - "duration" MUST be "00:00-00:8" | |
| - "current_state" = 100+ words, segment-specific | |
| - "voice_matching" = 100+ words, segment-specific | |
| - "environment" = 250+ words; "camera_position" = 75+ words; "lighting_state" = 50+ words min | |
| - "camera_movement" = concrete, timestamped path (pan/tilt/dolly/handheld/steadicam) | |
| - "synchronized_actions" must have exactly these keys: "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08","0:08-0:10" | |
| - Dialogue must fit in 10s naturally with breath points. | |
| - If continuationMode is true, include a continuity checkpoint aligning next segment’s start. | |
| - Set "segment_info.total_segments" = {N} on each segment. | |
| - Based on the character image provide select everything as asked. | |
| FULL SCRIPT: | |
| \"\"\"{inputs.script.strip()}\"\"\" | |
| AUTHORITATIVE SETTINGS (must be reflected): | |
| {knobs} | |
| SEGMENT LINES (cover in exactly 8 seconds each): | |
| """ | |
| seg_lines = "\n".join([f"- Segment {i+1}: {t}" for i, t in enumerate(segment_texts)]) | |
| footer = """ | |
| OUTPUT: | |
| Return JSON only as: | |
| { | |
| "segments": [ { ... per-segment object exactly matching the schema ... } ] | |
| } | |
| """ | |
| return header + seg_lines + footer | |
| # ---------- Validator (segment count, durations, keys, word counts, uniformity) ---------- | |
| MIN_WORDS = { | |
| ("character_description", "physical"): 200, | |
| ("character_description", "clothing"): 150, | |
| ("character_description", "current_state"): 100, | |
| ("character_description", "voice_matching"): 100, | |
| ("scene_continuity", "environment"): 250, | |
| ("scene_continuity", "camera_position"): 75, | |
| ("scene_continuity", "lighting_state"): 50, | |
| ("scene_continuity", "props_in_frame"): 75, | |
| ("scene_continuity", "background_elements"): 50, | |
| ("action_timeline", "micro_expressions"): 50, | |
| } | |
| def _word_count(text: str) -> int: | |
| return len(re.findall(r"\b\w+\b", text or "")) | |
| def validate_segments_payload(payload: Dict[str, Any], expected_segments: int) -> List[str]: | |
| errors: List[str] = [] | |
| segs = payload.get("segments", []) | |
| if len(segs) != expected_segments: | |
| errors.append(f"Expected {expected_segments} segments, got {len(segs)}.") | |
| required_sync_keys = {"0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08", "0:08-0:10"} | |
| physical_blocks, clothing_blocks = [], [] | |
| for i, seg in enumerate(segs, start=1): | |
| si = seg.get("segment_info", {}) | |
| if si.get("duration") != "00:00-00:10": | |
| errors.append(f"Segment {i}: duration must be 00:00-00:10.") | |
| if si.get("total_segments") != expected_segments: | |
| errors.append(f"Segment {i}: total_segments should be {expected_segments}, got {si.get('total_segments')}.") | |
| sync = seg.get("action_timeline", {}).get("synchronized_actions", {}) | |
| if set(sync.keys()) != required_sync_keys: | |
| errors.append(f"Segment {i}: synchronized_actions must have keys {sorted(required_sync_keys)}.") | |
| # Word-count checks | |
| for (section, field), minw in MIN_WORDS.items(): | |
| text = seg.get(section, {}).get(field, "") | |
| wc = _word_count(text) | |
| if wc < minw: | |
| errors.append(f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc}).") | |
| ch = seg.get("character_description", {}) | |
| physical_blocks.append(ch.get("physical", "")) | |
| clothing_blocks.append(ch.get("clothing", "")) | |
| # Uniformity across segments | |
| if expected_segments > 1: | |
| if len(set(physical_blocks)) > 1: | |
| errors.append("`character_description.physical` must be EXACTLY identical across all segments.") | |
| if len(set(clothing_blocks)) > 1: | |
| errors.append("`character_description.clothing` must be EXACTLY identical across all segments.") | |
| return errors | |
| def generate_segments_payload( | |
| inputs: VeoInputs, | |
| image_path: str = None, | |
| model: str = "gpt-4o", | |
| ) -> Dict[str, Any]: | |
| segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8) | |
| N = len(segment_texts) | |
| print(N) | |
| encoded_image = base64.b64encode(image_path).decode("utf-8") | |
| def _call_llm(user_prompt: str): | |
| return gpt_client.beta.chat.completions.parse( | |
| model=model, | |
| response_format=SegmentsPayload, | |
| messages=[ | |
| {"role": "system", "content": "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": user_prompt}, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{encoded_image}" | |
| }, | |
| }, | |
| ], | |
| }, | |
| ], | |
| ).choices[0].message.parsed | |
| user_prompt = build_prompt(inputs, segment_texts) | |
| parsed_obj = _call_llm(user_prompt) | |
| payload = parsed_obj.model_dump(by_alias=True) | |
| return payload |