Spaces:

userIdc2024
/

Video-Generator-Tools

Sleeping

File size: 8,495 Bytes

d856b59

from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from openai import OpenAI
import os
import re
from dotenv import load_dotenv
import base64

load_dotenv()

gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

class VeoInputs(BaseModel):
    script: str
    style: str
    jsonFormat: str = 'standard'
    continuationMode: bool = True
    voiceType: Optional[str] = None
    energyLevel: Optional[str] = None
    settingMode: str = 'single'
    cameraStyle: Optional[str] = None
    energyArc: Optional[str] = None
    narrativeStyle: Optional[str] = None
    accentRegion: Optional[str] = None

class ContinuityMarkers(BaseModel):
    start_position: str
    end_position: str
    start_expression: str
    end_expression: str
    start_gesture: str
    end_gesture: str
    location_status: str

class SegmentInfo(BaseModel):
    segment_number: int
    total_segments: int
    duration: str
    location: str
    continuity_markers: ContinuityMarkers

class CharacterDescription(BaseModel):
    current_state: str     # 100+ words, segment-specific
    voice_matching: str    # 100+ words, segment-specific

class SynchronizedActions(BaseModel):
    # Use legal Python identifiers; map to exact JSON keys with aliases
    f0000_0002: str = Field(alias="0:00-0:02")
    f0002_0004: str = Field(alias="0:02-0:04")
    f0004_0006: str = Field(alias="0:04-0:06")
    f0006_0008: str = Field(alias="0:06-0:08")

    class Config:
        populate_by_name = True

class ActionTimeline(BaseModel):
    dialogue: str
    synchronized_actions: SynchronizedActions
    micro_expressions: str   # 50+ words
    breathing_rhythm: str
    location_transition: str
    continuity_checkpoint: str

class SceneContinuity(BaseModel):
    environment: str           # 250+ words
    camera_position: str       # 75+ words
    camera_movement: str       # detailed movement path
    lighting_state: str        # 50+ words
    background_elements: str   # 50+ words
    spatial_relationships: str

class Segment(BaseModel):
    segment_info: SegmentInfo
    character_description: CharacterDescription
    scene_continuity: SceneContinuity
    action_timeline: ActionTimeline

class SegmentsPayload(BaseModel):
    segments: List[Segment]

def split_script_into_segments(script: str, seconds_per_segment: int = 8, words_per_second: float = 2.2) -> List[str]:
    """
    Packs sentences into ~seconds * words_per_second buckets (≈ 17-20 words/8s).
    Adjust words_per_second if your VO tempo differs.
    """
    sentences = re.split(r'(?<=[.!?])\s+', script.strip())
    sentences = [s.strip() for s in sentences if s.strip()]
    target = max(14, int(seconds_per_segment * words_per_second))  # minimal guard

    segments, cur, cur_len = [], [], 0
    for s in sentences:
        w = len(s.split())
        if cur and cur_len + w > target:
            segments.append(" ".join(cur))
            cur, cur_len = [], 0
        cur.append(s)
        cur_len += w
    if cur:
        segments.append(" ".join(cur))
    return segments or [script.strip()]

def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
    N = len(segment_texts)
    knobs = inputs.model_dump()
    header = f"""
You are a senior performance-marketing video director who writes segment-accurate, production-grade JSON prompts for Veo 3.
Return ONLY JSON that parses into the provided schema. Do not add fields. No markdown.

Task: Build prompts for exactly {N} segments of 8 seconds each.
Hard rules for EVERY segment:
- "duration" MUST be "00:00-00:8"
- "current_state" = 100+ words, segment-specific
- "voice_matching" = 100+ words, segment-specific
- "environment" = 250+ words; "camera_position" = 75+ words; "lighting_state" = 50+ words min
- "camera_movement" = concrete, timestamped path (pan/tilt/dolly/handheld/steadicam)
- "synchronized_actions" must have exactly these keys: "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08","0:08-0:10"
- Dialogue must fit in 10s naturally with breath points.
- If continuationMode is true, include a continuity checkpoint aligning next segment’s start.
- Set "segment_info.total_segments" = {N} on each segment.
- Based on the character image provide select everything as asked.
FULL SCRIPT:
\"\"\"{inputs.script.strip()}\"\"\"

AUTHORITATIVE SETTINGS (must be reflected):
{knobs}

SEGMENT LINES (cover in exactly 8 seconds each):
"""
    seg_lines = "\n".join([f"- Segment {i+1}: {t}" for i, t in enumerate(segment_texts)])

    footer = """
OUTPUT:
Return JSON only as:
{
  "segments": [ { ... per-segment object exactly matching the schema ... } ]
}
"""
    return header + seg_lines + footer


# ---------- Validator (segment count, durations, keys, word counts, uniformity) ----------

MIN_WORDS = {
    ("character_description", "physical"): 200,
    ("character_description", "clothing"): 150,
    ("character_description", "current_state"): 100,
    ("character_description", "voice_matching"): 100,
    ("scene_continuity", "environment"): 250,
    ("scene_continuity", "camera_position"): 75,
    ("scene_continuity", "lighting_state"): 50,
    ("scene_continuity", "props_in_frame"): 75,
    ("scene_continuity", "background_elements"): 50,
    ("action_timeline", "micro_expressions"): 50,
}

def _word_count(text: str) -> int:
    return len(re.findall(r"\b\w+\b", text or ""))

def validate_segments_payload(payload: Dict[str, Any], expected_segments: int) -> List[str]:
    errors: List[str] = []
    segs = payload.get("segments", [])
    if len(segs) != expected_segments:
        errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")

    required_sync_keys = {"0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08", "0:08-0:10"}
    physical_blocks, clothing_blocks = [], []

    for i, seg in enumerate(segs, start=1):
        si = seg.get("segment_info", {})
        if si.get("duration") != "00:00-00:10":
            errors.append(f"Segment {i}: duration must be 00:00-00:10.")
        if si.get("total_segments") != expected_segments:
            errors.append(f"Segment {i}: total_segments should be {expected_segments}, got {si.get('total_segments')}.")

        sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
        if set(sync.keys()) != required_sync_keys:
            errors.append(f"Segment {i}: synchronized_actions must have keys {sorted(required_sync_keys)}.")

        # Word-count checks
        for (section, field), minw in MIN_WORDS.items():
            text = seg.get(section, {}).get(field, "")
            wc = _word_count(text)
            if wc < minw:
                errors.append(f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc}).")

        ch = seg.get("character_description", {})
        physical_blocks.append(ch.get("physical", ""))
        clothing_blocks.append(ch.get("clothing", ""))

    # Uniformity across segments
    if expected_segments > 1:
        if len(set(physical_blocks)) > 1:
            errors.append("`character_description.physical` must be EXACTLY identical across all segments.")
        if len(set(clothing_blocks)) > 1:
            errors.append("`character_description.clothing` must be EXACTLY identical across all segments.")

    return errors

def generate_segments_payload(
    inputs: VeoInputs,
    image_path: str = None,
    model: str = "gpt-4o",
) -> Dict[str, Any]:
    segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
    N = len(segment_texts)
    print(N)

    encoded_image = base64.b64encode(image_path).decode("utf-8")

    def _call_llm(user_prompt: str):
        return gpt_client.beta.chat.completions.parse(
        model=model,
        response_format=SegmentsPayload,
        messages=[
            {"role": "system", "content": "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": user_prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{encoded_image}"
                        },
                    },
                ],
            },
        ],
        ).choices[0].message.parsed

    user_prompt = build_prompt(inputs, segment_texts)
    parsed_obj = _call_llm(user_prompt)
    payload = parsed_obj.model_dump(by_alias=True)

    return payload