Spaces:

userIdc2024
/

Video-Generator-Tools

Sleeping

App Files Files Community

Video-Generator-Tools / src /prompt_generator.py

userIdc2024

Upload 5 files

d856b59 verified 3 months ago

raw

history blame contribute delete

8.5 kB

	from typing import List, Optional, Dict, Any
	from pydantic import BaseModel, Field
	from openai import OpenAI
	import os
	import re
	from dotenv import load_dotenv
	import base64

	load_dotenv()

	gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

	class VeoInputs(BaseModel):
	script: str
	style: str
	jsonFormat: str = 'standard'
	continuationMode: bool = True
	voiceType: Optional[str] = None
	energyLevel: Optional[str] = None
	settingMode: str = 'single'
	cameraStyle: Optional[str] = None
	energyArc: Optional[str] = None
	narrativeStyle: Optional[str] = None
	accentRegion: Optional[str] = None

	class ContinuityMarkers(BaseModel):
	start_position: str
	end_position: str
	start_expression: str
	end_expression: str
	start_gesture: str
	end_gesture: str
	location_status: str

	class SegmentInfo(BaseModel):
	segment_number: int
	total_segments: int
	duration: str
	location: str
	continuity_markers: ContinuityMarkers

	class CharacterDescription(BaseModel):
	current_state: str # 100+ words, segment-specific
	voice_matching: str # 100+ words, segment-specific

	class SynchronizedActions(BaseModel):
	# Use legal Python identifiers; map to exact JSON keys with aliases
	f0000_0002: str = Field(alias="0:00-0:02")
	f0002_0004: str = Field(alias="0:02-0:04")
	f0004_0006: str = Field(alias="0:04-0:06")
	f0006_0008: str = Field(alias="0:06-0:08")

	class Config:
	populate_by_name = True

	class ActionTimeline(BaseModel):
	dialogue: str
	synchronized_actions: SynchronizedActions
	micro_expressions: str # 50+ words
	breathing_rhythm: str
	location_transition: str
	continuity_checkpoint: str

	class SceneContinuity(BaseModel):
	environment: str # 250+ words
	camera_position: str # 75+ words
	camera_movement: str # detailed movement path
	lighting_state: str # 50+ words
	background_elements: str # 50+ words
	spatial_relationships: str

	class Segment(BaseModel):
	segment_info: SegmentInfo
	character_description: CharacterDescription
	scene_continuity: SceneContinuity
	action_timeline: ActionTimeline

	class SegmentsPayload(BaseModel):
	segments: List[Segment]

	def split_script_into_segments(script: str, seconds_per_segment: int = 8, words_per_second: float = 2.2) -> List[str]:
	"""
	Packs sentences into ~seconds * words_per_second buckets (≈ 17-20 words/8s).
	Adjust words_per_second if your VO tempo differs.
	"""
	sentences = re.split(r'(?<=[.!?])\s+', script.strip())
	sentences = [s.strip() for s in sentences if s.strip()]
	target = max(14, int(seconds_per_segment * words_per_second)) # minimal guard

	segments, cur, cur_len = [], [], 0
	for s in sentences:
	w = len(s.split())
	if cur and cur_len + w > target:
	segments.append(" ".join(cur))
	cur, cur_len = [], 0
	cur.append(s)
	cur_len += w
	if cur:
	segments.append(" ".join(cur))
	return segments or [script.strip()]

	def build_prompt(inputs: VeoInputs, segment_texts: List[str]) -> str:
	N = len(segment_texts)
	knobs = inputs.model_dump()
	header = f"""
	You are a senior performance-marketing video director who writes segment-accurate, production-grade JSON prompts for Veo 3.
	Return ONLY JSON that parses into the provided schema. Do not add fields. No markdown.

	Task: Build prompts for exactly {N} segments of 8 seconds each.
	Hard rules for EVERY segment:
	- "duration" MUST be "00:00-00:8"
	- "current_state" = 100+ words, segment-specific
	- "voice_matching" = 100+ words, segment-specific
	- "environment" = 250+ words; "camera_position" = 75+ words; "lighting_state" = 50+ words min
	- "camera_movement" = concrete, timestamped path (pan/tilt/dolly/handheld/steadicam)
	- "synchronized_actions" must have exactly these keys: "0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08","0:08-0:10"
	- Dialogue must fit in 10s naturally with breath points.
	- If continuationMode is true, include a continuity checkpoint aligning next segment’s start.
	- Set "segment_info.total_segments" = {N} on each segment.
	- Based on the character image provide select everything as asked.
	FULL SCRIPT:
	\"\"\"{inputs.script.strip()}\"\"\"

	AUTHORITATIVE SETTINGS (must be reflected):
	{knobs}

	SEGMENT LINES (cover in exactly 8 seconds each):
	"""
	seg_lines = "\n".join([f"- Segment {i+1}: {t}" for i, t in enumerate(segment_texts)])

	footer = """
	OUTPUT:
	Return JSON only as:
	{
	"segments": [ { ... per-segment object exactly matching the schema ... } ]
	}
	"""
	return header + seg_lines + footer


	# ---------- Validator (segment count, durations, keys, word counts, uniformity) ----------

	MIN_WORDS = {
	("character_description", "physical"): 200,
	("character_description", "clothing"): 150,
	("character_description", "current_state"): 100,
	("character_description", "voice_matching"): 100,
	("scene_continuity", "environment"): 250,
	("scene_continuity", "camera_position"): 75,
	("scene_continuity", "lighting_state"): 50,
	("scene_continuity", "props_in_frame"): 75,
	("scene_continuity", "background_elements"): 50,
	("action_timeline", "micro_expressions"): 50,
	}

	def _word_count(text: str) -> int:
	return len(re.findall(r"\b\w+\b", text or ""))

	def validate_segments_payload(payload: Dict[str, Any], expected_segments: int) -> List[str]:
	errors: List[str] = []
	segs = payload.get("segments", [])
	if len(segs) != expected_segments:
	errors.append(f"Expected {expected_segments} segments, got {len(segs)}.")

	required_sync_keys = {"0:00-0:02","0:02-0:04","0:04-0:06","0:06-0:08", "0:08-0:10"}
	physical_blocks, clothing_blocks = [], []

	for i, seg in enumerate(segs, start=1):
	si = seg.get("segment_info", {})
	if si.get("duration") != "00:00-00:10":
	errors.append(f"Segment {i}: duration must be 00:00-00:10.")
	if si.get("total_segments") != expected_segments:
	errors.append(f"Segment {i}: total_segments should be {expected_segments}, got {si.get('total_segments')}.")

	sync = seg.get("action_timeline", {}).get("synchronized_actions", {})
	if set(sync.keys()) != required_sync_keys:
	errors.append(f"Segment {i}: synchronized_actions must have keys {sorted(required_sync_keys)}.")

	# Word-count checks
	for (section, field), minw in MIN_WORDS.items():
	text = seg.get(section, {}).get(field, "")
	wc = _word_count(text)
	if wc < minw:
	errors.append(f"Segment {i}: {section}.{field} must be >= {minw} words (got {wc}).")

	ch = seg.get("character_description", {})
	physical_blocks.append(ch.get("physical", ""))
	clothing_blocks.append(ch.get("clothing", ""))

	# Uniformity across segments
	if expected_segments > 1:
	if len(set(physical_blocks)) > 1:
	errors.append("`character_description.physical` must be EXACTLY identical across all segments.")
	if len(set(clothing_blocks)) > 1:
	errors.append("`character_description.clothing` must be EXACTLY identical across all segments.")

	return errors

	def generate_segments_payload(
	inputs: VeoInputs,
	image_path: str = None,
	model: str = "gpt-4o",
	) -> Dict[str, Any]:
	segment_texts = split_script_into_segments(inputs.script, seconds_per_segment=8)
	N = len(segment_texts)
	print(N)

	encoded_image = base64.b64encode(image_path).decode("utf-8")

	def _call_llm(user_prompt: str):
	return gpt_client.beta.chat.completions.parse(
	model=model,
	response_format=SegmentsPayload,
	messages=[
	{"role": "system", "content": "You are a precise JSON-only generator that must satisfy a strict schema and explicit segment count."},
	{
	"role": "user",
	"content": [
	{"type": "text", "text": user_prompt},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{encoded_image}"
	},
	},
	],
	},
	],
	).choices[0].message.parsed

	user_prompt = build_prompt(inputs, segment_texts)
	parsed_obj = _call_llm(user_prompt)
	payload = parsed_obj.model_dump(by_alias=True)

	return payload