File size: 18,352 Bytes
06ba7ea | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 | from typing import Dict, List, Literal, Any, Annotated, Optional, Union, ClassVar, Type, Tuple
from pydantic import BaseModel, Field, model_validator, constr, conlist
class VideoMetadata(BaseModel):
"""Video metadata"""
width: int = Field(description="Width")
height: int = Field(description="Height")
duration: float = Field(description="Duration (milliseconds)")
fps: float = Field(description="Video frame rate per second")
has_audio: bool = Field(default=False, description="Whether audio track is present")
audio_sample_rate_hz: Optional[int] = Field(
None,
gt=0,
description="Audio sample rate (Hz), common values: 44100, 48000"
)
@model_validator(mode='after')
def validate_audio_sample_rate(self):
"""Audio sample rate is required if audio is present"""
if self.has_audio and self.audio_sample_rate_hz is None:
raise ValueError('audio_sample_rate_hz must be provided when video contains audio')
return self
class ImageMetadata(BaseModel):
"""Image metadata"""
width: int = Field(description="Width")
height: int = Field(description="Height")
class Media(BaseModel):
"""Single media"""
media_id: str
path: str
media_type: Literal["video", "image", "audio", "unknown"]
metadata: Union[VideoMetadata, ImageMetadata]
extra_info: Optional[Dict[str, Any]] = None
class SourceRef(BaseModel):
""" Original media reference information """
media_id: str
start: float
end: float
duration: float
height: Optional[int] = None
width: Optional[int] = None
class Clip(BaseModel):
clip_id: str
language: Optional[str] = None
caption: str = Field(default="", description="Caption describing the media")
media_type: str
path: str
fps: Optional[float] = None
extra_info: Optional[Dict[str, Any]] = Field(default=None, description="Extra metadata")
class SubtitleUnit(BaseModel):
"""Subtitle segmentation unit"""
unit_id: str = Field(
...,
description="Unique identifier for subtitle unit",
example="subtitle_0001"
)
index_in_group: int = Field(
...,
ge=0,
description="Sequential index within current group (starting from 0)",
example=0
)
text: str = Field(
...,
description="Text content of this subtitle unit",
example="The cat doesn't understand what KPI means"
)
class GroupClips(BaseModel):
"""Video group - Visual material organization"""
group_id: str = Field(
...,
description="Unique identifier for the group",
example="group_0001"
)
summary: str = Field(
...,
description="Description of the group's visual style, emotional tone, or editing intent",
example="Start with the calmest, most healing shots to establish the mood."
)
clip_ids: List[str] = Field(
...,
description="List of video clip IDs used in this group, arranged in playback order",
example=["clip_0003", "clip_0002"]
)
class GroupScript(BaseModel):
"""Group script content"""
group_id: str = Field(
...,
description="Unique identifier for the group",
example="group_0001"
)
raw_text: str = Field(
...,
description="original script content for this group",
example="The cat doesn't understand what KPI means, the cat only knows the sun is shining today"
)
subtitle_units: List = Field(
...,
description="List of subtitle segmentation units for precise control of subtitle display rhythm"
)
class Voiceover(BaseModel):
"""Single voiceover/narration item"""
group_id: str = Field(..., description="Group ID, e.g., group_0001")
voiceover_id: str = Field(..., description="Voiceover ID, e.g., voiceover_0001")
path: str = Field(..., description="Voiceover file path")
duration: int = Field(..., description="Voiceover duration (milliseconds)", gt=0)
class BGM(BaseModel):
"""Background music"""
bgm_id: str = Field(..., description="BGM ID, e.g., bgm_0003")
path: str = Field(..., description="BGM file path")
duration: int = Field(..., description="BGM duration (milliseconds)", gt=0)
bpm: float = Field(..., description="Beats per minute", gt=0)
beats: List[int] = Field(default_factory=list, description="List of beat timestamps (milliseconds)")
class TimeWindow(BaseModel):
start: int = Field(..., description="Start time (milliseconds)")
end: int = Field(..., description="End time (milliseconds)")
class AudioMix(BaseModel):
gain_db: float = Field(default=0.0, description="Gain in decibels")
ducking: Optional[Any] = Field(default=None, description="Ducking effect configuration")
class ClipTrack(BaseModel):
clip_id: str
source_window: TimeWindow
timeline_window: TimeWindow
class BgmTrack(BaseModel):
bgm_id: str
timeline_window: TimeWindow
mix: AudioMix
class SubtitleTrack(BaseModel):
text: str
timeline_window: TimeWindow
class VoiceoverTrack(BaseModel):
media_id: str
timeline_window: TimeWindow
class TimelineTracks(BaseModel):
video: List[ClipTrack] = Field(default_factory=list)
subtitles: List[SubtitleTrack] = Field(default_factory=list)
voiceover: List[VoiceoverTrack] = Field(default_factory=list)
bgm: List[BgmTrack] = Field(default_factory=list)
class BaseInput(BaseModel):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Automatic mode; skip: Skip mode; default: Default mode"
)
class LoadMediaInput(BaseInput):
...
class SearchMediaInput(BaseInput):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Automatically search media from pexels; skip: skip search; default: skip search"
)
photo_number: Annotated[int, Field(default=0, description="The number of images the user wants to obtain")]
video_number: Annotated[int, Field(default=5, description="The number of videos the user wants to obtain")]
search_keyword: Annotated[str, Field(default="scenery", description="Keyword of the media the user wants to obtain. Only one keyword is allowed; multiple keywords are not permitted.")]
orientation: Literal["landscape", "portrait"] = Field(
default="landscape",
description="landscape: The screen is wider horizontally and narrower vertically, making it suitable for computer screens, landscape images, etc;portrait: The screen is higher vertically and narrower horizontally, making it suitable for mobile browsing and close-up shots of people."
)
min_video_duration: Annotated[int, Field(default=1, description="The shortest duration of footage requested by the user in seconds.")]
max_video_duration: Annotated[int, Field(default=30, description="The longest duration of footage requested by the user in seconds.")]
class LoadMediaOutput(BaseModel):
media: List[Media] = Field(
default_factory=list,
description="List of media"
)
class SplitShotsInput(BaseInput):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Automatically segment shots based on scene changes, treat images as single shots; skip: Do not segment shots; default: Use default segmentation method"
)
min_shot_duration: Annotated[int, Field(default=1000, description="Segmented shots must not be shorter than this duration (unit: milliseconds)")]
max_shot_duration: Annotated[int, Field(default=10000, description="If a single shot exceeds this duration, force segmentation (unit: milliseconds)")]
class SplitShotsOutput(BaseModel):
clip_captions: List[Clip] = Field(default_factory=list, description="List of clips after splitting shots")
overall: Dict[str, str]
class UnderstandClipsInput(BaseModel):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Generate descriptions based on media content; skip: Do not generate descriptions; default: Use default description generation method"
)
class UnderstandClipsOutput(BaseModel):
clip_captions: List[Clip] = Field(default_factory=list, description="List of clips after understanding clips")
overall: Dict[str, str]
class FilterClipsInput(BaseModel):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Filter clips based on user requirements; skip: Skip filtering; default: Use default filtering method"
)
user_request: Annotated[str, Field(default="", description="User's requirements for clip filtering; if none provided, formulate one based on media materials and other editing requirements.")] = ""
class FilterClipsOutput(BaseModel):
clip_captions: List[Clip] = Field(default_factory=list, description="List of clips")
overall: Dict[str, str]
overall: Dict[str, str]
class GroupClipsInput(BaseModel):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Organize clips in a logical order based on narrative flow of media content and user's sequencing requirements; skip: Skip sorting; default: Use default ordering method"
)
user_request: Annotated[str, Field(default="", description="User's requirements for media organization order; if none provided, arrange in a logical narrative sequence following standard conventions.")]
class GroupClipsOutput(BaseModel):
groups: List[GroupClips] = Field(default_factory=list, description="List of clips")
class GenerateScriptInput(BaseModel):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Generate appropriate script based on media content and user's script requirements; skip: Skip, do not add subtitles; default: Use default script"
)
user_request: Annotated[str, Field(default="", description="User's requirements for the script.")]
custom_script: Dict[str, Any] = Field(
default={},
description="If user has specific character-level editing requirements for script/title, pass the edited custom script and title through this parameter. Format should be based on the original script generation output format but with the subtitle_units field removed. In this case, mode must use `auto`, other modes are prohibited"
)
class GenerateScriptOutput(BaseModel):
group_scripts: List[GroupScript]
title: Optional[str]
class GenerateVoiceoverInput(BaseInput):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Generate appropriate voiceover based on media content and user's voice requirements; skip: Skip voiceover; default: Use default voiceover"
)
user_request: Annotated[str, Field(default="", description="User's requirements for voiceover.")]
class RecommendScriptTemplateInput(BaseInput):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Select an appropriate copywriting template based on the material content and user's requirements for voiceover style; skip: Skip;"
)
user_request: Annotated[str, Field(default="", description="User's specific requirements for the script style.")]
filter_include: Annotated[
Dict[str, List[str]],
Field(
description=(
"Positive filter conditions. Multiple dimensions are combined with AND, "
"multiple values within the same dimension are combined with OR.\n"
"Supported dimensions:\n"
"- tags: category, one or more of "
"[Life, Food, Beauty, Entertainment, Travel, Tech, Business, Vehicle, Health, Family, Pets, Knowledge]"
)
)
] = {}
filter_exclude: Annotated[
Dict[str, List[Union[str]]],
Field(
description=(
"Negative filter conditions. Items matching these conditions will be excluded. "
"The semantics are the same as filter_include. "
"Supported dimensions: tags, id."
)
)
] = {}
class GenerateVoiceoverOutput(BaseModel):
voiceover: List[Voiceover] = Field(default_factory=list, description="Voiceover list")
class SelectBGMInput(BaseInput):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Select appropriate music based on media content and user's music requirements; skip: Do not use music; default: Use default music"
)
user_request: Annotated[str, Field(default="", description="User's requirements for background music.")]
filter_include: Annotated[
Dict[str, List[Union[str, int]]],
Field(
description=(
"Positive filter conditions. Multiple dimensions are combined with AND, "
"multiple values within the same dimension are combined with OR.\n"
"Supported dimensions:\n"
"- mood: music emotion, one or more of "
"[Dynamic, Chill, Happy, Sorrow, Romantic, Calm, Excited, Healing, Inspirational]\n"
"- scene: usage scene, one or more of "
"[Vlog, Travel, Relaxing, Emotion, Transition, Outdoor, Cafe, Evening, Scenery, Food, Date, Club]\n"
"- genre: music genre, one or more of "
"[Pop, BGM, Electronic, R&B/Soul, Hip Hop/Rap, Rock, Jazz, Folk, Classical, Chinese Style]\n"
"- lang: lyric language, one or more of [bgm, en, zh, ko, ja]\n"
"- id: specific music ids (int)"
)
)
] = {}
filter_exclude: Annotated[
Dict[str, List[Union[str, int]]],
Field(
description=(
"Negative filter conditions. Items matching these conditions will be excluded. "
"The semantics are the same as filter_include. "
"Supported dimensions: mood, scene, genre, lang, id."
)
)
] = {}
class SelectBGMOutput(BaseModel):
bgm: List[BGM] = Field(default_factory=list, description="BGM list")
class RecommendTransitionInput(BaseInput):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: add fade in and fade out transitions at beginning and end; skip: Do not use transitions; default: Use default transitions",
)
duration: Annotated[int, Field(default=1000, description="Duration of the transition in milliseconds")]
class RecommendTransitionOutput(BaseInput):
...
class RecommendTextInput(BaseInput):
mode: Literal["auto", "skip", "default"] = Field(
default="auto",
description="auto: Select appropriate font style and color based on user's subtitle font style requirements; default: Use default font",
)
user_request: Annotated[str, Field(default="", description="User's requirements for font style")]
filter_include: Annotated[
Dict[str, List[Union[str, int]]],
Field(
description=(
"Positive filter conditions. Multiple dimensions are combined with AND, "
"multiple values within the same dimension are combined with OR.\n"
"Supported dimensions:\n"
"- class: Font type, one or more"
"[Creative, Handwriting, Calligraphy, Basic]\n"
)
)
] = {}
class RecommendTextOutput(BaseInput):
...
class PlanTimelineInput(BaseInput):
use_beats: Annotated[bool, Field(default=True, description="Whether clip transitions should sync with BGM beats")]
class PlanTimelineOutput(BaseModel):
tracks: List[TimelineTracks] = Field(default_factory=list, description="Timeline track collection")
class RenderVideoInput(BaseInput):
aspect_ratio: Annotated[str | None, Field(
default=None,
description="When explicitly specified, forces the canvas to one of 16:9, 4:3, 1:1, 3:4, 9:16. If unset, the system automatically infers the most suitable aspect ratio."
)]
output_max_dimension_px: Annotated[int | None, Field(
default=None,
description="Maximum output size in pixels (longest side); defaults to 1080 and works with the aspect ratio."
)]
clip_compose_mode: Annotated[str, Field(
default="padding",
description="" \
"How to fit media into the canvas: " \
"'padding' keeps aspect ratio and fills empty areas with a solid color; " \
"'crop' center-crops media to match the canvas aspect ratio."
)]
bg_color: Annotated[Tuple[int] | List[int] | None, Field(
default=(0, 0, 0),
description="Background color for canvas padding, specified as an (R, G, B) tuple (no alpha channel)."
)]
crf: Annotated[int, Field(
default=23,
description="CRF value (10–30), lower = better quality, larger file"
)]
# font parameters
font_color: Annotated[Tuple[int, int, int, int], Field(
default=(255, 255, 255, 255),
description="Font color, RGBA format (R, G, B, A), values range 0-255")
]
font_size: Annotated[int, Field(
default=40,
description="Font size in pixels. Recommended range: 28–120."
)]
margin_bottom: Annotated[int, Field(
default=270,
description="Bottom margin for subtitles in pixels. Defaults to 80; valid range: 40–1040."
)]
stroke_width: Annotated[int, Field(
default=2,
description="Text stroke width (px), typically 0–8"
)]
stroke_color: Annotated[Tuple[int, int, int, int], Field(
default=(0, 0, 0, 255),
description="Text stroke color in RGBA format",
)]
# audio
bgm_volume_scale: Annotated[float, Field(
default=0.25,
description="Background music volume multiplier, range 0.0–3.0 (1.0 = default volume)"
)]
tts_volume_scale: Annotated[float, Field(
default=2.0,
description="TTS volume multiplier, range 0.0–3.0 (1.0 = default volume)"
)]
include_video_audio: Annotated[bool, Field(
default=False,
description="Whether to include the original video audio track"
)]
|