File size: 18,352 Bytes
06ba7ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
from typing import Dict, List, Literal, Any, Annotated, Optional, Union, ClassVar, Type, Tuple
from pydantic import BaseModel, Field, model_validator, constr, conlist


class VideoMetadata(BaseModel):
    """Video metadata"""
    width: int = Field(description="Width")
    height: int = Field(description="Height")
    duration: float = Field(description="Duration (milliseconds)")
    fps: float = Field(description="Video frame rate per second")
    has_audio: bool = Field(default=False, description="Whether audio track is present")

    audio_sample_rate_hz: Optional[int] = Field(
        None, 
        gt=0,
        description="Audio sample rate (Hz), common values: 44100, 48000"
    )

    @model_validator(mode='after')
    def validate_audio_sample_rate(self):
        """Audio sample rate is required if audio is present"""
        if self.has_audio and self.audio_sample_rate_hz is None:
            raise ValueError('audio_sample_rate_hz must be provided when video contains audio')
        return self

class ImageMetadata(BaseModel):
    """Image metadata"""
    width: int = Field(description="Width")
    height: int = Field(description="Height")


class Media(BaseModel):
    """Single media"""
    media_id: str
    path: str
    media_type: Literal["video", "image", "audio", "unknown"]
    metadata: Union[VideoMetadata, ImageMetadata]
    extra_info: Optional[Dict[str, Any]] = None


class SourceRef(BaseModel):
    """ Original media reference information """
    media_id: str
    start: float
    end: float
    duration: float
    height: Optional[int] = None
    width: Optional[int] = None


class Clip(BaseModel):
    clip_id: str
    language: Optional[str] = None
    caption: str = Field(default="", description="Caption describing the media")
    media_type: str
    path: str
    fps: Optional[float] = None
    extra_info: Optional[Dict[str, Any]] = Field(default=None, description="Extra metadata")


class SubtitleUnit(BaseModel):
    """Subtitle segmentation unit"""
    unit_id: str = Field(
        ...,
        description="Unique identifier for subtitle unit",
        example="subtitle_0001"
    )
    index_in_group: int = Field(
        ...,
        ge=0,
        description="Sequential index within current group (starting from 0)",
        example=0
    )
    text: str = Field(
        ...,
        description="Text content of this subtitle unit",
        example="The cat doesn't understand what KPI means"
    )


class GroupClips(BaseModel):
    """Video group - Visual material organization"""
    group_id: str = Field(
        ...,
        description="Unique identifier for the group",
        example="group_0001"
    )
    summary: str = Field(
        ...,
        description="Description of the group's visual style, emotional tone, or editing intent",
        example="Start with the calmest, most healing shots to establish the mood."
    )
    clip_ids: List[str] = Field(
        ...,
        description="List of video clip IDs used in this group, arranged in playback order",
        example=["clip_0003", "clip_0002"]
    )


class GroupScript(BaseModel):
    """Group script content"""
    group_id: str = Field(
        ...,
        description="Unique identifier for the group",
        example="group_0001"
    )
    raw_text: str = Field(
        ...,
        description="original script content for this group",
        example="The cat doesn't understand what KPI means, the cat only knows the sun is shining today"
    )
    subtitle_units: List = Field(
        ...,
        description="List of subtitle segmentation units for precise control of subtitle display rhythm"
    )


class Voiceover(BaseModel):
    """Single voiceover/narration item"""
    group_id: str = Field(..., description="Group ID, e.g., group_0001")
    voiceover_id: str = Field(..., description="Voiceover ID, e.g., voiceover_0001")
    path: str = Field(..., description="Voiceover file path")
    duration: int = Field(..., description="Voiceover duration (milliseconds)", gt=0)


class BGM(BaseModel):
    """Background music"""
    bgm_id: str = Field(..., description="BGM ID, e.g., bgm_0003")
    path: str = Field(..., description="BGM file path")
    duration: int = Field(..., description="BGM duration (milliseconds)", gt=0)
    bpm: float = Field(..., description="Beats per minute", gt=0)
    beats: List[int] = Field(default_factory=list, description="List of beat timestamps (milliseconds)")


class TimeWindow(BaseModel):
    start: int = Field(..., description="Start time (milliseconds)")
    end: int = Field(..., description="End time (milliseconds)")


class AudioMix(BaseModel):
    gain_db: float = Field(default=0.0, description="Gain in decibels")
    ducking: Optional[Any] = Field(default=None, description="Ducking effect configuration")


class ClipTrack(BaseModel):
    clip_id: str
    source_window: TimeWindow
    timeline_window: TimeWindow


class BgmTrack(BaseModel):
    bgm_id: str
    timeline_window: TimeWindow
    mix: AudioMix


class SubtitleTrack(BaseModel):
    text: str
    timeline_window: TimeWindow


class VoiceoverTrack(BaseModel):
    media_id: str
    timeline_window: TimeWindow


class TimelineTracks(BaseModel):
    video: List[ClipTrack] = Field(default_factory=list)
    subtitles: List[SubtitleTrack] = Field(default_factory=list)
    voiceover: List[VoiceoverTrack] = Field(default_factory=list)
    bgm: List[BgmTrack] = Field(default_factory=list)


class BaseInput(BaseModel):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Automatic mode; skip: Skip mode; default: Default mode"
    )


class LoadMediaInput(BaseInput):
    ...

class SearchMediaInput(BaseInput):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Automatically search media from pexels; skip: skip search; default: skip search"
    )
    photo_number: Annotated[int, Field(default=0, description="The number of images the user wants to obtain")]
    video_number: Annotated[int, Field(default=5, description="The number of videos the user wants to obtain")]
    search_keyword: Annotated[str, Field(default="scenery", description="Keyword of the media the user wants to obtain. Only one keyword is allowed; multiple keywords are not permitted.")]
    orientation: Literal["landscape", "portrait"] = Field(
        default="landscape",
        description="landscape: The screen is wider horizontally and narrower vertically, making it suitable for computer screens, landscape images, etc;portrait: The screen is higher vertically and narrower horizontally, making it suitable for mobile browsing and close-up shots of people."
    )
    min_video_duration: Annotated[int, Field(default=1, description="The shortest duration of footage requested by the user in seconds.")]
    max_video_duration: Annotated[int, Field(default=30, description="The longest duration of footage requested by the user in seconds.")]

class LoadMediaOutput(BaseModel):
    media: List[Media] = Field(
        default_factory=list,
        description="List of media"
    )


class SplitShotsInput(BaseInput):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Automatically segment shots based on scene changes, treat images as single shots; skip: Do not segment shots; default: Use default segmentation method"
    )
    min_shot_duration: Annotated[int, Field(default=1000, description="Segmented shots must not be shorter than this duration (unit: milliseconds)")]
    max_shot_duration: Annotated[int, Field(default=10000, description="If a single shot exceeds this duration, force segmentation (unit: milliseconds)")]

class SplitShotsOutput(BaseModel):
    clip_captions: List[Clip] = Field(default_factory=list, description="List of clips after splitting shots")
    overall: Dict[str, str]


class UnderstandClipsInput(BaseModel):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Generate descriptions based on media content; skip: Do not generate descriptions; default: Use default description generation method"
    )

class UnderstandClipsOutput(BaseModel):
    clip_captions: List[Clip] = Field(default_factory=list, description="List of clips after understanding clips")
    overall: Dict[str, str]

class FilterClipsInput(BaseModel):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Filter clips based on user requirements; skip: Skip filtering; default: Use default filtering method"
    )
    user_request: Annotated[str, Field(default="", description="User's requirements for clip filtering; if none provided, formulate one based on media materials and other editing requirements.")] = ""

class FilterClipsOutput(BaseModel):
    clip_captions: List[Clip] = Field(default_factory=list, description="List of clips")
    overall: Dict[str, str]
    overall: Dict[str, str]


class GroupClipsInput(BaseModel):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Organize clips in a logical order based on narrative flow of media content and user's sequencing requirements; skip: Skip sorting; default: Use default ordering method"
    )
    user_request: Annotated[str, Field(default="", description="User's requirements for media organization order; if none provided, arrange in a logical narrative sequence following standard conventions.")]

class GroupClipsOutput(BaseModel):
    groups: List[GroupClips] = Field(default_factory=list, description="List of clips")


class GenerateScriptInput(BaseModel):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Generate appropriate script based on media content and user's script requirements; skip: Skip, do not add subtitles; default: Use default script"
    )
    user_request: Annotated[str, Field(default="", description="User's requirements for the script.")]
    custom_script: Dict[str, Any] = Field(
        default={},
        description="If user has specific character-level editing requirements for script/title, pass the edited custom script and title through this parameter. Format should be based on the original script generation output format but with the subtitle_units field removed. In this case, mode must use `auto`, other modes are prohibited"
    )

class GenerateScriptOutput(BaseModel):
    group_scripts: List[GroupScript]
    title: Optional[str]


class GenerateVoiceoverInput(BaseInput):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Generate appropriate voiceover based on media content and user's voice requirements; skip: Skip voiceover; default: Use default voiceover"
    )
    user_request: Annotated[str, Field(default="", description="User's requirements for voiceover.")]

class RecommendScriptTemplateInput(BaseInput):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Select an appropriate copywriting template based on the material content and user's requirements for voiceover style; skip: Skip;"
    )
    user_request: Annotated[str, Field(default="", description="User's specific requirements for the script style.")]
    filter_include: Annotated[
        Dict[str, List[str]],
        Field(
            description=(
                "Positive filter conditions. Multiple dimensions are combined with AND, "
                "multiple values within the same dimension are combined with OR.\n"
                "Supported dimensions:\n"
                "- tags: category, one or more of "
                "[Life, Food, Beauty, Entertainment, Travel, Tech, Business, Vehicle, Health, Family, Pets, Knowledge]"
            )
        )
    ] = {}
    filter_exclude: Annotated[
        Dict[str, List[Union[str]]],
        Field(
            description=(
                "Negative filter conditions. Items matching these conditions will be excluded. "
                "The semantics are the same as filter_include. "
                "Supported dimensions: tags, id."
            )
        )
    ] = {}


class GenerateVoiceoverOutput(BaseModel):
    voiceover: List[Voiceover] = Field(default_factory=list, description="Voiceover list")


class SelectBGMInput(BaseInput):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Select appropriate music based on media content and user's music requirements; skip: Do not use music; default: Use default music"
    )
    user_request: Annotated[str, Field(default="", description="User's requirements for background music.")]
    filter_include: Annotated[
        Dict[str, List[Union[str, int]]],
        Field(
            description=(
                "Positive filter conditions. Multiple dimensions are combined with AND, "
                "multiple values within the same dimension are combined with OR.\n"
                "Supported dimensions:\n"
                "- mood: music emotion, one or more of "
                "[Dynamic, Chill, Happy, Sorrow, Romantic, Calm, Excited, Healing, Inspirational]\n"
                "- scene: usage scene, one or more of "
                "[Vlog, Travel, Relaxing, Emotion, Transition, Outdoor, Cafe, Evening, Scenery, Food, Date, Club]\n"
                "- genre: music genre, one or more of "
                "[Pop, BGM, Electronic, R&B/Soul, Hip Hop/Rap, Rock, Jazz, Folk, Classical, Chinese Style]\n"
                "- lang: lyric language, one or more of [bgm, en, zh, ko, ja]\n"
                "- id: specific music ids (int)"
            )
        )
    ] = {}
    filter_exclude: Annotated[
        Dict[str, List[Union[str, int]]],
        Field(
            description=(
                "Negative filter conditions. Items matching these conditions will be excluded. "
                "The semantics are the same as filter_include. "
                "Supported dimensions: mood, scene, genre, lang, id."
            )
        )
    ] = {}

class SelectBGMOutput(BaseModel):
    bgm: List[BGM] = Field(default_factory=list, description="BGM list")


class RecommendTransitionInput(BaseInput):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: add fade in and fade out transitions at beginning and end; skip: Do not use transitions; default: Use default transitions",
    )
    duration: Annotated[int, Field(default=1000, description="Duration of the transition in milliseconds")]

class RecommendTransitionOutput(BaseInput):
    ...


class RecommendTextInput(BaseInput):
    mode: Literal["auto", "skip", "default"] = Field(
        default="auto",
        description="auto: Select appropriate font style and color based on user's subtitle font style requirements; default: Use default font",
    )
    user_request: Annotated[str, Field(default="", description="User's requirements for font style")]
    filter_include: Annotated[
        Dict[str, List[Union[str, int]]],
        Field(
            description=(
                "Positive filter conditions. Multiple dimensions are combined with AND, "
                "multiple values within the same dimension are combined with OR.\n"
                "Supported dimensions:\n"
                "- class: Font type, one or more"
                "[Creative, Handwriting, Calligraphy, Basic]\n"
            )
        )
    ] = {}

class RecommendTextOutput(BaseInput):
    ...

class PlanTimelineInput(BaseInput):
    use_beats: Annotated[bool, Field(default=True, description="Whether clip transitions should sync with BGM beats")]

class PlanTimelineOutput(BaseModel):
    tracks: List[TimelineTracks] = Field(default_factory=list, description="Timeline track collection")

class RenderVideoInput(BaseInput):
    aspect_ratio: Annotated[str | None, Field(
        default=None,
        description="When explicitly specified, forces the canvas to one of 16:9, 4:3, 1:1, 3:4, 9:16. If unset, the system automatically infers the most suitable aspect ratio."
    )]
    output_max_dimension_px: Annotated[int | None, Field(
        default=None,
        description="Maximum output size in pixels (longest side); defaults to 1080 and works with the aspect ratio."
    )]
    clip_compose_mode: Annotated[str, Field(
        default="padding",
        description="" \
        "How to fit media into the canvas: " \
        "'padding' keeps aspect ratio and fills empty areas with a solid color; " \
        "'crop' center-crops media to match the canvas aspect ratio."
    )]
    bg_color: Annotated[Tuple[int] | List[int] | None, Field(
        default=(0, 0, 0),
        description="Background color for canvas padding, specified as an (R, G, B) tuple (no alpha channel)."
    )]
    crf: Annotated[int, Field(
        default=23, 
        description="CRF value (10–30), lower = better quality, larger file"
    )]

    # font parameters
    font_color: Annotated[Tuple[int, int, int, int], Field(
        default=(255, 255, 255, 255), 
        description="Font color, RGBA format (R, G, B, A), values range 0-255")
    ]
    font_size: Annotated[int, Field(
        default=40,
        description="Font size in pixels. Recommended range: 28–120."
    )]
    margin_bottom: Annotated[int, Field(
        default=270,
        description="Bottom margin for subtitles in pixels. Defaults to 80; valid range: 40–1040."
    )]
    stroke_width: Annotated[int, Field(
        default=2,
        description="Text stroke width (px), typically 0–8"
    )]
    stroke_color: Annotated[Tuple[int, int, int, int], Field(
        default=(0, 0, 0, 255), 
        description="Text stroke color in RGBA format",
    )]

    # audio
    bgm_volume_scale: Annotated[float, Field(
        default=0.25,
        description="Background music volume multiplier, range 0.0–3.0 (1.0 = default volume)"
    )]
    tts_volume_scale: Annotated[float, Field(
        default=2.0,
        description="TTS volume multiplier, range 0.0–3.0 (1.0 = default volume)"
    )]
    include_video_audio: Annotated[bool, Field(
        default=False,
        description="Whether to include the original video audio track"
    )]