Manim-Agent / backend /services /sync_engine_logic.py
github-actions[bot]
[API] Cuong2004/Manim-Agent @ 1d7c417 (run 25583057312)
9bed109
import logging
from typing import Any, cast
from shared.schemas.planner_output import PlannerOutput
from shared.schemas.voice_segments import VoiceSegmentTimestamps
logger = logging.getLogger(__name__)
def validate_sync_duration(
video_duration: float, audio_duration: float, threshold: float = 0.5
) -> dict[str, Any]:
"""
Checks if the generated video duration matches the expected audio duration.
"""
if video_duration is None or audio_duration is None:
return {
"video_duration": video_duration,
"audio_duration": audio_duration,
"diff": 0,
"sync_issue": False,
"error": "Missing duration data",
}
diff = abs(video_duration - audio_duration)
is_issue = diff > threshold
return {
"video_duration": round(video_duration, 3),
"audio_duration": round(audio_duration, 3),
"diff": round(diff, 3),
"sync_issue": is_issue,
"threshold": threshold,
}
def align_beats_to_audio(plan: PlannerOutput, timestamps: VoiceSegmentTimestamps) -> dict[str, Any]:
"""
Core sync logic: Maps narrative beats from the execution plan to physical audio timestamps.
Returns a structured 'sync_segments' dictionary.
"""
segments = []
# Version 2 voice timestamps provide paragraph-level 'segments'
ts_segments = timestamps.segments
for i, beat in enumerate(plan.beats):
if i < len(ts_segments):
ts = ts_segments[i]
segments.append(
{
"step_label": beat.step_label,
"start": ts.start,
"end": ts.end,
"duration": round(ts.end - ts.start, 3),
"text_ref": ts.text,
}
)
else:
# Fallback if plan has more beats than voice segments
prev_end = cast(float, segments[-1]["end"]) if segments else 0.0
segments.append(
{
"step_label": beat.step_label,
"start": prev_end,
"end": prev_end + 2.0,
"duration": 2.0,
"text_ref": "[fallback/unvoiced]",
}
)
return {
"version": "1",
"beats": segments,
"total_audio_duration": round(ts_segments[-1].end if ts_segments else 0.0, 3),
}