from typing import List, Dict class SegmentSynchronizer: def synchronize( self, frames: List[Dict], # [{"timestamp": 0.0, "path": "...", "description": "..."}] transcript: List[Dict] # [{"start": 0.0, "end": 3.2, "text": "..."}] ) -> List[Dict]: """ Create unified segments with visual + speech. Returns: List of synchronized segments: [ { "start": 0.0, "end": 2.0, "frame_path": "/tmp/frame_001.jpg", "visual": "Woman looking frustrated in kitchen", "speech": "Tired of everyday exhaustion?" }, ... ] """ segments = [] for i, frame in enumerate(frames): timestamp = frame['timestamp'] # Calculate segment end (next frame timestamp or +interval) if i < len(frames) - 1: end_time = frames[i + 1]['timestamp'] else: end_time = timestamp + 2.0 # Default interval # Find overlapping speech speech_text = "" for t in transcript: # Check if speech segment overlaps with this frame's time window if t['end'] > timestamp and t['start'] < end_time: speech_text += " " + t['text'] speech_text = speech_text.strip() segments.append({ "start": timestamp, "end": end_time, "frame_path": frame['path'], "visual": frame['description'], "speech": speech_text if speech_text else None }) return segments