|
|
from typing import List, Dict |
|
|
|
|
|
|
|
|
class SegmentSynchronizer: |
|
|
def synchronize( |
|
|
self, |
|
|
frames: List[Dict], |
|
|
transcript: List[Dict] |
|
|
) -> List[Dict]: |
|
|
""" |
|
|
Create unified segments with visual + speech. |
|
|
|
|
|
Returns: |
|
|
List of synchronized segments: |
|
|
[ |
|
|
{ |
|
|
"start": 0.0, |
|
|
"end": 2.0, |
|
|
"frame_path": "/tmp/frame_001.jpg", |
|
|
"visual": "Woman looking frustrated in kitchen", |
|
|
"speech": "Tired of everyday exhaustion?" |
|
|
}, |
|
|
... |
|
|
] |
|
|
""" |
|
|
segments = [] |
|
|
|
|
|
for i, frame in enumerate(frames): |
|
|
timestamp = frame['timestamp'] |
|
|
|
|
|
|
|
|
if i < len(frames) - 1: |
|
|
end_time = frames[i + 1]['timestamp'] |
|
|
else: |
|
|
end_time = timestamp + 2.0 |
|
|
|
|
|
|
|
|
speech_text = "" |
|
|
for t in transcript: |
|
|
|
|
|
if t['end'] > timestamp and t['start'] < end_time: |
|
|
speech_text += " " + t['text'] |
|
|
|
|
|
speech_text = speech_text.strip() |
|
|
|
|
|
segments.append({ |
|
|
"start": timestamp, |
|
|
"end": end_time, |
|
|
"frame_path": frame['path'], |
|
|
"visual": frame['description'], |
|
|
"speech": speech_text if speech_text else None |
|
|
}) |
|
|
|
|
|
return segments |
|
|
|