File size: 1,760 Bytes
6bdfadc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from typing import List, Dict
class SegmentSynchronizer:
def synchronize(
self,
frames: List[Dict], # [{"timestamp": 0.0, "path": "...", "description": "..."}]
transcript: List[Dict] # [{"start": 0.0, "end": 3.2, "text": "..."}]
) -> List[Dict]:
"""
Create unified segments with visual + speech.
Returns:
List of synchronized segments:
[
{
"start": 0.0,
"end": 2.0,
"frame_path": "/tmp/frame_001.jpg",
"visual": "Woman looking frustrated in kitchen",
"speech": "Tired of everyday exhaustion?"
},
...
]
"""
segments = []
for i, frame in enumerate(frames):
timestamp = frame['timestamp']
# Calculate segment end (next frame timestamp or +interval)
if i < len(frames) - 1:
end_time = frames[i + 1]['timestamp']
else:
end_time = timestamp + 2.0 # Default interval
# Find overlapping speech
speech_text = ""
for t in transcript:
# Check if speech segment overlaps with this frame's time window
if t['end'] > timestamp and t['start'] < end_time:
speech_text += " " + t['text']
speech_text = speech_text.strip()
segments.append({
"start": timestamp,
"end": end_time,
"frame_path": frame['path'],
"visual": frame['description'],
"speech": speech_text if speech_text else None
})
return segments
|