Whisper Transcriber Bot commited on
Commit
d33fc74
·
1 Parent(s): 72f1983

Group words into proper subtitle segments

Browse files

- Add group_words_into_segments() to combine word-level timestamps
- Group by: sentence punctuation, max 12 words, max 7 seconds
- Update SRT and VTT formatters to use grouped segments
- Much easier to edit subtitles now

Files changed (1) hide show
  1. utils/formatters.py +137 -13
utils/formatters.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  from typing import Dict, List, Any, Optional
3
  from datetime import timedelta
4
  import logging
@@ -10,6 +11,97 @@ logger = logging.getLogger(__name__)
10
  class SubtitleFormatter:
11
  """Format transcription results into various subtitle formats"""
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  @staticmethod
14
  def format_timestamp_srt(seconds: float) -> str:
15
  """
@@ -69,9 +161,12 @@ class SubtitleFormatter:
69
  srt_content = []
70
  chunks = result.get('chunks', [])
71
 
72
- for idx, chunk in enumerate(chunks, 1):
73
- timestamp = chunk.get('timestamp', (0, 0))
74
- text = chunk.get('text', '').strip()
 
 
 
75
 
76
  if not text:
77
  continue
@@ -79,10 +174,14 @@ class SubtitleFormatter:
79
  start_time = timestamp[0] if timestamp[0] is not None else 0.0
80
  end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
81
 
82
- # Add speaker label if available
83
- if speaker_labels and idx - 1 in speaker_labels:
84
- speaker = speaker_labels[idx - 1]
85
- text = f"[{speaker}]: {text}"
 
 
 
 
86
 
87
  # Format: index, timestamp, text
88
  srt_content.append(f"{idx}")
@@ -95,6 +194,25 @@ class SubtitleFormatter:
95
 
96
  return "\n".join(srt_content)
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  @staticmethod
99
  def to_vtt(result: Dict[str, Any], speaker_labels: Optional[Dict] = None) -> str:
100
  """
@@ -110,9 +228,12 @@ class SubtitleFormatter:
110
  vtt_content = ["WEBVTT", ""]
111
  chunks = result.get('chunks', [])
112
 
113
- for idx, chunk in enumerate(chunks):
114
- timestamp = chunk.get('timestamp', (0, 0))
115
- text = chunk.get('text', '').strip()
 
 
 
116
 
117
  if not text:
118
  continue
@@ -121,9 +242,12 @@ class SubtitleFormatter:
121
  end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
122
 
123
  # Add speaker label if available
124
- if speaker_labels and idx in speaker_labels:
125
- speaker = speaker_labels[idx]
126
- text = f"<v {speaker}>{text}</v>"
 
 
 
127
 
128
  # Format: timestamp, text
129
  vtt_content.append(
 
1
  import json
2
+ import re
3
  from typing import Dict, List, Any, Optional
4
  from datetime import timedelta
5
  import logging
 
11
  class SubtitleFormatter:
12
  """Format transcription results into various subtitle formats"""
13
 
14
+ # Settings for grouping words into subtitle segments
15
+ MAX_WORDS_PER_SEGMENT = 12
16
+ MAX_DURATION_SECONDS = 7.0
17
+ MIN_DURATION_SECONDS = 1.0
18
+
19
+ @staticmethod
20
+ def group_words_into_segments(chunks: List[Dict]) -> List[Dict]:
21
+ """
22
+ Group word-level chunks into proper subtitle segments.
23
+
24
+ Groups by:
25
+ - Sentence-ending punctuation (. ! ? etc.)
26
+ - Maximum words per segment
27
+ - Maximum duration per segment
28
+
29
+ Args:
30
+ chunks: List of word-level chunks with timestamps
31
+
32
+ Returns:
33
+ List of grouped segments with combined text and timestamps
34
+ """
35
+ if not chunks:
36
+ return []
37
+
38
+ segments = []
39
+ current_segment = {
40
+ 'words': [],
41
+ 'start': None,
42
+ 'end': None,
43
+ 'text': ''
44
+ }
45
+
46
+ sentence_endings = re.compile(r'[.!?;:]$')
47
+
48
+ for chunk in chunks:
49
+ text = chunk.get('text', '').strip()
50
+ if not text:
51
+ continue
52
+
53
+ timestamp = chunk.get('timestamp', (None, None))
54
+ word_start = timestamp[0] if timestamp[0] is not None else 0.0
55
+ word_end = timestamp[1] if timestamp[1] is not None else word_start + 0.5
56
+
57
+ # Initialize segment start time
58
+ if current_segment['start'] is None:
59
+ current_segment['start'] = word_start
60
+
61
+ current_segment['words'].append(text)
62
+ current_segment['end'] = word_end
63
+
64
+ # Calculate current segment duration
65
+ duration = current_segment['end'] - current_segment['start']
66
+ word_count = len(current_segment['words'])
67
+
68
+ # Check if we should end the current segment
69
+ should_end_segment = (
70
+ sentence_endings.search(text) or # Sentence ending punctuation
71
+ word_count >= SubtitleFormatter.MAX_WORDS_PER_SEGMENT or # Max words reached
72
+ duration >= SubtitleFormatter.MAX_DURATION_SECONDS # Max duration reached
73
+ )
74
+
75
+ if should_end_segment:
76
+ # Finalize current segment
77
+ current_segment['text'] = ' '.join(current_segment['words'])
78
+ # Clean up double spaces
79
+ current_segment['text'] = re.sub(r'\s+', ' ', current_segment['text']).strip()
80
+
81
+ segments.append({
82
+ 'timestamp': (current_segment['start'], current_segment['end']),
83
+ 'text': current_segment['text']
84
+ })
85
+
86
+ # Reset for next segment
87
+ current_segment = {
88
+ 'words': [],
89
+ 'start': None,
90
+ 'end': None,
91
+ 'text': ''
92
+ }
93
+
94
+ # Don't forget the last segment if there are remaining words
95
+ if current_segment['words']:
96
+ current_segment['text'] = ' '.join(current_segment['words'])
97
+ current_segment['text'] = re.sub(r'\s+', ' ', current_segment['text']).strip()
98
+ segments.append({
99
+ 'timestamp': (current_segment['start'], current_segment['end']),
100
+ 'text': current_segment['text']
101
+ })
102
+
103
+ return segments
104
+
105
  @staticmethod
106
  def format_timestamp_srt(seconds: float) -> str:
107
  """
 
161
  srt_content = []
162
  chunks = result.get('chunks', [])
163
 
164
+ # Group words into proper subtitle segments
165
+ segments = SubtitleFormatter.group_words_into_segments(chunks)
166
+
167
+ for idx, segment in enumerate(segments, 1):
168
+ timestamp = segment.get('timestamp', (0, 0))
169
+ text = segment.get('text', '').strip()
170
 
171
  if not text:
172
  continue
 
174
  start_time = timestamp[0] if timestamp[0] is not None else 0.0
175
  end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
176
 
177
+ # Add speaker label if available (simplified for grouped segments)
178
+ if speaker_labels:
179
+ # Find the most common speaker in this time range
180
+ speaker = SubtitleFormatter._get_speaker_for_segment(
181
+ speaker_labels, chunks, start_time, end_time
182
+ )
183
+ if speaker:
184
+ text = f"[{speaker}]: {text}"
185
 
186
  # Format: index, timestamp, text
187
  srt_content.append(f"{idx}")
 
194
 
195
  return "\n".join(srt_content)
196
 
197
+ @staticmethod
198
+ def _get_speaker_for_segment(
199
+ speaker_labels: Dict,
200
+ chunks: List[Dict],
201
+ start_time: float,
202
+ end_time: float
203
+ ) -> Optional[str]:
204
+ """Get the most common speaker for a time segment"""
205
+ speakers = []
206
+ for idx, chunk in enumerate(chunks):
207
+ ts = chunk.get('timestamp', (None, None))
208
+ if ts[0] is not None and start_time <= ts[0] <= end_time:
209
+ if idx in speaker_labels:
210
+ speakers.append(speaker_labels[idx])
211
+ if speakers:
212
+ # Return most common speaker
213
+ return max(set(speakers), key=speakers.count)
214
+ return None
215
+
216
  @staticmethod
217
  def to_vtt(result: Dict[str, Any], speaker_labels: Optional[Dict] = None) -> str:
218
  """
 
228
  vtt_content = ["WEBVTT", ""]
229
  chunks = result.get('chunks', [])
230
 
231
+ # Group words into proper subtitle segments
232
+ segments = SubtitleFormatter.group_words_into_segments(chunks)
233
+
234
+ for idx, segment in enumerate(segments):
235
+ timestamp = segment.get('timestamp', (0, 0))
236
+ text = segment.get('text', '').strip()
237
 
238
  if not text:
239
  continue
 
242
  end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
243
 
244
  # Add speaker label if available
245
+ if speaker_labels:
246
+ speaker = SubtitleFormatter._get_speaker_for_segment(
247
+ speaker_labels, chunks, start_time, end_time
248
+ )
249
+ if speaker:
250
+ text = f"<v {speaker}>{text}</v>"
251
 
252
  # Format: timestamp, text
253
  vtt_content.append(