Spaces:

xTHExBEASTx
/

Whisper-Transcriber

Runtime error

Whisper Transcriber Bot commited on Dec 29, 2025

Commit

d33fc74

1 Parent(s): 72f1983

Group words into proper subtitle segments

- Add group_words_into_segments() to combine word-level timestamps
- Group by: sentence punctuation, max 12 words, max 7 seconds
- Update SRT and VTT formatters to use grouped segments
- Much easier to edit subtitles now

Files changed (1) hide show

utils/formatters.py +137 -13

utils/formatters.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from typing import Dict, List, Any, Optional
 from datetime import timedelta
 import logging
@@ -10,6 +11,97 @@ logger = logging.getLogger(__name__)
 class SubtitleFormatter:
     """Format transcription results into various subtitle formats"""
     @staticmethod
     def format_timestamp_srt(seconds: float) -> str:
         """
@@ -69,9 +161,12 @@ class SubtitleFormatter:
         srt_content = []
         chunks = result.get('chunks', [])
-        for idx, chunk in enumerate(chunks, 1):
-            timestamp = chunk.get('timestamp', (0, 0))
-            text = chunk.get('text', '').strip()
             if not text:
                 continue
@@ -79,10 +174,14 @@ class SubtitleFormatter:
             start_time = timestamp[0] if timestamp[0] is not None else 0.0
             end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
-            # Add speaker label if available
-            if speaker_labels and idx - 1 in speaker_labels:
-                speaker = speaker_labels[idx - 1]
-                text = f"[{speaker}]: {text}"
             # Format: index, timestamp, text
             srt_content.append(f"{idx}")
@@ -95,6 +194,25 @@ class SubtitleFormatter:
         return "\n".join(srt_content)
     @staticmethod
     def to_vtt(result: Dict[str, Any], speaker_labels: Optional[Dict] = None) -> str:
         """
@@ -110,9 +228,12 @@ class SubtitleFormatter:
         vtt_content = ["WEBVTT", ""]
         chunks = result.get('chunks', [])
-        for idx, chunk in enumerate(chunks):
-            timestamp = chunk.get('timestamp', (0, 0))
-            text = chunk.get('text', '').strip()
             if not text:
                 continue
@@ -121,9 +242,12 @@ class SubtitleFormatter:
             end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
             # Add speaker label if available
-            if speaker_labels and idx in speaker_labels:
-                speaker = speaker_labels[idx]
-                text = f"<v {speaker}>{text}</v>"
             # Format: timestamp, text
             vtt_content.append(

 import json
+import re
 from typing import Dict, List, Any, Optional
 from datetime import timedelta
 import logging
 class SubtitleFormatter:
     """Format transcription results into various subtitle formats"""
+    # Settings for grouping words into subtitle segments
+    MAX_WORDS_PER_SEGMENT = 12
+    MAX_DURATION_SECONDS = 7.0
+    MIN_DURATION_SECONDS = 1.0
+    @staticmethod
+    def group_words_into_segments(chunks: List[Dict]) -> List[Dict]:
+        """
+        Group word-level chunks into proper subtitle segments.
+        Groups by:
+        - Sentence-ending punctuation (. ! ? etc.)
+        - Maximum words per segment
+        - Maximum duration per segment
+        Args:
+            chunks: List of word-level chunks with timestamps
+        Returns:
+            List of grouped segments with combined text and timestamps
+        """
+        if not chunks:
+            return []
+        segments = []
+        current_segment = {
+            'words': [],
+            'start': None,
+            'end': None,
+            'text': ''
+        }
+        sentence_endings = re.compile(r'[.!?;:]$')
+        for chunk in chunks:
+            text = chunk.get('text', '').strip()
+            if not text:
+                continue
+            timestamp = chunk.get('timestamp', (None, None))
+            word_start = timestamp[0] if timestamp[0] is not None else 0.0
+            word_end = timestamp[1] if timestamp[1] is not None else word_start + 0.5
+            # Initialize segment start time
+            if current_segment['start'] is None:
+                current_segment['start'] = word_start
+            current_segment['words'].append(text)
+            current_segment['end'] = word_end
+            # Calculate current segment duration
+            duration = current_segment['end'] - current_segment['start']
+            word_count = len(current_segment['words'])
+            # Check if we should end the current segment
+            should_end_segment = (
+                sentence_endings.search(text) or  # Sentence ending punctuation
+                word_count >= SubtitleFormatter.MAX_WORDS_PER_SEGMENT or  # Max words reached
+                duration >= SubtitleFormatter.MAX_DURATION_SECONDS  # Max duration reached
+            )
+            if should_end_segment:
+                # Finalize current segment
+                current_segment['text'] = ' '.join(current_segment['words'])
+                # Clean up double spaces
+                current_segment['text'] = re.sub(r'\s+', ' ', current_segment['text']).strip()
+                segments.append({
+                    'timestamp': (current_segment['start'], current_segment['end']),
+                    'text': current_segment['text']
+                })
+                # Reset for next segment
+                current_segment = {
+                    'words': [],
+                    'start': None,
+                    'end': None,
+                    'text': ''
+                }
+        # Don't forget the last segment if there are remaining words
+        if current_segment['words']:
+            current_segment['text'] = ' '.join(current_segment['words'])
+            current_segment['text'] = re.sub(r'\s+', ' ', current_segment['text']).strip()
+            segments.append({
+                'timestamp': (current_segment['start'], current_segment['end']),
+                'text': current_segment['text']
+            })
+        return segments
     @staticmethod
     def format_timestamp_srt(seconds: float) -> str:
         """
         srt_content = []
         chunks = result.get('chunks', [])
+        # Group words into proper subtitle segments
+        segments = SubtitleFormatter.group_words_into_segments(chunks)
+        for idx, segment in enumerate(segments, 1):
+            timestamp = segment.get('timestamp', (0, 0))
+            text = segment.get('text', '').strip()
             if not text:
                 continue
             start_time = timestamp[0] if timestamp[0] is not None else 0.0
             end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
+            # Add speaker label if available (simplified for grouped segments)
+            if speaker_labels:
+                # Find the most common speaker in this time range
+                speaker = SubtitleFormatter._get_speaker_for_segment(
+                    speaker_labels, chunks, start_time, end_time
+                )
+                if speaker:
+                    text = f"[{speaker}]: {text}"
             # Format: index, timestamp, text
             srt_content.append(f"{idx}")
         return "\n".join(srt_content)
+    @staticmethod
+    def _get_speaker_for_segment(
+        speaker_labels: Dict,
+        chunks: List[Dict],
+        start_time: float,
+        end_time: float
+    ) -> Optional[str]:
+        """Get the most common speaker for a time segment"""
+        speakers = []
+        for idx, chunk in enumerate(chunks):
+            ts = chunk.get('timestamp', (None, None))
+            if ts[0] is not None and start_time <= ts[0] <= end_time:
+                if idx in speaker_labels:
+                    speakers.append(speaker_labels[idx])
+        if speakers:
+            # Return most common speaker
+            return max(set(speakers), key=speakers.count)
+        return None
     @staticmethod
     def to_vtt(result: Dict[str, Any], speaker_labels: Optional[Dict] = None) -> str:
         """
         vtt_content = ["WEBVTT", ""]
         chunks = result.get('chunks', [])
+        # Group words into proper subtitle segments
+        segments = SubtitleFormatter.group_words_into_segments(chunks)
+        for idx, segment in enumerate(segments):
+            timestamp = segment.get('timestamp', (0, 0))
+            text = segment.get('text', '').strip()
             if not text:
                 continue
             end_time = timestamp[1] if timestamp[1] is not None else start_time + 1.0
             # Add speaker label if available
+            if speaker_labels:
+                speaker = SubtitleFormatter._get_speaker_for_segment(
+                    speaker_labels, chunks, start_time, end_time
+                )
+                if speaker:
+                    text = f"<v {speaker}>{text}</v>"
             # Format: timestamp, text
             vtt_content.append(