Rasta02
/

dataku

Model card Files Files and versions

xet

Community

Rasta02 commited on Nov 30, 2025

Commit

e53e238

verified ·

1 Parent(s): 1c865d3

Upload backend/subtitle_extractor.py with huggingface_hub

Browse files

Files changed (1) hide show

backend/subtitle_extractor.py +265 -0

backend/subtitle_extractor.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+Subtitle Extractor Module
+Extracts subtitles from videos using OCR and generates SRT files
+"""
+import cv2
+import sys
+import os
+from pathlib import Path
+from collections import defaultdict
+# Add backend to path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from backend.main import SubtitleDetect
+class SubtitleExtractor:
+    """Extract subtitles from video and generate SRT files"""
+    def __init__(self, video_path, sub_area=None):
+        """
+        Initialize subtitle extractor
+        Args:
+            video_path: Path to video file
+            sub_area: Optional subtitle area (ymin, ymax, xmin, xmax)
+        """
+        self.video_path = video_path
+        self.sub_area = sub_area
+        self.detector = SubtitleDetect(video_path, sub_area)
+        # Get video properties
+        self.video_cap = cv2.VideoCapture(video_path)
+        self.fps = self.video_cap.get(cv2.CAP_PROP_FPS)
+        self.frame_count = int(self.video_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    @property
+    def text_recognizer(self):
+        """Lazy load PaddleOCR text recognizer"""
+        if not hasattr(self, '_text_recognizer'):
+            import paddle
+            paddle.disable_signal_handler()
+            from paddleocr.tools.infer import utility
+            from paddleocr.tools.infer.predict_rec import TextRecognizer
+            import importlib
+            import config
+            importlib.reload(config)
+            args = utility.parse_args()
+            args.rec_algorithm = 'CRNN'
+            args.rec_model_dir = config.REC_MODEL_PATH if hasattr(config, 'REC_MODEL_PATH') else os.path.join(config.DET_MODEL_BASE, config.MODEL_VERSION, 'ch_rec')
+            args.use_onnx = len(config.ONNX_PROVIDERS) > 0
+            args.onnx_providers = config.ONNX_PROVIDERS
+            self._text_recognizer = TextRecognizer(args)
+        return self._text_recognizer
+    def extract_text_from_frame(self, frame, boxes):
+        """
+        Extract text from frame using OCR
+        Args:
+            frame: Video frame (numpy array)
+            boxes: List of detected text boxes [(xmin, xmax, ymin, ymax), ...]
+        Returns:
+            List of extracted text strings
+        """
+        texts = []
+        for box in boxes:
+            xmin, xmax, ymin, ymax = box
+            # Crop text region
+            text_region = frame[ymin:ymax, xmin:xmax]
+            if text_region.size == 0:
+                continue
+            try:
+                # Run OCR on cropped region
+                rec_result, _ = self.text_recognizer([text_region])
+                if rec_result and len(rec_result) > 0:
+                    text, confidence = rec_result[0]
+                    if confidence > 0.5:  # Only accept if confidence > 50%
+                        texts.append(text)
+            except Exception as e:
+                print(f"Warning: OCR failed for box {box}: {e}")
+                continue
+        return texts
+    def format_timestamp(self, seconds):
+        """
+        Convert seconds to SRT timestamp format (HH:MM:SS,mmm)
+        Args:
+            seconds: Time in seconds (float)
+        Returns:
+            Formatted timestamp string
+        """
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        millis = int((seconds % 1) * 1000)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+    def extract_subtitles(self, progress_callback=None):
+        """
+        Extract subtitles with OCR and timestamps
+        Args:
+            progress_callback: Optional callback function for progress updates
+        Returns:
+            List of subtitle dictionaries with 'start', 'end', 'text' keys
+        """
+        print("[Subtitle Extractor] Starting subtitle extraction...")
+        # Detect subtitle regions
+        subtitle_frame_dict = self.detector.find_subtitle_frame_no()
+        if not subtitle_frame_dict:
+            print("[Subtitle Extractor] No subtitles detected!")
+            return []
+        print(f"[Subtitle Extractor] Found subtitles in {len(subtitle_frame_dict)} frames")
+        # Group continuous frames with same text
+        subtitles = []
+        current_subtitle = None
+        # Reset video capture
+        self.video_cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
+        current_frame_no = 0
+        # Find continuous ranges
+        continuous_ranges = self.detector.find_continuous_ranges_with_same_mask(subtitle_frame_dict)
+        for start_frame, end_frame in continuous_ranges:
+            # Seek to start frame
+            self.video_cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame - 1)
+            ret, frame = self.video_cap.read()
+            if not ret:
+                continue
+            # Get boxes for this frame
+            boxes = subtitle_frame_dict.get(start_frame, [])
+            # Extract text
+            texts = self.extract_text_from_frame(frame, boxes)
+            combined_text = " ".join(texts).strip()
+            if not combined_text:
+                continue
+            # Calculate timestamps
+            start_time = (start_frame - 1) / self.fps
+            end_time = end_frame / self.fps
+            # Merge with previous if same text and continuous
+            if (current_subtitle and
+                current_subtitle['text'] == combined_text and
+                abs(start_time - current_subtitle['end']) < 1.0):
+                # Extend end time
+                current_subtitle['end'] = end_time
+            else:
+                # Add previous subtitle if exists
+                if current_subtitle:
+                    subtitles.append(current_subtitle)
+                # Start new subtitle
+                current_subtitle = {
+                    'start': start_time,
+                    'end': end_time,
+                    'text': combined_text
+                }
+            if progress_callback:
+                progress = end_frame / self.frame_count
+                progress_callback(progress, f"Extracting subtitles: {len(subtitles)+1} found")
+        # Add last subtitle
+        if current_subtitle:
+            subtitles.append(current_subtitle)
+        print(f"[Subtitle Extractor] Extracted {len(subtitles)} subtitle segments")
+        return subtitles
+    def generate_srt(self, subtitles, output_path):
+        """
+        Generate SRT file from subtitles
+        Args:
+            subtitles: List of subtitle dictionaries
+            output_path: Path to save SRT file
+        Returns:
+            Path to generated SRT file
+        """
+        print(f"[Subtitle Extractor] Generating SRT file: {output_path}")
+        with open(output_path, 'w', encoding='utf-8') as f:
+            for i, sub in enumerate(subtitles, 1):
+                # Subtitle number
+                f.write(f"{i}\n")
+                # Timestamps
+                start_ts = self.format_timestamp(sub['start'])
+                end_ts = self.format_timestamp(sub['end'])
+                f.write(f"{start_ts} --> {end_ts}\n")
+                # Text
+                f.write(f"{sub['text']}\n")
+                # Blank line
+                f.write("\n")
+        print(f"[Subtitle Extractor] SRT file saved: {output_path}")
+        return output_path
+    def extract_to_srt(self, output_path=None, progress_callback=None):
+        """
+        Complete extraction pipeline: detect -> OCR -> generate SRT
+        Args:
+            output_path: Optional custom output path for SRT file
+            progress_callback: Optional callback for progress updates
+        Returns:
+            Path to generated SRT file
+        """
+        # Default output path
+        if output_path is None:
+            video_name = Path(self.video_path).stem
+            output_dir = Path(self.video_path).parent
+            output_path = output_dir / f"{video_name}_subtitles.srt"
+        # Extract subtitles
+        subtitles = self.extract_subtitles(progress_callback)
+        if not subtitles:
+            # Create empty SRT
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write("# No subtitles detected\n")
+            return str(output_path)
+        # Generate SRT
+        return self.generate_srt(subtitles, str(output_path))
+if __name__ == '__main__':
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python subtitle_extractor.py <video_path>")
+        sys.exit(1)
+    video_path = sys.argv[1]
+    extractor = SubtitleExtractor(video_path)
+    srt_path = extractor.extract_to_srt()
+    print(f"Subtitles extracted to: {srt_path}")