dataku / backend /subtitle_extractor.py

Upload backend/subtitle_extractor.py with huggingface_hub

e53e238 verified 2 months ago

9.05 kB

	"""
	Subtitle Extractor Module
	Extracts subtitles from videos using OCR and generates SRT files
	"""

	import cv2
	import sys
	import os
	from pathlib import Path
	from collections import defaultdict

	# Add backend to path
	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from backend.main import SubtitleDetect


	class SubtitleExtractor:
	"""Extract subtitles from video and generate SRT files"""

	def __init__(self, video_path, sub_area=None):
	"""
	Initialize subtitle extractor

	Args:
	video_path: Path to video file
	sub_area: Optional subtitle area (ymin, ymax, xmin, xmax)
	"""
	self.video_path = video_path
	self.sub_area = sub_area
	self.detector = SubtitleDetect(video_path, sub_area)

	# Get video properties
	self.video_cap = cv2.VideoCapture(video_path)
	self.fps = self.video_cap.get(cv2.CAP_PROP_FPS)
	self.frame_count = int(self.video_cap.get(cv2.CAP_PROP_FRAME_COUNT))

	@property
	def text_recognizer(self):
	"""Lazy load PaddleOCR text recognizer"""
	if not hasattr(self, '_text_recognizer'):
	import paddle
	paddle.disable_signal_handler()
	from paddleocr.tools.infer import utility
	from paddleocr.tools.infer.predict_rec import TextRecognizer
	import importlib
	import config
	importlib.reload(config)

	args = utility.parse_args()
	args.rec_algorithm = 'CRNN'
	args.rec_model_dir = config.REC_MODEL_PATH if hasattr(config, 'REC_MODEL_PATH') else os.path.join(config.DET_MODEL_BASE, config.MODEL_VERSION, 'ch_rec')
	args.use_onnx = len(config.ONNX_PROVIDERS) > 0
	args.onnx_providers = config.ONNX_PROVIDERS

	self._text_recognizer = TextRecognizer(args)
	return self._text_recognizer

	def extract_text_from_frame(self, frame, boxes):
	"""
	Extract text from frame using OCR

	Args:
	frame: Video frame (numpy array)
	boxes: List of detected text boxes [(xmin, xmax, ymin, ymax), ...]

	Returns:
	List of extracted text strings
	"""
	texts = []

	for box in boxes:
	xmin, xmax, ymin, ymax = box

	# Crop text region
	text_region = frame[ymin:ymax, xmin:xmax]

	if text_region.size == 0:
	continue

	try:
	# Run OCR on cropped region
	rec_result, _ = self.text_recognizer([text_region])
	if rec_result and len(rec_result) > 0:
	text, confidence = rec_result[0]
	if confidence > 0.5: # Only accept if confidence > 50%
	texts.append(text)
	except Exception as e:
	print(f"Warning: OCR failed for box {box}: {e}")
	continue

	return texts

	def format_timestamp(self, seconds):
	"""
	Convert seconds to SRT timestamp format (HH:MM:SS,mmm)

	Args:
	seconds: Time in seconds (float)

	Returns:
	Formatted timestamp string
	"""
	hours = int(seconds // 3600)
	minutes = int((seconds % 3600) // 60)
	secs = int(seconds % 60)
	millis = int((seconds % 1) * 1000)

	return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

	def extract_subtitles(self, progress_callback=None):
	"""
	Extract subtitles with OCR and timestamps

	Args:
	progress_callback: Optional callback function for progress updates

	Returns:
	List of subtitle dictionaries with 'start', 'end', 'text' keys
	"""
	print("[Subtitle Extractor] Starting subtitle extraction...")

	# Detect subtitle regions
	subtitle_frame_dict = self.detector.find_subtitle_frame_no()

	if not subtitle_frame_dict:
	print("[Subtitle Extractor] No subtitles detected!")
	return []

	print(f"[Subtitle Extractor] Found subtitles in {len(subtitle_frame_dict)} frames")

	# Group continuous frames with same text
	subtitles = []
	current_subtitle = None

	# Reset video capture
	self.video_cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
	current_frame_no = 0

	# Find continuous ranges
	continuous_ranges = self.detector.find_continuous_ranges_with_same_mask(subtitle_frame_dict)

	for start_frame, end_frame in continuous_ranges:
	# Seek to start frame
	self.video_cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame - 1)
	ret, frame = self.video_cap.read()

	if not ret:
	continue

	# Get boxes for this frame
	boxes = subtitle_frame_dict.get(start_frame, [])

	# Extract text
	texts = self.extract_text_from_frame(frame, boxes)
	combined_text = " ".join(texts).strip()

	if not combined_text:
	continue

	# Calculate timestamps
	start_time = (start_frame - 1) / self.fps
	end_time = end_frame / self.fps

	# Merge with previous if same text and continuous
	if (current_subtitle and
	current_subtitle['text'] == combined_text and
	abs(start_time - current_subtitle['end']) < 1.0):
	# Extend end time
	current_subtitle['end'] = end_time
	else:
	# Add previous subtitle if exists
	if current_subtitle:
	subtitles.append(current_subtitle)

	# Start new subtitle
	current_subtitle = {
	'start': start_time,
	'end': end_time,
	'text': combined_text
	}

	if progress_callback:
	progress = end_frame / self.frame_count
	progress_callback(progress, f"Extracting subtitles: {len(subtitles)+1} found")

	# Add last subtitle
	if current_subtitle:
	subtitles.append(current_subtitle)

	print(f"[Subtitle Extractor] Extracted {len(subtitles)} subtitle segments")
	return subtitles

	def generate_srt(self, subtitles, output_path):
	"""
	Generate SRT file from subtitles

	Args:
	subtitles: List of subtitle dictionaries
	output_path: Path to save SRT file

	Returns:
	Path to generated SRT file
	"""
	print(f"[Subtitle Extractor] Generating SRT file: {output_path}")

	with open(output_path, 'w', encoding='utf-8') as f:
	for i, sub in enumerate(subtitles, 1):
	# Subtitle number
	f.write(f"{i}\n")

	# Timestamps
	start_ts = self.format_timestamp(sub['start'])
	end_ts = self.format_timestamp(sub['end'])
	f.write(f"{start_ts} --> {end_ts}\n")

	# Text
	f.write(f"{sub['text']}\n")

	# Blank line
	f.write("\n")

	print(f"[Subtitle Extractor] SRT file saved: {output_path}")
	return output_path

	def extract_to_srt(self, output_path=None, progress_callback=None):
	"""
	Complete extraction pipeline: detect -> OCR -> generate SRT

	Args:
	output_path: Optional custom output path for SRT file
	progress_callback: Optional callback for progress updates

	Returns:
	Path to generated SRT file
	"""
	# Default output path
	if output_path is None:
	video_name = Path(self.video_path).stem
	output_dir = Path(self.video_path).parent
	output_path = output_dir / f"{video_name}_subtitles.srt"

	# Extract subtitles
	subtitles = self.extract_subtitles(progress_callback)

	if not subtitles:
	# Create empty SRT
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write("# No subtitles detected\n")
	return str(output_path)

	# Generate SRT
	return self.generate_srt(subtitles, str(output_path))


	if __name__ == '__main__':
	import sys
	if len(sys.argv) < 2:
	print("Usage: python subtitle_extractor.py <video_path>")
	sys.exit(1)

	video_path = sys.argv[1]
	extractor = SubtitleExtractor(video_path)
	srt_path = extractor.extract_to_srt()
	print(f"Subtitles extracted to: {srt_path}")