Spaces:

SemiAutomat1c
/

philverify-api

Running

Ryan Christian D. Deniega

feat: add video frame OCR — extract on-screen text alongside Whisper ASR

7097cb7 13 days ago

4.42 kB

	"""
	PhilVerify — Video Frame OCR Module
	Extracts on-screen text from video files by sampling frames with ffmpeg
	and running Tesseract OCR on each frame.

	Strategy:
	- Extract 1 frame every FRAME_INTERVAL seconds using ffmpeg (already in Docker)
	- Run existing Tesseract OCR on each frame
	- Deduplicate consecutive near-identical frames (static lower-thirds, etc.)
	- Return unique on-screen text joined by newlines
	"""
	import asyncio
	import logging
	import os
	import subprocess
	import tempfile
	from difflib import SequenceMatcher

	from inputs.ocr import extract_text_from_image

	logger = logging.getLogger(__name__)

	# Sample 1 frame every N seconds — good balance for news/social media clips
	FRAME_INTERVAL = 3
	# Similarity threshold — skip frame if >80% similar to previous (avoids repeating static text)
	SIMILARITY_THRESHOLD = 0.80
	# Minimum meaningful OCR text length per frame
	MIN_FRAME_CHARS = 8


	def _similarity(a: str, b: str) -> float:
	"""Return similarity ratio between two strings (0.0 – 1.0)."""
	return SequenceMatcher(None, a.strip(), b.strip()).ratio()


	def _extract_frames_with_ffmpeg(video_path: str, output_dir: str) -> list[str]:
	"""
	Use ffmpeg to extract 1 frame every FRAME_INTERVAL seconds as JPEG files.
	Returns list of frame file paths. Returns [] on failure.
	"""
	pattern = os.path.join(output_dir, "frame_%04d.jpg")
	cmd = [
	"ffmpeg", "-i", video_path,
	"-vf", f"fps=1/{FRAME_INTERVAL}",
	"-q:v", "2", # high quality JPEG
	"-frames:v", "300", # safety cap: max 300 frames (~15 min @ 3s interval)
	pattern,
	"-y", # overwrite
	"-loglevel", "error", # suppress noise
	]
	try:
	result = subprocess.run(cmd, capture_output=True, timeout=120)
	if result.returncode != 0:
	logger.warning("ffmpeg frame extraction failed: %s", result.stderr.decode())
	return []
	frames = sorted(f for f in os.listdir(output_dir) if f.endswith(".jpg"))
	logger.info("ffmpeg extracted %d frames from video", len(frames))
	return [os.path.join(output_dir, f) for f in frames]
	except FileNotFoundError:
	logger.warning("ffmpeg not found — video OCR unavailable")
	return []
	except subprocess.TimeoutExpired:
	logger.warning("ffmpeg frame extraction timed out")
	return []
	except Exception as e:
	logger.error("ffmpeg error: %s", e)
	return []


	async def extract_text_from_video_frames(media_bytes: bytes, filename: str = "upload.mp4") -> str:
	"""
	Extract on-screen text from a video by sampling frames with ffmpeg
	and running Tesseract OCR on each frame.

	Returns deduplicated on-screen text, or empty string if no text found
	or ffmpeg/tesseract unavailable.
	"""
	suffix = os.path.splitext(filename)[-1] or ".mp4"

	with tempfile.TemporaryDirectory() as tmpdir:
	# Write video bytes to temp file
	video_path = os.path.join(tmpdir, f"input{suffix}")
	with open(video_path, "wb") as f:
	f.write(media_bytes)

	frames_dir = os.path.join(tmpdir, "frames")
	os.makedirs(frames_dir)

	# Extract frames (blocking — run in executor to avoid blocking event loop)
	loop = asyncio.get_event_loop()
	frame_paths = await loop.run_in_executor(
	None, _extract_frames_with_ffmpeg, video_path, frames_dir
	)

	if not frame_paths:
	logger.info("No frames extracted — skipping video OCR")
	return ""

	# Run OCR on each frame, deduplicate consecutive similar text
	unique_texts: list[str] = []
	last_text = ""

	for frame_path in frame_paths:
	with open(frame_path, "rb") as f:
	frame_bytes = f.read()

	text = await extract_text_from_image(frame_bytes)
	text = text.strip()

	if len(text) < MIN_FRAME_CHARS:
	continue # mostly blank frame

	if last_text and _similarity(text, last_text) > SIMILARITY_THRESHOLD:
	continue # too similar to previous — static overlay, skip

	unique_texts.append(text)
	last_text = text

	result = "\n".join(unique_texts).strip()
	logger.info("Video OCR: %d unique text segments, %d total chars", len(unique_texts), len(result))
	return result