Spaces:
Running
Running
Ryan Christian D. Deniega
feat: add video frame OCR β extract on-screen text alongside Whisper ASR
7097cb7 | """ | |
| PhilVerify β Whisper ASR Module | |
| Transcribes video/audio files using OpenAI Whisper. | |
| Also provides combined ASR + frame OCR for full video text extraction. | |
| Recommended model: large-v3 (best Filipino speech accuracy). | |
| """ | |
| import asyncio | |
| import logging | |
| import tempfile | |
| import os | |
| logger = logging.getLogger(__name__) | |
| async def transcribe_video(media_bytes: bytes, filename: str = "upload") -> str: | |
| """ | |
| Transcribe audio/video bytes using Whisper. | |
| Saves bytes to a temp file (Whisper requires file path, not bytes). | |
| Returns the transcript string. | |
| """ | |
| try: | |
| import whisper | |
| from config import get_settings | |
| settings = get_settings() | |
| model_size = settings.whisper_model_size | |
| logger.info("Loading Whisper model: %s", model_size) | |
| model = whisper.load_model(model_size) | |
| # Whisper needs a file path β write bytes to temp file | |
| suffix = os.path.splitext(filename)[-1] or ".mp4" | |
| with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: | |
| tmp.write(media_bytes) | |
| tmp_path = tmp.name | |
| try: | |
| result = model.transcribe(tmp_path, language=None) # Auto-detect language | |
| transcript = result.get("text", "").strip() | |
| logger.info("Whisper transcribed %d chars (lang=%s)", len(transcript), result.get("language")) | |
| return transcript | |
| finally: | |
| os.unlink(tmp_path) # Clean up temp file | |
| except ImportError: | |
| logger.warning("openai-whisper not installed β ASR unavailable") | |
| return "" | |
| except Exception as e: | |
| logger.error("Whisper transcription failed: %s", e) | |
| return "" | |
| async def transcribe_and_ocr_video(media_bytes: bytes, filename: str = "upload") -> str: | |
| """ | |
| Full video text extraction: runs Whisper ASR and frame OCR in parallel, | |
| then merges results based on what was found. | |
| Cases handled: | |
| - Audio only (no on-screen text) β returns speech transcript alone | |
| - On-screen text only (silent) β returns OCR text alone | |
| - Both β returns labelled combination | |
| - Neither β returns empty string (caller raises 422) | |
| """ | |
| from inputs.video_ocr import extract_text_from_video_frames | |
| # Run Whisper ASR and frame OCR concurrently | |
| speech_text, ocr_text = await asyncio.gather( | |
| transcribe_video(media_bytes, filename=filename), | |
| extract_text_from_video_frames(media_bytes, filename=filename), | |
| ) | |
| speech_text = (speech_text or "").strip() | |
| ocr_text = (ocr_text or "").strip() | |
| has_speech = len(speech_text) >= 10 | |
| has_ocr = len(ocr_text) >= 10 | |
| if has_speech and has_ocr: | |
| logger.info("Video has both speech (%d chars) and on-screen text (%d chars) β combining", len(speech_text), len(ocr_text)) | |
| return f"[SPEECH]\n{speech_text}\n\n[ON-SCREEN TEXT]\n{ocr_text}" | |
| if has_speech: | |
| logger.info("Video has speech only (%d chars)", len(speech_text)) | |
| return speech_text | |
| if has_ocr: | |
| logger.info("Video has on-screen text only (%d chars)", len(ocr_text)) | |
| return ocr_text | |
| logger.warning("Video yielded no usable text from either ASR or frame OCR") | |
| return "" | |