File size: 3,286 Bytes
6c9b8f1
 
 
7097cb7
6c9b8f1
 
7097cb7
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7097cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
PhilVerify β€” Whisper ASR Module
Transcribes video/audio files using OpenAI Whisper.
Also provides combined ASR + frame OCR for full video text extraction.
Recommended model: large-v3 (best Filipino speech accuracy).
"""
import asyncio
import logging
import tempfile
import os

logger = logging.getLogger(__name__)


async def transcribe_video(media_bytes: bytes, filename: str = "upload") -> str:
    """
    Transcribe audio/video bytes using Whisper.
    Saves bytes to a temp file (Whisper requires file path, not bytes).
    Returns the transcript string.
    """
    try:
        import whisper
        from config import get_settings
        settings = get_settings()

        model_size = settings.whisper_model_size
        logger.info("Loading Whisper model: %s", model_size)

        model = whisper.load_model(model_size)

        # Whisper needs a file path β€” write bytes to temp file
        suffix = os.path.splitext(filename)[-1] or ".mp4"
        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
            tmp.write(media_bytes)
            tmp_path = tmp.name

        try:
            result = model.transcribe(tmp_path, language=None)  # Auto-detect language
            transcript = result.get("text", "").strip()
            logger.info("Whisper transcribed %d chars (lang=%s)", len(transcript), result.get("language"))
            return transcript
        finally:
            os.unlink(tmp_path)  # Clean up temp file

    except ImportError:
        logger.warning("openai-whisper not installed β€” ASR unavailable")
        return ""
    except Exception as e:
        logger.error("Whisper transcription failed: %s", e)
        return ""


async def transcribe_and_ocr_video(media_bytes: bytes, filename: str = "upload") -> str:
    """
    Full video text extraction: runs Whisper ASR and frame OCR in parallel,
    then merges results based on what was found.

    Cases handled:
      - Audio only (no on-screen text)  β†’ returns speech transcript alone
      - On-screen text only (silent)    β†’ returns OCR text alone
      - Both                            β†’ returns labelled combination
      - Neither                         β†’ returns empty string (caller raises 422)
    """
    from inputs.video_ocr import extract_text_from_video_frames

    # Run Whisper ASR and frame OCR concurrently
    speech_text, ocr_text = await asyncio.gather(
        transcribe_video(media_bytes, filename=filename),
        extract_text_from_video_frames(media_bytes, filename=filename),
    )

    speech_text = (speech_text or "").strip()
    ocr_text = (ocr_text or "").strip()

    has_speech = len(speech_text) >= 10
    has_ocr = len(ocr_text) >= 10

    if has_speech and has_ocr:
        logger.info("Video has both speech (%d chars) and on-screen text (%d chars) β€” combining", len(speech_text), len(ocr_text))
        return f"[SPEECH]\n{speech_text}\n\n[ON-SCREEN TEXT]\n{ocr_text}"

    if has_speech:
        logger.info("Video has speech only (%d chars)", len(speech_text))
        return speech_text

    if has_ocr:
        logger.info("Video has on-screen text only (%d chars)", len(ocr_text))
        return ocr_text

    logger.warning("Video yielded no usable text from either ASR or frame OCR")
    return ""