Spaces:

TheBug95
/

OphtalmoCapture

Sleeping

File size: 3,536 Bytes

"""OphthalmoCapture — Whisper Transcription Service

Encapsulates all Whisper-related logic: model loading, transcription,
and segment-level timestamps.  Temporary files are ALWAYS cleaned up.
"""

import os
import shutil
import tempfile
import streamlit as st
import whisper

# ── Ensure ffmpeg is available ───────────────────────────────────────────────
# If system ffmpeg is not in PATH, use the bundled one from imageio-ffmpeg.
if shutil.which("ffmpeg") is None:
    try:
        import imageio_ffmpeg
        _ffmpeg_real = imageio_ffmpeg.get_ffmpeg_exe()
        _ffmpeg_dir = os.path.dirname(_ffmpeg_real)
        # Create an alias so that Whisper (which calls "ffmpeg") can find it.
        _alias_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
        _ffmpeg_alias = os.path.join(_ffmpeg_dir, _alias_name)
        if not os.path.exists(_ffmpeg_alias):
            try:
                os.link(_ffmpeg_real, _ffmpeg_alias)   # hard link (no admin)
            except OSError:
                import shutil as _sh
                _sh.copy2(_ffmpeg_real, _ffmpeg_alias)  # fallback: copy
        os.environ["PATH"] = (
            _ffmpeg_dir + os.pathsep + os.environ.get("PATH", "")
        )
    except ImportError:
        pass  # Will fail later with a clear Whisper error


@st.cache_resource
def load_whisper_model(model_size: str):
    """Load and cache a Whisper model."""
    print(f"Loading Whisper model: {model_size}...")
    return whisper.load_model(model_size)


def transcribe_audio(model, audio_bytes: bytes, language: str = "es") -> str:
    """Transcribe raw WAV bytes and return plain text.

    The temporary file is **always** deleted (try/finally).
    """
    tmp_path = None
    try:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            tmp.write(audio_bytes)
            tmp_path = tmp.name

        result = model.transcribe(tmp_path, language=language)
        return result.get("text", "").strip()
    except Exception as e:
        st.error(f"Error de transcripción: {e}")
        return ""
    finally:
        if tmp_path and os.path.exists(tmp_path):
            os.unlink(tmp_path)


def transcribe_audio_with_timestamps(
    model, audio_bytes: bytes, language: str = "es"
) -> tuple[str, list[dict]]:
    """Transcribe raw WAV bytes and return (plain_text, segments).

    Each segment dict contains:
        {"start": float, "end": float, "text": str}

    Useful for syncing transcript highlights with audio playback.
    """
    tmp_path = None
    try:
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            tmp.write(audio_bytes)
            tmp_path = tmp.name

        result = model.transcribe(tmp_path, language=language)
        text = result.get("text", "").strip()

        segments = []
        for seg in result.get("segments", []):
            segments.append({
                "start": round(seg["start"], 2),
                "end": round(seg["end"], 2),
                "text": seg["text"].strip(),
            })

        return text, segments
    except Exception as e:
        st.error(f"Error de transcripción: {e}")
        return "", []
    finally:
        if tmp_path and os.path.exists(tmp_path):
            os.unlink(tmp_path)


def format_timestamp(seconds: float) -> str:
    """Convert seconds to MM:SS format."""
    m, s = divmod(int(seconds), 60)
    return f"{m:02d}:{s:02d}"