agentbee

Running

File size: 5,521 Bytes

"""
Audio Transcription Tool - Whisper speech-to-text
Author: @mangubee
Date: 2026-01-13

Provides audio transcription using OpenAI Whisper:
- Supports MP3, WAV, M4A, and other audio formats
- ZeroGPU acceleration via @spaces.GPU decorator
- Model caching for efficient repeated use
- Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files)

Requirements:
- openai-whisper: pip install openai-whisper
- ZeroGPU: @spaces.GPU decorator required for HF Spaces
"""

import logging
import os
import tempfile
from typing import Dict, Any
from pathlib import Path

# ============================================================================
# CONFIG
# ============================================================================
WHISPER_MODEL = "small"  # tiny, base, small, medium, large
WHISPER_LANGUAGE = "en"   # English (auto-detect if None)
AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"]

# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)

# ============================================================================
# Global Model Cache
# ============================================================================
_MODEL = None


# ============================================================================
# ZeroGPU Import (conditional)
# ============================================================================
try:
    from spaces import GPU
    ZERO_GPU_AVAILABLE = True
except ImportError:
    # Not on HF Spaces, use dummy decorator
    def GPU(func):
        return func
    ZERO_GPU_AVAILABLE = False
    logger.info("ZeroGPU not available, running in CPU mode")


# ============================================================================
# Transcription Function
# =============================================================================

@GPU  # Required for ZeroGPU - tells HF Spaces to allocate GPU
def transcribe_audio(file_path: str) -> Dict[str, Any]:
    """
    Transcribe audio file using Whisper (ZeroGPU accelerated).

    Args:
        file_path: Path to audio file (MP3, WAV, M4A, etc.)

    Returns:
        Dict with structure: {
            "text": str,           # Transcribed text
            "file_path": str,      # Original file path
            "success": bool,        # True if transcription succeeded
            "error": str or None   # Error message if failed
        }

    Raises:
        FileNotFoundError: If audio file doesn't exist
        ValueError: If file format is not supported

    Examples:
        >>> transcribe_audio("audio.mp3")
        {"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None}
    """
    global _MODEL

    # Validate file path
    if not file_path:
        logger.error("Empty file path provided")
        return {
            "text": "",
            "file_path": "",
            "success": False,
            "error": "Empty file path provided"
        }

    file_path = Path(file_path)

    if not file_path.exists():
        logger.error(f"File not found: {file_path}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"File not found: {file_path}"
        }

    # Check file extension
    if file_path.suffix.lower() not in AUDIO_FORMATS:
        logger.error(f"Unsupported audio format: {file_path.suffix}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}"
        }

    logger.info(f"Transcribing audio: {file_path}")

    try:
        # Lazy import Whisper (only when function is called)
        import whisper

        # Load model (cached globally)
        if _MODEL is None:
            logger.info(f"Loading Whisper model: {WHISPER_MODEL}")
            device = "cuda" if ZERO_GPU_AVAILABLE else "cpu"
            _MODEL = whisper.load_model(WHISPER_MODEL, device=device)
            logger.info(f"Whisper model loaded on {device}")

        # Transcribe audio
        result = _MODEL.transcribe(
            str(file_path),
            language=WHISPER_LANGUAGE,
            fp16=False  # Use fp32 for compatibility
        )

        text = result["text"].strip()
        logger.info(f"Transcription successful: {len(text)} characters")

        return {
            "text": text,
            "file_path": str(file_path),
            "success": True,
            "error": None
        }

    except FileNotFoundError:
        logger.error(f"Audio file not found: {file_path}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"Audio file not found: {file_path}"
        }
    except Exception as e:
        logger.error(f"Transcription failed: {e}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"Transcription failed: {str(e)}"
        }


# ============================================================================
# Cleanup Function
# =============================================================================

def cleanup():
    """Reset global model cache (useful for testing)."""
    global _MODEL
    _MODEL = None
    logger.info("Whisper model cache cleared")