""" Audio Transcription Tool - Whisper speech-to-text Author: @mangubee Date: 2026-01-13 Provides audio transcription using OpenAI Whisper: - Supports MP3, WAV, M4A, and other audio formats - ZeroGPU acceleration via @spaces.GPU decorator - Model caching for efficient repeated use - Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files) Requirements: - openai-whisper: pip install openai-whisper - ZeroGPU: @spaces.GPU decorator required for HF Spaces """ import logging import os import tempfile from typing import Dict, Any from pathlib import Path # ============================================================================ # CONFIG # ============================================================================ WHISPER_MODEL = "small" # tiny, base, small, medium, large WHISPER_LANGUAGE = "en" # English (auto-detect if None) AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"] # ============================================================================ # Logging Setup # ============================================================================ logger = logging.getLogger(__name__) # ============================================================================ # Global Model Cache # ============================================================================ _MODEL = None # ============================================================================ # ZeroGPU Import (conditional) # ============================================================================ try: from spaces import GPU ZERO_GPU_AVAILABLE = True except ImportError: # Not on HF Spaces, use dummy decorator def GPU(func): return func ZERO_GPU_AVAILABLE = False logger.info("ZeroGPU not available, running in CPU mode") # ============================================================================ # Transcription Function # ============================================================================= @GPU # Required for ZeroGPU - tells HF Spaces to allocate GPU def transcribe_audio(file_path: str) -> Dict[str, Any]: """ Transcribe audio file using Whisper (ZeroGPU accelerated). Args: file_path: Path to audio file (MP3, WAV, M4A, etc.) Returns: Dict with structure: { "text": str, # Transcribed text "file_path": str, # Original file path "success": bool, # True if transcription succeeded "error": str or None # Error message if failed } Raises: FileNotFoundError: If audio file doesn't exist ValueError: If file format is not supported Examples: >>> transcribe_audio("audio.mp3") {"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None} """ global _MODEL # Validate file path if not file_path: logger.error("Empty file path provided") return { "text": "", "file_path": "", "success": False, "error": "Empty file path provided" } file_path = Path(file_path) if not file_path.exists(): logger.error(f"File not found: {file_path}") return { "text": "", "file_path": str(file_path), "success": False, "error": f"File not found: {file_path}" } # Check file extension if file_path.suffix.lower() not in AUDIO_FORMATS: logger.error(f"Unsupported audio format: {file_path.suffix}") return { "text": "", "file_path": str(file_path), "success": False, "error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}" } logger.info(f"Transcribing audio: {file_path}") try: # Lazy import Whisper (only when function is called) import whisper # Load model (cached globally) if _MODEL is None: logger.info(f"Loading Whisper model: {WHISPER_MODEL}") device = "cuda" if ZERO_GPU_AVAILABLE else "cpu" _MODEL = whisper.load_model(WHISPER_MODEL, device=device) logger.info(f"Whisper model loaded on {device}") # Transcribe audio result = _MODEL.transcribe( str(file_path), language=WHISPER_LANGUAGE, fp16=False # Use fp32 for compatibility ) text = result["text"].strip() logger.info(f"Transcription successful: {len(text)} characters") return { "text": text, "file_path": str(file_path), "success": True, "error": None } except FileNotFoundError: logger.error(f"Audio file not found: {file_path}") return { "text": "", "file_path": str(file_path), "success": False, "error": f"Audio file not found: {file_path}" } except Exception as e: logger.error(f"Transcription failed: {e}") return { "text": "", "file_path": str(file_path), "success": False, "error": f"Transcription failed: {str(e)}" } # ============================================================================ # Cleanup Function # ============================================================================= def cleanup(): """Reset global model cache (useful for testing).""" global _MODEL _MODEL = None logger.info("Whisper model cache cleared")