agentbee / src /tools /audio.py
mangubee's picture
fix: correct author name formatting in multiple files
e7b4937
"""
Audio Transcription Tool - Whisper speech-to-text
Author: @mangubee
Date: 2026-01-13
Provides audio transcription using OpenAI Whisper:
- Supports MP3, WAV, M4A, and other audio formats
- ZeroGPU acceleration via @spaces.GPU decorator
- Model caching for efficient repeated use
- Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files)
Requirements:
- openai-whisper: pip install openai-whisper
- ZeroGPU: @spaces.GPU decorator required for HF Spaces
"""
import logging
import os
import tempfile
from typing import Dict, Any
from pathlib import Path
# ============================================================================
# CONFIG
# ============================================================================
WHISPER_MODEL = "small" # tiny, base, small, medium, large
WHISPER_LANGUAGE = "en" # English (auto-detect if None)
AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"]
# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)
# ============================================================================
# Global Model Cache
# ============================================================================
_MODEL = None
# ============================================================================
# ZeroGPU Import (conditional)
# ============================================================================
try:
from spaces import GPU
ZERO_GPU_AVAILABLE = True
except ImportError:
# Not on HF Spaces, use dummy decorator
def GPU(func):
return func
ZERO_GPU_AVAILABLE = False
logger.info("ZeroGPU not available, running in CPU mode")
# ============================================================================
# Transcription Function
# =============================================================================
@GPU # Required for ZeroGPU - tells HF Spaces to allocate GPU
def transcribe_audio(file_path: str) -> Dict[str, Any]:
"""
Transcribe audio file using Whisper (ZeroGPU accelerated).
Args:
file_path: Path to audio file (MP3, WAV, M4A, etc.)
Returns:
Dict with structure: {
"text": str, # Transcribed text
"file_path": str, # Original file path
"success": bool, # True if transcription succeeded
"error": str or None # Error message if failed
}
Raises:
FileNotFoundError: If audio file doesn't exist
ValueError: If file format is not supported
Examples:
>>> transcribe_audio("audio.mp3")
{"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None}
"""
global _MODEL
# Validate file path
if not file_path:
logger.error("Empty file path provided")
return {
"text": "",
"file_path": "",
"success": False,
"error": "Empty file path provided"
}
file_path = Path(file_path)
if not file_path.exists():
logger.error(f"File not found: {file_path}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"File not found: {file_path}"
}
# Check file extension
if file_path.suffix.lower() not in AUDIO_FORMATS:
logger.error(f"Unsupported audio format: {file_path.suffix}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}"
}
logger.info(f"Transcribing audio: {file_path}")
try:
# Lazy import Whisper (only when function is called)
import whisper
# Load model (cached globally)
if _MODEL is None:
logger.info(f"Loading Whisper model: {WHISPER_MODEL}")
device = "cuda" if ZERO_GPU_AVAILABLE else "cpu"
_MODEL = whisper.load_model(WHISPER_MODEL, device=device)
logger.info(f"Whisper model loaded on {device}")
# Transcribe audio
result = _MODEL.transcribe(
str(file_path),
language=WHISPER_LANGUAGE,
fp16=False # Use fp32 for compatibility
)
text = result["text"].strip()
logger.info(f"Transcription successful: {len(text)} characters")
return {
"text": text,
"file_path": str(file_path),
"success": True,
"error": None
}
except FileNotFoundError:
logger.error(f"Audio file not found: {file_path}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"Audio file not found: {file_path}"
}
except Exception as e:
logger.error(f"Transcription failed: {e}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"Transcription failed: {str(e)}"
}
# ============================================================================
# Cleanup Function
# =============================================================================
def cleanup():
"""Reset global model cache (useful for testing)."""
global _MODEL
_MODEL = None
logger.info("Whisper model cache cleared")