File size: 5,521 Bytes
38cc8e4 e7b4937 38cc8e4 7bb1d7c 38cc8e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
"""
Audio Transcription Tool - Whisper speech-to-text
Author: @mangubee
Date: 2026-01-13
Provides audio transcription using OpenAI Whisper:
- Supports MP3, WAV, M4A, and other audio formats
- ZeroGPU acceleration via @spaces.GPU decorator
- Model caching for efficient repeated use
- Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files)
Requirements:
- openai-whisper: pip install openai-whisper
- ZeroGPU: @spaces.GPU decorator required for HF Spaces
"""
import logging
import os
import tempfile
from typing import Dict, Any
from pathlib import Path
# ============================================================================
# CONFIG
# ============================================================================
WHISPER_MODEL = "small" # tiny, base, small, medium, large
WHISPER_LANGUAGE = "en" # English (auto-detect if None)
AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"]
# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)
# ============================================================================
# Global Model Cache
# ============================================================================
_MODEL = None
# ============================================================================
# ZeroGPU Import (conditional)
# ============================================================================
try:
from spaces import GPU
ZERO_GPU_AVAILABLE = True
except ImportError:
# Not on HF Spaces, use dummy decorator
def GPU(func):
return func
ZERO_GPU_AVAILABLE = False
logger.info("ZeroGPU not available, running in CPU mode")
# ============================================================================
# Transcription Function
# =============================================================================
@GPU # Required for ZeroGPU - tells HF Spaces to allocate GPU
def transcribe_audio(file_path: str) -> Dict[str, Any]:
"""
Transcribe audio file using Whisper (ZeroGPU accelerated).
Args:
file_path: Path to audio file (MP3, WAV, M4A, etc.)
Returns:
Dict with structure: {
"text": str, # Transcribed text
"file_path": str, # Original file path
"success": bool, # True if transcription succeeded
"error": str or None # Error message if failed
}
Raises:
FileNotFoundError: If audio file doesn't exist
ValueError: If file format is not supported
Examples:
>>> transcribe_audio("audio.mp3")
{"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None}
"""
global _MODEL
# Validate file path
if not file_path:
logger.error("Empty file path provided")
return {
"text": "",
"file_path": "",
"success": False,
"error": "Empty file path provided"
}
file_path = Path(file_path)
if not file_path.exists():
logger.error(f"File not found: {file_path}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"File not found: {file_path}"
}
# Check file extension
if file_path.suffix.lower() not in AUDIO_FORMATS:
logger.error(f"Unsupported audio format: {file_path.suffix}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}"
}
logger.info(f"Transcribing audio: {file_path}")
try:
# Lazy import Whisper (only when function is called)
import whisper
# Load model (cached globally)
if _MODEL is None:
logger.info(f"Loading Whisper model: {WHISPER_MODEL}")
device = "cuda" if ZERO_GPU_AVAILABLE else "cpu"
_MODEL = whisper.load_model(WHISPER_MODEL, device=device)
logger.info(f"Whisper model loaded on {device}")
# Transcribe audio
result = _MODEL.transcribe(
str(file_path),
language=WHISPER_LANGUAGE,
fp16=False # Use fp32 for compatibility
)
text = result["text"].strip()
logger.info(f"Transcription successful: {len(text)} characters")
return {
"text": text,
"file_path": str(file_path),
"success": True,
"error": None
}
except FileNotFoundError:
logger.error(f"Audio file not found: {file_path}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"Audio file not found: {file_path}"
}
except Exception as e:
logger.error(f"Transcription failed: {e}")
return {
"text": "",
"file_path": str(file_path),
"success": False,
"error": f"Transcription failed: {str(e)}"
}
# ============================================================================
# Cleanup Function
# =============================================================================
def cleanup():
"""Reset global model cache (useful for testing)."""
global _MODEL
_MODEL = None
logger.info("Whisper model cache cleared")
|