|
|
""" |
|
|
Audio Transcription Tool - Whisper speech-to-text |
|
|
Author: @mangubee |
|
|
Date: 2026-01-13 |
|
|
|
|
|
Provides audio transcription using OpenAI Whisper: |
|
|
- Supports MP3, WAV, M4A, and other audio formats |
|
|
- ZeroGPU acceleration via @spaces.GPU decorator |
|
|
- Model caching for efficient repeated use |
|
|
- Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files) |
|
|
|
|
|
Requirements: |
|
|
- openai-whisper: pip install openai-whisper |
|
|
- ZeroGPU: @spaces.GPU decorator required for HF Spaces |
|
|
""" |
|
|
|
|
|
import logging |
|
|
import os |
|
|
import tempfile |
|
|
from typing import Dict, Any |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
WHISPER_MODEL = "small" |
|
|
WHISPER_LANGUAGE = "en" |
|
|
AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_MODEL = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from spaces import GPU |
|
|
ZERO_GPU_AVAILABLE = True |
|
|
except ImportError: |
|
|
|
|
|
def GPU(func): |
|
|
return func |
|
|
ZERO_GPU_AVAILABLE = False |
|
|
logger.info("ZeroGPU not available, running in CPU mode") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@GPU |
|
|
def transcribe_audio(file_path: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Transcribe audio file using Whisper (ZeroGPU accelerated). |
|
|
|
|
|
Args: |
|
|
file_path: Path to audio file (MP3, WAV, M4A, etc.) |
|
|
|
|
|
Returns: |
|
|
Dict with structure: { |
|
|
"text": str, # Transcribed text |
|
|
"file_path": str, # Original file path |
|
|
"success": bool, # True if transcription succeeded |
|
|
"error": str or None # Error message if failed |
|
|
} |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If audio file doesn't exist |
|
|
ValueError: If file format is not supported |
|
|
|
|
|
Examples: |
|
|
>>> transcribe_audio("audio.mp3") |
|
|
{"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None} |
|
|
""" |
|
|
global _MODEL |
|
|
|
|
|
|
|
|
if not file_path: |
|
|
logger.error("Empty file path provided") |
|
|
return { |
|
|
"text": "", |
|
|
"file_path": "", |
|
|
"success": False, |
|
|
"error": "Empty file path provided" |
|
|
} |
|
|
|
|
|
file_path = Path(file_path) |
|
|
|
|
|
if not file_path.exists(): |
|
|
logger.error(f"File not found: {file_path}") |
|
|
return { |
|
|
"text": "", |
|
|
"file_path": str(file_path), |
|
|
"success": False, |
|
|
"error": f"File not found: {file_path}" |
|
|
} |
|
|
|
|
|
|
|
|
if file_path.suffix.lower() not in AUDIO_FORMATS: |
|
|
logger.error(f"Unsupported audio format: {file_path.suffix}") |
|
|
return { |
|
|
"text": "", |
|
|
"file_path": str(file_path), |
|
|
"success": False, |
|
|
"error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}" |
|
|
} |
|
|
|
|
|
logger.info(f"Transcribing audio: {file_path}") |
|
|
|
|
|
try: |
|
|
|
|
|
import whisper |
|
|
|
|
|
|
|
|
if _MODEL is None: |
|
|
logger.info(f"Loading Whisper model: {WHISPER_MODEL}") |
|
|
device = "cuda" if ZERO_GPU_AVAILABLE else "cpu" |
|
|
_MODEL = whisper.load_model(WHISPER_MODEL, device=device) |
|
|
logger.info(f"Whisper model loaded on {device}") |
|
|
|
|
|
|
|
|
result = _MODEL.transcribe( |
|
|
str(file_path), |
|
|
language=WHISPER_LANGUAGE, |
|
|
fp16=False |
|
|
) |
|
|
|
|
|
text = result["text"].strip() |
|
|
logger.info(f"Transcription successful: {len(text)} characters") |
|
|
|
|
|
return { |
|
|
"text": text, |
|
|
"file_path": str(file_path), |
|
|
"success": True, |
|
|
"error": None |
|
|
} |
|
|
|
|
|
except FileNotFoundError: |
|
|
logger.error(f"Audio file not found: {file_path}") |
|
|
return { |
|
|
"text": "", |
|
|
"file_path": str(file_path), |
|
|
"success": False, |
|
|
"error": f"Audio file not found: {file_path}" |
|
|
} |
|
|
except Exception as e: |
|
|
logger.error(f"Transcription failed: {e}") |
|
|
return { |
|
|
"text": "", |
|
|
"file_path": str(file_path), |
|
|
"success": False, |
|
|
"error": f"Transcription failed: {str(e)}" |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cleanup(): |
|
|
"""Reset global model cache (useful for testing).""" |
|
|
global _MODEL |
|
|
_MODEL = None |
|
|
logger.info("Whisper model cache cleared") |
|
|
|