File size: 5,521 Bytes
38cc8e4
 
e7b4937
38cc8e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7bb1d7c
38cc8e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
Audio Transcription Tool - Whisper speech-to-text
Author: @mangubee
Date: 2026-01-13

Provides audio transcription using OpenAI Whisper:
- Supports MP3, WAV, M4A, and other audio formats
- ZeroGPU acceleration via @spaces.GPU decorator
- Model caching for efficient repeated use
- Unified tool for Phase 1 (YouTube fallback) and Phase 2 (MP3 files)

Requirements:
- openai-whisper: pip install openai-whisper
- ZeroGPU: @spaces.GPU decorator required for HF Spaces
"""

import logging
import os
import tempfile
from typing import Dict, Any
from pathlib import Path

# ============================================================================
# CONFIG
# ============================================================================
WHISPER_MODEL = "small"  # tiny, base, small, medium, large
WHISPER_LANGUAGE = "en"   # English (auto-detect if None)
AUDIO_FORMATS = [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".aac"]

# ============================================================================
# Logging Setup
# ============================================================================
logger = logging.getLogger(__name__)

# ============================================================================
# Global Model Cache
# ============================================================================
_MODEL = None


# ============================================================================
# ZeroGPU Import (conditional)
# ============================================================================
try:
    from spaces import GPU
    ZERO_GPU_AVAILABLE = True
except ImportError:
    # Not on HF Spaces, use dummy decorator
    def GPU(func):
        return func
    ZERO_GPU_AVAILABLE = False
    logger.info("ZeroGPU not available, running in CPU mode")


# ============================================================================
# Transcription Function
# =============================================================================

@GPU  # Required for ZeroGPU - tells HF Spaces to allocate GPU
def transcribe_audio(file_path: str) -> Dict[str, Any]:
    """
    Transcribe audio file using Whisper (ZeroGPU accelerated).

    Args:
        file_path: Path to audio file (MP3, WAV, M4A, etc.)

    Returns:
        Dict with structure: {
            "text": str,           # Transcribed text
            "file_path": str,      # Original file path
            "success": bool,        # True if transcription succeeded
            "error": str or None   # Error message if failed
        }

    Raises:
        FileNotFoundError: If audio file doesn't exist
        ValueError: If file format is not supported

    Examples:
        >>> transcribe_audio("audio.mp3")
        {"text": "Hello world", "file_path": "audio.mp3", "success": True, "error": None}
    """
    global _MODEL

    # Validate file path
    if not file_path:
        logger.error("Empty file path provided")
        return {
            "text": "",
            "file_path": "",
            "success": False,
            "error": "Empty file path provided"
        }

    file_path = Path(file_path)

    if not file_path.exists():
        logger.error(f"File not found: {file_path}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"File not found: {file_path}"
        }

    # Check file extension
    if file_path.suffix.lower() not in AUDIO_FORMATS:
        logger.error(f"Unsupported audio format: {file_path.suffix}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"Unsupported audio format: {file_path.suffix}. Supported: {AUDIO_FORMATS}"
        }

    logger.info(f"Transcribing audio: {file_path}")

    try:
        # Lazy import Whisper (only when function is called)
        import whisper

        # Load model (cached globally)
        if _MODEL is None:
            logger.info(f"Loading Whisper model: {WHISPER_MODEL}")
            device = "cuda" if ZERO_GPU_AVAILABLE else "cpu"
            _MODEL = whisper.load_model(WHISPER_MODEL, device=device)
            logger.info(f"Whisper model loaded on {device}")

        # Transcribe audio
        result = _MODEL.transcribe(
            str(file_path),
            language=WHISPER_LANGUAGE,
            fp16=False  # Use fp32 for compatibility
        )

        text = result["text"].strip()
        logger.info(f"Transcription successful: {len(text)} characters")

        return {
            "text": text,
            "file_path": str(file_path),
            "success": True,
            "error": None
        }

    except FileNotFoundError:
        logger.error(f"Audio file not found: {file_path}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"Audio file not found: {file_path}"
        }
    except Exception as e:
        logger.error(f"Transcription failed: {e}")
        return {
            "text": "",
            "file_path": str(file_path),
            "success": False,
            "error": f"Transcription failed: {str(e)}"
        }


# ============================================================================
# Cleanup Function
# =============================================================================

def cleanup():
    """Reset global model cache (useful for testing)."""
    global _MODEL
    _MODEL = None
    logger.info("Whisper model cache cleared")