|
|
""" |
|
|
Speech to Text Service |
|
|
Wrapper untuk Whisper STT |
|
|
""" |
|
|
|
|
|
import whisper |
|
|
import torch |
|
|
import warnings |
|
|
import os |
|
|
from typing import Dict |
|
|
from app.core.device import get_device, optimize_for_device |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
class SpeechToTextService: |
|
|
"""Speech-to-Text service using Whisper""" |
|
|
|
|
|
def __init__(self, model_name: str = "medium", device: str = None, language: str = "id"): |
|
|
"""Initialize Whisper model""" |
|
|
print(f"ποΈ Initializing Speech-to-Text service") |
|
|
print(f"π¦ Loading Whisper model: {model_name}") |
|
|
|
|
|
|
|
|
if device is None or device == "auto": |
|
|
self.device = get_device() |
|
|
optimize_for_device(self.device) |
|
|
else: |
|
|
self.device = device |
|
|
print(f"π» Using device: {self.device}") |
|
|
|
|
|
|
|
|
|
|
|
cache_dir = os.environ.get('WHISPER_CACHE', '/data/.cache') |
|
|
model_cache_path = os.path.join(cache_dir, f'{model_name}.pt') |
|
|
|
|
|
|
|
|
try: |
|
|
if os.path.exists(model_cache_path): |
|
|
print(f"β
Loading from cache (pre-downloaded during build)") |
|
|
else: |
|
|
print(f"π₯ Model not in cache, downloading '{model_name}'...") |
|
|
print(f" This may take 1-2 minutes...") |
|
|
|
|
|
self.model = whisper.load_model(model_name, device=self.device, download_root=cache_dir) |
|
|
print("β
Whisper model ready!\n") |
|
|
except Exception as e: |
|
|
print(f"β Failed to load model '{model_name}': {e}") |
|
|
print("βοΈ Falling back to 'base' model...") |
|
|
|
|
|
base_cache_path = os.path.join(cache_dir, 'base.pt') |
|
|
if os.path.exists(base_cache_path): |
|
|
print(f"β
Loading base model from cache") |
|
|
else: |
|
|
print(f"π₯ Downloading base model...") |
|
|
|
|
|
self.model = whisper.load_model("base", device=self.device, download_root=cache_dir) |
|
|
print("β
Base model ready!\n") |
|
|
|
|
|
self.language = language |
|
|
|
|
|
def transcribe(self, audio_path: str, **kwargs) -> Dict: |
|
|
""" |
|
|
Transcribe audio file to text |
|
|
|
|
|
Args: |
|
|
audio_path: Path ke file audio |
|
|
**kwargs: Additional Whisper parameters |
|
|
|
|
|
Returns: |
|
|
Dict: {'text': str, 'segments': list, 'language': str} |
|
|
""" |
|
|
print(f"π§ Transcribing: {audio_path}") |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
fp16 = self.device == "cuda" |
|
|
|
|
|
result = self.model.transcribe( |
|
|
audio_path, |
|
|
language=self.language, |
|
|
task="transcribe", |
|
|
word_timestamps=True, |
|
|
condition_on_previous_text=False, |
|
|
fp16=fp16, |
|
|
**kwargs |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"β οΈ Transcription with word_timestamps failed: {e}") |
|
|
print(f"π Retrying without word_timestamps...") |
|
|
|
|
|
|
|
|
fp16 = self.device == "cuda" |
|
|
|
|
|
result = self.model.transcribe( |
|
|
audio_path, |
|
|
language=self.language, |
|
|
task="transcribe", |
|
|
condition_on_previous_text=False, |
|
|
fp16=fp16, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
print("β
Transcription complete!\n") |
|
|
|
|
|
return { |
|
|
'text': result['text'], |
|
|
'segments': result.get('segments', []), |
|
|
'language': result.get('language', self.language) |
|
|
} |
|
|
|