Spaces:

Cyberlace
/

api-swara-audio-analysis

Paused

App Files Files Community

api-swara-audio-analysis / app /services /speech_to_text.py

fariedalfarizi

Enable Whisper medium cache using /data persistent storage (HF Pro)

5395cd1 29 days ago

raw

history blame contribute delete

3.91 kB

	"""
	Speech to Text Service
	Wrapper untuk Whisper STT
	"""

	import whisper
	import torch
	import warnings
	import os
	from typing import Dict
	from app.core.device import get_device, optimize_for_device
	warnings.filterwarnings('ignore')


	class SpeechToTextService:
	"""Speech-to-Text service using Whisper"""

	def __init__(self, model_name: str = "medium", device: str = None, language: str = "id"):
	"""Initialize Whisper model"""
	print(f"🎙️ Initializing Speech-to-Text service")
	print(f"📦 Loading Whisper model: {model_name}")

	# Auto-detect device if not specified
	if device is None or device == "auto":
	self.device = get_device()
	optimize_for_device(self.device)
	else:
	self.device = device
	print(f"💻 Using device: {self.device}")

	# Check if model is already cached
	# Use /data/.cache for Whisper (persistent storage on HF Pro)
	cache_dir = os.environ.get('WHISPER_CACHE', '/data/.cache')
	model_cache_path = os.path.join(cache_dir, f'{model_name}.pt')

	# Load Whisper model
	try:
	if os.path.exists(model_cache_path):
	print(f"✅ Loading from cache (pre-downloaded during build)")
	else:
	print(f"📥 Model not in cache, downloading '{model_name}'...")
	print(f" This may take 1-2 minutes...")

	self.model = whisper.load_model(model_name, device=self.device, download_root=cache_dir)
	print("✅ Whisper model ready!\n")
	except Exception as e:
	print(f"❌ Failed to load model '{model_name}': {e}")
	print("⚙️ Falling back to 'base' model...")

	base_cache_path = os.path.join(cache_dir, 'base.pt')
	if os.path.exists(base_cache_path):
	print(f"✅ Loading base model from cache")
	else:
	print(f"📥 Downloading base model...")

	self.model = whisper.load_model("base", device=self.device, download_root=cache_dir)
	print("✅ Base model ready!\n")

	self.language = language

	def transcribe(self, audio_path: str, **kwargs) -> Dict:
	"""
	Transcribe audio file to text

	Args:
	audio_path: Path ke file audio
	**kwargs: Additional Whisper parameters

	Returns:
	Dict: {'text': str, 'segments': list, 'language': str}
	"""
	print(f"🎧 Transcribing: {audio_path}")

	try:
	# Try with word_timestamps first
	# Use FP16 for GPU to reduce memory and improve speed
	fp16 = self.device == "cuda"

	result = self.model.transcribe(
	audio_path,
	language=self.language,
	task="transcribe",
	word_timestamps=True,
	condition_on_previous_text=False,
	fp16=fp16,
	**kwargs
	)
	except Exception as e:
	print(f"⚠️ Transcription with word_timestamps failed: {e}")
	print(f"🔄 Retrying without word_timestamps...")

	# Fallback: transcribe without word_timestamps
	fp16 = self.device == "cuda"

	result = self.model.transcribe(
	audio_path,
	language=self.language,
	task="transcribe",
	condition_on_previous_text=False,
	fp16=fp16,
	**kwargs
	)

	print("✅ Transcription complete!\n")

	return {
	'text': result['text'],
	'segments': result.get('segments', []),
	'language': result.get('language', self.language)
	}