stt / handler.py
SoSolaris's picture
Update handler.py
42ab4b8 verified
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import io
import base64
from typing import Dict, Any
class EndpointHandler:
def __init__(self, path=""):
print("Loading Whisper model...")
try:
try:
self.model = WhisperForConditionalGeneration.from_pretrained(
path,
torch_dtype=torch.bfloat16,
device_map={"": 0},
attn_implementation="flash_attention_2"
)
print("✅ Flash Attention 2 activated!")
except ImportError:
print("⚠️ Flash Attention not available, fallback to eager")
self.model = WhisperForConditionalGeneration.from_pretrained(
path,
torch_dtype=torch.float16,
device_map="auto"
)
self.processor = WhisperProcessor.from_pretrained(path)
self.model.eval()
if hasattr(torch, 'compile'):
try:
self.model = torch.compile(self.model, mode="max-autotune")
print("Model compiled with max-autotune!")
except Exception as e:
print(f"Max-autotune compilation failed: {e}")
try:
self.model = torch.compile(self.model, mode="reduce-overhead")
print("Model compiled with reduce-overhead!")
except Exception as e2:
print(f"Compilation failed: {e2}")
# Precompute decoder_input_ids for French transcription
forced_ids = self.processor.get_decoder_prompt_ids(language="french", task="transcribe")
self.french_decoder_input_ids = torch.tensor(
[[tok_id for _, tok_id in forced_ids]],
device="cuda" if torch.cuda.is_available() else "cpu"
)
print("Model loaded and optimized successfully!")
except Exception as e:
print(f"Error loading model: {e}")
raise e
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
try:
inputs = data.get("inputs", "")
parameters = data.get("parameters", {})
# Decode audio
if isinstance(inputs, str):
try:
audio_bytes = base64.b64decode(inputs)
except Exception:
return {"error": "Invalid base64 encoded audio"}
elif isinstance(inputs, bytes):
audio_bytes = inputs
else:
return {"error": "Invalid input format. Expected base64 string or bytes"}
if len(audio_bytes) > 25 * 1024 * 1024:
return {"error": "File too large (max 25MB)"}
# Load audio
audio_array, _ = librosa.load(
io.BytesIO(audio_bytes),
sr=16000,
mono=True,
duration=30
)
if len(audio_array) == 0:
return {"error": "Invalid or empty audio file"}
# Process audio WITHOUT language/task specification to avoid forced_decoder_ids
model_inputs = self.processor(
audio_array,
sampling_rate=16000,
return_tensors="pt"
)
# Remove any forced_decoder_ids that might have been added
if "forced_decoder_ids" in model_inputs:
del model_inputs["forced_decoder_ids"]
# Move to device and convert dtype
model_inputs = {
k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
for k, v in model_inputs.items()
}
# Parameters
max_length = parameters.get("max_length", 256)
num_beams = parameters.get("num_beams", 6)
temperature = parameters.get("temperature", 0.0)
# Generate with explicit decoder_input_ids
with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
predicted_ids = self.model.generate(
**model_inputs,
decoder_input_ids=self.french_decoder_input_ids,
max_length=max_length,
num_beams=num_beams,
temperature=temperature,
do_sample=False,
early_stopping=True,
no_repeat_ngram_size=3,
repetition_penalty=1.1,
length_penalty=1.0,
use_cache=True,
pad_token_id=self.processor.tokenizer.eos_token_id
)
transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
return {"transcription": transcription[0]}
except Exception as e:
return {"error": f"Transcription error: {str(e)}"}