from transformers import WhisperProcessor, WhisperForConditionalGeneration import torch import librosa import io import base64 from typing import Dict, Any class EndpointHandler: def __init__(self, path=""): print("Loading Whisper model...") try: try: self.model = WhisperForConditionalGeneration.from_pretrained( path, torch_dtype=torch.bfloat16, device_map={"": 0}, attn_implementation="flash_attention_2" ) print("✅ Flash Attention 2 activated!") except ImportError: print("⚠️ Flash Attention not available, fallback to eager") self.model = WhisperForConditionalGeneration.from_pretrained( path, torch_dtype=torch.float16, device_map="auto" ) self.processor = WhisperProcessor.from_pretrained(path) self.model.eval() if hasattr(torch, 'compile'): try: self.model = torch.compile(self.model, mode="max-autotune") print("Model compiled with max-autotune!") except Exception as e: print(f"Max-autotune compilation failed: {e}") try: self.model = torch.compile(self.model, mode="reduce-overhead") print("Model compiled with reduce-overhead!") except Exception as e2: print(f"Compilation failed: {e2}") # Precompute decoder_input_ids for French transcription forced_ids = self.processor.get_decoder_prompt_ids(language="french", task="transcribe") self.french_decoder_input_ids = torch.tensor( [[tok_id for _, tok_id in forced_ids]], device="cuda" if torch.cuda.is_available() else "cpu" ) print("Model loaded and optimized successfully!") except Exception as e: print(f"Error loading model: {e}") raise e def __call__(self, data: Dict[str, Any]) -> Dict[str, str]: try: inputs = data.get("inputs", "") parameters = data.get("parameters", {}) # Decode audio if isinstance(inputs, str): try: audio_bytes = base64.b64decode(inputs) except Exception: return {"error": "Invalid base64 encoded audio"} elif isinstance(inputs, bytes): audio_bytes = inputs else: return {"error": "Invalid input format. Expected base64 string or bytes"} if len(audio_bytes) > 25 * 1024 * 1024: return {"error": "File too large (max 25MB)"} # Load audio audio_array, _ = librosa.load( io.BytesIO(audio_bytes), sr=16000, mono=True, duration=30 ) if len(audio_array) == 0: return {"error": "Invalid or empty audio file"} # Process audio WITHOUT language/task specification to avoid forced_decoder_ids model_inputs = self.processor( audio_array, sampling_rate=16000, return_tensors="pt" ) # Remove any forced_decoder_ids that might have been added if "forced_decoder_ids" in model_inputs: del model_inputs["forced_decoder_ids"] # Move to device and convert dtype model_inputs = { k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device) for k, v in model_inputs.items() } # Parameters max_length = parameters.get("max_length", 256) num_beams = parameters.get("num_beams", 6) temperature = parameters.get("temperature", 0.0) # Generate with explicit decoder_input_ids with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16): predicted_ids = self.model.generate( **model_inputs, decoder_input_ids=self.french_decoder_input_ids, max_length=max_length, num_beams=num_beams, temperature=temperature, do_sample=False, early_stopping=True, no_repeat_ngram_size=3, repetition_penalty=1.1, length_penalty=1.0, use_cache=True, pad_token_id=self.processor.tokenizer.eos_token_id ) transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True) return {"transcription": transcription[0]} except Exception as e: return {"error": f"Transcription error: {str(e)}"}