SoSolaris
/

stt

Model card Files Files and versions

stt / handler.py

SoSolaris's picture

Update handler.py

42ab4b8 verified 5 months ago

history blame contribute delete

5.12 kB

	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import torch
	import librosa
	import io
	import base64
	from typing import Dict, Any

	class EndpointHandler:
	def __init__(self, path=""):
	print("Loading Whisper model...")
	try:
	try:
	self.model = WhisperForConditionalGeneration.from_pretrained(
	path,
	torch_dtype=torch.bfloat16,
	device_map={"": 0},
	attn_implementation="flash_attention_2"
	)
	print("✅ Flash Attention 2 activated!")
	except ImportError:
	print("⚠️ Flash Attention not available, fallback to eager")
	self.model = WhisperForConditionalGeneration.from_pretrained(
	path,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	self.processor = WhisperProcessor.from_pretrained(path)
	self.model.eval()

	if hasattr(torch, 'compile'):
	try:
	self.model = torch.compile(self.model, mode="max-autotune")
	print("Model compiled with max-autotune!")
	except Exception as e:
	print(f"Max-autotune compilation failed: {e}")
	try:
	self.model = torch.compile(self.model, mode="reduce-overhead")
	print("Model compiled with reduce-overhead!")
	except Exception as e2:
	print(f"Compilation failed: {e2}")

	# Precompute decoder_input_ids for French transcription
	forced_ids = self.processor.get_decoder_prompt_ids(language="french", task="transcribe")
	self.french_decoder_input_ids = torch.tensor(
	[[tok_id for _, tok_id in forced_ids]],
	device="cuda" if torch.cuda.is_available() else "cpu"
	)

	print("Model loaded and optimized successfully!")
	except Exception as e:
	print(f"Error loading model: {e}")
	raise e

	def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
	try:
	inputs = data.get("inputs", "")
	parameters = data.get("parameters", {})

	# Decode audio
	if isinstance(inputs, str):
	try:
	audio_bytes = base64.b64decode(inputs)
	except Exception:
	return {"error": "Invalid base64 encoded audio"}
	elif isinstance(inputs, bytes):
	audio_bytes = inputs
	else:
	return {"error": "Invalid input format. Expected base64 string or bytes"}

	if len(audio_bytes) > 25 * 1024 * 1024:
	return {"error": "File too large (max 25MB)"}

	# Load audio
	audio_array, _ = librosa.load(
	io.BytesIO(audio_bytes),
	sr=16000,
	mono=True,
	duration=30
	)
	if len(audio_array) == 0:
	return {"error": "Invalid or empty audio file"}

	# Process audio WITHOUT language/task specification to avoid forced_decoder_ids
	model_inputs = self.processor(
	audio_array,
	sampling_rate=16000,
	return_tensors="pt"
	)

	# Remove any forced_decoder_ids that might have been added
	if "forced_decoder_ids" in model_inputs:
	del model_inputs["forced_decoder_ids"]

	# Move to device and convert dtype
	model_inputs = {
	k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
	for k, v in model_inputs.items()
	}

	# Parameters
	max_length = parameters.get("max_length", 256)
	num_beams = parameters.get("num_beams", 6)
	temperature = parameters.get("temperature", 0.0)

	# Generate with explicit decoder_input_ids
	with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
	predicted_ids = self.model.generate(
	**model_inputs,
	decoder_input_ids=self.french_decoder_input_ids,
	max_length=max_length,
	num_beams=num_beams,
	temperature=temperature,
	do_sample=False,
	early_stopping=True,
	no_repeat_ngram_size=3,
	repetition_penalty=1.1,
	length_penalty=1.0,
	use_cache=True,
	pad_token_id=self.processor.tokenizer.eos_token_id
	)

	transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
	return {"transcription": transcription[0]}
	except Exception as e:
	return {"error": f"Transcription error: {str(e)}"}