SoSolaris
/

stt

Safetensors

whisper

Model card Files Files and versions

xet

Community

SoSolaris commited on Aug 20, 2025

Commit

2bcb732

verified ·

1 Parent(s): 8c0d808

Update handler.py

Browse files

Files changed (1) hide show

handler.py +29 -61

handler.py CHANGED Viewed

@@ -3,18 +3,12 @@ import torch
 import librosa
 import io
 import base64
-from typing import Dict, List, Any
-import json
 class EndpointHandler:
     def __init__(self, path=""):
-        """
-        Initialize the handler for Hugging Face Inference Endpoints
-        """
         print("Loading Whisper model...")
         try:
-            # Try Flash Attention 2 first
             try:
                 self.model = WhisperForConditionalGeneration.from_pretrained(
                     path,
@@ -30,55 +24,41 @@ class EndpointHandler:
                     torch_dtype=torch.float16,
                     device_map="auto"
                 )
             self.processor = WhisperProcessor.from_pretrained(path)
-            # Set to evaluation mode
             self.model.eval()
-            # Compile model for optimization
             if hasattr(torch, 'compile'):
                 try:
                     self.model = torch.compile(self.model, mode="max-autotune")
                     print("Model compiled with max-autotune!")
                 except Exception as e:
-                    print(f"Max-autotune compilation failed, fallback: {e}")
                     try:
                         self.model = torch.compile(self.model, mode="reduce-overhead")
                         print("Model compiled with reduce-overhead!")
                     except Exception as e2:
                         print(f"Compilation failed: {e2}")
-            # Pre-compute French decoder IDs
-            self.french_decoder_ids = self.processor.get_decoder_prompt_ids(
-                language="french",
-                task="transcribe"
             )
             print("Model loaded and optimized successfully!")
         except Exception as e:
             print(f"Error loading model: {e}")
             raise e
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
-        """
-        Process the request
-        Args:
-            data (Dict): The request payload containing:
-                - "inputs": base64 encoded audio file or audio bytes
-                - "parameters": optional parameters for generation
-        Returns:
-            Dict: The transcription result
-        """
         try:
-            # Extract inputs
             inputs = data.get("inputs", "")
             parameters = data.get("parameters", {})
-            # Handle different input formats
             if isinstance(inputs, str):
-                # Assume base64 encoded audio
                 try:
                     audio_bytes = base64.b64decode(inputs)
                 except Exception:
@@ -87,48 +67,39 @@ class EndpointHandler:
                 audio_bytes = inputs
             else:
                 return {"error": "Invalid input format. Expected base64 string or bytes"}
-            # Validate file size (max 25MB)
             if len(audio_bytes) > 25 * 1024 * 1024:
                 return {"error": "File too large (max 25MB)"}
-            # Load audio from bytes
-            audio_array, sampling_rate = librosa.load(
-                io.BytesIO(audio_bytes),
                 sr=16000,
                 mono=True,
-                duration=30  # Limit to 30 seconds max
             )
-            # Validate audio
             if len(audio_array) == 0:
                 return {"error": "Invalid or empty audio file"}
-            # Process audio for the model
             model_inputs = self.processor(
-                audio_array,
-                sampling_rate=16000,
                 return_tensors="pt"
             )
-            # Move inputs to same device and dtype as model
             model_inputs = {
-                k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
                 for k, v in model_inputs.items()
             }
-            # Extract generation parameters
             max_length = parameters.get("max_length", 256)
             num_beams = parameters.get("num_beams", 6)
             temperature = parameters.get("temperature", 0.0)
-            # Generate transcription with anti-hallucination parameters
             with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
-                # Add language forcing to inputs instead of generation params
-                model_inputs.update(self.processor.get_decoder_prompt_ids(language="french", task="transcribe"))
                 predicted_ids = self.model.generate(
                     **model_inputs,
                     max_length=max_length,
                     num_beams=num_beams,
                     temperature=temperature,
@@ -142,11 +113,8 @@ class EndpointHandler:
                     suppress_tokens=[],
                     begin_suppress_tokens=[]
                 )
-            # Decode the transcription
             transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
             return {"transcription": transcription[0]}
         except Exception as e:
-            return {"error": f"Transcription error: {str(e)}"}

 import librosa
 import io
 import base64
+from typing import Dict, Any
 class EndpointHandler:
     def __init__(self, path=""):
         print("Loading Whisper model...")
         try:
             try:
                 self.model = WhisperForConditionalGeneration.from_pretrained(
                     path,
                     torch_dtype=torch.float16,
                     device_map="auto"
                 )
             self.processor = WhisperProcessor.from_pretrained(path)
             self.model.eval()
             if hasattr(torch, 'compile'):
                 try:
                     self.model = torch.compile(self.model, mode="max-autotune")
                     print("Model compiled with max-autotune!")
                 except Exception as e:
+                    print(f"Max-autotune compilation failed: {e}")
                     try:
                         self.model = torch.compile(self.model, mode="reduce-overhead")
                         print("Model compiled with reduce-overhead!")
                     except Exception as e2:
                         print(f"Compilation failed: {e2}")
+            # pre-compute decoder ids for french
+            self.french_decoder_ids = torch.tensor(
+                self.processor.get_decoder_prompt_ids(
+                    language="french", task="transcribe"
+                ),
+                device="cuda" if torch.cuda.is_available() else "cpu"
             )
             print("Model loaded and optimized successfully!")
         except Exception as e:
             print(f"Error loading model: {e}")
             raise e
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         try:
             inputs = data.get("inputs", "")
             parameters = data.get("parameters", {})
             if isinstance(inputs, str):
                 try:
                     audio_bytes = base64.b64decode(inputs)
                 except Exception:
                 audio_bytes = inputs
             else:
                 return {"error": "Invalid input format. Expected base64 string or bytes"}
             if len(audio_bytes) > 25 * 1024 * 1024:
                 return {"error": "File too large (max 25MB)"}
+            audio_array, _ = librosa.load(
+                io.BytesIO(audio_bytes),
                 sr=16000,
                 mono=True,
+                duration=30
             )
             if len(audio_array) == 0:
                 return {"error": "Invalid or empty audio file"}
             model_inputs = self.processor(
+                audio_array,
+                sampling_rate=16000,
                 return_tensors="pt"
             )
             model_inputs = {
+                k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
                 for k, v in model_inputs.items()
             }
             max_length = parameters.get("max_length", 256)
             num_beams = parameters.get("num_beams", 6)
             temperature = parameters.get("temperature", 0.0)
             with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
                 predicted_ids = self.model.generate(
                     **model_inputs,
+                    decoder_input_ids=self.french_decoder_ids,
                     max_length=max_length,
                     num_beams=num_beams,
                     temperature=temperature,
                     suppress_tokens=[],
                     begin_suppress_tokens=[]
                 )
             transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
             return {"transcription": transcription[0]}
         except Exception as e:
+            return {"error": f"Transcription error: {str(e)}"}