SoSolaris
/

stt

Safetensors

whisper

Model card Files Files and versions

xet

Community

SoSolaris commited on Aug 20, 2025

Commit

f7c80df

verified ·

1 Parent(s): 2bcb732

Update handler.py

Browse files

Files changed (1) hide show

handler.py +9 -8

handler.py CHANGED Viewed

@@ -40,12 +40,9 @@ class EndpointHandler:
                     except Exception as e2:
                         print(f"Compilation failed: {e2}")
-            # pre-compute decoder ids for french
-            self.french_decoder_ids = torch.tensor(
-                self.processor.get_decoder_prompt_ids(
-                    language="french", task="transcribe"
-                ),
-                device="cuda" if torch.cuda.is_available() else "cpu"
             )
             print("Model loaded and optimized successfully!")
@@ -58,6 +55,7 @@ class EndpointHandler:
             inputs = data.get("inputs", "")
             parameters = data.get("parameters", {})
             if isinstance(inputs, str):
                 try:
                     audio_bytes = base64.b64decode(inputs)
@@ -68,9 +66,11 @@ class EndpointHandler:
             else:
                 return {"error": "Invalid input format. Expected base64 string or bytes"}
             if len(audio_bytes) > 25 * 1024 * 1024:
                 return {"error": "File too large (max 25MB)"}
             audio_array, _ = librosa.load(
                 io.BytesIO(audio_bytes),
                 sr=16000,
@@ -86,20 +86,20 @@ class EndpointHandler:
                 sampling_rate=16000,
                 return_tensors="pt"
             )
             model_inputs = {
                 k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
                 for k, v in model_inputs.items()
             }
             max_length = parameters.get("max_length", 256)
             num_beams = parameters.get("num_beams", 6)
             temperature = parameters.get("temperature", 0.0)
             with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
                 predicted_ids = self.model.generate(
                     **model_inputs,
-                    decoder_input_ids=self.french_decoder_ids,
                     max_length=max_length,
                     num_beams=num_beams,
                     temperature=temperature,
@@ -110,6 +110,7 @@ class EndpointHandler:
                     length_penalty=1.0,
                     use_cache=True,
                     pad_token_id=self.processor.tokenizer.eos_token_id,
                     suppress_tokens=[],
                     begin_suppress_tokens=[]
                 )

                     except Exception as e2:
                         print(f"Compilation failed: {e2}")
+            # forced_decoder_ids pour français (comme fastapi)
+            self.french_decoder_ids = self.processor.get_decoder_prompt_ids(
+                language="french", task="transcribe"
             )
             print("Model loaded and optimized successfully!")
             inputs = data.get("inputs", "")
             parameters = data.get("parameters", {})
+            # decode audio (base64 string or bytes)
             if isinstance(inputs, str):
                 try:
                     audio_bytes = base64.b64decode(inputs)
             else:
                 return {"error": "Invalid input format. Expected base64 string or bytes"}
+            # check size
             if len(audio_bytes) > 25 * 1024 * 1024:
                 return {"error": "File too large (max 25MB)"}
+            # load audio
             audio_array, _ = librosa.load(
                 io.BytesIO(audio_bytes),
                 sr=16000,
                 sampling_rate=16000,
                 return_tensors="pt"
             )
             model_inputs = {
                 k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
                 for k, v in model_inputs.items()
             }
+            # params
             max_length = parameters.get("max_length", 256)
             num_beams = parameters.get("num_beams", 6)
             temperature = parameters.get("temperature", 0.0)
+            # generate
             with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
                 predicted_ids = self.model.generate(
                     **model_inputs,
                     max_length=max_length,
                     num_beams=num_beams,
                     temperature=temperature,
                     length_penalty=1.0,
                     use_cache=True,
                     pad_token_id=self.processor.tokenizer.eos_token_id,
+                    forced_decoder_ids=self.french_decoder_ids,  # ✅ identique à fastapi
                     suppress_tokens=[],
                     begin_suppress_tokens=[]
                 )