SoSolaris
/

stt

Safetensors

whisper

Model card Files Files and versions

xet

Community

SoSolaris commited on Aug 20, 2025

Commit

42ab4b8

verified ·

1 Parent(s): bb23501

Update handler.py

Browse files

Files changed (1) hide show

handler.py +12 -11

handler.py CHANGED Viewed

@@ -40,7 +40,7 @@ class EndpointHandler:
                     except Exception as e2:
                         print(f"Compilation failed: {e2}")
-            # precompute decoder_input_ids for French transcription
             forced_ids = self.processor.get_decoder_prompt_ids(language="french", task="transcribe")
             self.french_decoder_input_ids = torch.tensor(
                 [[tok_id for _, tok_id in forced_ids]],
@@ -57,7 +57,7 @@ class EndpointHandler:
             inputs = data.get("inputs", "")
             parameters = data.get("parameters", {})
-            # decode audio
             if isinstance(inputs, str):
                 try:
                     audio_bytes = base64.b64decode(inputs)
@@ -71,7 +71,7 @@ class EndpointHandler:
             if len(audio_bytes) > 25 * 1024 * 1024:
                 return {"error": "File too large (max 25MB)"}
-            # load audio
             audio_array, _ = librosa.load(
                 io.BytesIO(audio_bytes),
                 sr=16000,
@@ -81,30 +81,33 @@ class EndpointHandler:
             if len(audio_array) == 0:
                 return {"error": "Invalid or empty audio file"}
-            # processor injecte forced_decoder_ids -> on les enlève
             model_inputs = self.processor(
                 audio_array,
                 sampling_rate=16000,
                 return_tensors="pt"
             )
             if "forced_decoder_ids" in model_inputs:
                 del model_inputs["forced_decoder_ids"]
             model_inputs = {
                 k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
                 for k, v in model_inputs.items()
             }
-            # params
             max_length = parameters.get("max_length", 256)
             num_beams = parameters.get("num_beams", 6)
             temperature = parameters.get("temperature", 0.0)
-            # generate
             with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
                 predicted_ids = self.model.generate(
                     **model_inputs,
-                    decoder_input_ids=self.french_decoder_input_ids,  # ✅ seul forçage langue
                     max_length=max_length,
                     num_beams=num_beams,
                     temperature=temperature,
@@ -114,12 +117,10 @@ class EndpointHandler:
                     repetition_penalty=1.1,
                     length_penalty=1.0,
                     use_cache=True,
-                    pad_token_id=self.processor.tokenizer.eos_token_id,
-                    suppress_tokens=[],
-                    begin_suppress_tokens=[]
                 )
             transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
             return {"transcription": transcription[0]}
         except Exception as e:
-            return {"error": f"Transcription error: {str(e)}"}

                     except Exception as e2:
                         print(f"Compilation failed: {e2}")
+            # Precompute decoder_input_ids for French transcription
             forced_ids = self.processor.get_decoder_prompt_ids(language="french", task="transcribe")
             self.french_decoder_input_ids = torch.tensor(
                 [[tok_id for _, tok_id in forced_ids]],
             inputs = data.get("inputs", "")
             parameters = data.get("parameters", {})
+            # Decode audio
             if isinstance(inputs, str):
                 try:
                     audio_bytes = base64.b64decode(inputs)
             if len(audio_bytes) > 25 * 1024 * 1024:
                 return {"error": "File too large (max 25MB)"}
+            # Load audio
             audio_array, _ = librosa.load(
                 io.BytesIO(audio_bytes),
                 sr=16000,
             if len(audio_array) == 0:
                 return {"error": "Invalid or empty audio file"}
+            # Process audio WITHOUT language/task specification to avoid forced_decoder_ids
             model_inputs = self.processor(
                 audio_array,
                 sampling_rate=16000,
                 return_tensors="pt"
             )
+            # Remove any forced_decoder_ids that might have been added
             if "forced_decoder_ids" in model_inputs:
                 del model_inputs["forced_decoder_ids"]
+            # Move to device and convert dtype
             model_inputs = {
                 k: v.to(self.model.device).half() if v.dtype == torch.float32 else v.to(self.model.device)
                 for k, v in model_inputs.items()
             }
+            # Parameters
             max_length = parameters.get("max_length", 256)
             num_beams = parameters.get("num_beams", 6)
             temperature = parameters.get("temperature", 0.0)
+            # Generate with explicit decoder_input_ids
             with torch.no_grad(), torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16):
                 predicted_ids = self.model.generate(
                     **model_inputs,
+                    decoder_input_ids=self.french_decoder_input_ids,
                     max_length=max_length,
                     num_beams=num_beams,
                     temperature=temperature,
                     repetition_penalty=1.1,
                     length_penalty=1.0,
                     use_cache=True,
+                    pad_token_id=self.processor.tokenizer.eos_token_id
                 )
             transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)
             return {"transcription": transcription[0]}
         except Exception as e:
+            return {"error": f"Transcription error: {str(e)}"}