Spaces:

MoHamdyy
/

Translation_Stack

Sleeping

MoHamdyy commited on Jul 13, 2025

Commit

403dd60

1 Parent(s): 574f683

Fix: Correctly pass GenerationConfig to Whisper model

Files changed (1) hide show

app.py CHANGED Viewed

@@ -403,10 +403,18 @@ def full_speech_translation_pipeline(audio_input_path: str):
         print("STT: Extracting features and transcribing...")
         inputs = stt_processor(audio_array_stt, sampling_rate=target_sr_stt, return_tensors="pt").input_features.to(DEVICE)
         forced_ids = stt_processor.get_decoder_prompt_ids(language="arabic", task="transcribe")
         with torch.no_grad():
-            generated_ids = stt_model.generate(inputs, forced_decoder_ids=forced_ids, max_length=448)
-        arabic_transcript = stt_processor.decode(generated_ids[0], skip_special_tokens=True).strip()
         print(f"STT Output: {arabic_transcript}")
     except Exception as e:
         print(f"STT Error: {e}")
@@ -439,14 +447,6 @@ def full_speech_translation_pipeline(audio_input_path: str):
             if generated_mel is not None and generated_mel.numel() > 0:
                 mel_for_vocoder = generated_mel.detach().squeeze(0).transpose(0, 1)
                 audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
-                synthesized_audio_np = audio_tensor.cpu().numpy()
-                print(f"TTS: Synthesized audio shape: {synthesized_audio_np.shape}")
-        except Exception as e:
-            print(f"TTS Error: {e}")
-    print(f"--- PIPELINE END ---")
-    return arabic_transcript, english_translation, (hp.sr, synthesized_audio_np)
 # --- Part 4: Gradio Interface Definition ---
 # (Same as before)

         print("STT: Extracting features and transcribing...")
         inputs = stt_processor(audio_array_stt, sampling_rate=target_sr_stt, return_tensors="pt").input_features.to(DEVICE)
         forced_ids = stt_processor.get_decoder_prompt_ids(language="arabic", task="transcribe")
         with torch.no_grad():
+            # Pass forced_decoder_ids via a GenerationConfig to avoid unused kwargs error
+            generation_config = stt_model.generation_config.copy()
+            generation_config.forced_decoder_ids = forced_ids
+            generated_ids = stt_model.generate(inputs, generation_config=generation_config, max_length=448)
+        # Use batch_decode for robustness
+        arabic_transcript = stt_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
         print(f"STT Output: {arabic_transcript}")
     except Exception as e:
         print(f"STT Error: {e}")
             if generated_mel is not None and generated_mel.numel() > 0:
                 mel_for_vocoder = generated_mel.detach().squeeze(0).transpose(0, 1)
                 audio_tensor = inverse_mel_spec_to_wav(mel_for_vocoder)
 # --- Part 4: Gradio Interface Definition ---
 # (Same as before)