Update app.py
Browse files
app.py
CHANGED
|
@@ -17,10 +17,16 @@ def speech_to_text(audio):
|
|
| 17 |
return "No audio provided."
|
| 18 |
|
| 19 |
# Load the audio file
|
| 20 |
-
input_audio,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# Process the audio for ASR
|
| 23 |
-
input_audio = asr_processor(input_audio, sampling_rate=
|
| 24 |
logits = asr_model(input_audio.input_values).logits
|
| 25 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 26 |
transcription = asr_processor.decode(predicted_ids[0])
|
|
|
|
| 17 |
return "No audio provided."
|
| 18 |
|
| 19 |
# Load the audio file
|
| 20 |
+
input_audio, original_sample_rate = torchaudio.load(audio)
|
| 21 |
+
|
| 22 |
+
# Resample the audio to 16,000 Hz if necessary
|
| 23 |
+
target_sample_rate = 16000
|
| 24 |
+
if original_sample_rate != target_sample_rate:
|
| 25 |
+
resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=target_sample_rate)
|
| 26 |
+
input_audio = resampler(input_audio)
|
| 27 |
|
| 28 |
# Process the audio for ASR
|
| 29 |
+
input_audio = asr_processor(input_audio, sampling_rate=target_sample_rate, return_tensors="pt", padding=True)
|
| 30 |
logits = asr_model(input_audio.input_values).logits
|
| 31 |
predicted_ids = torch.argmax(logits, dim=-1)
|
| 32 |
transcription = asr_processor.decode(predicted_ids[0])
|