Shikhar commited on
Commit
1a1b90c
·
1 Parent(s): c2c9a4c

Use soundfile for audio loading (no torchcodec/ffmpeg needed)

Browse files
Files changed (2) hide show
  1. app.py +5 -2
  2. requirements.txt +1 -0
app.py CHANGED
@@ -7,6 +7,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
7
  import gradio as gr
8
  import torch
9
  import torchaudio
 
10
  from huggingface_hub import hf_hub_download
11
 
12
  from src.model.xeusphoneme.builders import build_xeus_pr_inference
@@ -43,10 +44,12 @@ def transcribe(audio_path):
43
  if inference is None:
44
  inference = load_model()
45
 
46
- waveform, sr = torchaudio.load(audio_path)
 
 
 
47
  if sr != SAMPLE_RATE:
48
  waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)
49
- waveform = waveform.mean(dim=0) # mono
50
  waveform = waveform[: SAMPLE_RATE * MAX_SECONDS]
51
 
52
  if waveform.numel() == 0:
 
7
  import gradio as gr
8
  import torch
9
  import torchaudio
10
+ import soundfile as sf
11
  from huggingface_hub import hf_hub_download
12
 
13
  from src.model.xeusphoneme.builders import build_xeus_pr_inference
 
44
  if inference is None:
45
  inference = load_model()
46
 
47
+ data, sr = sf.read(audio_path, dtype="float32")
48
+ waveform = torch.from_numpy(data)
49
+ if waveform.dim() == 2:
50
+ waveform = waveform.mean(dim=1)
51
  if sr != SAMPLE_RATE:
52
  waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)
 
53
  waveform = waveform[: SAMPLE_RATE * MAX_SECONDS]
54
 
55
  if waveform.numel() == 0:
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  torch
2
  torchaudio
 
3
  huggingface_hub
4
  pyyaml
5
  typeguard
 
1
  torch
2
  torchaudio
3
+ soundfile
4
  huggingface_hub
5
  pyyaml
6
  typeguard