inayatarshad commited on
Commit
249e156
·
1 Parent(s): bc14aa5

Use soundfile for audio waveform loading

Browse files
Files changed (2) hide show
  1. app.py +6 -11
  2. requirements.txt +1 -0
app.py CHANGED
@@ -8,13 +8,13 @@ from pathlib import Path
8
  from typing import Optional
9
 
10
  import numpy as np
 
11
  from fastapi import FastAPI
12
  from fastapi.middleware.cors import CORSMiddleware
13
  from huggingface_hub import hf_hub_download
14
  from pydantic import BaseModel
15
  import torch
16
  import torch.nn as nn
17
- import torchaudio
18
  from transformers import AutoModelForTokenClassification, AutoTokenizer, Wav2Vec2Model, Wav2Vec2Processor, pipeline
19
 
20
 
@@ -388,20 +388,15 @@ def predict_audio(audio_payload: str) -> dict:
388
  },
389
  }
390
 
391
- waveform, sample_rate = torchaudio.load(wav_path)
392
- if waveform.shape[0] > 1:
393
- waveform = waveform.mean(dim=0, keepdim=True)
394
 
395
- if sample_rate != 16000:
396
- resampler = torchaudio.transforms.Resample(sample_rate, 16000)
397
- waveform = resampler(waveform)
398
-
399
- waveform = waveform.squeeze()
400
- if waveform.numel() > MAX_AUDIO_LENGTH:
401
  waveform = waveform[:MAX_AUDIO_LENGTH]
402
 
403
  inputs = processor(
404
- waveform.cpu().numpy(),
405
  sampling_rate=16000,
406
  return_tensors="pt",
407
  padding=True,
 
8
  from typing import Optional
9
 
10
  import numpy as np
11
+ import soundfile as sf
12
  from fastapi import FastAPI
13
  from fastapi.middleware.cors import CORSMiddleware
14
  from huggingface_hub import hf_hub_download
15
  from pydantic import BaseModel
16
  import torch
17
  import torch.nn as nn
 
18
  from transformers import AutoModelForTokenClassification, AutoTokenizer, Wav2Vec2Model, Wav2Vec2Processor, pipeline
19
 
20
 
 
388
  },
389
  }
390
 
391
+ waveform, sample_rate = sf.read(wav_path, dtype="float32")
392
+ if waveform.ndim > 1:
393
+ waveform = waveform.mean(axis=1)
394
 
395
+ if waveform.shape[0] > MAX_AUDIO_LENGTH:
 
 
 
 
 
396
  waveform = waveform[:MAX_AUDIO_LENGTH]
397
 
398
  inputs = processor(
399
+ waveform,
400
  sampling_rate=16000,
401
  return_tensors="pt",
402
  padding=True,
requirements.txt CHANGED
@@ -5,3 +5,4 @@ huggingface_hub==0.30.2
5
  transformers==4.51.3
6
  safetensors==0.5.3
7
  numpy==2.2.6
 
 
5
  transformers==4.51.3
6
  safetensors==0.5.3
7
  numpy==2.2.6
8
+ soundfile==0.13.1