Spaces:
Sleeping
Sleeping
Commit ·
249e156
1
Parent(s): bc14aa5
Use soundfile for audio waveform loading
Browse files- app.py +6 -11
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -8,13 +8,13 @@ from pathlib import Path
|
|
| 8 |
from typing import Optional
|
| 9 |
|
| 10 |
import numpy as np
|
|
|
|
| 11 |
from fastapi import FastAPI
|
| 12 |
from fastapi.middleware.cors import CORSMiddleware
|
| 13 |
from huggingface_hub import hf_hub_download
|
| 14 |
from pydantic import BaseModel
|
| 15 |
import torch
|
| 16 |
import torch.nn as nn
|
| 17 |
-
import torchaudio
|
| 18 |
from transformers import AutoModelForTokenClassification, AutoTokenizer, Wav2Vec2Model, Wav2Vec2Processor, pipeline
|
| 19 |
|
| 20 |
|
|
@@ -388,20 +388,15 @@ def predict_audio(audio_payload: str) -> dict:
|
|
| 388 |
},
|
| 389 |
}
|
| 390 |
|
| 391 |
-
waveform, sample_rate =
|
| 392 |
-
if waveform.
|
| 393 |
-
waveform = waveform.mean(
|
| 394 |
|
| 395 |
-
if
|
| 396 |
-
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
| 397 |
-
waveform = resampler(waveform)
|
| 398 |
-
|
| 399 |
-
waveform = waveform.squeeze()
|
| 400 |
-
if waveform.numel() > MAX_AUDIO_LENGTH:
|
| 401 |
waveform = waveform[:MAX_AUDIO_LENGTH]
|
| 402 |
|
| 403 |
inputs = processor(
|
| 404 |
-
waveform
|
| 405 |
sampling_rate=16000,
|
| 406 |
return_tensors="pt",
|
| 407 |
padding=True,
|
|
|
|
| 8 |
from typing import Optional
|
| 9 |
|
| 10 |
import numpy as np
|
| 11 |
+
import soundfile as sf
|
| 12 |
from fastapi import FastAPI
|
| 13 |
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
from huggingface_hub import hf_hub_download
|
| 15 |
from pydantic import BaseModel
|
| 16 |
import torch
|
| 17 |
import torch.nn as nn
|
|
|
|
| 18 |
from transformers import AutoModelForTokenClassification, AutoTokenizer, Wav2Vec2Model, Wav2Vec2Processor, pipeline
|
| 19 |
|
| 20 |
|
|
|
|
| 388 |
},
|
| 389 |
}
|
| 390 |
|
| 391 |
+
waveform, sample_rate = sf.read(wav_path, dtype="float32")
|
| 392 |
+
if waveform.ndim > 1:
|
| 393 |
+
waveform = waveform.mean(axis=1)
|
| 394 |
|
| 395 |
+
if waveform.shape[0] > MAX_AUDIO_LENGTH:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
waveform = waveform[:MAX_AUDIO_LENGTH]
|
| 397 |
|
| 398 |
inputs = processor(
|
| 399 |
+
waveform,
|
| 400 |
sampling_rate=16000,
|
| 401 |
return_tensors="pt",
|
| 402 |
padding=True,
|
requirements.txt
CHANGED
|
@@ -5,3 +5,4 @@ huggingface_hub==0.30.2
|
|
| 5 |
transformers==4.51.3
|
| 6 |
safetensors==0.5.3
|
| 7 |
numpy==2.2.6
|
|
|
|
|
|
| 5 |
transformers==4.51.3
|
| 6 |
safetensors==0.5.3
|
| 7 |
numpy==2.2.6
|
| 8 |
+
soundfile==0.13.1
|