Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- Dockerfile +22 -0
- README.md +140 -6
- app.py +528 -0
- requirements.txt +12 -0
Dockerfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# μμ€ν
ν¨ν€μ§ μ€μΉ
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
ffmpeg \
|
| 8 |
+
git \
|
| 9 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 10 |
+
|
| 11 |
+
# Python ν¨ν€μ§ μ€μΉ
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# μ± λ³΅μ¬
|
| 16 |
+
COPY app.py .
|
| 17 |
+
|
| 18 |
+
# ν¬νΈ λ
ΈμΆ
|
| 19 |
+
EXPOSE 7860
|
| 20 |
+
|
| 21 |
+
# μ€ν
|
| 22 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,11 +1,145 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
-
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Speechlib API
|
| 3 |
+
emoji: π€
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
+
app_file: app.py
|
| 8 |
pinned: false
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Speechlib REST API (ECAPA-TDNN)
|
| 12 |
+
|
| 13 |
+
νμ λΆλ¦¬(Speaker Diarization) + νμ μλ³(Speaker Identification) + μμ± μΈμ(STT) REST API
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- **νμ λΆλ¦¬**: pyannote/speaker-diarization-3.1λ‘ μ¬λ¬ νμ ꡬλΆ
|
| 18 |
+
- **νμ μλ³**: speechbrain ECAPA-TDNNμΌλ‘ λ±λ‘λ νμ μλ³ (κ³ μ λ°)
|
| 19 |
+
- **μμ± μΈμ**: faster-whisper (large-v3-turbo)λ₯Ό μ¬μ©ν STT
|
| 20 |
+
|
| 21 |
+
## API Endpoints
|
| 22 |
+
|
| 23 |
+
### GET /
|
| 24 |
+
API μν νμΈ
|
| 25 |
+
|
| 26 |
+
### GET /health
|
| 27 |
+
ν¬μ€ 체ν¬
|
| 28 |
+
|
| 29 |
+
### POST /transcribe
|
| 30 |
+
λ¨μ STT + νμ λΆλ¦¬ (νμ μλ³ μμ)
|
| 31 |
+
|
| 32 |
+
**Parameters (multipart/form-data):**
|
| 33 |
+
- `audio`: μ€λμ€ νμΌ (νμ)
|
| 34 |
+
- `language`: μΈμ΄ μ½λ (κΈ°λ³Έκ°: ko)
|
| 35 |
+
- `hf_token`: HuggingFace ν ν° (νμ)
|
| 36 |
+
|
| 37 |
+
### POST /process
|
| 38 |
+
μ 체 κΈ°λ₯: νμ λΆλ¦¬ + νμ μλ³ + STT
|
| 39 |
+
|
| 40 |
+
**Parameters (multipart/form-data):**
|
| 41 |
+
- `audio`: λΆμν μ€λμ€ νμΌ (νμ)
|
| 42 |
+
- `voice_sample`: νμ μν νμΌ (μ ν)
|
| 43 |
+
- `speaker_name`: μλ³ν νμ μ΄λ¦ (κΈ°λ³Έκ°: speaker)
|
| 44 |
+
- `language`: μΈμ΄ μ½λ (κΈ°λ³Έκ°: ko)
|
| 45 |
+
- `hf_token`: HuggingFace ν ν° (νμ)
|
| 46 |
+
|
| 47 |
+
## Usage Example
|
| 48 |
+
|
| 49 |
+
### cURL
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
# λ¨μ STT
|
| 53 |
+
curl -X POST "https://YOUR_SPACE.hf.space/transcribe" \
|
| 54 |
+
-F "audio=@audio.wav" \
|
| 55 |
+
-F "language=ko" \
|
| 56 |
+
-F "hf_token=hf_YOUR_TOKEN"
|
| 57 |
+
|
| 58 |
+
# νμ μλ³ ν¬ν¨
|
| 59 |
+
curl -X POST "https://YOUR_SPACE.hf.space/process" \
|
| 60 |
+
-F "audio=@conversation.wav" \
|
| 61 |
+
-F "voice_sample=@speaker_sample.wav" \
|
| 62 |
+
-F "speaker_name=νκΈΈλ" \
|
| 63 |
+
-F "language=ko" \
|
| 64 |
+
-F "hf_token=hf_YOUR_TOKEN"
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### Python
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
import requests
|
| 71 |
+
|
| 72 |
+
# λ¨μ STT
|
| 73 |
+
response = requests.post(
|
| 74 |
+
"https://YOUR_SPACE.hf.space/transcribe",
|
| 75 |
+
files={"audio": open("audio.wav", "rb")},
|
| 76 |
+
data={"language": "ko", "hf_token": "hf_YOUR_TOKEN"}
|
| 77 |
+
)
|
| 78 |
+
print(response.json())
|
| 79 |
+
|
| 80 |
+
# νμ μλ³ ν¬ν¨
|
| 81 |
+
response = requests.post(
|
| 82 |
+
"https://YOUR_SPACE.hf.space/process",
|
| 83 |
+
files={
|
| 84 |
+
"audio": open("conversation.wav", "rb"),
|
| 85 |
+
"voice_sample": open("speaker_sample.wav", "rb")
|
| 86 |
+
},
|
| 87 |
+
data={
|
| 88 |
+
"speaker_name": "νκΈΈλ",
|
| 89 |
+
"language": "ko",
|
| 90 |
+
"hf_token": "hf_YOUR_TOKEN"
|
| 91 |
+
}
|
| 92 |
+
)
|
| 93 |
+
print(response.json())
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
### JavaScript/Node.js
|
| 97 |
+
|
| 98 |
+
```javascript
|
| 99 |
+
const FormData = require('form-data');
|
| 100 |
+
const fs = require('fs');
|
| 101 |
+
const axios = require('axios');
|
| 102 |
+
|
| 103 |
+
const form = new FormData();
|
| 104 |
+
form.append('audio', fs.createReadStream('audio.wav'));
|
| 105 |
+
form.append('language', 'ko');
|
| 106 |
+
form.append('hf_token', 'hf_YOUR_TOKEN');
|
| 107 |
+
|
| 108 |
+
const response = await axios.post(
|
| 109 |
+
'https://YOUR_SPACE.hf.space/transcribe',
|
| 110 |
+
form,
|
| 111 |
+
{ headers: form.getHeaders() }
|
| 112 |
+
);
|
| 113 |
+
console.log(response.data);
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
## Response Format
|
| 117 |
+
|
| 118 |
+
```json
|
| 119 |
+
{
|
| 120 |
+
"success": true,
|
| 121 |
+
"segments": [
|
| 122 |
+
{
|
| 123 |
+
"start": 0.0,
|
| 124 |
+
"end": 2.5,
|
| 125 |
+
"text": "μλ
νμΈμ",
|
| 126 |
+
"speaker": "νκΈΈλ",
|
| 127 |
+
"similarity": 85.3
|
| 128 |
+
}
|
| 129 |
+
],
|
| 130 |
+
"speaker_stats": {
|
| 131 |
+
"νκΈΈλ": {
|
| 132 |
+
"count": 10,
|
| 133 |
+
"duration": 45.5
|
| 134 |
+
}
|
| 135 |
+
},
|
| 136 |
+
"total_segments": 20
|
| 137 |
+
}
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
## Notes
|
| 141 |
+
|
| 142 |
+
- ECAPA-TDNNμ μ μ¬λ μκ³κ° 25% μ΄μμΌ λ νμ λ§€μΉ
|
| 143 |
+
- GPU μ¬μ© κ°λ₯ μ μλμΌλ‘ GPU νμ©
|
| 144 |
+
- μ§μ μ€λμ€ ν¬λ§·: wav, mp3, m4a, ogg, flac, aac
|
| 145 |
+
- API λ¬Έμ: https://YOUR_SPACE.hf.space/docs
|
app.py
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speechlib REST API - HuggingFace Spaces (ECAPA-TDNN λ²μ )
|
| 3 |
+
νμ λΆλ¦¬ + νμ μλ³ + STT
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
import json
|
| 8 |
+
import numpy as np
|
| 9 |
+
import shutil
|
| 10 |
+
from typing import List, Dict, Optional
|
| 11 |
+
from contextlib import asynccontextmanager
|
| 12 |
+
|
| 13 |
+
# νκ²½ μ€μ
|
| 14 |
+
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
| 15 |
+
os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
|
| 19 |
+
# PyTorch νΈνμ± ν¨μΉ (λ²μ μ λ°λΌ λΆκΈ°)
|
| 20 |
+
if hasattr(torch.serialization, 'add_safe_globals'):
|
| 21 |
+
torch.serialization.add_safe_globals([torch.torch_version.TorchVersion])
|
| 22 |
+
from pyannote.audio.core import task as pyannote_task
|
| 23 |
+
from pyannote.audio.core.io import Audio
|
| 24 |
+
torch.serialization.add_safe_globals([
|
| 25 |
+
pyannote_task.Specifications,
|
| 26 |
+
pyannote_task.Problem,
|
| 27 |
+
pyannote_task.Resolution,
|
| 28 |
+
Audio
|
| 29 |
+
])
|
| 30 |
+
|
| 31 |
+
# weights_only=False ν¨μΉ
|
| 32 |
+
original_load = torch.load
|
| 33 |
+
def patched_load(*args, **kwargs):
|
| 34 |
+
if 'weights_only' not in kwargs:
|
| 35 |
+
kwargs['weights_only'] = False
|
| 36 |
+
return original_load(*args, **kwargs)
|
| 37 |
+
torch.load = patched_load
|
| 38 |
+
|
| 39 |
+
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
| 40 |
+
from fastapi.responses import JSONResponse
|
| 41 |
+
import uvicorn
|
| 42 |
+
import torchaudio
|
| 43 |
+
from pydub import AudioSegment
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class SpeakerPipelineECAPA:
|
| 47 |
+
"""
|
| 48 |
+
ECAPA-TDNN μλ² λ©μ μ¬μ©ν νμ μλ³ νμ΄νλΌμΈ
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def __init__(
|
| 52 |
+
self,
|
| 53 |
+
hf_token: str,
|
| 54 |
+
whisper_model: str = "large-v3-turbo",
|
| 55 |
+
similarity_threshold: float = 0.25,
|
| 56 |
+
device: str = None
|
| 57 |
+
):
|
| 58 |
+
self.hf_token = hf_token
|
| 59 |
+
self.whisper_model_size = whisper_model
|
| 60 |
+
self.similarity_threshold = similarity_threshold
|
| 61 |
+
|
| 62 |
+
# GPU μ¬μ© κ°λ₯νλ©΄ GPU, μλλ©΄ CPU
|
| 63 |
+
if device is None:
|
| 64 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 65 |
+
else:
|
| 66 |
+
self.device = device
|
| 67 |
+
|
| 68 |
+
self.registered_speakers: Dict[str, np.ndarray] = {}
|
| 69 |
+
|
| 70 |
+
# λͺ¨λΈλ€ (lazy loading)
|
| 71 |
+
self._diarization_pipeline = None
|
| 72 |
+
self._ecapa_model = None
|
| 73 |
+
self._whisper_model = None
|
| 74 |
+
|
| 75 |
+
print(f"[SpeakerPipeline ECAPA-TDNN] μ΄κΈ°ν")
|
| 76 |
+
print(f" - Device: {self.device}")
|
| 77 |
+
print(f" - μκ³κ°: {similarity_threshold}")
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def diarization_pipeline(self):
|
| 81 |
+
if self._diarization_pipeline is None:
|
| 82 |
+
print("[λ‘λ©] pyannote/speaker-diarization-3.1...")
|
| 83 |
+
from pyannote.audio import Pipeline
|
| 84 |
+
self._diarization_pipeline = Pipeline.from_pretrained(
|
| 85 |
+
"pyannote/speaker-diarization-3.1",
|
| 86 |
+
use_auth_token=self.hf_token
|
| 87 |
+
)
|
| 88 |
+
self._diarization_pipeline.to(torch.device(self.device))
|
| 89 |
+
return self._diarization_pipeline
|
| 90 |
+
|
| 91 |
+
@property
|
| 92 |
+
def ecapa_model(self):
|
| 93 |
+
if self._ecapa_model is None:
|
| 94 |
+
print("[λ‘λ©] speechbrain ECAPA-TDNN...")
|
| 95 |
+
from speechbrain.inference.speaker import EncoderClassifier
|
| 96 |
+
self._ecapa_model = EncoderClassifier.from_hparams(
|
| 97 |
+
source="speechbrain/spkrec-ecapa-voxceleb",
|
| 98 |
+
savedir="pretrained_models/spkrec-ecapa-voxceleb",
|
| 99 |
+
run_opts={"device": self.device}
|
| 100 |
+
)
|
| 101 |
+
return self._ecapa_model
|
| 102 |
+
|
| 103 |
+
@property
|
| 104 |
+
def whisper_model(self):
|
| 105 |
+
if self._whisper_model is None:
|
| 106 |
+
print(f"[λ‘λ©] faster-whisper {self.whisper_model_size}...")
|
| 107 |
+
from faster_whisper import WhisperModel
|
| 108 |
+
compute_type = "float16" if self.device == "cuda" else "int8"
|
| 109 |
+
self._whisper_model = WhisperModel(
|
| 110 |
+
self.whisper_model_size,
|
| 111 |
+
device=self.device,
|
| 112 |
+
compute_type=compute_type
|
| 113 |
+
)
|
| 114 |
+
return self._whisper_model
|
| 115 |
+
|
| 116 |
+
def _load_audio(self, audio_path: str) -> tuple:
|
| 117 |
+
"""μ€λμ€ λ‘λ λ° μ μ²λ¦¬"""
|
| 118 |
+
ext = os.path.splitext(audio_path)[1].lower()
|
| 119 |
+
|
| 120 |
+
if ext in ['.m4a', '.mp4', '.aac', '.ogg', '.flac', '.mp3']:
|
| 121 |
+
audio = AudioSegment.from_file(audio_path)
|
| 122 |
+
audio = audio.set_channels(1).set_frame_rate(16000)
|
| 123 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
| 124 |
+
tmp_path = tmp.name
|
| 125 |
+
audio.export(tmp_path, format='wav')
|
| 126 |
+
waveform, sample_rate = torchaudio.load(tmp_path)
|
| 127 |
+
os.unlink(tmp_path)
|
| 128 |
+
else:
|
| 129 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
| 130 |
+
|
| 131 |
+
if waveform.shape[0] > 1:
|
| 132 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
| 133 |
+
|
| 134 |
+
if sample_rate != 16000:
|
| 135 |
+
resampler = torchaudio.transforms.Resample(sample_rate, 16000)
|
| 136 |
+
waveform = resampler(waveform)
|
| 137 |
+
sample_rate = 16000
|
| 138 |
+
|
| 139 |
+
return waveform, sample_rate
|
| 140 |
+
|
| 141 |
+
def get_embedding_ecapa(self, waveform: torch.Tensor) -> np.ndarray:
|
| 142 |
+
"""ECAPA-TDNNμΌλ‘ μλ² λ© μΆμΆ"""
|
| 143 |
+
if waveform.dim() == 2:
|
| 144 |
+
waveform = waveform.squeeze(0)
|
| 145 |
+
|
| 146 |
+
waveform = waveform.to(self.device)
|
| 147 |
+
|
| 148 |
+
with torch.no_grad():
|
| 149 |
+
embedding = self.ecapa_model.encode_batch(waveform.unsqueeze(0))
|
| 150 |
+
|
| 151 |
+
return embedding.squeeze().cpu().numpy()
|
| 152 |
+
|
| 153 |
+
def register_speaker(self, name: str, audio_paths: List[str]) -> None:
|
| 154 |
+
"""νμ λ±λ‘"""
|
| 155 |
+
print(f"\n[νμ λ±λ‘] {name} ({len(audio_paths)}κ° μν)")
|
| 156 |
+
embeddings = []
|
| 157 |
+
|
| 158 |
+
for path in audio_paths:
|
| 159 |
+
if not os.path.exists(path):
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
try:
|
| 163 |
+
waveform, sr = self._load_audio(path)
|
| 164 |
+
emb = self.get_embedding_ecapa(waveform)
|
| 165 |
+
emb = emb / np.linalg.norm(emb)
|
| 166 |
+
embeddings.append(emb)
|
| 167 |
+
print(f" β {os.path.basename(path)}")
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f" β μλ¬({os.path.basename(path)}): {e}")
|
| 170 |
+
|
| 171 |
+
if not embeddings:
|
| 172 |
+
print(f" [κ²½κ³ ] μ ν¨ν μνμ΄ μμ΅λλ€!")
|
| 173 |
+
return
|
| 174 |
+
|
| 175 |
+
avg_embedding = np.mean(embeddings, axis=0)
|
| 176 |
+
avg_embedding = avg_embedding / np.linalg.norm(avg_embedding)
|
| 177 |
+
self.registered_speakers[name] = avg_embedding
|
| 178 |
+
print(f"[νμ λ±λ‘] {name} μλ£!")
|
| 179 |
+
|
| 180 |
+
def process(self, audio_path: str, language: str = "ko") -> List[Dict]:
|
| 181 |
+
"""λ©μΈ μ²λ¦¬ ν¨μ"""
|
| 182 |
+
print(f"\n[μ²λ¦¬ μμ] {os.path.basename(audio_path)}")
|
| 183 |
+
|
| 184 |
+
waveform, sample_rate = self._load_audio(audio_path)
|
| 185 |
+
audio_dict = {"waveform": waveform, "sample_rate": sample_rate}
|
| 186 |
+
|
| 187 |
+
# 1. νμ λΆλ¦¬
|
| 188 |
+
print("[1/3] νμ λΆλ¦¬ μ€...")
|
| 189 |
+
raw_diarization = self.diarization_pipeline(audio_dict)
|
| 190 |
+
|
| 191 |
+
diarization = None
|
| 192 |
+
if hasattr(raw_diarization, "itertracks"):
|
| 193 |
+
diarization = raw_diarization
|
| 194 |
+
else:
|
| 195 |
+
for attr in dir(raw_diarization):
|
| 196 |
+
if attr.startswith("_"): continue
|
| 197 |
+
try:
|
| 198 |
+
val = getattr(raw_diarization, attr)
|
| 199 |
+
if hasattr(val, "itertracks"):
|
| 200 |
+
diarization = val
|
| 201 |
+
break
|
| 202 |
+
except: pass
|
| 203 |
+
|
| 204 |
+
if diarization is None:
|
| 205 |
+
raise RuntimeError("νμ λΆλ¦¬ κ²°κ³Όλ₯Ό νμ±ν μ μμ΅λλ€.")
|
| 206 |
+
|
| 207 |
+
segments = []
|
| 208 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 209 |
+
segments.append({
|
| 210 |
+
"start": turn.start,
|
| 211 |
+
"end": turn.end,
|
| 212 |
+
"diarization_speaker": speaker
|
| 213 |
+
})
|
| 214 |
+
print(f" β {len(segments)}κ° μΈκ·Έλ¨ΌνΈ κ°μ§")
|
| 215 |
+
|
| 216 |
+
# 2. νμ μλ³ (ECAPA-TDNN)
|
| 217 |
+
if self.registered_speakers:
|
| 218 |
+
print("[2/3] νμ μλ³ μ€ (ECAPA-TDNN)...")
|
| 219 |
+
|
| 220 |
+
speaker_embeddings = {}
|
| 221 |
+
speakers_found = set(seg["diarization_speaker"] for seg in segments)
|
| 222 |
+
|
| 223 |
+
for spk in speakers_found:
|
| 224 |
+
spk_embs = []
|
| 225 |
+
for seg in segments:
|
| 226 |
+
if seg["diarization_speaker"] != spk:
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
duration = seg["end"] - seg["start"]
|
| 230 |
+
if duration < 0.5:
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
start_sample = int(seg["start"] * sample_rate)
|
| 235 |
+
end_sample = int(seg["end"] * sample_rate)
|
| 236 |
+
end_sample = min(end_sample, waveform.shape[1])
|
| 237 |
+
seg_waveform = waveform[:, start_sample:end_sample]
|
| 238 |
+
|
| 239 |
+
if seg_waveform.shape[1] < sample_rate * 0.3:
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
emb = self.get_embedding_ecapa(seg_waveform)
|
| 243 |
+
emb = emb / np.linalg.norm(emb)
|
| 244 |
+
spk_embs.append(emb)
|
| 245 |
+
except:
|
| 246 |
+
pass
|
| 247 |
+
|
| 248 |
+
if spk_embs:
|
| 249 |
+
speaker_embeddings[spk] = spk_embs
|
| 250 |
+
|
| 251 |
+
# νμ λ§€ν
|
| 252 |
+
speaker_mapping = {}
|
| 253 |
+
speaker_scores = {}
|
| 254 |
+
|
| 255 |
+
for spk, embs in speaker_embeddings.items():
|
| 256 |
+
avg_emb = np.mean(embs, axis=0)
|
| 257 |
+
avg_emb = avg_emb / np.linalg.norm(avg_emb)
|
| 258 |
+
|
| 259 |
+
speaker_scores[spk] = {}
|
| 260 |
+
for name, ref_emb in self.registered_speakers.items():
|
| 261 |
+
sim = np.dot(avg_emb, ref_emb)
|
| 262 |
+
speaker_scores[spk][name] = sim
|
| 263 |
+
|
| 264 |
+
# κ²½μ λ§€μΉ
|
| 265 |
+
for reg_name in self.registered_speakers.keys():
|
| 266 |
+
best_spk = None
|
| 267 |
+
best_sim = -1
|
| 268 |
+
|
| 269 |
+
for spk in speaker_scores.keys():
|
| 270 |
+
if spk in [m[0] for m in speaker_mapping.values() if m[0] != spk]:
|
| 271 |
+
continue
|
| 272 |
+
|
| 273 |
+
sim = speaker_scores[spk].get(reg_name, -1)
|
| 274 |
+
if sim > best_sim:
|
| 275 |
+
best_sim = sim
|
| 276 |
+
best_spk = spk
|
| 277 |
+
|
| 278 |
+
if best_spk and best_sim >= self.similarity_threshold:
|
| 279 |
+
speaker_mapping[best_spk] = (reg_name, best_sim)
|
| 280 |
+
|
| 281 |
+
for spk in speaker_scores.keys():
|
| 282 |
+
if spk not in speaker_mapping:
|
| 283 |
+
speaker_mapping[spk] = (spk, 0.0)
|
| 284 |
+
|
| 285 |
+
for seg in segments:
|
| 286 |
+
d_spk = seg["diarization_speaker"]
|
| 287 |
+
if d_spk in speaker_mapping:
|
| 288 |
+
seg["speaker"], seg["similarity"] = speaker_mapping[d_spk]
|
| 289 |
+
else:
|
| 290 |
+
seg["speaker"] = d_spk
|
| 291 |
+
seg["similarity"] = 0.0
|
| 292 |
+
else:
|
| 293 |
+
for seg in segments:
|
| 294 |
+
seg["speaker"] = seg["diarization_speaker"]
|
| 295 |
+
seg["similarity"] = 0.0
|
| 296 |
+
|
| 297 |
+
# 3. STT
|
| 298 |
+
print("[3/3] μμ± μΈμ(STT) μ€...")
|
| 299 |
+
whisper_segs, _ = self.whisper_model.transcribe(
|
| 300 |
+
audio_path, language=language, beam_size=5, vad_filter=True
|
| 301 |
+
)
|
| 302 |
+
whisper_results = [{"start": s.start, "end": s.end, "text": s.text.strip()} for s in whisper_segs]
|
| 303 |
+
|
| 304 |
+
# 4. λ³ν©
|
| 305 |
+
final_results = []
|
| 306 |
+
for w_seg in whisper_results:
|
| 307 |
+
best_speaker = "Unknown"
|
| 308 |
+
best_overlap = 0
|
| 309 |
+
best_sim = 0.0
|
| 310 |
+
|
| 311 |
+
for d_seg in segments:
|
| 312 |
+
overlap = max(0, min(w_seg["end"], d_seg["end"]) - max(w_seg["start"], d_seg["start"]))
|
| 313 |
+
if overlap > best_overlap:
|
| 314 |
+
best_overlap = overlap
|
| 315 |
+
best_speaker = d_seg["speaker"]
|
| 316 |
+
best_sim = d_seg.get("similarity", 0.0)
|
| 317 |
+
|
| 318 |
+
final_results.append({
|
| 319 |
+
"start": w_seg["start"],
|
| 320 |
+
"end": w_seg["end"],
|
| 321 |
+
"text": w_seg["text"],
|
| 322 |
+
"speaker": best_speaker,
|
| 323 |
+
"similarity": round(best_sim * 100, 1)
|
| 324 |
+
})
|
| 325 |
+
|
| 326 |
+
return final_results
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
# μ μ νμ΄νλΌμΈ μΈμ€ν΄μ€
|
| 330 |
+
_pipeline: Optional[SpeakerPipelineECAPA] = None
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def get_pipeline(hf_token: str) -> SpeakerPipelineECAPA:
|
| 334 |
+
"""νμ΄νλΌμΈ μ±κΈν€ μΈμ€ν΄μ€ λ°ν"""
|
| 335 |
+
global _pipeline
|
| 336 |
+
if _pipeline is None:
|
| 337 |
+
_pipeline = SpeakerPipelineECAPA(hf_token=hf_token)
|
| 338 |
+
return _pipeline
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
# FastAPI μ±
|
| 342 |
+
@asynccontextmanager
|
| 343 |
+
async def lifespan(app: FastAPI):
|
| 344 |
+
# μμ μ
|
| 345 |
+
print("π Speechlib API μλ² μμ")
|
| 346 |
+
yield
|
| 347 |
+
# μ’
λ£ μ
|
| 348 |
+
print("π Speechlib API μλ² μ’
λ£")
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
app = FastAPI(
|
| 352 |
+
title="Speechlib API",
|
| 353 |
+
description="νμ λΆλ¦¬ + νμ μλ³ + STT REST API (ECAPA-TDNN)",
|
| 354 |
+
version="1.0.0",
|
| 355 |
+
lifespan=lifespan
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
@app.get("/")
|
| 360 |
+
async def root():
|
| 361 |
+
"""API μν νμΈ"""
|
| 362 |
+
return {
|
| 363 |
+
"status": "ok",
|
| 364 |
+
"message": "Speechlib API (ECAPA-TDNN)",
|
| 365 |
+
"endpoints": {
|
| 366 |
+
"/transcribe": "POST - λ¨μ STT + νμ λΆλ¦¬",
|
| 367 |
+
"/process": "POST - μ 체 κΈ°λ₯ (νμ μλ³ ν¬ν¨)"
|
| 368 |
+
}
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
@app.get("/health")
|
| 373 |
+
async def health_check():
|
| 374 |
+
"""ν¬μ€ 체ν¬"""
|
| 375 |
+
return {"status": "healthy", "device": "cuda" if torch.cuda.is_available() else "cpu"}
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
@app.post("/transcribe")
|
| 379 |
+
async def transcribe(
|
| 380 |
+
audio: UploadFile = File(..., description="μ€λμ€ νμΌ"),
|
| 381 |
+
language: str = Form(default="ko", description="μΈμ΄ μ½λ (ko, en, ja, zh)"),
|
| 382 |
+
hf_token: str = Form(..., description="HuggingFace ν ν°")
|
| 383 |
+
):
|
| 384 |
+
"""
|
| 385 |
+
λ¨μ STT + νμ λΆλ¦¬ (νμ μλ³ μμ)
|
| 386 |
+
"""
|
| 387 |
+
temp_dir = None
|
| 388 |
+
try:
|
| 389 |
+
# μμ νμΌ μ μ₯
|
| 390 |
+
temp_dir = tempfile.mkdtemp()
|
| 391 |
+
audio_path = os.path.join(temp_dir, audio.filename)
|
| 392 |
+
|
| 393 |
+
with open(audio_path, "wb") as f:
|
| 394 |
+
content = await audio.read()
|
| 395 |
+
f.write(content)
|
| 396 |
+
|
| 397 |
+
# νμ΄νλΌμΈ μ€ν
|
| 398 |
+
pipeline = get_pipeline(hf_token)
|
| 399 |
+
pipeline.registered_speakers.clear() # νμ μλ³ μμ
|
| 400 |
+
|
| 401 |
+
results = pipeline.process(audio_path, language=language)
|
| 402 |
+
|
| 403 |
+
# κ²°κ³Ό ν¬λ§·ν
|
| 404 |
+
segments = []
|
| 405 |
+
speaker_stats = {}
|
| 406 |
+
|
| 407 |
+
for r in results:
|
| 408 |
+
segments.append({
|
| 409 |
+
"start": round(r["start"], 2),
|
| 410 |
+
"end": round(r["end"], 2),
|
| 411 |
+
"text": r["text"],
|
| 412 |
+
"speaker": r["speaker"]
|
| 413 |
+
})
|
| 414 |
+
|
| 415 |
+
speaker = r["speaker"]
|
| 416 |
+
if speaker not in speaker_stats:
|
| 417 |
+
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
| 418 |
+
speaker_stats[speaker]["count"] += 1
|
| 419 |
+
speaker_stats[speaker]["duration"] += r["end"] - r["start"]
|
| 420 |
+
|
| 421 |
+
for speaker in speaker_stats:
|
| 422 |
+
speaker_stats[speaker]["duration"] = round(speaker_stats[speaker]["duration"], 2)
|
| 423 |
+
|
| 424 |
+
return JSONResponse(content={
|
| 425 |
+
"success": True,
|
| 426 |
+
"segments": segments,
|
| 427 |
+
"speaker_stats": speaker_stats,
|
| 428 |
+
"total_segments": len(segments)
|
| 429 |
+
})
|
| 430 |
+
|
| 431 |
+
except Exception as e:
|
| 432 |
+
import traceback
|
| 433 |
+
return JSONResponse(
|
| 434 |
+
status_code=500,
|
| 435 |
+
content={
|
| 436 |
+
"success": False,
|
| 437 |
+
"error": str(e),
|
| 438 |
+
"traceback": traceback.format_exc()
|
| 439 |
+
}
|
| 440 |
+
)
|
| 441 |
+
finally:
|
| 442 |
+
if temp_dir and os.path.exists(temp_dir):
|
| 443 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
@app.post("/process")
|
| 447 |
+
async def process_audio(
|
| 448 |
+
audio: UploadFile = File(..., description="λΆμν μ€λμ€ νμΌ"),
|
| 449 |
+
voice_sample: UploadFile = File(default=None, description="νμ μν νμΌ (μ ν)"),
|
| 450 |
+
speaker_name: str = Form(default="speaker", description="μλ³ν νμ μ΄λ¦"),
|
| 451 |
+
language: str = Form(default="ko", description="μΈμ΄ μ½λ (ko, en, ja, zh)"),
|
| 452 |
+
hf_token: str = Form(..., description="HuggingFace ν ν°")
|
| 453 |
+
):
|
| 454 |
+
"""
|
| 455 |
+
μ 체 κΈ°λ₯: νμ λΆλ¦¬ + νμ μλ³ + STT
|
| 456 |
+
"""
|
| 457 |
+
temp_dir = None
|
| 458 |
+
try:
|
| 459 |
+
# μμ λλ ν 리 μμ±
|
| 460 |
+
temp_dir = tempfile.mkdtemp()
|
| 461 |
+
|
| 462 |
+
# λ©μΈ μ€λμ€ μ μ₯
|
| 463 |
+
audio_path = os.path.join(temp_dir, audio.filename)
|
| 464 |
+
with open(audio_path, "wb") as f:
|
| 465 |
+
content = await audio.read()
|
| 466 |
+
f.write(content)
|
| 467 |
+
|
| 468 |
+
# νμ΄νλΌμΈ κ°μ Έμ€κΈ°
|
| 469 |
+
pipeline = get_pipeline(hf_token)
|
| 470 |
+
pipeline.registered_speakers.clear()
|
| 471 |
+
|
| 472 |
+
# νμ μνμ΄ μμΌλ©΄ λ±λ‘
|
| 473 |
+
if voice_sample and voice_sample.filename:
|
| 474 |
+
sample_path = os.path.join(temp_dir, voice_sample.filename)
|
| 475 |
+
with open(sample_path, "wb") as f:
|
| 476 |
+
sample_content = await voice_sample.read()
|
| 477 |
+
f.write(sample_content)
|
| 478 |
+
pipeline.register_speaker(speaker_name, [sample_path])
|
| 479 |
+
|
| 480 |
+
# μ²λ¦¬
|
| 481 |
+
results = pipeline.process(audio_path, language=language)
|
| 482 |
+
|
| 483 |
+
# κ²°κ³Ό ν¬λ§·ν
|
| 484 |
+
segments = []
|
| 485 |
+
speaker_stats = {}
|
| 486 |
+
|
| 487 |
+
for r in results:
|
| 488 |
+
segments.append({
|
| 489 |
+
"start": round(r["start"], 2),
|
| 490 |
+
"end": round(r["end"], 2),
|
| 491 |
+
"text": r["text"],
|
| 492 |
+
"speaker": r["speaker"],
|
| 493 |
+
"similarity": r["similarity"]
|
| 494 |
+
})
|
| 495 |
+
|
| 496 |
+
speaker = r["speaker"]
|
| 497 |
+
if speaker not in speaker_stats:
|
| 498 |
+
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
| 499 |
+
speaker_stats[speaker]["count"] += 1
|
| 500 |
+
speaker_stats[speaker]["duration"] += r["end"] - r["start"]
|
| 501 |
+
|
| 502 |
+
for speaker in speaker_stats:
|
| 503 |
+
speaker_stats[speaker]["duration"] = round(speaker_stats[speaker]["duration"], 2)
|
| 504 |
+
|
| 505 |
+
return JSONResponse(content={
|
| 506 |
+
"success": True,
|
| 507 |
+
"segments": segments,
|
| 508 |
+
"speaker_stats": speaker_stats,
|
| 509 |
+
"total_segments": len(segments)
|
| 510 |
+
})
|
| 511 |
+
|
| 512 |
+
except Exception as e:
|
| 513 |
+
import traceback
|
| 514 |
+
return JSONResponse(
|
| 515 |
+
status_code=500,
|
| 516 |
+
content={
|
| 517 |
+
"success": False,
|
| 518 |
+
"error": str(e),
|
| 519 |
+
"traceback": traceback.format_exc()
|
| 520 |
+
}
|
| 521 |
+
)
|
| 522 |
+
finally:
|
| 523 |
+
if temp_dir and os.path.exists(temp_dir):
|
| 524 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
if __name__ == "__main__":
|
| 528 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.100.0
|
| 2 |
+
uvicorn>=0.23.0
|
| 3 |
+
python-multipart>=0.0.6
|
| 4 |
+
torch==2.4.0
|
| 5 |
+
torchaudio==2.4.0
|
| 6 |
+
pyannote.audio==3.3.2
|
| 7 |
+
speechbrain==1.0.0
|
| 8 |
+
faster-whisper>=1.0.0
|
| 9 |
+
pydub>=0.25.1
|
| 10 |
+
numpy<2.0.0
|
| 11 |
+
ffmpeg-python
|
| 12 |
+
soundfile
|