- README.md +19 -12
- fixed_app.py +11 -15
- startup.sh +1 -1
README.md
CHANGED
|
@@ -35,13 +35,19 @@ docker run -p 7860:7860 -v /your/model/dir:/models whisper-api
|
|
| 35 |
|
| 36 |
## API 使用
|
| 37 |
|
| 38 |
-
### 转录接口(流式SSE输出)
|
| 39 |
|
| 40 |
```bash
|
| 41 |
POST /transcribe
|
| 42 |
```
|
| 43 |
-
- 支持参数:
|
| 44 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
### 示例请求
|
| 47 |
|
|
@@ -49,18 +55,19 @@ POST /transcribe
|
|
| 49 |
{
|
| 50 |
"audio": "data:audio/wav;base64,...",
|
| 51 |
"model": "base",
|
| 52 |
-
"language": "zh"
|
|
|
|
|
|
|
| 53 |
}
|
| 54 |
```
|
| 55 |
|
| 56 |
-
###
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
...
|
| 64 |
```
|
| 65 |
|
| 66 |
## 主要特性
|
|
|
|
| 35 |
|
| 36 |
## API 使用
|
| 37 |
|
|
|
|
| 38 |
|
| 39 |
```bash
|
| 40 |
POST /transcribe
|
| 41 |
```
|
| 42 |
+
- 支持参数:
|
| 43 |
+
- audio(base64,必填)
|
| 44 |
+
- model(如 tiny/base/small/...,可选,默认 tiny)
|
| 45 |
+
- language(如 zh/en/auto,可选,默认 zh)
|
| 46 |
+
- temperature(float,可选,默认0.0)
|
| 47 |
+
- beam_size(int,可选,默认1)
|
| 48 |
+
- fast_mode(bool,可选,默认true,极致加速)
|
| 49 |
+
- vad(bool,可选,默认false,启用端点检测)
|
| 50 |
+
- threads(int,可选,默认4,推理线程数)
|
| 51 |
|
| 52 |
### 示例请求
|
| 53 |
|
|
|
|
| 55 |
{
|
| 56 |
"audio": "data:audio/wav;base64,...",
|
| 57 |
"model": "base",
|
| 58 |
+
"language": "zh",
|
| 59 |
+
"vad": true,
|
| 60 |
+
"threads": 8
|
| 61 |
}
|
| 62 |
```
|
| 63 |
|
| 64 |
+
### 返回内容包含:
|
| 65 |
+
```json
|
| 66 |
+
{
|
| 67 |
+
"full_text":完整转写文本
|
| 68 |
+
"processing_time":处理耗时(秒)
|
| 69 |
+
"cmd":实际调用的whisper命令行
|
| 70 |
+
}
|
|
|
|
| 71 |
```
|
| 72 |
|
| 73 |
## 主要特性
|
fixed_app.py
CHANGED
|
@@ -30,6 +30,7 @@ class AudioRequest(BaseModel):
|
|
| 30 |
fast_mode: Optional[bool] = True # 快速模式
|
| 31 |
vad: Optional[bool] = False
|
| 32 |
threads: Optional[int] = 4
|
|
|
|
| 33 |
|
| 34 |
def load_model(model_name: str):
|
| 35 |
"""确保模型文件存在,返回模型路径"""
|
|
@@ -61,24 +62,19 @@ def load_model(model_name: str):
|
|
| 61 |
logger.error(f"找不到模型 {model_name},请确保模型文件存在")
|
| 62 |
raise HTTPException(status_code=500, detail=f"Model {model_name} not found")
|
| 63 |
|
| 64 |
-
async def convert_audio_to_wav(input_file: str) -> str:
|
| 65 |
-
"""使用ffmpeg将音频文件转换为WAV
|
| 66 |
try:
|
| 67 |
# 创建输出文件路径
|
| 68 |
output_file = input_file.rsplit('.', 1)[0] + '_converted.wav'
|
| 69 |
|
| 70 |
-
# 构建ffmpeg命令
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
'-ac', '1', # 声道数:单声道
|
| 76 |
-
'-acodec', 'pcm_s16le', # 音频编码器:16位PCM
|
| 77 |
-
'-y', # 覆盖输出文件
|
| 78 |
-
output_file
|
| 79 |
-
]
|
| 80 |
|
| 81 |
-
logger.info(f"开始音频转换: {
|
| 82 |
|
| 83 |
# 执行ffmpeg命令
|
| 84 |
proc = await asyncio.create_subprocess_exec(
|
|
@@ -102,7 +98,7 @@ async def convert_audio_to_wav(input_file: str) -> str:
|
|
| 102 |
if os.path.exists(input_file):
|
| 103 |
os.unlink(input_file)
|
| 104 |
|
| 105 |
-
logger.info(f"音频转换成功: {output_file}, 大小: {os.path.getsize(output_file)}
|
| 106 |
return output_file
|
| 107 |
|
| 108 |
except HTTPException:
|
|
@@ -244,7 +240,7 @@ async def transcribe_audio(request: AudioRequest):
|
|
| 244 |
supported_formats = ('.wav', '.flac', '.mp3', '.ogg')
|
| 245 |
if not audio_file.endswith(supported_formats):
|
| 246 |
logger.info(f"音频格式不直接支持,将转换为WAV: {audio_file}")
|
| 247 |
-
audio_file = await convert_audio_to_wav(audio_file)
|
| 248 |
|
| 249 |
# 创建临时目录用于输出
|
| 250 |
temp_dir = tempfile.mkdtemp()
|
|
|
|
| 30 |
fast_mode: Optional[bool] = True # 快速模式
|
| 31 |
vad: Optional[bool] = False
|
| 32 |
threads: Optional[int] = 4
|
| 33 |
+
atempo: Optional[float] = 1.0
|
| 34 |
|
| 35 |
def load_model(model_name: str):
|
| 36 |
"""确保模型文件存在,返回模型路径"""
|
|
|
|
| 62 |
logger.error(f"找不到模型 {model_name},请确保模型文件存在")
|
| 63 |
raise HTTPException(status_code=500, detail=f"Model {model_name} not found")
|
| 64 |
|
| 65 |
+
async def convert_audio_to_wav(input_file: str, atempo: float = 1.0) -> str:
|
| 66 |
+
"""使用ffmpeg将音频文件转换为WAV格式,支持atempo变速"""
|
| 67 |
try:
|
| 68 |
# 创建输出文件路径
|
| 69 |
output_file = input_file.rsplit('.', 1)[0] + '_converted.wav'
|
| 70 |
|
| 71 |
+
# 构建ffmpeg命令 采样率:16kHz 单声道 音频编码器:16位PCM
|
| 72 |
+
if atempo != 1.0: # 使用ffmpeg的atempo滤镜进行变速
|
| 73 |
+
cmd = f"ffmpeg -i {input_file} -ar 16000 -ac 1 -c:a pcm_s16le -filter:a \"atempo={atempo}\" -y {output_file}"
|
| 74 |
+
else: # 如果atempo为1.0,则直接转换为WAV格式
|
| 75 |
+
cmd = f"ffmpeg -i {input_file} -ar 16000 -ac 1 -c:a pcm_s16le -y {output_file}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
+
logger.info(f"开始音频转换: {cmd}")
|
| 78 |
|
| 79 |
# 执行ffmpeg命令
|
| 80 |
proc = await asyncio.create_subprocess_exec(
|
|
|
|
| 98 |
if os.path.exists(input_file):
|
| 99 |
os.unlink(input_file)
|
| 100 |
|
| 101 |
+
logger.info(f"音频转换成功: {output_file}, 大小: {os.path.getsize(output_file)} 字节, 时长: {os.path.getsize(output_file) / 16000} 秒")
|
| 102 |
return output_file
|
| 103 |
|
| 104 |
except HTTPException:
|
|
|
|
| 240 |
supported_formats = ('.wav', '.flac', '.mp3', '.ogg')
|
| 241 |
if not audio_file.endswith(supported_formats):
|
| 242 |
logger.info(f"音频格式不直接支持,将转换为WAV: {audio_file}")
|
| 243 |
+
audio_file = await convert_audio_to_wav(audio_file, request.atempo)
|
| 244 |
|
| 245 |
# 创建临时目录用于输出
|
| 246 |
temp_dir = tempfile.mkdtemp()
|
startup.sh
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
# 显示环境信息
|
| 4 |
-
echo "=== Whisper API Startup 0.
|
| 5 |
echo "Python version: $(python3 --version)"
|
| 6 |
echo "Current directory: $(pwd)"
|
| 7 |
# echo "Files in /app:"
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
# 显示环境信息
|
| 4 |
+
echo "=== Whisper API Startup 0.9==="
|
| 5 |
echo "Python version: $(python3 --version)"
|
| 6 |
echo "Current directory: $(pwd)"
|
| 7 |
# echo "Files in /app:"
|