Spaces:
Running
Running
File size: 18,727 Bytes
1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 463162b eea7b6a efd572d 463162b efd572d 463162b efd572d a132dcd efd572d 463162b efd572d 463162b efd572d 463162b efd572d 463162b efd572d 463162b efd572d 463162b efd572d 463162b efd572d 463162b efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 1b687b0 efd572d 501ca3e efd572d 501ca3e efd572d 501ca3e efd572d 501ca3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 |
"""
StepFun API Client for audio editing and voice cloning
"""
import os
import time
import json
import logging
import tempfile
import requests
import base64
logger = logging.getLogger(__name__)
# API Configuration
API_BASE_URL = "https://api.stepfun.com/v1"
POLL_INTERVAL = 2 # seconds
POLL_TIMEOUT = 60 # seconds
def get_api_token() -> str:
"""Get API token from environment variable"""
token = os.getenv('STEPFUN_API_TOKEN', '')
if not token:
logger.warning("⚠️ STEPFUN_API_TOKEN not set in environment variables")
return token
class StepFunAPIClient:
"""StepFun API Client for audio editing"""
@property
def token(self) -> str:
"""Get token dynamically (allows runtime env var changes)"""
return get_api_token()
@property
def base_headers(self) -> dict:
"""Get headers with current token"""
return {"Authorization": f"Bearer {self.token}"}
def upload_file(self, file_path: str) -> str | None:
"""Upload audio file and return file_id"""
url = f"{API_BASE_URL}/files"
file_name = os.path.basename(file_path)
# Determine audio type
if file_path.endswith('.wav'):
audio_type = 'audio/wav'
elif file_path.endswith('.mp3'):
audio_type = 'audio/mpeg'
else:
audio_type = 'audio/wav'
files = [
('file', (file_name, open(file_path, 'rb'), audio_type))
]
payload = {'purpose': 'storage'}
headers = self.base_headers.copy()
try:
response = requests.post(url, headers=headers, data=payload, files=files, timeout=60)
response.raise_for_status()
result = response.json()
file_id = result.get("id")
logger.info(f"✅ Uploaded file {file_name}, file_id: {file_id}")
return file_id
except Exception as e:
logger.error(f"❌ Failed to upload file {file_name}: {e}")
return None
def query_file_status(self, file_id: str) -> bool:
"""Query file status, return True if success"""
url = f"{API_BASE_URL}/files/{file_id}"
headers = {
**self.base_headers,
"Content-Type": "application/json"
}
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
result = response.json()
status = result.get("status", "").lower()
logger.debug(f"File {file_id} status: {status}")
return status == "success"
except Exception as e:
logger.error(f"❌ Failed to query file status: {e}")
return False
def wait_for_file_ready(self, file_id: str) -> bool:
"""Wait for file to be ready (poll status)"""
start_time = time.time()
while time.time() - start_time < POLL_TIMEOUT:
if self.query_file_status(file_id):
return True
time.sleep(POLL_INTERVAL)
logger.error(f"⏰ File {file_id} status query timeout ({POLL_TIMEOUT}s)")
return False
def audio_edit(self, file_id: str, sample_text: str, target_text: str,
edit_type: str = "clone", edit_info: str = "") -> bytes | None:
"""Call audio edit API and return audio bytes"""
url = f"{API_BASE_URL}/audio/edit"
headers = {
**self.base_headers,
"Content-Type": "application/json"
}
payload = {
"model": "step-tts-edit",
"file_id": file_id,
"sample_text": target_text, # target text from UI
"text": sample_text, # prompt text from UI
"edit_type": edit_type,
"edit_info": edit_info or "",
"response_format": "wav"
}
logger.info(f"🎯 Calling audio edit API: edit_type={edit_type}, edit_info={edit_info}")
try:
response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=120)
# Check if response is audio or error
content_type = response.headers.get('Content-Type', '')
if 'audio' in content_type or response.status_code == 200:
if len(response.content) > 1000: # Likely audio data
logger.info(f"✅ Audio edit successful, received {len(response.content)} bytes")
return response.content
else:
# Might be an error message
try:
error_data = response.json()
logger.error(f"❌ API error: {error_data}")
except:
logger.error(f"❌ API error: {response.text}")
return None
else:
logger.error(f"❌ API error: {response.status_code} - {response.text}")
return None
except Exception as e:
logger.error(f"❌ Audio edit request failed: {e}")
return None
def transcribe_audio_sse(self, audio_path: str, progress_callback=None, streaming=False):
"""
使用ASR SSE接口转录音频文件
Args:
audio_path: 音频文件路径
progress_callback: 可选的回调函数,用于处理增量文本更新 callback(delta_text)
streaming: 是否返回生成器进行流式更新
Returns:
如果streaming=False: 完整的转录文本
如果streaming=True: 生成器,产生增量更新和最终文本
Raises:
Exception: 如果转录失败
"""
if streaming:
return self._transcribe_audio_sse_streaming(audio_path, progress_callback)
else:
return self._transcribe_audio_sse_sync(audio_path, progress_callback)
def _transcribe_audio_sse_sync(self, audio_path: str, progress_callback=None) -> str:
"""
同步ASR转录,返回最终文本
"""
url = f"{API_BASE_URL}/audio/asr/sse"
# 读取音频文件并转换为base64
try:
with open(audio_path, 'rb') as audio_file:
audio_data = base64.b64encode(audio_file.read()).decode('utf-8')
except Exception as e:
logger.error(f"❌ Failed to read audio file: {e}")
raise Exception(f"Failed to read audio file: {e}")
# 构建请求payload
payload = {
"audio": {
"data": audio_data,
"input": {
"transcription": {
"language": "zh",
"prompt": "请记录下你所听到的语音内容。",
"model": "step-asr",
"full_rerun_on_commit": True,
"enable_itn": True
},
"format": {
"type": "pcm",
"codec": "pcm_s16le",
"rate": 16000,
"bits": 16,
"channel": 1
}
}
}
}
headers = {
**self.base_headers,
"Content-Type": "application/json",
"Accept": "text/event-stream"
}
logger.info("🎙️ Starting ASR transcription...")
try:
response = requests.post(url, headers=headers, data=json.dumps(payload), stream=True, timeout=120)
response.raise_for_status()
final_text = ""
accumulated_text = ""
for line in response.iter_lines(decode_unicode=True):
if line:
line = line.strip()
if line.startswith("data: "):
try:
# 解析SSE数据
data_str = line[6:] # 去掉 "data: " 前缀
if data_str == "[DONE]":
break
event_data = json.loads(data_str)
event_type = event_data.get("type")
if event_type == "transcript.text.delta":
delta = event_data.get("delta", "")
accumulated_text += delta
logger.debug(f"📝 ASR delta: {delta}")
# 处理增量更新回调
if progress_callback:
progress_callback(accumulated_text)
elif event_type == "transcript.text.done":
final_text = event_data.get("text", accumulated_text)
logger.info(f"✅ ASR transcription complete: {final_text}")
break
elif event_type == "error":
error_msg = event_data.get("message", "Unknown ASR error")
logger.error(f"❌ ASR error: {error_msg}")
raise Exception(f"ASR API error: {error_msg}")
except json.JSONDecodeError as e:
logger.warning(f"⚠️ Failed to parse SSE line: {line}, error: {e}")
continue
except Exception as e:
logger.error(f"❌ Error processing SSE event: {e}")
raise
# 如果没有获得final_text,使用accumulated_text
if not final_text:
final_text = accumulated_text
if not final_text:
raise Exception("No transcription result received")
logger.info(f"🎯 Final transcription: {final_text}")
return final_text
except Exception as e:
logger.error(f"❌ ASR transcription failed: {e}")
raise Exception(f"ASR transcription failed: {e}")
def _transcribe_audio_sse_streaming(self, audio_path: str, progress_callback=None):
"""
流式ASR转录,返回生成器
"""
url = f"{API_BASE_URL}/audio/asr"
# 读取音频文件并转换为base64
try:
with open(audio_path, 'rb') as audio_file:
audio_data = base64.b64encode(audio_file.read()).decode('utf-8')
except Exception as e:
logger.error(f"❌ Failed to read audio file: {e}")
yield f"[读取文件失败: {e}]"
return
# 构建请求payload
payload = {
"audio": {
"data": audio_data,
"input": {
"transcription": {
"language": "zh",
"prompt": "请记录下你所听到的语音内容。",
"model": "step-asr",
"full_rerun_on_commit": True,
"enable_itn": True
},
"format": {
"type": "pcm",
"codec": "pcm_s16le",
"rate": 16000,
"bits": 16,
"channel": 1
}
}
}
}
headers = {
**self.base_headers,
"Content-Type": "application/json",
"Accept": "text/event-stream"
}
logger.info("🎙️ Starting ASR transcription...")
try:
response = requests.post(url, headers=headers, data=json.dumps(payload), stream=True, timeout=120)
response.raise_for_status()
final_text = ""
accumulated_text = ""
for line in response.iter_lines(decode_unicode=True):
if line:
line = line.strip()
if line.startswith("data: "):
try:
# 解析SSE数据
data_str = line[6:] # 去掉 "data: " 前缀
if data_str == "[DONE]":
break
event_data = json.loads(data_str)
event_type = event_data.get("type")
if event_type == "transcript.text.delta":
delta = event_data.get("delta", "")
accumulated_text += delta
logger.debug(f"📝 ASR delta: {delta}")
# 流式更新
yield accumulated_text
if progress_callback:
progress_callback(accumulated_text)
elif event_type == "transcript.text.done":
final_text = event_data.get("text", accumulated_text)
logger.info(f"✅ ASR transcription complete: {final_text}")
yield final_text
return
elif event_type == "error":
error_msg = event_data.get("message", "Unknown ASR error")
logger.error(f"❌ ASR error: {error_msg}")
yield f"[ASR错误: {error_msg}]"
return
except json.JSONDecodeError as e:
logger.warning(f"⚠️ Failed to parse SSE line: {line}, error: {e}")
continue
except Exception as e:
logger.error(f"❌ Error processing SSE event: {e}")
yield f"[处理错误: {str(e)}]"
return
# 如果没有获得final_text,使用accumulated_text
if not final_text:
final_text = accumulated_text
if not final_text:
yield f"[转录无结果]"
return
logger.info(f"🎯 Final transcription: {final_text}")
except Exception as e:
logger.error(f"❌ ASR transcription failed: {e}")
yield f"[转录失败: {str(e)}]"
# Global API client instance (singleton)
_client = StepFunAPIClient()
def get_client() -> StepFunAPIClient:
"""Get global API client"""
return _client
def process_audio(audio_input: str, prompt_text: str, target_text: str,
edit_type: str, edit_info: str = None) -> str:
"""
Process audio using StepFun API
Args:
audio_input: Path to input audio file
prompt_text: Text content of the input audio
target_text: Target text for generation
edit_type: Type of edit (clone, gender, age, etc.)
edit_info: Additional edit info (e.g., "male", "happy")
Returns:
Path to output audio file
Raises:
ValueError: If API token not configured
RuntimeError: If API call fails
"""
client = get_client()
if not client.token:
raise ValueError("API token not configured. Please set STEPFUN_API_TOKEN environment variable.")
# 1. Upload audio file
logger.info("📤 Uploading audio file...")
file_id = client.upload_file(audio_input)
if not file_id:
raise RuntimeError("Failed to upload audio file")
# 2. Wait for file to be ready
logger.info("⏳ Waiting for file to be ready...")
if not client.wait_for_file_ready(file_id):
raise RuntimeError("File processing timeout")
# 3. Call audio edit API
logger.info("🎤 Calling audio edit API...")
audio_bytes = client.audio_edit(
file_id=file_id,
sample_text=prompt_text,
target_text=target_text,
edit_type=edit_type,
edit_info=edit_info or ""
)
if not audio_bytes:
raise RuntimeError("Audio edit API returned no data")
# 4. Save to temp file and return path
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(audio_bytes)
output_path = f.name
logger.info(f"✅ Audio saved to: {output_path}")
return output_path
def transcribe_audio(audio_path: str, progress_callback=None, streaming=False):
"""
使用ASR SSE接口转录音频文件
Args:
audio_path: 音频文件路径
progress_callback: 可选的回调函数,用于处理增量文本更新 callback(delta_text)
streaming: 是否返回生成器进行流式更新
Returns:
如果streaming=False: 完整的转录文本
如果streaming=True: 生成器,产生增量更新和最终文本
Raises:
ValueError: If API token not configured
RuntimeError: If transcription fails
"""
if streaming:
return transcribe_audio_streaming(audio_path, progress_callback)
else:
return transcribe_audio_sync(audio_path, progress_callback)
def transcribe_audio_sync(audio_path: str, progress_callback=None) -> str:
"""
同步转录音频文件,返回最终文本
Args:
audio_path: 音频文件路径
progress_callback: 可选的回调函数,用于处理增量文本更新
Returns:
完整的转录文本
Raises:
ValueError: If API token not configured
RuntimeError: If transcription fails
"""
client = get_client()
if not client.token:
raise ValueError("API token not configured. Please set STEPFUN_API_TOKEN environment variable.")
try:
return client.transcribe_audio_sse(audio_path, progress_callback, streaming=False)
except Exception as e:
raise RuntimeError(f"Transcription failed: {e}")
def transcribe_audio_streaming(audio_path: str, progress_callback=None):
"""
流式转录音频文件,返回生成器
Args:
audio_path: 音频文件路径
progress_callback: 可选的回调函数,用于处理增量文本更新
Yields:
增量更新和最终文本
Raises:
ValueError: If API token not configured
RuntimeError: If transcription fails
"""
client = get_client()
if not client.token:
yield f"[错误: API token not configured]"
return
try:
for update in client.transcribe_audio_sse(audio_path, progress_callback, streaming=True):
yield update
except Exception as e:
yield f"[转录失败: {str(e)}]"
|