doc_alive / generation /gen_audio.py
slxhere's picture
Add audio generation
5c9f0d9
import os, json, uuid
from pathlib import Path
from typing import Dict, Optional
from openai import OpenAI
# ============ Utility Functions ============
def _build_openai_tts_prompt(text: str,
style: Optional[str] = None,
speed: Optional[float] = None) -> str:
"""Merge text, style, and other options into a single TTS input string."""
parts = [text.strip()]
if style:
parts.append(f"Style: {style.strip()}")
if speed:
parts.append(f"Speaking speed: {speed}")
return " ".join([p for p in parts if p])
# ============ Generator Wrapper ============
class OpenAIAudioGenerator:
"""
Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts).
"""
def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
self.client = client or OpenAI()
self.out_dir = Path(out_dir)
self.out_dir.mkdir(parents=True, exist_ok=True)
def generate_from_spec(self,
audio_spec: Dict,
filename_prefix: str = "speech",
save_meta: bool = False) -> Dict:
"""
audio_spec example:
{
"text": "Hello, world!",
"voice": "alloy",
"speed": 1.0,
}
"""
prompt_text = _build_openai_tts_prompt(
audio_spec.get("text", ""),
audio_spec.get("style"),
audio_spec.get("speed")
)
voice = audio_spec.get("voice", "alloy")
fmt = audio_spec.get("format", "mp3")
model = audio_spec.get("model", "gpt-4o-mini-tts")
filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}"
file_path = self.out_dir / filename
try:
with self.client.audio.speech.with_streaming_response.create(
model=model,
voice=voice,
input=prompt_text,
) as response:
response.stream_to_file(file_path)
# 读出字节后删除文件
audio_bytes = file_path.read_bytes()
os.remove(file_path)
except Exception as e:
raise RuntimeError(f"OpenAI Audio generation failed: {e}")
meta = {
"model": model,
"voice": voice,
"format": fmt,
"prompt_sent": prompt_text,
"llm_audio_spec": audio_spec
}
if save_meta:
meta_file = file_path.with_suffix(".json")
meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
return {"audio_bytes": audio_bytes, "meta": meta}
# ============ Integration with Main Pipeline (Example) ============
def generate_audio_with_openai_from_llm_spec(spec: Dict,
out_dir: str = "outputs",
openai_key=None) -> Dict:
"""
Directly feed the spec returned by call_llm_structured:
spec = {
"image": {...},
"audio": {...},
"debug": {...}
}
"""
client = OpenAI(api_key=openai_key)
gen = OpenAIAudioGenerator(out_dir=out_dir, client=client)
return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio")