import os, json, uuid from pathlib import Path from typing import Dict, Optional from openai import OpenAI # ============ Utility Functions ============ def _build_openai_tts_prompt(text: str, style: Optional[str] = None, speed: Optional[float] = None) -> str: """Merge text, style, and other options into a single TTS input string.""" parts = [text.strip()] if style: parts.append(f"Style: {style.strip()}") if speed: parts.append(f"Speaking speed: {speed}") return " ".join([p for p in parts if p]) # ============ Generator Wrapper ============ class OpenAIAudioGenerator: """ Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts). """ def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"): self.client = client or OpenAI() self.out_dir = Path(out_dir) self.out_dir.mkdir(parents=True, exist_ok=True) def generate_from_spec(self, audio_spec: Dict, filename_prefix: str = "speech", save_meta: bool = False) -> Dict: """ audio_spec example: { "text": "Hello, world!", "voice": "alloy", "speed": 1.0, } """ prompt_text = _build_openai_tts_prompt( audio_spec.get("text", ""), audio_spec.get("style"), audio_spec.get("speed") ) voice = audio_spec.get("voice", "alloy") fmt = audio_spec.get("format", "mp3") model = audio_spec.get("model", "gpt-4o-mini-tts") filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}" file_path = self.out_dir / filename try: with self.client.audio.speech.with_streaming_response.create( model=model, voice=voice, input=prompt_text, ) as response: response.stream_to_file(file_path) # 读出字节后删除文件 audio_bytes = file_path.read_bytes() os.remove(file_path) except Exception as e: raise RuntimeError(f"OpenAI Audio generation failed: {e}") meta = { "model": model, "voice": voice, "format": fmt, "prompt_sent": prompt_text, "llm_audio_spec": audio_spec } if save_meta: meta_file = file_path.with_suffix(".json") meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False)) return {"audio_bytes": audio_bytes, "meta": meta} # ============ Integration with Main Pipeline (Example) ============ def generate_audio_with_openai_from_llm_spec(spec: Dict, out_dir: str = "outputs", openai_key=None) -> Dict: """ Directly feed the spec returned by call_llm_structured: spec = { "image": {...}, "audio": {...}, "debug": {...} } """ client = OpenAI(api_key=openai_key) gen = OpenAIAudioGenerator(out_dir=out_dir, client=client) return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio")