|
|
import os, json, uuid |
|
|
from pathlib import Path |
|
|
from typing import Dict, Optional |
|
|
from openai import OpenAI |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_openai_tts_prompt(text: str, |
|
|
style: Optional[str] = None, |
|
|
speed: Optional[float] = None) -> str: |
|
|
"""Merge text, style, and other options into a single TTS input string.""" |
|
|
parts = [text.strip()] |
|
|
if style: |
|
|
parts.append(f"Style: {style.strip()}") |
|
|
if speed: |
|
|
parts.append(f"Speaking speed: {speed}") |
|
|
return " ".join([p for p in parts if p]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class OpenAIAudioGenerator: |
|
|
""" |
|
|
Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts). |
|
|
""" |
|
|
|
|
|
def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"): |
|
|
self.client = client or OpenAI() |
|
|
self.out_dir = Path(out_dir) |
|
|
self.out_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
def generate_from_spec(self, |
|
|
audio_spec: Dict, |
|
|
filename_prefix: str = "speech", |
|
|
save_meta: bool = False) -> Dict: |
|
|
""" |
|
|
audio_spec example: |
|
|
{ |
|
|
"text": "Hello, world!", |
|
|
"voice": "alloy", |
|
|
"speed": 1.0, |
|
|
} |
|
|
""" |
|
|
|
|
|
prompt_text = _build_openai_tts_prompt( |
|
|
audio_spec.get("text", ""), |
|
|
audio_spec.get("style"), |
|
|
audio_spec.get("speed") |
|
|
) |
|
|
|
|
|
voice = audio_spec.get("voice", "alloy") |
|
|
fmt = audio_spec.get("format", "mp3") |
|
|
model = audio_spec.get("model", "gpt-4o-mini-tts") |
|
|
|
|
|
filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}" |
|
|
file_path = self.out_dir / filename |
|
|
|
|
|
try: |
|
|
with self.client.audio.speech.with_streaming_response.create( |
|
|
model=model, |
|
|
voice=voice, |
|
|
input=prompt_text, |
|
|
) as response: |
|
|
response.stream_to_file(file_path) |
|
|
|
|
|
|
|
|
audio_bytes = file_path.read_bytes() |
|
|
os.remove(file_path) |
|
|
|
|
|
except Exception as e: |
|
|
raise RuntimeError(f"OpenAI Audio generation failed: {e}") |
|
|
|
|
|
meta = { |
|
|
"model": model, |
|
|
"voice": voice, |
|
|
"format": fmt, |
|
|
"prompt_sent": prompt_text, |
|
|
"llm_audio_spec": audio_spec |
|
|
} |
|
|
|
|
|
if save_meta: |
|
|
meta_file = file_path.with_suffix(".json") |
|
|
meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False)) |
|
|
|
|
|
return {"audio_bytes": audio_bytes, "meta": meta} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_audio_with_openai_from_llm_spec(spec: Dict, |
|
|
out_dir: str = "outputs", |
|
|
openai_key=None) -> Dict: |
|
|
""" |
|
|
Directly feed the spec returned by call_llm_structured: |
|
|
spec = { |
|
|
"image": {...}, |
|
|
"audio": {...}, |
|
|
"debug": {...} |
|
|
} |
|
|
""" |
|
|
client = OpenAI(api_key=openai_key) |
|
|
gen = OpenAIAudioGenerator(out_dir=out_dir, client=client) |
|
|
return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio") |
|
|
|