File size: 3,308 Bytes
5c9f0d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os, json, uuid
from pathlib import Path
from typing import Dict, Optional
from openai import OpenAI


# ============ Utility Functions ============

def _build_openai_tts_prompt(text: str,
                             style: Optional[str] = None,
                             speed: Optional[float] = None) -> str:
    """Merge text, style, and other options into a single TTS input string."""
    parts = [text.strip()]
    if style:
        parts.append(f"Style: {style.strip()}")
    if speed:
        parts.append(f"Speaking speed: {speed}")
    return " ".join([p for p in parts if p])


# ============ Generator Wrapper ============

class OpenAIAudioGenerator:
    """
    Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts).
    """

    def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
        self.client = client or OpenAI()
        self.out_dir = Path(out_dir)
        self.out_dir.mkdir(parents=True, exist_ok=True)

    def generate_from_spec(self,
                           audio_spec: Dict,
                           filename_prefix: str = "speech",
                           save_meta: bool = False) -> Dict:
        """
        audio_spec example:
        {
            "text": "Hello, world!",
            "voice": "alloy",
            "speed": 1.0,
        }
        """

        prompt_text = _build_openai_tts_prompt(
            audio_spec.get("text", ""),
            audio_spec.get("style"),
            audio_spec.get("speed")
        )

        voice = audio_spec.get("voice", "alloy")
        fmt = audio_spec.get("format", "mp3")
        model = audio_spec.get("model", "gpt-4o-mini-tts")

        filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}"
        file_path = self.out_dir / filename

        try:
            with self.client.audio.speech.with_streaming_response.create(
                model=model,
                voice=voice,
                input=prompt_text,
            ) as response:
                response.stream_to_file(file_path)

            # 读出字节后删除文件
            audio_bytes = file_path.read_bytes()
            os.remove(file_path)

        except Exception as e:
            raise RuntimeError(f"OpenAI Audio generation failed: {e}")

        meta = {
            "model": model,
            "voice": voice,
            "format": fmt,
            "prompt_sent": prompt_text,
            "llm_audio_spec": audio_spec
        }

        if save_meta:
            meta_file = file_path.with_suffix(".json")
            meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False))

        return {"audio_bytes": audio_bytes, "meta": meta}


# ============ Integration with Main Pipeline (Example) ============

def generate_audio_with_openai_from_llm_spec(spec: Dict,
                                             out_dir: str = "outputs",
                                             openai_key=None) -> Dict:
    """
    Directly feed the spec returned by call_llm_structured:
    spec = {
        "image": {...},
        "audio": {...},
        "debug": {...}
    }
    """
    client = OpenAI(api_key=openai_key)
    gen = OpenAIAudioGenerator(out_dir=out_dir, client=client)
    return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio")