Spaces:

slxhere
/

doc_alive

Running

App Files Files Community

doc_alive / generation /gen_audio.py

slxhere

Add audio generation

5c9f0d9 4 months ago

raw

history blame contribute delete

3.31 kB

	import os, json, uuid
	from pathlib import Path
	from typing import Dict, Optional
	from openai import OpenAI


	# ============ Utility Functions ============

	def _build_openai_tts_prompt(text: str,
	style: Optional[str] = None,
	speed: Optional[float] = None) -> str:
	"""Merge text, style, and other options into a single TTS input string."""
	parts = [text.strip()]
	if style:
	parts.append(f"Style: {style.strip()}")
	if speed:
	parts.append(f"Speaking speed: {speed}")
	return " ".join([p for p in parts if p])


	# ============ Generator Wrapper ============

	class OpenAIAudioGenerator:
	"""
	Generate speech audio using the OpenAI Audio Speech API (gpt-4o-mini-tts).
	"""

	def __init__(self, client: Optional[OpenAI] = None, out_dir: str = "outputs"):
	self.client = client or OpenAI()
	self.out_dir = Path(out_dir)
	self.out_dir.mkdir(parents=True, exist_ok=True)

	def generate_from_spec(self,
	audio_spec: Dict,
	filename_prefix: str = "speech",
	save_meta: bool = False) -> Dict:
	"""
	audio_spec example:
	{
	"text": "Hello, world!",
	"voice": "alloy",
	"speed": 1.0,
	}
	"""

	prompt_text = _build_openai_tts_prompt(
	audio_spec.get("text", ""),
	audio_spec.get("style"),
	audio_spec.get("speed")
	)

	voice = audio_spec.get("voice", "alloy")
	fmt = audio_spec.get("format", "mp3")
	model = audio_spec.get("model", "gpt-4o-mini-tts")

	filename = f"{filename_prefix}_{uuid.uuid4().hex[:8]}.{fmt}"
	file_path = self.out_dir / filename

	try:
	with self.client.audio.speech.with_streaming_response.create(
	model=model,
	voice=voice,
	input=prompt_text,
	) as response:
	response.stream_to_file(file_path)

	# 读出字节后删除文件
	audio_bytes = file_path.read_bytes()
	os.remove(file_path)

	except Exception as e:
	raise RuntimeError(f"OpenAI Audio generation failed: {e}")

	meta = {
	"model": model,
	"voice": voice,
	"format": fmt,
	"prompt_sent": prompt_text,
	"llm_audio_spec": audio_spec
	}

	if save_meta:
	meta_file = file_path.with_suffix(".json")
	meta_file.write_text(json.dumps(meta, indent=2, ensure_ascii=False))

	return {"audio_bytes": audio_bytes, "meta": meta}


	# ============ Integration with Main Pipeline (Example) ============

	def generate_audio_with_openai_from_llm_spec(spec: Dict,
	out_dir: str = "outputs",
	openai_key=None) -> Dict:
	"""
	Directly feed the spec returned by call_llm_structured:
	spec = {
	"image": {...},
	"audio": {...},
	"debug": {...}
	}
	"""
	client = OpenAI(api_key=openai_key)
	gen = OpenAIAudioGenerator(out_dir=out_dir, client=client)
	return gen.generate_from_spec(spec["audio"], filename_prefix="gptaudio")