sozo-api / audio_gen.py
rairo's picture
Update audio_gen.py
08a5383 verified
raw
history blame
3.93 kB
# -----------------------
# Audio Generation Function
# -----------------------
import os
import re
import time
import tempfile
import requests
import json
import io
import base64
import cv2
import logging
import uuid
import subprocess
from pathlib import Path
import urllib.parse
from io import BytesIO
from PIL import Image
def generate_audio(text, voice_model, audio_model="deepgram"):
"""
Generate audio from text using either DeepGram or Pollinations OpenAI-Audio.
Args:
text (str): The text to convert to speech.
voice_model (str): The voice/model to use.
- For DeepGram, e.g., "aura-asteria-en" or "aura-helios-en".
- For Pollinations, e.g., "sage" (female) or "echo" (male).
audio_model (str): Which audio generation service to use ("deepgram" or "openai-audio").
Returns:
str or None: The path to the generated audio file, or None if generation failed.
"""
if audio_model == "deepgram":
deepgram_api_key = os.getenv("DeepGram")
if not deepgram_api_key:
st.error("Deepgram API Key is missing.")
return None
headers_tts = {
"Authorization": f"Token {deepgram_api_key}",
"Content-Type": "text/plain"
}
url = f"https://api.deepgram.com/v1/speak?model={voice_model}"
response = requests.post(url, headers=headers_tts, data=text)
if response.status_code == 200:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
temp_file.write(response.content)
temp_file.close()
return temp_file.name
else:
st.error(f"DeepGram TTS error: {response.status_code}")
return None
elif audio_model == "openai-audio":
# URL encode the text and call Pollinations TTS endpoint for openai-audio
encoded_text = urllib.parse.quote(text)
url = f"https://text.pollinations.ai/{encoded_text}?model=openai-audio&voice={voice_model}"
response = requests.get(url)
if response.status_code == 200:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
temp_file.write(response.content)
temp_file.close()
return temp_file.name
else:
print(f"OpenAI Audio TTS error: {response.status_code}")
return None
else:
st.error("Unsupported audio model selected.")
return None
def get_audio_duration(audio_file):
import subprocess
try:
cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', audio_file]
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
return 5.0
return float(result.stdout.strip())
except Exception:
return 5.0
#edit text and audio
def edit_section_text(original_text: str, new_text: str, voice_model: str, audio_model: str):
"""
Takes the original text, replaces it with new_text, re-generates the audio,
and returns (updated_text, new_audio_path).
We'll assume you already have a 'generate_audio' function in this same file
that can produce audio from text.
"""
from audio_gen import generate_audio # or wherever your existing TTS function is
try:
# 1) The new text is just new_text
updated_text = new_text.strip()
if not updated_text:
return None, None
# 2) Re-generate the audio for the new text
updated_audio_text = re.sub(r"<.*?>", "", updated_text)
audio_file_path = generate_audio(updated_audio_text, voice_model, audio_model=audio_model)
return updated_text, audio_file_path
except Exception as e:
logging.error(f"Error editing section text/audio: {e}")
return None, None