Spaces:

rairo
/

sozo-api

Sleeping

File size: 3,928 Bytes

# -----------------------
# Audio Generation Function
# -----------------------
import os
import re
import time
import tempfile
import requests
import json
import io
import base64
import cv2
import logging
import uuid
import subprocess
from pathlib import Path
import urllib.parse
from io import BytesIO
from PIL import Image

def generate_audio(text, voice_model, audio_model="deepgram"):
    """
    Generate audio from text using either DeepGram or Pollinations OpenAI-Audio.
    Args:
        text (str): The text to convert to speech.
        voice_model (str): The voice/model to use.
            - For DeepGram, e.g., "aura-asteria-en" or "aura-helios-en".
            - For Pollinations, e.g., "sage" (female) or "echo" (male).
        audio_model (str): Which audio generation service to use ("deepgram" or "openai-audio").
    Returns:
        str or None: The path to the generated audio file, or None if generation failed.
    """
    if audio_model == "deepgram":
        deepgram_api_key = os.getenv("DeepGram")
        if not deepgram_api_key:
            st.error("Deepgram API Key is missing.")
            return None
        headers_tts = {
            "Authorization": f"Token {deepgram_api_key}",
            "Content-Type": "text/plain"
        }
        url = f"https://api.deepgram.com/v1/speak?model={voice_model}"
        response = requests.post(url, headers=headers_tts, data=text)
        if response.status_code == 200:
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
            temp_file.write(response.content)
            temp_file.close()
            return temp_file.name
        else:
            st.error(f"DeepGram TTS error: {response.status_code}")
            return None
    elif audio_model == "openai-audio":
        # URL encode the text and call Pollinations TTS endpoint for openai-audio
        encoded_text = urllib.parse.quote(text)
        url = f"https://text.pollinations.ai/{encoded_text}?model=openai-audio&voice={voice_model}"
        response = requests.get(url)
        if response.status_code == 200:
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
            temp_file.write(response.content)
            temp_file.close()
            return temp_file.name
        else:
            print(f"OpenAI Audio TTS error: {response.status_code}")
            return None
    else:
        st.error("Unsupported audio model selected.")
        return None

def get_audio_duration(audio_file):
    import subprocess
    try:
        cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
               '-of', 'default=noprint_wrappers=1:nokey=1', audio_file]
        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.returncode != 0:
            return 5.0
        return float(result.stdout.strip())
    except Exception:
        return 5.0

#edit text and audio

def edit_section_text(original_text: str, new_text: str, voice_model: str, audio_model: str):
    """
    Takes the original text, replaces it with new_text, re-generates the audio,
    and returns (updated_text, new_audio_path).
    We'll assume you already have a 'generate_audio' function in this same file
    that can produce audio from text. 
    """
    from audio_gen import generate_audio  # or wherever your existing TTS function is

    try:
        # 1) The new text is just new_text
        updated_text = new_text.strip()
        if not updated_text:
            return None, None

        # 2) Re-generate the audio for the new text
        updated_audio_text = re.sub(r"<.*?>", "", updated_text)
        audio_file_path = generate_audio(updated_audio_text, voice_model, audio_model=audio_model)
        return updated_text, audio_file_path

    except Exception as e:
        logging.error(f"Error editing section text/audio: {e}")
        return None, None