Spaces:
Paused
Paused
File size: 5,407 Bytes
964514c 026b176 964514c 026b176 964514c 1dc4889 964514c 1dc4889 964514c 1dc4889 964514c 1dc4889 964514c 1dc4889 964514c 1dc4889 964514c 026b176 23bc671 026b176 23bc671 026b176 23bc671 026b176 23bc671 026b176 23bc671 026b176 23bc671 964514c 23bc671 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
#coding: utf-8
import os
import tempfile
#from typing import Any
#from typing import Dict
#from typing import IO
#from typing import List
from typing import Optional
from typing import Tuple
#from typing import Union
from base64 import b64encode
from openai import OpenAI
from pydub import AudioSegment
import streamlit as st
#from dotenv import load_dotenv
# Charger les variables d'environnement depuis le fichier .env
#load_dotenv()
class openai_tts(object):
def __init__(self,
tts_voice: Optional[str] = "nova",
tts_model: Optional[str] = "tts-1",
response_format: Optional[str] = "mp3",
speed: Optional[float] = 1.0
):
self.client = None
self.init_supported_formats__()
self.init_api_client()
if response_format:
self.set_response_format(response_format)
if tts_voice:
self.set_tts_voice(tts_voice)
if tts_model:
self.set_tts_model(tts_model)
if speed:
self.set_tts_speed(speed)
def set_tts_speed(self, speed):
if not (0.25 <= speed <= 4.0):
raise ValueError(f"[TTS] - Speed must be between 0.25 and 4.0. Provided value: {speed}")
else:
self.speed = speed
return self
def set_tts_voice(self, voice):
voix_valides = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
if voice not in voix_valides:
raise ValueError(f"[TTS] - Invalid TTS voice: {voice}. Valid voices are: {', '.join(voix_valides)}.")
else:
self.tts_voice = voice
return self
def set_tts_model(self, model):
if model not in ["tts-1", "tts-1-hd"]:
raise ValueError(f"[TTS] - Invalid TTS model: {model}. Valid models are 'tts-1' and 'tts-1-hd'.")
else:
self.tts_model = model
return self
def init_supported_formats__(self):
self.supported_formats = [ 'mp3', 'opus', 'aac', 'flac', 'wav', 'pcm' ]
return self
def set_response_format(self, format: str):
if format not in self.supported_formats:
raise ValueError(f"[TTS] - Unsupported format: {format}. Supported formats are: {', '.join(self.supported_formats)}")
else:
self.response_format = format
return self
def init_api_client(self):
if not (self.client):
# OpenAI client configuration with API key
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
return self
def text_to_speech(self,
input_text: str) -> Tuple[Optional[bytes], float]:
"""
Convertit du texte en parole en utilisant l'API OpenAI.
Args:
input_text (str): Le texte à convertir en parole.
Returns:
Dict[str, Union[float, str]]: Un dictionnaire contenant:
- 'audio_duration' (float): La durée de l'audio en secondes.
- 'data_bytes' (str): Les données audio encodées en base64.
"""
response = self.client.audio.speech.create(
model=self.tts_model,
voice=self.tts_voice,
input=input_text,
response_format=self.response_format,
speed=self.speed
)
data_output = response.read()
tmp_file = tempfile.TemporaryFile()
tmp_file.write(data_output)
tmp_file.seek(0)
audio = AudioSegment.from_file(tmp_file, format=self.response_format)
duration = len(audio) / 1000
tmp_file.close()
return {
"audio_duration": duration,
"data_bytes": b64encode(data_output).decode()
}
def process_tts_message(text_response: str) -> Tuple[Optional[bytes], Optional[float]]:
"""
Convertit un texte en parole en utilisant l'API OpenAI TTS.
Args:
text_response (str): Le texte à convertir en parole.
Returns:
Tuple[Optional[bytes], Optional[float]]: Un tuple contenant les données audio encodées en base64
et la durée de l'audio, ou (None, None) en cas d'erreur.
"""
if not text_response or not isinstance(text_response, str):
st.error("Erreur : Le texte à convertir est invalide ou vide")
return None, None
try:
tts = openai_tts(
tts_voice=st.session_state.tts_voice,
tts_model="tts-1",
response_format="mp3",
speed=1.0
)
tts_output_ = tts.text_to_speech(text_response)
return tts_output_["data_bytes"], tts_output_["audio_duration"]
except ValueError as ve:
# Erreurs de validation (voix invalide, format non supporté, etc.)
st.error(f"Erreur de configuration TTS : {ve}")
return None, None
except (KeyError, AttributeError) as ke:
# Erreurs liées aux variables de session manquantes
st.error("Erreur : Configuration TTS incomplète ou invalide")
return None, None
except Exception as e:
# Autres erreurs (réseau, API, etc.)
st.error(f"Une erreur s'est produite lors de la conversion texte-parole : {e}")
return None, None
if __name__ == "__main__":
openai_tts().text_to_speech("Hello, I am an AI assistant. How can I help you?")
|