Spaces:
Sleeping
Sleeping
| # Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming | |
| import base64 | |
| from io import BytesIO | |
| from typing import IO | |
| import yaml | |
| from elevenlabs import VoiceSettings | |
| from elevenlabs.client import ElevenLabs | |
| from hackathon.config import settings | |
| client = ElevenLabs(api_key=settings.ELEVENLABS_API_KEY) | |
| voices = {"politician1": "ohZqJahxofk8dkPKmd9F", "politician2": "v7sy7EHXxN3ToffFQfvr"} | |
| # voice_id: "ohZqJahxofk8dkPKmd9F" # Another voice just in case | |
| def read_audio_config(yaml_path: str) -> dict: | |
| try: | |
| with open(yaml_path, "r") as file: | |
| config = yaml.safe_load(file) | |
| return config | |
| except FileNotFoundError: | |
| raise FileNotFoundError(f"The file at path '{yaml_path}' does not exist.") | |
| except yaml.YAMLError as e: | |
| raise ValueError(f"Error parsing YAML file: {e}") | |
| def read_audio_file(audio_path: str): | |
| with open(audio_path, "rb") as audio_file: | |
| audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8") | |
| return audio_base64 | |
| def text_to_speech_file( | |
| text: str, | |
| voice_id: str, | |
| stability=0.5, | |
| similarity=1.0, | |
| style=0.3, | |
| base_path="audio_store", | |
| ) -> str: | |
| """voice: politician1 or politician2""" | |
| # Calling the text_to_speech conversion API with detailed parameters | |
| response = client.text_to_speech.convert( | |
| voice_id=voice_id, # Adam pre-made voice | |
| output_format="mp3_44100_32", | |
| text=text, | |
| model_id="eleven_turbo_v2_5", # use the turbo model for low latency | |
| voice_settings=VoiceSettings( | |
| stability=0.5, | |
| similarity_boost=1.0, | |
| style=0.3, | |
| use_speaker_boost=True, | |
| ), | |
| ) | |
| audio_data = BytesIO() | |
| for chunk in response: | |
| if chunk: | |
| audio_data.write(chunk) | |
| audio_data.seek(0) | |
| audio_base64 = base64.b64encode(audio_data.read()).decode("utf-8") | |
| return audio_base64 | |
| def text_to_speech_stream( | |
| text: str, voice: str, stability=0.5, similarity=1.0, style=0.3 | |
| ) -> IO[bytes]: | |
| """voice: politician1 or politician2""" | |
| # Perform the text-to-speech conversion | |
| response = client.text_to_speech.convert( | |
| voice_id=voices[voice], # Adam pre-made voice | |
| output_format="mp3_22050_32", | |
| text=text, | |
| model_id="eleven_multilingual_v2", | |
| voice_settings=VoiceSettings( | |
| stability=0.0, | |
| similarity_boost=1.0, | |
| style=0.0, | |
| use_speaker_boost=True, | |
| ), | |
| ) | |
| # Create a BytesIO object to hold the audio data in memory | |
| audio_stream = BytesIO() | |
| # Write each chunk of audio data to the stream | |
| for chunk in response: | |
| if chunk: | |
| audio_stream.write(chunk) | |
| # Reset stream position to the beginning | |
| audio_stream.seek(0) | |
| # Return the stream for further use | |
| return audio_stream | |