Spaces:

PineSearch
/

generateAudio

Paused

App Files Files Community

SAUL19 commited on Jun 25, 2023

Commit

11eb5d0

1 Parent(s): f00a1a5

Create old_app

Browse files

Files changed (1) hide show

old_app +110 -0

old_app ADDED Viewed

	@@ -0,0 +1,110 @@

+import gradio as gr
+from gradio.inputs import Textbox
+import re
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import torch
+import random
+import string
+import soundfile as sf
+import boto3
+from io import BytesIO
+import os
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+S3_BUCKET_NAME = os.getenv("BUCKET_NAME")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load the processor
+processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+# load the model
+model = SpeechT5ForTextToSpeech.from_pretrained(
+    "microsoft/speecht5_tts").to(device)
+# load the vocoder, that is the voice encoder
+vocoder = SpeechT5HifiGan.from_pretrained(
+    "microsoft/speecht5_hifigan").to(device)
+# we load this dataset to get the speaker embeddings
+embeddings_dataset = load_dataset(
+    "Matthijs/cmu-arctic-xvectors", split="validation")
+# speaker ids from the embeddings dataset
+speakers = {
+    'awb': 0,     # Scottish male
+    'bdl': 1138,  # US male
+    'clb': 2271,  # US female
+    'jmk': 3403,  # Canadian male
+    'ksp': 4535,  # Indian male
+    'rms': 5667,  # US male
+    'slt': 6799   # US female
+}
+def generateAudio(text_to_audio, s3_save_as, key_id):
+    if AWS_ACCESS_KEY_ID != key_id:
+        return "not permition"
+    s3_save_as = '-'.join(s3_save_as.split()) + ".wav"
+    def cut_text(text, max_tokens=500):
+        # Remove non-alphanumeric characters, except periods and commas
+        text = re.sub(r"[^\w\s.,]", "", text)
+        # Replace multiple spaces with a single space
+        text = re.sub(r"\s{2,}", " ", text)
+        # Remove line breaks
+        text = re.sub(r"\n", " ", text)
+        return text
+    def save_audio_to_s3(audio):
+        # Create an instance of the S3 client
+        s3 = boto3.client('s3',
+                          aws_access_key_id=AWS_ACCESS_KEY_ID,
+                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
+        # Full path of the file in the bucket
+        s3_key = "public/" + s3_save_as
+        # Upload the audio file to the S3 bucket
+        s3.upload_fileobj(audio, S3_BUCKET_NAME, s3_key)
+    def save_text_to_speech(text, speaker=None):
+        # Preprocess text and recortar
+        text = cut_text(text, max_tokens=500)
+        # Divide el texto en segmentos de 30 palabras
+        palabras = text.split()
+        segmentos = [' '.join(palabras[i:i+30]) for i in range(0, len(palabras), 30)]
+        # Generar audio para cada segmento y combinarlos
+        audio_segments = []
+        for segment in segmentos:
+            inputs = processor(text=segment, return_tensors="pt").to(device)
+            if speaker is not None:
+                speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
+            else:
+                speaker_embeddings = torch.randn((1, 512)).to(device)
+            speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+            audio_segments.append(speech)
+        combined_audio = torch.cat(audio_segments, dim=0)
+        # Crear objeto BytesIO para almacenar el audio
+        audio_buffer = BytesIO()
+        sf.write(audio_buffer, combined_audio.cpu().numpy(), samplerate=16000, format='WAV')
+        audio_buffer.seek(0)
+        # Guardar el audio combinado en S3
+        save_audio_to_s3(audio_buffer)
+    save_text_to_speech(text_to_audio, 2271)
+    return s3_save_as
+iface = gr.Interface(fn=generateAudio, inputs=[Textbox(label="text_to_audio"), Textbox(label="S3url"), Textbox(label="aws_key_id")], outputs="text", title="Text-to-Audio")
+iface.launch()