|
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
|
import torch |
|
|
import soundfile as sf |
|
|
import gradio as gr |
|
|
from datasets import load_dataset |
|
|
from runware import Runware, IImageInference |
|
|
import asyncio |
|
|
from dotenv import load_dotenv |
|
|
import os |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
RUNWARE_API_KEY = os.getenv("RUNWARE_API_KEY") |
|
|
if not RUNWARE_API_KEY: |
|
|
raise ValueError("API key no encontrada. Asegúrate de configurarla en la variable de entorno 'RUNWARE_API_KEY'.") |
|
|
|
|
|
|
|
|
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") |
|
|
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") |
|
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
|
|
|
async def generar_imagen_desde_texto(texto): |
|
|
if not (3 <= len(texto) <= 2000): |
|
|
return "Error: El texto debe tener entre 3 y 2000 caracteres." |
|
|
|
|
|
runware = Runware(api_key=RUNWARE_API_KEY) |
|
|
await runware.connect() |
|
|
|
|
|
request_image = IImageInference( |
|
|
positivePrompt=texto, |
|
|
model="civitai:36520@76907", |
|
|
numberResults=1, |
|
|
negativePrompt="cloudy, rainy", |
|
|
height=512, |
|
|
width=512, |
|
|
) |
|
|
|
|
|
images = await runware.imageInference(requestImage=request_image) |
|
|
if images: |
|
|
return images[0].imageURL |
|
|
else: |
|
|
return "No se generó ninguna imagen." |
|
|
|
|
|
|
|
|
def text_to_speech(text): |
|
|
if not (3 <= len(text) <= 2000): |
|
|
return "Error: El texto debe tener entre 3 y 2000 caracteres.", None |
|
|
|
|
|
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
|
|
|
|
|
|
|
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") |
|
|
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) |
|
|
|
|
|
|
|
|
audio_path = "speech.wav" |
|
|
sf.write(audio_path, speech.numpy(), samplerate=16000) |
|
|
|
|
|
|
|
|
imagen_url = asyncio.run(generar_imagen_desde_texto(text)) |
|
|
|
|
|
|
|
|
print(f"URL de la imagen generada: {imagen_url}") |
|
|
|
|
|
return audio_path, imagen_url |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=text_to_speech, |
|
|
inputs=gr.Textbox(label="Escribe tu texto aquí"), |
|
|
outputs=[ |
|
|
gr.Audio(label="Escucha el audio generado"), |
|
|
gr.Image(label="Imagen generada") |
|
|
], |
|
|
title="Generación de texto a voz e imagen según texto", |
|
|
live=True |
|
|
) |
|
|
|
|
|
iface.launch() |