import gradio as gr import spaces import subprocess import os from PIL import Image import ffmpeg from pydub import AudioSegment import numpy as np import soundfile as sf from gtts import gTTS def save_audio_mp3(audio_tuple, filename): #sampling_rate, audio_data = audio_tuple #audio_bytes = np.array(audio_data, dtype=np.int16).tobytes() #audio_segment = AudioSegment(audio_bytes, sample_width=2, frame_rate=sampling_rate, channels=1) #audio_segment.export(filename, format="mp3") sampling_rate, audio_data = audio_tuple # Verificar la tasa de muestreo if not isinstance(sampling_rate, int) or sampling_rate <= 0: raise ValueError("La tasa de muestreo debe ser un entero positivo.") # Verificar los datos de audio if not isinstance(audio_data, np.ndarray): raise ValueError("Los datos de audio deben ser un array de numpy.") # Convertir los datos de audio a bytes audio_bytes = np.array(audio_data, dtype=np.int16).tobytes() # Crear el segmento de audio audio_segment = AudioSegment( data=audio_bytes, sample_width=2, frame_rate=sampling_rate, channels=1 ) # Exportar el segmento de audio a un archivo MP3 audio_segment.export(filename, format="mp3") return f"Audio saved successfully as {filename}" def audio_video(): input_video = ffmpeg.input('results/result_voice.mp4') input_audio = ffmpeg.input('sample_data/uploaded_audio.mp3') os.system(f"rm -rf results/final_output.mp4") ffmpeg.concat(input_video, input_audio, v=1, a=1).output('results/final_output.mp4').run() return "results/final_output.mp4" @spaces.GPU def run_inference(input_image, input_audio=None, input_text=None): pil_image = Image.fromarray(input_image.astype(np.uint8)) save_dir = "sample_data" if not os.path.exists(save_dir): os.makedirs(save_dir) # Save input image filename = os.path.join(save_dir, "uploaded_image.png") pil_image.save(filename) #Save input audio #save_audio_mp3(input_audio, "sample_data/uploaded_audio.mp3") if input_text: tts = gTTS(input_text, lang='en', tld='com.au') tts.save("sample_data/uploaded_audio.mp3") else: save_audio_mp3(input_audio, "sample_data/uploaded_audio.mp3") command = f'python3 inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face sample_data/uploaded_image.png --audio sample_data/uploaded_audio.mp3' process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output, error = process.communicate() return audio_video() def run(): custom_css = """ * { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif !important; } body { background-color: #ffffff !important; } .gradio-container { background-color: #ffffff !important; } button.primary { background: linear-gradient(90deg, #a8a8a8, #7d7d7d) !important; color: #ffffff !important; border: none !important; } button.primary:hover { background: linear-gradient(90deg, #b3b3b3, #8a8a8a) !important; } #container { background-color: #ffffff; border-radius: 15px; padding: 20px 30px; box-shadow: 0 8px 20px rgba(0, 0, 0, 0.08); margin: 20px auto; max-width: 800px; } div.svelte-iyf88w { background: #ffffff !important; } h1 { color: #2c3e50; font-size: 2.4em; font-weight: 700; margin-bottom: 10px; } .subtitle { color: #7f8c8d; font-size: 1.2em; } .generate-btn { font-size: 1.1em; font-weight: 600; padding: 14px 32px; border-radius: 10px; transition: all 0.25s ease; margin-top: 15px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); } .video-small video { max-width: 400px !important; /* ancho máximo */ height: auto !important; border-radius: 10px; display: block; margin: 0 auto; } """ with gr.Blocks(css=custom_css) as demo: with gr.Group(): gr.Markdown(""" # Talkie