|
|
import gradio as gr |
|
|
import spaces |
|
|
import subprocess |
|
|
import os |
|
|
from PIL import Image |
|
|
import ffmpeg |
|
|
from pydub import AudioSegment |
|
|
|
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
from gtts import gTTS |
|
|
|
|
|
|
|
|
def save_audio_mp3(audio_tuple, filename): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sampling_rate, audio_data = audio_tuple |
|
|
|
|
|
|
|
|
if not isinstance(sampling_rate, int) or sampling_rate <= 0: |
|
|
raise ValueError("La tasa de muestreo debe ser un entero positivo.") |
|
|
|
|
|
|
|
|
if not isinstance(audio_data, np.ndarray): |
|
|
raise ValueError("Los datos de audio deben ser un array de numpy.") |
|
|
|
|
|
|
|
|
audio_bytes = np.array(audio_data, dtype=np.int16).tobytes() |
|
|
|
|
|
|
|
|
audio_segment = AudioSegment( |
|
|
data=audio_bytes, |
|
|
sample_width=2, |
|
|
frame_rate=sampling_rate, |
|
|
channels=1 |
|
|
) |
|
|
|
|
|
|
|
|
audio_segment.export(filename, format="mp3") |
|
|
|
|
|
return f"Audio saved successfully as {filename}" |
|
|
|
|
|
|
|
|
def audio_video(): |
|
|
input_video = ffmpeg.input('results/result_voice.mp4') |
|
|
input_audio = ffmpeg.input('sample_data/uploaded_audio.mp3') |
|
|
os.system(f"rm -rf results/final_output.mp4") |
|
|
ffmpeg.concat(input_video, input_audio, v=1, a=1).output('results/final_output.mp4').run() |
|
|
|
|
|
return "results/final_output.mp4" |
|
|
|
|
|
@spaces.GPU |
|
|
def run_inference(input_image, input_audio=None, input_text=None): |
|
|
pil_image = Image.fromarray(input_image.astype(np.uint8)) |
|
|
|
|
|
save_dir = "sample_data" |
|
|
if not os.path.exists(save_dir): |
|
|
os.makedirs(save_dir) |
|
|
|
|
|
|
|
|
filename = os.path.join(save_dir, "uploaded_image.png") |
|
|
pil_image.save(filename) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if input_text: |
|
|
tts = gTTS(input_text, lang='en', tld='com.au') |
|
|
tts.save("sample_data/uploaded_audio.mp3") |
|
|
else: |
|
|
save_audio_mp3(input_audio, "sample_data/uploaded_audio.mp3") |
|
|
|
|
|
command = f'python3 inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face sample_data/uploaded_image.png --audio sample_data/uploaded_audio.mp3' |
|
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) |
|
|
output, error = process.communicate() |
|
|
|
|
|
return audio_video() |
|
|
|
|
|
def run(): |
|
|
custom_css = """ |
|
|
* { |
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif !important; |
|
|
} |
|
|
body { |
|
|
background-color: #ffffff !important; |
|
|
} |
|
|
.gradio-container { |
|
|
background-color: #ffffff !important; |
|
|
} |
|
|
button.primary { |
|
|
background: linear-gradient(90deg, #a8a8a8, #7d7d7d) !important; |
|
|
color: #ffffff !important; |
|
|
border: none !important; |
|
|
} |
|
|
button.primary:hover { |
|
|
background: linear-gradient(90deg, #b3b3b3, #8a8a8a) !important; |
|
|
} |
|
|
#container { |
|
|
background-color: #ffffff; |
|
|
border-radius: 15px; |
|
|
padding: 20px 30px; |
|
|
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.08); |
|
|
margin: 20px auto; |
|
|
max-width: 800px; |
|
|
} |
|
|
|
|
|
|
|
|
div.svelte-iyf88w { |
|
|
background: #ffffff !important; |
|
|
} |
|
|
|
|
|
h1 { |
|
|
color: #2c3e50; |
|
|
font-size: 2.4em; |
|
|
font-weight: 700; |
|
|
margin-bottom: 10px; |
|
|
} |
|
|
.subtitle { |
|
|
color: #7f8c8d; |
|
|
font-size: 1.2em; |
|
|
} |
|
|
.generate-btn { |
|
|
font-size: 1.1em; |
|
|
font-weight: 600; |
|
|
padding: 14px 32px; |
|
|
border-radius: 10px; |
|
|
transition: all 0.25s ease; |
|
|
margin-top: 15px; |
|
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); |
|
|
} |
|
|
|
|
|
.video-small video { |
|
|
max-width: 400px !important; /* ancho máximo */ |
|
|
height: auto !important; |
|
|
border-radius: 10px; |
|
|
display: block; |
|
|
margin: 0 auto; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=custom_css) as demo: |
|
|
with gr.Group(): |
|
|
gr.Markdown(""" |
|
|
# Talkie |
|
|
<div class='subtitle'>Upload an image, add some audio or text, and watch the magic happen! ✨</div> |
|
|
""") |
|
|
with gr.Row(): |
|
|
input_image = gr.Image(label="📸 Your image") |
|
|
|
|
|
with gr.Row(): |
|
|
input_audio = gr.Audio(label="🎵 Your audio (Optional)") |
|
|
input_text = gr.Textbox(label="💭 Your text", placeholder="Type your text here...") |
|
|
|
|
|
with gr.Row(): |
|
|
btn = gr.Button("GENERATE", elem_classes=["generate-btn"]) |
|
|
|
|
|
with gr.Row(): |
|
|
video_out = gr.Video(label="🎥 Your video", show_label=True) |
|
|
|
|
|
btn.click(run_inference, inputs=[input_image, input_audio, input_text], outputs=video_out) |
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo = run() |
|
|
demo.queue() |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |