talky / app.py
fabiolamp's picture
Update app.py
6575803 verified
import gradio as gr
import spaces
import subprocess
import os
from PIL import Image
import ffmpeg
from pydub import AudioSegment
import numpy as np
import soundfile as sf
from gtts import gTTS
def save_audio_mp3(audio_tuple, filename):
#sampling_rate, audio_data = audio_tuple
#audio_bytes = np.array(audio_data, dtype=np.int16).tobytes()
#audio_segment = AudioSegment(audio_bytes, sample_width=2, frame_rate=sampling_rate, channels=1)
#audio_segment.export(filename, format="mp3")
sampling_rate, audio_data = audio_tuple
# Verificar la tasa de muestreo
if not isinstance(sampling_rate, int) or sampling_rate <= 0:
raise ValueError("La tasa de muestreo debe ser un entero positivo.")
# Verificar los datos de audio
if not isinstance(audio_data, np.ndarray):
raise ValueError("Los datos de audio deben ser un array de numpy.")
# Convertir los datos de audio a bytes
audio_bytes = np.array(audio_data, dtype=np.int16).tobytes()
# Crear el segmento de audio
audio_segment = AudioSegment(
data=audio_bytes,
sample_width=2,
frame_rate=sampling_rate,
channels=1
)
# Exportar el segmento de audio a un archivo MP3
audio_segment.export(filename, format="mp3")
return f"Audio saved successfully as {filename}"
def audio_video():
input_video = ffmpeg.input('results/result_voice.mp4')
input_audio = ffmpeg.input('sample_data/uploaded_audio.mp3')
os.system(f"rm -rf results/final_output.mp4")
ffmpeg.concat(input_video, input_audio, v=1, a=1).output('results/final_output.mp4').run()
return "results/final_output.mp4"
@spaces.GPU
def run_inference(input_image, input_audio=None, input_text=None):
pil_image = Image.fromarray(input_image.astype(np.uint8))
save_dir = "sample_data"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# Save input image
filename = os.path.join(save_dir, "uploaded_image.png")
pil_image.save(filename)
#Save input audio
#save_audio_mp3(input_audio, "sample_data/uploaded_audio.mp3")
if input_text:
tts = gTTS(input_text, lang='en', tld='com.au')
tts.save("sample_data/uploaded_audio.mp3")
else:
save_audio_mp3(input_audio, "sample_data/uploaded_audio.mp3")
command = f'python3 inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face sample_data/uploaded_image.png --audio sample_data/uploaded_audio.mp3'
process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
output, error = process.communicate()
return audio_video()
def run():
custom_css = """
* {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif !important;
}
body {
background-color: #ffffff !important;
}
.gradio-container {
background-color: #ffffff !important;
}
button.primary {
background: linear-gradient(90deg, #a8a8a8, #7d7d7d) !important;
color: #ffffff !important;
border: none !important;
}
button.primary:hover {
background: linear-gradient(90deg, #b3b3b3, #8a8a8a) !important;
}
#container {
background-color: #ffffff;
border-radius: 15px;
padding: 20px 30px;
box-shadow: 0 8px 20px rgba(0, 0, 0, 0.08);
margin: 20px auto;
max-width: 800px;
}
div.svelte-iyf88w {
background: #ffffff !important;
}
h1 {
color: #2c3e50;
font-size: 2.4em;
font-weight: 700;
margin-bottom: 10px;
}
.subtitle {
color: #7f8c8d;
font-size: 1.2em;
}
.generate-btn {
font-size: 1.1em;
font-weight: 600;
padding: 14px 32px;
border-radius: 10px;
transition: all 0.25s ease;
margin-top: 15px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
}
.video-small video {
max-width: 400px !important; /* ancho máximo */
height: auto !important;
border-radius: 10px;
display: block;
margin: 0 auto;
}
"""
with gr.Blocks(css=custom_css) as demo:
with gr.Group():
gr.Markdown("""
# Talkie
<div class='subtitle'>Upload an image, add some audio or text, and watch the magic happen! ✨</div>
""")
with gr.Row():
input_image = gr.Image(label="📸 Your image")
with gr.Row():
input_audio = gr.Audio(label="🎵 Your audio (Optional)")
input_text = gr.Textbox(label="💭 Your text", placeholder="Type your text here...")
with gr.Row():
btn = gr.Button("GENERATE", elem_classes=["generate-btn"])
with gr.Row():
video_out = gr.Video(label="🎥 Your video", show_label=True)
btn.click(run_inference, inputs=[input_image, input_audio, input_text], outputs=video_out)
return demo
if __name__ == "__main__":
demo = run()
demo.queue()
demo.launch(server_name="0.0.0.0", server_port=7860)