| |
| import gradio as gr |
| import shutil |
| import os |
| import subprocess |
| import sys |
|
|
| subprocess.run(["python", "src/setup_wav2lip.py"]) |
|
|
| |
| sys.path.append(os.path.abspath("./src")) |
|
|
| from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion |
| from call_openai_api import moni as rtff |
|
|
| |
| AUDIO_RECORD_PATH = os.path.abspath("./assets/audio/grabacion_gradio.wav") |
| VIDEO_PATH = os.path.abspath("./assets/video/data_video_sun.mp4") |
| TRANSCRIPTION_TEXT_PATH = os.path.abspath("./results/transcripcion.txt") |
| RESULT_AUDIO_TEMP_PATH = os.path.abspath("./results/audiov2.wav") |
| RESULT_AUDIO_FINAL_PATH = os.path.abspath("./assets/audio/audio.wav") |
| RESULT_VIDEO_PATH = os.path.abspath("./results/result_voice.mp4") |
| TEXT_TO_SPEECH_PATH = os.path.abspath("./src/text_to_speech.py") |
| RUN_INFERENCE_PATH = os.path.abspath("./src/run_inference.py") |
|
|
|
|
| def transcribir_con_progreso(audio_path): |
| progreso = gr.Progress() |
| progreso(0, "Iniciando transcripción...") |
| model_name = "openai/whisper-large" |
| progreso(25, "Cargando modelo Whisper...") |
| transcripcion = transcribe_audio(audio_path, model_name) |
| progreso(75, "Guardando transcripción...") |
| guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH) |
| progreso(100, "Transcripción completada.") |
| return transcripcion |
|
|
|
|
| def generar_audio_desde_texto(): |
| print("Ejecutando text_to_speech...") |
| result = subprocess.run( |
| [sys.executable, TEXT_TO_SPEECH_PATH], |
| capture_output=True, |
| text=True |
| ) |
| print("stdout:", result.stdout) |
| print("stderr:", result.stderr) |
|
|
| if result.returncode != 0: |
| raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}") |
|
|
| if os.path.exists(RESULT_AUDIO_TEMP_PATH): |
| os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True) |
| shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH) |
| print(f"Audio copiado a: {RESULT_AUDIO_FINAL_PATH}") |
| return RESULT_AUDIO_FINAL_PATH |
| else: |
| print("Audio temporal no encontrado") |
| return None |
|
|
|
|
| def procesar_video_audio(): |
| print("Iniciando procesamiento de video...") |
| print("Audio de entrada:", RESULT_AUDIO_FINAL_PATH) |
| print("Video de entrada:", VIDEO_PATH) |
|
|
| result = subprocess.run( |
| [sys.executable, RUN_INFERENCE_PATH, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH], |
| capture_output=True, |
| text=True |
| ) |
|
|
| print("stdout:", result.stdout) |
| print("stderr:", result.stderr) |
|
|
| if os.path.exists(RESULT_VIDEO_PATH): |
| print("Video generado:", RESULT_VIDEO_PATH) |
| return RESULT_VIDEO_PATH |
| else: |
| print("No se generó el video") |
| return None |
|
|
|
|
| def flujo_completo(audio_file_path): |
| try: |
| os.makedirs(os.path.dirname(AUDIO_RECORD_PATH), exist_ok=True) |
| shutil.copy(audio_file_path, AUDIO_RECORD_PATH) |
| print("Audio grabado copiado a:", AUDIO_RECORD_PATH) |
|
|
| transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH) |
| print("Texto transcrito:", transcripcion) |
|
|
| respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH) |
| print("Respuesta de OpenAI:", respuesta_openai) |
|
|
| audio_generado = generar_audio_desde_texto() |
| video_path = procesar_video_audio() |
|
|
| return "Grabación recibida", AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path |
|
|
| except Exception as e: |
| return ( |
| f"Error durante el flujo completo: {str(e)}", |
| None, |
| f"Error: {str(e)}", |
| None, |
| None |
| ) |
|
|
|
|
| def interfaz(): |
| with gr.Blocks() as demo: |
| with gr.Row(): |
| with gr.Column(): |
| gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500) |
| audio_input = gr.Audio(label="Graba tu voz", type="filepath", format="wav") |
| estado_grabacion = gr.Textbox(label="Estado", interactive=False) |
|
|
| with gr.Column(): |
| output_audio = gr.Audio(label="Audio grabado", interactive=False) |
| output_audio_speech = gr.Audio(label="Audio TTS", interactive=False) |
| video_resultado = gr.Video(label="Video procesado", interactive=False) |
| texto_transcripcion = gr.Textbox(label="Texto transcrito") |
|
|
| audio_input.change( |
| flujo_completo, |
| inputs=audio_input, |
| outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado] |
| ) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| demo = interfaz() |
| demo.launch() |
|
|
|
|