wedyanessam's picture
Update app.py
bad5ae3 verified
raw
history blame
2.66 kB
import subprocess
import os
import sys
import shutil
from pathlib import Path
import argparse
import gradio as gr
from STT.sst import speech_to_text
from LLM.llm import generate_reply
from TTS_X.tts import generate_voice
from FantasyTalking.infer import load_models, main
# downloading of models if didn't exist
if not os.path.exists("./models/fantasytalking_model.ckpt"):
subprocess.run(["python", "download_models.py"])
args_template = argparse.Namespace(
fantasytalking_model_path="./models/fantasytalking_model.ckpt",
wav2vec_model_dir="./models/wav2vec2-base-960h",
wan_model_dir="./models/Wan2.1-I2V-14B-720P",
image_path="",
audio_path="",
prompt="",
output_dir="./output",
image_size=512,
audio_scale=1.0,
prompt_cfg_scale=5.0,
audio_cfg_scale=5.0,
max_num_frames=81,
inference_steps=20,
fps=23,
num_persistent_param_in_dit=None,
seed=1111
)
pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
print("βœ…")
def generate_video(image_path, audio_path, prompt, output_dir="./output"):
args_dict = vars(args_template).copy()
args_dict.update({
"image_path": image_path,
"audio_path": audio_path,
"prompt": prompt,
"output_dir": output_dir
})
args = argparse.Namespace(**args_dict)
return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)
def full_pipeline(user_audio, user_image):
user_text = speech_to_text(user_audio)
reply = generate_reply(user_text)
reply_audio_path = generate_voice(reply)
Path("./output").mkdir(parents=True, exist_ok=True)
video_path = generate_video(
image_path=user_image,
audio_path=reply_audio_path,
prompt=reply
)
return user_text, reply, reply_audio_path, video_path
with gr.Blocks() as demo:
gr.Markdown(" Realtime Interactive Avatar 🎭")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="Upload Voice", type="filepath")
image_input = gr.Image(label="Upload Image", type="filepath")
btn = gr.Button("Generate")
with gr.Column():
user_text = gr.Textbox(label="Transcribed Text (Speech to Text)")
reply_text = gr.Textbox(label="Assistant Response (LLM)")
reply_audio = gr.Audio(label="Spoken Response (Text to Speech)")
video_output = gr.Video(label="Final Generated Video")
btn.click(fn=full_pipeline,
inputs=[audio_input, image_input],
outputs=[user_text, reply_text, reply_audio, video_output])
demo.launch(inbrowser=True, share=True)