Spaces:

wedyanessam
/

Real_Time_Interactive_Avatar_v2

Runtime error

App Files Files Community

Real_Time_Interactive_Avatar_v2 / app.py

wedyanessam

Update app.py

bad5ae3 verified 7 months ago

raw

history blame

2.66 kB

	import subprocess
	import os
	import sys
	import shutil
	from pathlib import Path
	import argparse
	import gradio as gr
	from STT.sst import speech_to_text
	from LLM.llm import generate_reply
	from TTS_X.tts import generate_voice
	from FantasyTalking.infer import load_models, main



	# downloading of models if didn't exist
	if not os.path.exists("./models/fantasytalking_model.ckpt"):
	subprocess.run(["python", "download_models.py"])




	args_template = argparse.Namespace(
	fantasytalking_model_path="./models/fantasytalking_model.ckpt",
	wav2vec_model_dir="./models/wav2vec2-base-960h",
	wan_model_dir="./models/Wan2.1-I2V-14B-720P",
	image_path="",
	audio_path="",
	prompt="",
	output_dir="./output",
	image_size=512,
	audio_scale=1.0,
	prompt_cfg_scale=5.0,
	audio_cfg_scale=5.0,
	max_num_frames=81,
	inference_steps=20,
	fps=23,
	num_persistent_param_in_dit=None,
	seed=1111
	)


	pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args_template)
	print("✅")


	def generate_video(image_path, audio_path, prompt, output_dir="./output"):
	args_dict = vars(args_template).copy()
	args_dict.update({
	"image_path": image_path,
	"audio_path": audio_path,
	"prompt": prompt,
	"output_dir": output_dir
	})

	args = argparse.Namespace(**args_dict)
	return main(args, pipe, fantasytalking, wav2vec_processor, wav2vec)


	def full_pipeline(user_audio, user_image):

	user_text = speech_to_text(user_audio)
	reply = generate_reply(user_text)
	reply_audio_path = generate_voice(reply)
	Path("./output").mkdir(parents=True, exist_ok=True)
	video_path = generate_video(
	image_path=user_image,
	audio_path=reply_audio_path,
	prompt=reply
	)

	return user_text, reply, reply_audio_path, video_path


	with gr.Blocks() as demo:
	gr.Markdown(" Realtime Interactive Avatar 🎭")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(label="Upload Voice", type="filepath")
	image_input = gr.Image(label="Upload Image", type="filepath")
	btn = gr.Button("Generate")

	with gr.Column():
	user_text = gr.Textbox(label="Transcribed Text (Speech to Text)")
	reply_text = gr.Textbox(label="Assistant Response (LLM)")
	reply_audio = gr.Audio(label="Spoken Response (Text to Speech)")
	video_output = gr.Video(label="Final Generated Video")

	btn.click(fn=full_pipeline,
	inputs=[audio_input, image_input],
	outputs=[user_text, reply_text, reply_audio, video_output])

	demo.launch(inbrowser=True, share=True)