import gradio as gr from pathlib import Path import argparse,os from datetime import datetime import librosa from infer import load_models,main import spaces try: import torch if torch.cuda.is_available(): _ = torch.tensor([0.0]).to('cuda') except Exception as e: print(f"GPU warmup failed: {e}") os.environ["GRADIO_TEMP_DIR"] = "./tmp" try: # Define full dummy args with all attributes expected by load_models class DummyArgs: # Core model paths wan_model_dir = "./models/Wan2.1-I2V-14B-720P" fantasytalking_model_path = "./models/fantasytalking_model.ckpt" wav2vec_model_dir = "./models/wav2vec2-base-960h" # Required inference-related parameters image_path = "./assets/images/woman.png" audio_path = "./assets/audios/woman.wav" prompt = "A woman is talking." output_dir = "./output" image_size = 512 audio_scale = 1.0 prompt_cfg_scale = 5.0 audio_cfg_scale = 5.0 max_num_frames = 81 inference_steps = 20 fps = 23 seed = 1111 # ✅ The missing one that caused your error: num_persistent_param_in_dit = 7 * 10**9 # adjust if needed # Preload models print("🔄 Loading models into memory...") args = DummyArgs() pipe, fantasytalking, wav2vec_processor, wav2vec = load_models(args) print("✅ Models loaded successfully.") except Exception as e: print(f"❌ Error loading models: {e}") pipe = fantasytalking = wav2vec_processor = wav2vec = None raise e # fail fast if model load fails # pipe,fantasytalking,wav2vec_processor,wav2vec = None,None,None,None @spaces.GPU(duration=1200) def generate_video( image_path, audio_path, prompt, prompt_cfg_scale, audio_cfg_scale, audio_weight, image_size, max_num_frames, inference_steps, seed, ): try: output_dir = Path("./output") output_dir.mkdir(parents=True, exist_ok=True) image_path = Path(image_path).absolute().as_posix() audio_path = Path(audio_path).absolute().as_posix() args = create_args( image_path=image_path, audio_path=audio_path, prompt=prompt or "A person is talking.", output_dir=str(output_dir), audio_weight=audio_weight, prompt_cfg_scale=prompt_cfg_scale, audio_cfg_scale=audio_cfg_scale, image_size=image_size, max_num_frames=max_num_frames, inference_steps=inference_steps, seed=seed, ) # ✅ Run inference using preloaded models save_path = main(args, pipe, fantasytalking, wav2vec_processor, wav2vec) print(f"✅ Video saved at {save_path}") return save_path except Exception as e: print(f"❌ Error generating video: {e}") return None def create_args( image_path: str, audio_path: str, prompt: str, output_dir: str, audio_weight: float, prompt_cfg_scale: float, audio_cfg_scale: float, image_size: int, max_num_frames: int, inference_steps: int, seed: int, ) -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( "--wan_model_dir", type=str, default="./models/Wan2.1-I2V-14B-720P", required=False, help="The dir of the Wan I2V 14B model.", ) parser.add_argument( "--fantasytalking_model_path", type=str, default="./models/fantasytalking_model.ckpt", required=False, help="The .ckpt path of fantasytalking model.", ) parser.add_argument( "--wav2vec_model_dir", type=str, default="./models/wav2vec2-base-960h", required=False, help="The dir of wav2vec model.", ) parser.add_argument( "--image_path", type=str, default="./assets/images/woman.png", required=False, help="The path of the image.", ) parser.add_argument( "--audio_path", type=str, default="./assets/audios/woman.wav", required=False, help="The path of the audio.", ) parser.add_argument( "--prompt", type=str, default="A woman is talking.", required=False, help="prompt.", ) parser.add_argument( "--output_dir", type=str, default="./output", help="Dir to save the video.", ) parser.add_argument( "--image_size", type=int, default=512, help="The image will be resized proportionally to this size.", ) parser.add_argument( "--audio_scale", type=float, default=1.0, help="Image width.", ) parser.add_argument( "--prompt_cfg_scale", type=float, default=5.0, required=False, help="prompt cfg scale", ) parser.add_argument( "--audio_cfg_scale", type=float, default=5.0, required=False, help="audio cfg scale", ) parser.add_argument( "--max_num_frames", type=int, default=81, required=False, help="The maximum frames for generating videos, the audio part exceeding max_num_frames/fps will be truncated.", ) parser.add_argument( "--inference_steps", type=int, default=20, required=False, ) parser.add_argument( "--fps", type=int, default=23, required=False, ) parser.add_argument( "--num_persistent_param_in_dit", type=int, default=7*10**9, required=False, help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required" ) parser.add_argument( "--seed", type=int, default=1111, required=False, ) args = parser.parse_args( [ "--image_path", image_path, "--audio_path", audio_path, "--prompt", prompt, "--output_dir", output_dir, "--image_size", str(image_size), "--audio_scale", str(audio_weight), "--prompt_cfg_scale", str(prompt_cfg_scale), "--audio_cfg_scale", str(audio_cfg_scale), "--max_num_frames", str(max_num_frames), "--inference_steps", str(inference_steps), "--seed", str(seed), ] ) print(args) return args # Create Gradio interface with gr.Blocks(title="FantasyTalking Video Generation") as demo: gr.Markdown( """ # FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis