| {"train_file": "", "validation_file": "", "test_file": "", "num_examples": -1, "text_encoder_name": "./mt5-large", "scheduler_name": "stabilityai/stable-diffusion-2-1", "unet_model_name": "./saved/unet/pretrain_ft", "unet_model_config": null, "hf_model": null, "snr_gamma": 5.0, "freeze_text_encoder": true, "text_column": "caption", "audio_column": "audio_location", "video_column": "frame_pt", "feature_column": "cavp_feature_location", "augment": false, "uncondition": true, "prefix": null, "per_device_train_batch_size": 20, "per_device_eval_batch_size": 20, "learning_rate": 3e-05, "weight_decay": 1e-08, "num_train_epochs": 40, "max_train_steps": null, "gradient_accumulation_steps": 2, "lr_scheduler_type": "linear", "num_warmup_steps": 0, "adam_beta1": 0.9, "adam_beta2": 0.999, "adam_weight_decay": 0.01, "adam_epsilon": 1e-08, "output_dir": "saved/STA-V2A", "seed": null, "checkpointing_steps": "best", "save_every": 1, "resume_from_checkpoint": null, "resume_diff_from_checkpoint": null, "vae_model": "audioldm-s-full", "sample_rate": 16000, "with_tracking": false, "var_len": false, "report_to": "all", "video_fps": 40, "fraze_unet": false, "predict_onset_model": null, "has_global_video_feature": true, "use_feature_window": true, "guidance_free_rate": 0.2, "Onset_weight": 1.0} | |