autorf-query_predictor / config.json
jxie's picture
Upload QueryPredictorForRegression
190f28e verified
{
"architectures": [
"QueryPredictorForRegression"
],
"camera_motion_conditioning_drop_prob": 0.1,
"canonical_track_rate": null,
"consistency_delta_time": 0.001,
"consistency_loss_weight": 1.0,
"consistency_velocity_match_alpha": 1e-05,
"conv_block_hidden_size": 352,
"denoising_predictor_config": null,
"depth_track_multiplier": 1.0,
"drop_path_prob": 0.0,
"encoder_image_size": 512,
"freeze_image_model": true,
"freeze_text_encoder": true,
"height": 256,
"image_encoder_hidden_size": 352,
"image_model_name": "facebook/sam2.1-hiera-tiny",
"max_track_length": 50,
"model_type": "query_predictor",
"movement_inference_multiplier": 1.0,
"movement_mean": 2.5,
"movement_scalar": 1.0,
"movement_std": 12.0,
"noise_schedule_type": "cos",
"num_blocks": 4,
"out_channels": 1,
"predict_visible_ratios": false,
"predictor_hidden_size": 512,
"predictor_type": "gmlp",
"prepend_query_points": true,
"rectified_flow_ema_beta": 0.9999,
"rectified_flow_ema_update_every": 100,
"rectified_flow_use_consistency": true,
"text_conditioning_drop_prob": 0.1,
"text_encoder_max_seq_length": 64,
"text_encoder_name": "openai/clip-vit-base-patch32",
"torch_dtype": "float32",
"track_dimensionality": 2,
"track_subsample_count": 1000,
"transformers_version": "4.50.3",
"use_camera_motion_conditioning": false,
"width": 256
}