Spaces:

AdamYao
/

Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching

Running on Zero

App Files Files Community

Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching / app.py

AdamYao

Update

a04da58 verified 5 months ago

raw

history blame contribute delete

6.28 kB

	from __future__ import print_function, division
	import spaces
	import sys
	sys.path.insert(0,'core')
	sys.path.append('core/utils')

	import os
	import argparse
	import gradio as gr
	import cv2
	from core.raft_stereo_depthbeta_refine import RAFTStereoDepthBetaRefine
	import torch
	import torch.nn as nn
	from core.utils.utils import InputPadder
	import matplotlib.pyplot as plt
	from huggingface_hub import hf_hub_download

	parser = argparse.ArgumentParser()
	parser.add_argument('--root', help="dataset root", default=None)
	parser.add_argument('--sv_root', help="visualization root", default=None)
	parser.add_argument('--test_exp_name', default='', help="name your experiment in testing")
	parser.add_argument('--mast3r_model_path', default='MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth', help="pretrained model path for MaSt3R")
	parser.add_argument('--depthany_model_dir', default='./dav2_models', help="directory of pretrained model path for DepthAnything")
	parser.add_argument('--restore_ckpt', help="restore checkpoint", default="./ckpts/diving_stereo.pth")
	parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
	parser.add_argument('--valid_iters', type=int, default=32, help='number of flow-field updates during forward pass')
	parser.add_argument('--eval', action='store_true', help='evaluation mode')
	parser.add_argument('--is_test', action='store_true', help='on testing')

	# Architecure choices
	parser.add_argument('--hidden_dims', nargs='+', type=int, default=[128]*3, help="hidden state and context dimensions")
	parser.add_argument('--corr_implementation', choices=["reg", "alt", "reg_cuda", "alt_cuda"], default="reg", help="correlation volume implementation")
	parser.add_argument('--shared_backbone', action='store_true', help="use a single backbone for the context and feature encoders")
	parser.add_argument('--corr_levels', type=int, default=4, help="number of levels in the correlation pyramid")
	parser.add_argument('--corr_radius', type=int, default=4, help="width of the correlation pyramid")
	parser.add_argument('--n_downsample', type=int, default=2, help="resolution of the disparity field (1/2^K)")
	parser.add_argument('--context_norm', type=str, default="batch", choices=['group', 'batch', 'instance', 'none'], help="normalization of context encoder")
	parser.add_argument('--slow_fast_gru', action='store_true', help="iterate the low-res GRUs more frequently")
	parser.add_argument('--n_gru_layers', type=int, default=3, help="number of hidden GRU levels")

	parser.add_argument('--lbp_neighbor_offsets', default='(-5,-5), (5,5), (5,-5), (-5,5), (-3,0), (3,0), (0,-3), (0,3)', help="determine the neighbors used in LBP encoder")
	parser.add_argument('--modulation_ratio', type=float, default=1., help="hyperparameters for modulation")
	parser.add_argument('--modulation_alg', choices=["linear", "sigmoid"], default="linear", help="rescale modulation")
	parser.add_argument('--conf_from_fea', action='store_true', help="confidence in refinement not only from cost volume but also from other features")
	parser.add_argument('--refine_pool', action='store_true', help="use pooling in refinement")
	parser.add_argument('--refine_unet', action='store_true', help="use EfficientUnet in refinement")

	parser.add_argument('--improvement', action='store_true', help="visualize improvement map (error_map[i] - error_map[i-1])")
	parser.add_argument('--movement', action='store_true', help="visualize movement map (flow_pr[i] - flow_pr[i-1])")
	parser.add_argument('--acceleration', action='store_true', help="visualize acceleration map (movement_map[i] - movement_map[i-1])")
	parser.add_argument('--mask', action='store_true', help="visualize mask")
	parser.add_argument('--binary_thold', type=float, default=0.5, help="visualize binary mask")

	args = parser.parse_args()
	args.conf_from_fea = True
	args.eval = True

	model = RAFTStereoDepthBetaRefine(args)
	model = torch.nn.DataParallel(model, device_ids=[0])


	checkpoint_path = hf_hub_download(
	repo_id="BFZD/Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching",
	filename="ckpts/diving_stereo.pth",
	)

	checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
	# model.load_state_dict(checkpoint, strict=True)
	new_state_dict = {}
	for key, value in checkpoint.items():
	if key.find("lbp_encoder.lbp_conv") != -1:
	continue
	new_state_dict[key] = value
	# model.load_state_dict(new_state_dict, strict=True)
	model.load_state_dict(new_state_dict, strict=False)

	model.cuda()
	model.eval()


	@spaces.GPU
	def predict(image1, image2):
	with torch.no_grad():
	image1 = torch.from_numpy(image1).permute(2, 0, 1).float()
	image2 = torch.from_numpy(image2).permute(2, 0, 1).float()
	image1 = image1[None][:,:3,:,:].cuda()
	image2 = image2[None][:,:3,:,:].cuda()
	padder = InputPadder(image1.shape, divis_by=32)
	image1, image2 = padder.pad(image1, image2)
	_, disp = model(image1, image2, iters=args.valid_iters, test_mode=True, vis_mode=True)
	output = disp.abs().cpu().numpy()
	disp = padder.unpad(output)
	disp = disp.squeeze()
	normalized_disp = (disp - disp.min()) / (disp.max() - disp.min())
	cmap = plt.get_cmap('jet')
	colored_disp = cmap(normalized_disp)[:, :, :3] # Get RGB channels

	return colored_disp
	with gr.Blocks() as demo:
	gr.HTML('''
	<div align="center">
	<h1> [ICCV25 Oral] Diving into the Fusion of Monocular Priors for Generalized Stereo Matching
	<a title="Github" href="https://github.com/YaoChengTang/Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching" target="_blank" rel="noopener noreferrer" style="display: inline-block;"> <img src="https://img.shields.io/github/stars/YaoChengTang/Diving-into-the-Fusion-of-Monocular-Priors-for-Generalized-Stereo-Matching?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="GitHub Stars"> </a></h1>
	</div>
	''')

	with gr.Row():
	left_img = gr.Image(label="Left Image")
	right_img = gr.Image(label="Right Image")
	output_img = gr.Image(label="Disparity Map")

	btn = gr.Button("Submit")
	btn.click(
	fn=predict,
	inputs=[left_img, right_img],
	outputs=output_img
	)
	demo.launch()